Detect a person in a local video file

GA release of person detection for Video Intelligence API.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Java


import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.videointelligence.v1.AnnotateVideoProgress;
import com.google.cloud.videointelligence.v1.AnnotateVideoRequest;
import com.google.cloud.videointelligence.v1.AnnotateVideoResponse;
import com.google.cloud.videointelligence.v1.DetectedAttribute;
import com.google.cloud.videointelligence.v1.DetectedLandmark;
import com.google.cloud.videointelligence.v1.Feature;
import com.google.cloud.videointelligence.v1.PersonDetectionAnnotation;
import com.google.cloud.videointelligence.v1.PersonDetectionConfig;
import com.google.cloud.videointelligence.v1.TimestampedObject;
import com.google.cloud.videointelligence.v1.Track;
import com.google.cloud.videointelligence.v1.VideoAnnotationResults;
import com.google.cloud.videointelligence.v1.VideoContext;
import com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClient;
import com.google.cloud.videointelligence.v1.VideoSegment;
import com.google.protobuf.ByteString;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

public class DetectPerson {

  public static void detectPerson() throws Exception {
    // TODO(developer): Replace these variables before running the sample.
    String localFilePath = "resources/googlework_short.mp4";
    detectPerson(localFilePath);
  }

  // Detects people in a video stored in a local file using the Cloud Video Intelligence API.
  public static void detectPerson(String localFilePath) throws Exception {
    try (VideoIntelligenceServiceClient videoIntelligenceServiceClient =
        VideoIntelligenceServiceClient.create()) {
      // Reads a local video file and converts it to base64.
      Path path = Paths.get(localFilePath);
      byte[] data = Files.readAllBytes(path);
      ByteString inputContent = ByteString.copyFrom(data);

      PersonDetectionConfig personDetectionConfig =
          PersonDetectionConfig.newBuilder()
              // Must set includeBoundingBoxes to true to get poses and attributes.
              .setIncludeBoundingBoxes(true)
              .setIncludePoseLandmarks(true)
              .setIncludeAttributes(true)
              .build();
      VideoContext videoContext =
          VideoContext.newBuilder().setPersonDetectionConfig(personDetectionConfig).build();

      AnnotateVideoRequest request =
          AnnotateVideoRequest.newBuilder()
              .setInputContent(inputContent)
              .addFeatures(Feature.PERSON_DETECTION)
              .setVideoContext(videoContext)
              .build();

      // Detects people in a video
      // We get the first result because only one video is processed.
      OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> future =
          videoIntelligenceServiceClient.annotateVideoAsync(request);

      System.out.println("Waiting for operation to complete...");
      AnnotateVideoResponse response = future.get();

      // Gets annotations for video
      VideoAnnotationResults annotationResult = response.getAnnotationResultsList().get(0);

      // Annotations for list of people detected, tracked and recognized in video.
      for (PersonDetectionAnnotation personDetectionAnnotation :
          annotationResult.getPersonDetectionAnnotationsList()) {
        System.out.print("Person detected:\n");
        for (Track track : personDetectionAnnotation.getTracksList()) {
          VideoSegment segment = track.getSegment();
          System.out.printf(
              "\tStart: %d.%.0fs\n",
              segment.getStartTimeOffset().getSeconds(),
              segment.getStartTimeOffset().getNanos() / 1e6);
          System.out.printf(
              "\tEnd: %d.%.0fs\n",
              segment.getEndTimeOffset().getSeconds(), segment.getEndTimeOffset().getNanos() / 1e6);

          // Each segment includes timestamped objects that include characteristic--e.g. clothes,
          // posture of the person detected.
          TimestampedObject firstTimestampedObject = track.getTimestampedObjects(0);

          // Attributes include unique pieces of clothing, poses (i.e., body landmarks)
          // of the person detected.
          for (DetectedAttribute attribute : firstTimestampedObject.getAttributesList()) {
            System.out.printf(
                "\tAttribute: %s; Value: %s\n", attribute.getName(), attribute.getValue());
          }

          // Landmarks in person detection include body parts.
          for (DetectedLandmark attribute : firstTimestampedObject.getLandmarksList()) {
            System.out.printf(
                "\tLandmark: %s; Vertex: %f, %f\n",
                attribute.getName(), attribute.getPoint().getX(), attribute.getPoint().getY());
          }
        }
      }
    }
  }
}

Node.js

/**
 * TODO(developer): Uncomment these variables before running the sample.
 */
// const gcsUri = 'GCS URI of the video to analyze, e.g. gs://my-bucket/my-video.mp4';

// Imports the Google Cloud Video Intelligence library + Node's fs library
const Video = require('@google-cloud/video-intelligence').v1;
const fs = require('fs');
// Creates a client
const video = new Video.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const path = 'Local file to analyze, e.g. ./my-file.mp4';

// Reads a local video file and converts it to base64
const file = fs.readFileSync(path);
const inputContent = file.toString('base64');

async function detectPerson() {
  const request = {
    inputContent: inputContent,
    features: ['PERSON_DETECTION'],
    videoContext: {
      personDetectionConfig: {
        // Must set includeBoundingBoxes to true to get poses and attributes.
        includeBoundingBoxes: true,
        includePoseLandmarks: true,
        includeAttributes: true,
      },
    },
  };
  // Detects faces in a video
  // We get the first result because we only process 1 video
  const [operation] = await video.annotateVideo(request);
  const results = await operation.promise();
  console.log('Waiting for operation to complete...');

  // Gets annotations for video
  const personAnnotations =
    results[0].annotationResults[0].personDetectionAnnotations;

  for (const {tracks} of personAnnotations) {
    console.log('Person detected:');

    for (const {segment, timestampedObjects} of tracks) {
      if (segment.startTimeOffset.seconds === undefined) {
        segment.startTimeOffset.seconds = 0;
      }
      if (segment.startTimeOffset.nanos === undefined) {
        segment.startTimeOffset.nanos = 0;
      }
      if (segment.endTimeOffset.seconds === undefined) {
        segment.endTimeOffset.seconds = 0;
      }
      if (segment.endTimeOffset.nanos === undefined) {
        segment.endTimeOffset.nanos = 0;
      }
      console.log(
        `\tStart: ${segment.startTimeOffset.seconds}` +
          `.${(segment.startTimeOffset.nanos / 1e6).toFixed(0)}s`
      );
      console.log(
        `\tEnd: ${segment.endTimeOffset.seconds}.` +
          `${(segment.endTimeOffset.nanos / 1e6).toFixed(0)}s`
      );

      // Each segment includes timestamped objects that
      // include characteristic--e.g. clothes, posture
      // of the person detected.
      const [firstTimestampedObject] = timestampedObjects;

      // Attributes include unique pieces of clothing, poses (i.e., body
      // landmarks) of the person detected.
      for (const {name, value} of firstTimestampedObject.attributes) {
        console.log(`\tAttribute: ${name}; Value: ${value}`);
      }

      // Landmarks in person detection include body parts.
      for (const {name, point} of firstTimestampedObject.landmarks) {
        console.log(`\tLandmark: ${name}; Vertex: ${point.x}, ${point.y}`);
      }
    }
  }
}

detectPerson();

Python

import io

from google.cloud import videointelligence_v1 as videointelligence


def detect_person(local_file_path="path/to/your/video-file.mp4"):
    """Detects people in a video from a local file."""

    client = videointelligence.VideoIntelligenceServiceClient()

    with io.open(local_file_path, "rb") as f:
        input_content = f.read()

    # Configure the request
    config = videointelligence.types.PersonDetectionConfig(
        include_bounding_boxes=True,
        include_attributes=True,
        include_pose_landmarks=True,
    )
    context = videointelligence.types.VideoContext(person_detection_config=config)

    # Start the asynchronous request
    operation = client.annotate_video(
        request={
            "features": [videointelligence.Feature.PERSON_DETECTION],
            "input_content": input_content,
            "video_context": context,
        }
    )

    print("\nProcessing video for person detection annotations.")
    result = operation.result(timeout=300)

    print("\nFinished processing.\n")

    # Retrieve the first result, because a single video was processed.
    annotation_result = result.annotation_results[0]

    for annotation in annotation_result.person_detection_annotations:
        print("Person detected:")
        for track in annotation.tracks:
            print(
                "Segment: {}s to {}s".format(
                    track.segment.start_time_offset.seconds
                    + track.segment.start_time_offset.microseconds / 1e6,
                    track.segment.end_time_offset.seconds
                    + track.segment.end_time_offset.microseconds / 1e6,
                )
            )

            # Each segment includes timestamped objects that include
            # characteristic - -e.g.clothes, posture of the person detected.
            # Grab the first timestamped object
            timestamped_object = track.timestamped_objects[0]
            box = timestamped_object.normalized_bounding_box
            print("Bounding box:")
            print("\tleft  : {}".format(box.left))
            print("\ttop   : {}".format(box.top))
            print("\tright : {}".format(box.right))
            print("\tbottom: {}".format(box.bottom))

            # Attributes include unique pieces of clothing,
            # poses, or hair color.
            print("Attributes:")
            for attribute in timestamped_object.attributes:
                print(
                    "\t{}:{} {}".format(
                        attribute.name, attribute.value, attribute.confidence
                    )
                )

            # Landmarks in person detection include body parts such as
            # left_shoulder, right_ear, and right_ankle
            print("Landmarks:")
            for landmark in timestamped_object.landmarks:
                print(
                    "\t{}: {} (x={}, y={})".format(
                        landmark.name,
                        landmark.confidence,
                        landmark.point.x,  # Normalized vertex
                        landmark.point.y,  # Normalized vertex
                    )
                )