Detect text in a video on Cloud Storage

Detect text in video stored in Cloud Storage.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

C#

public static object DetectTextGcs(string gcsUri)
{
    var client = VideoIntelligenceServiceClient.Create();
    var request = new AnnotateVideoRequest
    {
        InputUri = gcsUri,
        Features = { Feature.TextDetection },
    };

    Console.WriteLine("\nProcessing video for text detection.");
    var op = client.AnnotateVideo(request).PollUntilCompleted();

    // Retrieve the first result because only one video was processed.
    var annotationResults = op.Result.AnnotationResults[0];

    // Get only the first result.
    var textAnnotation = annotationResults.TextAnnotations[0];
    Console.WriteLine($"\nText: {textAnnotation.Text}");

    // Get the first text segment.
    var textSegment = textAnnotation.Segments[0];
    var startTime = textSegment.Segment.StartTimeOffset;
    var endTime = textSegment.Segment.EndTimeOffset;
    Console.Write(
        $"Start time: {startTime.Seconds + startTime.Nanos / 1e9 }, ");
    Console.WriteLine(
        $"End time: {endTime.Seconds + endTime.Nanos / 1e9 }");

    Console.WriteLine($"Confidence: {textSegment.Confidence}");

    // Show the result for the first frame in this segment.
    var frame = textSegment.Frames[0];
    var timeOffset = frame.TimeOffset;
    Console.Write("Time offset for the first frame: ");
    Console.WriteLine(timeOffset.Seconds + timeOffset.Nanos * 1e9);
    Console.WriteLine("Rotated Bounding Box Vertices:");
    foreach (var vertex in frame.RotatedBoundingBox.Vertices)
    {
        Console.WriteLine(
            $"\tVertex x: {vertex.X}, Vertex.y: {vertex.Y}");
    }
    return 0;
}

Go


import (
	"context"
	"fmt"
	"io"

	video "cloud.google.com/go/videointelligence/apiv1"
	"github.com/golang/protobuf/ptypes"
	videopb "google.golang.org/genproto/googleapis/cloud/videointelligence/v1"
)

// textDetectionGCS analyzes a video and extracts the text from the video's audio.
func textDetectionGCS(w io.Writer, gcsURI string) error {
	// gcsURI := "gs://python-docs-samples-tests/video/googlework_short.mp4"

	ctx := context.Background()

	// Creates a client.
	client, err := video.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("video.NewClient: %v", err)
	}

	op, err := client.AnnotateVideo(ctx, &videopb.AnnotateVideoRequest{
		InputUri: gcsURI,
		Features: []videopb.Feature{
			videopb.Feature_TEXT_DETECTION,
		},
	})
	if err != nil {
		return fmt.Errorf("AnnotateVideo: %v", err)
	}

	resp, err := op.Wait(ctx)
	if err != nil {
		return fmt.Errorf("Wait: %v", err)
	}

	// Only one video was processed, so get the first result.
	result := resp.GetAnnotationResults()[0]

	for _, annotation := range result.TextAnnotations {
		fmt.Fprintf(w, "Text: %q\n", annotation.GetText())

		// Get the first text segment.
		segment := annotation.GetSegments()[0]
		start, _ := ptypes.Duration(segment.GetSegment().GetStartTimeOffset())
		end, _ := ptypes.Duration(segment.GetSegment().GetEndTimeOffset())
		fmt.Fprintf(w, "\tSegment: %v to %v\n", start, end)

		fmt.Fprintf(w, "\tConfidence: %f\n", segment.GetConfidence())

		// Show the result for the first frame in this segment.
		frame := segment.GetFrames()[0]
		seconds := float32(frame.GetTimeOffset().GetSeconds())
		nanos := float32(frame.GetTimeOffset().GetNanos())
		fmt.Fprintf(w, "\tTime offset of the first frame: %fs\n", seconds+nanos/1e9)

		fmt.Fprintf(w, "\tRotated bounding box vertices:\n")
		for _, vertex := range frame.GetRotatedBoundingBox().GetVertices() {
			fmt.Fprintf(w, "\t\tVertex x=%f, y=%f\n", vertex.GetX(), vertex.GetY())
		}
	}

	return nil
}

Java

/**
 * Detect Text in a video.
 *
 * @param gcsUri the path to the video file to analyze.
 */
public static VideoAnnotationResults detectTextGcs(String gcsUri) throws Exception {
  try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
    // Create the request
    AnnotateVideoRequest request =
        AnnotateVideoRequest.newBuilder()
            .setInputUri(gcsUri)
            .addFeatures(Feature.TEXT_DETECTION)
            .build();

    // asynchronously perform object tracking on videos
    OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> future =
        client.annotateVideoAsync(request);

    System.out.println("Waiting for operation to complete...");
    // The first result is retrieved because a single video was processed.
    AnnotateVideoResponse response = future.get(300, TimeUnit.SECONDS);
    VideoAnnotationResults results = response.getAnnotationResults(0);

    // Get only the first annotation for demo purposes.
    TextAnnotation annotation = results.getTextAnnotations(0);
    System.out.println("Text: " + annotation.getText());

    // Get the first text segment.
    TextSegment textSegment = annotation.getSegments(0);
    System.out.println("Confidence: " + textSegment.getConfidence());
    // For the text segment display it's time offset
    VideoSegment videoSegment = textSegment.getSegment();
    Duration startTimeOffset = videoSegment.getStartTimeOffset();
    Duration endTimeOffset = videoSegment.getEndTimeOffset();
    // Display the offset times in seconds, 1e9 is part of the formula to convert nanos to seconds
    System.out.println(
        String.format(
            "Start time: %.2f", startTimeOffset.getSeconds() + startTimeOffset.getNanos() / 1e9));
    System.out.println(
        String.format(
            "End time: %.2f", endTimeOffset.getSeconds() + endTimeOffset.getNanos() / 1e9));

    // Show the first result for the first frame in the segment.
    TextFrame textFrame = textSegment.getFrames(0);
    Duration timeOffset = textFrame.getTimeOffset();
    System.out.println(
        String.format(
            "Time offset for the first frame: %.2f",
            timeOffset.getSeconds() + timeOffset.getNanos() / 1e9));

    // Display the rotated bounding box for where the text is on the frame.
    System.out.println("Rotated Bounding Box Vertices:");
    List<NormalizedVertex> vertices = textFrame.getRotatedBoundingBox().getVerticesList();
    for (NormalizedVertex normalizedVertex : vertices) {
      System.out.println(
          String.format(
              "\tVertex.x: %.2f, Vertex.y: %.2f",
              normalizedVertex.getX(), normalizedVertex.getY()));
    }
    return results;
  }
}

Node.js

// Imports the Google Cloud Video Intelligence library
const Video = require('@google-cloud/video-intelligence');
// Creates a client
const video = new Video.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const gcsUri = 'GCS URI of the video to analyze, e.g. gs://my-bucket/my-video.mp4';

const request = {
  inputUri: gcsUri,
  features: ['TEXT_DETECTION'],
};
// Detects text in a video
const [operation] = await video.annotateVideo(request);
const results = await operation.promise();
console.log('Waiting for operation to complete...');
// Gets annotations for video
const textAnnotations = results[0].annotationResults[0].textAnnotations;
textAnnotations.forEach(textAnnotation => {
  console.log(`Text ${textAnnotation.text} occurs at:`);
  textAnnotation.segments.forEach(segment => {
    const time = segment.segment;
    console.log(
      ` Start: ${time.startTimeOffset.seconds || 0}.${(
        time.startTimeOffset.nanos / 1e6
      ).toFixed(0)}s`
    );
    console.log(
      ` End: ${time.endTimeOffset.seconds || 0}.${(
        time.endTimeOffset.nanos / 1e6
      ).toFixed(0)}s`
    );
    console.log(` Confidence: ${segment.confidence}`);
    segment.frames.forEach(frame => {
      const timeOffset = frame.timeOffset;
      console.log(
        `Time offset for the frame: ${timeOffset.seconds || 0}` +
          `.${(timeOffset.nanos / 1e6).toFixed(0)}s`
      );
      console.log('Rotated Bounding Box Vertices:');
      frame.rotatedBoundingBox.vertices.forEach(vertex => {
        console.log(`Vertex.x:${vertex.x}, Vertex.y:${vertex.y}`);
      });
    });
  });
});

PHP

use Google\Cloud\VideoIntelligence\V1\VideoIntelligenceServiceClient;
use Google\Cloud\VideoIntelligence\V1\Feature;

/** Uncomment and populate these variables in your code */
// $uri = 'The cloud storage object to analyze (gs://your-bucket-name/your-object-name)';
// $options = [];

# Instantiate a client.
$video = new VideoIntelligenceServiceClient();

# Execute a request.
$operation = $video->annotateVideo([
    'inputUri' => $uri,
    'features' => [Feature::TEXT_DETECTION]
]);

# Wait for the request to complete.
$operation->pollUntilComplete($options);

# Print the results.
if ($operation->operationSucceeded()) {
    $results = $operation->getResult()->getAnnotationResults()[0];

    # Process video/segment level label annotations
    foreach ($results->getTextAnnotations() as $text) {
        printf('Video text description: %s' . PHP_EOL, $text->getText());
        foreach ($text->getSegments() as $segment) {
            $start = $segment->getSegment()->getStartTimeOffset();
            $end = $segment->getSegment()->getEndTimeOffset();
            printf('  Segment: %ss to %ss' . PHP_EOL,
                $start->getSeconds() + $start->getNanos()/1000000000.0,
                $end->getSeconds() + $end->getNanos()/1000000000.0);
            printf('  Confidence: %f' . PHP_EOL, $segment->getConfidence());
        }
    }
    print(PHP_EOL);
} else {
    print_r($operation->getError());
}

Python

"""Detect text in a video stored on GCS."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.TEXT_DETECTION]

operation = video_client.annotate_video(
    request={"features": features, "input_uri": input_uri}
)

print("\nProcessing video for text detection.")
result = operation.result(timeout=600)

# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]

for text_annotation in annotation_result.text_annotations:
    print("\nText: {}".format(text_annotation.text))

    # Get the first text segment
    text_segment = text_annotation.segments[0]
    start_time = text_segment.segment.start_time_offset
    end_time = text_segment.segment.end_time_offset
    print(
        "start_time: {}, end_time: {}".format(
            start_time.seconds + start_time.microseconds * 1e-6,
            end_time.seconds + end_time.microseconds * 1e-6,
        )
    )

    print("Confidence: {}".format(text_segment.confidence))

    # Show the result for the first frame in this segment.
    frame = text_segment.frames[0]
    time_offset = frame.time_offset
    print(
        "Time offset for the first frame: {}".format(
            time_offset.seconds + time_offset.microseconds * 1e-6
        )
    )
    print("Rotated Bounding Box Vertices:")
    for vertex in frame.rotated_bounding_box.vertices:
        print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y))

Ruby

# path = "Path to a video file on Google Cloud Storage: gs://bucket/video.mp4"

require "google/cloud/video_intelligence"

video = Google::Cloud::VideoIntelligence.video_intelligence_service

# Register a callback during the method call
operation = video.annotate_video features: [:TEXT_DETECTION], input_uri: path

puts "Processing video for text detection:"
operation.wait_until_done!

raise operation.results.message? if operation.error?
puts "Finished Processing."

text_annotations = operation.results.annotation_results.first.text_annotations
print_text_annotations text_annotations