Recognizing text

Text Detection performs Optical Character Recognition (OCR), which detects and extracts text within an input video.

Text detection is available for all the languages supported by the Cloud Vision API.

Request Text Detection for a Video on Google Cloud Storage

The following samples demonstrate text detection on a file located in Cloud Storage.

REST & CMD LINE

Send video annotation request

The following shows how to send a POST request to the videos:annotate method. The example uses the access token for a service account set up for the project using the Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the Video Intelligence API Quickstart.

Before using any of the request data below, make the following replacements:

  • input-uri: a Cloud Storage bucket that contains the file you want to annotate, including the file name. Must start with gs://.
    For example: "inputUri": "gs://cloud-videointelligence-demo/assistant.mp4",
  • language-code: [Optional] For example, "en-US"

HTTP method and URL:

POST https://videointelligence.googleapis.com/v1/videos:annotate

Request JSON body:

{
  "inputUri": "input-uri",
  "features": ["TEXT_DETECTION"],
  "videoContext": {
    "textDetectionConfig": {
      "languageHints": ["language-code"]
    }
  }
}

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

{
  "name": "projects/project-number/locations/location-id/operations/operation-id"
}

If the response is successful, the Video Intelligence API returns the name for your operation. The above shows an example of such a response, where: project-number is the number of your project and operation-id is the ID of the long running operation created for the request.

  • project-number: the number of your project
  • location-id: the Cloud region where annotation should take place. Supported cloud regions are: us-east1, us-west1, europe-west1, asia-east1. If no region is specified, a region will be determined based on video file location.
  • operation-id: the ID of the long running operation created for the request and provided in the response when you started the operation, for example 12345...

Get annotation results

To retrieve the result of the operation, make a GET request, using the operation name returned from the call to videos:annotate, as shown in the following example.

Before using any of the request data below, make the following replacements:

  • operation-name: the name of the operation as returned by Video Intelligence API. The operation name has the format projects/project-number/locations/location-id/operations/operation-id

HTTP method and URL:

GET https://videointelligence.googleapis.com/v1/operation-name

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

Text detection annotations are returned as a textAnnotations list. Note: The done field is only returned when its value is True. It's not included in responses for which the operation has not completed.

C#

public static object DetectTextGcs(string gcsUri)
{
    var client = VideoIntelligenceServiceClient.Create();
    var request = new AnnotateVideoRequest
    {
        InputUri = gcsUri,
        Features = { Feature.TextDetection },
    };

    Console.WriteLine("\nProcessing video for text detection.");
    var op = client.AnnotateVideo(request).PollUntilCompleted();

    // Retrieve the first result because only one video was processed.
    var annotationResults = op.Result.AnnotationResults[0];

    // Get only the first result.
    var textAnnotation = annotationResults.TextAnnotations[0];
    Console.WriteLine($"\nText: {textAnnotation.Text}");

    // Get the first text segment.
    var textSegment = textAnnotation.Segments[0];
    var startTime = textSegment.Segment.StartTimeOffset;
    var endTime = textSegment.Segment.EndTimeOffset;
    Console.Write(
        $"Start time: {startTime.Seconds + startTime.Nanos / 1e9 }, ");
    Console.WriteLine(
        $"End time: {endTime.Seconds + endTime.Nanos / 1e9 }");

    Console.WriteLine($"Confidence: {textSegment.Confidence}");

    // Show the result for the first frame in this segment.
    var frame = textSegment.Frames[0];
    var timeOffset = frame.TimeOffset;
    Console.Write("Time offset for the first frame: ");
    Console.WriteLine(timeOffset.Seconds + timeOffset.Nanos * 1e9);
    Console.WriteLine("Rotated Bounding Box Vertices:");
    foreach (var vertex in frame.RotatedBoundingBox.Vertices)
    {
        Console.WriteLine(
            $"\tVertex x: {vertex.X}, Vertex.y: {vertex.Y}");
    }
    return 0;
}

Go


import (
	"context"
	"fmt"
	"io"

	video "cloud.google.com/go/videointelligence/apiv1"
	"github.com/golang/protobuf/ptypes"
	videopb "google.golang.org/genproto/googleapis/cloud/videointelligence/v1"
)

// textDetectionGCS analyzes a video and extracts the text from the video's audio.
func textDetectionGCS(w io.Writer, gcsURI string) error {
	// gcsURI := "gs://python-docs-samples-tests/video/googlework_short.mp4"

	ctx := context.Background()

	// Creates a client.
	client, err := video.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("video.NewClient: %v", err)
	}

	op, err := client.AnnotateVideo(ctx, &videopb.AnnotateVideoRequest{
		InputUri: gcsURI,
		Features: []videopb.Feature{
			videopb.Feature_TEXT_DETECTION,
		},
	})
	if err != nil {
		return fmt.Errorf("AnnotateVideo: %v", err)
	}

	resp, err := op.Wait(ctx)
	if err != nil {
		return fmt.Errorf("Wait: %v", err)
	}

	// Only one video was processed, so get the first result.
	result := resp.GetAnnotationResults()[0]

	for _, annotation := range result.TextAnnotations {
		fmt.Fprintf(w, "Text: %q\n", annotation.GetText())

		// Get the first text segment.
		segment := annotation.GetSegments()[0]
		start, _ := ptypes.Duration(segment.GetSegment().GetStartTimeOffset())
		end, _ := ptypes.Duration(segment.GetSegment().GetEndTimeOffset())
		fmt.Fprintf(w, "\tSegment: %v to %v\n", start, end)

		fmt.Fprintf(w, "\tConfidence: %f\n", segment.GetConfidence())

		// Show the result for the first frame in this segment.
		frame := segment.GetFrames()[0]
		seconds := float32(frame.GetTimeOffset().GetSeconds())
		nanos := float32(frame.GetTimeOffset().GetNanos())
		fmt.Fprintf(w, "\tTime offset of the first frame: %fs\n", seconds+nanos/1e9)

		fmt.Fprintf(w, "\tRotated bounding box vertices:\n")
		for _, vertex := range frame.GetRotatedBoundingBox().GetVertices() {
			fmt.Fprintf(w, "\t\tVertex x=%f, y=%f\n", vertex.GetX(), vertex.GetY())
		}
	}

	return nil
}

Java

/**
 * Detect Text in a video.
 *
 * @param gcsUri the path to the video file to analyze.
 */
public static VideoAnnotationResults detectTextGcs(String gcsUri) throws Exception {
  try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
    // Create the request
    AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
        .setInputUri(gcsUri)
        .addFeatures(Feature.TEXT_DETECTION)
        .build();

    // asynchronously perform object tracking on videos
    OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> future =
        client.annotateVideoAsync(request);

    System.out.println("Waiting for operation to complete...");
    // The first result is retrieved because a single video was processed.
    AnnotateVideoResponse response = future.get(300, TimeUnit.SECONDS);
    VideoAnnotationResults results = response.getAnnotationResults(0);

    // Get only the first annotation for demo purposes.
    TextAnnotation annotation = results.getTextAnnotations(0);
    System.out.println("Text: " + annotation.getText());

    // Get the first text segment.
    TextSegment textSegment = annotation.getSegments(0);
    System.out.println("Confidence: " + textSegment.getConfidence());
    // For the text segment display it's time offset
    VideoSegment videoSegment = textSegment.getSegment();
    Duration startTimeOffset = videoSegment.getStartTimeOffset();
    Duration endTimeOffset = videoSegment.getEndTimeOffset();
    // Display the offset times in seconds, 1e9 is part of the formula to convert nanos to seconds
    System.out.println(String.format("Start time: %.2f",
        startTimeOffset.getSeconds() + startTimeOffset.getNanos() / 1e9));
    System.out.println(String.format("End time: %.2f",
        endTimeOffset.getSeconds() + endTimeOffset.getNanos() / 1e9));

    // Show the first result for the first frame in the segment.
    TextFrame textFrame = textSegment.getFrames(0);
    Duration timeOffset = textFrame.getTimeOffset();
    System.out.println(String.format("Time offset for the first frame: %.2f",
        timeOffset.getSeconds() + timeOffset.getNanos() / 1e9));

    // Display the rotated bounding box for where the text is on the frame.
    System.out.println("Rotated Bounding Box Vertices:");
    List<NormalizedVertex> vertices = textFrame.getRotatedBoundingBox().getVerticesList();
    for (NormalizedVertex normalizedVertex : vertices) {
      System.out.println(String.format(
          "\tVertex.x: %.2f, Vertex.y: %.2f",
          normalizedVertex.getX(),
          normalizedVertex.getY()));
    }
    return results;
  }
}

Node.js

// Imports the Google Cloud Video Intelligence library
const Video = require('@google-cloud/video-intelligence');
// Creates a client
const video = new Video.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const gcsUri = 'GCS URI of the video to analyze, e.g. gs://my-bucket/my-video.mp4';

const request = {
  inputUri: gcsUri,
  features: ['TEXT_DETECTION'],
};
// Detects text in a video
const [operation] = await video.annotateVideo(request);
const results = await operation.promise();
console.log('Waiting for operation to complete...');
// Gets annotations for video
const textAnnotations = results[0].annotationResults[0].textAnnotations;
textAnnotations.forEach(textAnnotation => {
  console.log(`Text ${textAnnotation.text} occurs at:`);
  textAnnotation.segments.forEach(segment => {
    const time = segment.segment;
    console.log(
      ` Start: ${time.startTimeOffset.seconds || 0}.${(
        time.startTimeOffset.nanos / 1e6
      ).toFixed(0)}s`
    );
    console.log(
      ` End: ${time.endTimeOffset.seconds || 0}.${(
        time.endTimeOffset.nanos / 1e6
      ).toFixed(0)}s`
    );
    console.log(` Confidence: ${segment.confidence}`);
    segment.frames.forEach(frame => {
      const timeOffset = frame.timeOffset;
      console.log(
        `Time offset for the frame: ${timeOffset.seconds || 0}` +
          `.${(timeOffset.nanos / 1e6).toFixed(0)}s`
      );
      console.log('Rotated Bounding Box Vertices:');
      frame.rotatedBoundingBox.vertices.forEach(vertex => {
        console.log(`Vertex.x:${vertex.x}, Vertex.y:${vertex.y}`);
      });
    });
  });
});

Python

"""Detect text in a video stored on GCS."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.TEXT_DETECTION]

operation = video_client.annotate_video(input_uri=input_uri, features=features)

print("\nProcessing video for text detection.")
result = operation.result(timeout=600)

# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]

for text_annotation in annotation_result.text_annotations:
    print("\nText: {}".format(text_annotation.text))

    # Get the first text segment
    text_segment = text_annotation.segments[0]
    start_time = text_segment.segment.start_time_offset
    end_time = text_segment.segment.end_time_offset
    print(
        "start_time: {}, end_time: {}".format(
            start_time.seconds + start_time.nanos * 1e-9,
            end_time.seconds + end_time.nanos * 1e-9,
        )
    )

    print("Confidence: {}".format(text_segment.confidence))

    # Show the result for the first frame in this segment.
    frame = text_segment.frames[0]
    time_offset = frame.time_offset
    print(
        "Time offset for the first frame: {}".format(
            time_offset.seconds + time_offset.nanos * 1e-9
        )
    )
    print("Rotated Bounding Box Vertices:")
    for vertex in frame.rotated_bounding_box.vertices:
        print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y))

Ruby

# path = "Path to a video file on Google Cloud Storage: gs://bucket/video.mp4"

require "google/cloud/video_intelligence"

video = Google::Cloud::VideoIntelligence.video_intelligence_service

# Register a callback during the method call
operation = video.annotate_video features: [:TEXT_DETECTION], input_uri: path

puts "Processing video for text detection:"
operation.wait_until_done!

raise operation.results.message? if operation.error?
puts "Finished Processing."

text_annotations = operation.results.annotation_results.first.text_annotations
print_text_annotations text_annotations

Request Text Detection for Video from a Local File

The following samples demonstrate text detection on a file stored locally.

REST & CMD LINE

Send video annotation request

To perform annotation on a local video file, be sure to base64-encode the contents of the video file. Include the base64-encoded contents in the inputContent field of the request. For information on how to base64-encode the contents of a video file, see Base64 Encoding.

The following shows how to send a POST request to the videos:annotate method. The example uses the access token for a service account set up for the project using the Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the Video Intelligence API Quickstart

Before using any of the request data below, make the following replacements:

  • "inputContent": base-64-encoded-content
    For example:
    "UklGRg41AwBBVkkgTElTVAwBAABoZHJsYXZpaDgAAAA1ggAAxPMBAAAAAAAQCAA..."
  • language-code: [Optional] For example, "en-US"

HTTP method and URL:

POST https://videointelligence.googleapis.com/v1/videos:annotate

Request JSON body:

{
  "inputContent": "base-64-encoded-content",
  "features": ["TEXT_DETECTION"],
  "videoContext": {
    "textDetectionConfig": {
      "languageHints": ["language-code"]
    }
  }
}

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

{
  "name": "projects/project-number/locations/location-id/operations/operation-id"
}

If the response is successful, the Video Intelligence API returns the name of your operation. The above shows an example of such a response, where project-number is the name of your project and operation-id is the ID of the long running operation created for the request.

  • operation-id: provided in the response when you started the operation, for example 12345...

Get annotation results

To retrieve the result of the operation, make a GET request, using the operation name returned from the call to videos:annotate, as shown in the following example.

HTTP method and URL:

GET https://videointelligence.googleapis.com/v1/operation-name

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

Text detection annotations are returned as a textAnnotations list. Note: The done field is only returned when its value is True. It's not included in responses for which the operation has not completed.

C#

public static object DetectText(string filePath)
{
    var client = VideoIntelligenceServiceClient.Create();
    var request = new AnnotateVideoRequest
    {
        InputContent = Google.Protobuf.ByteString.CopyFrom(File.ReadAllBytes(filePath)),
        Features = { Feature.TextDetection },
    };

    Console.WriteLine("\nProcessing video for text detection.");
    var op = client.AnnotateVideo(request).PollUntilCompleted();

    // Retrieve the first result because only one video was processed.
    var annotationResults = op.Result.AnnotationResults[0];

    // Get only the first result.
    var textAnnotation = annotationResults.TextAnnotations[0];
    Console.WriteLine($"\nText: {textAnnotation.Text}");

    // Get the first text segment.
    var textSegment = textAnnotation.Segments[0];
    var startTime = textSegment.Segment.StartTimeOffset;
    var endTime = textSegment.Segment.EndTimeOffset;
    Console.Write(
        $"Start time: {startTime.Seconds + startTime.Nanos / 1e9 }, ");
    Console.WriteLine(
        $"End time: {endTime.Seconds + endTime.Nanos / 1e9 }");

    Console.WriteLine($"Confidence: {textSegment.Confidence}");

    // Show the result for the first frame in this segment.
    var frame = textSegment.Frames[0];
    var timeOffset = frame.TimeOffset;
    Console.Write("Time offset for the first frame: ");
    Console.WriteLine(timeOffset.Seconds + timeOffset.Nanos * 1e9);
    Console.WriteLine("Rotated Bounding Box Vertices:");
    foreach (var vertex in frame.RotatedBoundingBox.Vertices)
    {
        Console.WriteLine(
            $"\tVertex x: {vertex.X}, Vertex.y: {vertex.Y}");
    }
    return 0;
}

Go


import (
	"context"
	"fmt"
	"io"
	"io/ioutil"

	video "cloud.google.com/go/videointelligence/apiv1"
	"github.com/golang/protobuf/ptypes"
	videopb "google.golang.org/genproto/googleapis/cloud/videointelligence/v1"
)

// textDetection analyzes a video and extracts the text from the video's audio.
func textDetection(w io.Writer, filename string) error {
	// filename := "../testdata/googlework_short.mp4"

	ctx := context.Background()

	// Creates a client.
	client, err := video.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("video.NewClient: %v", err)
	}

	fileBytes, err := ioutil.ReadFile(filename)
	if err != nil {
		return fmt.Errorf("ioutil.ReadFile: %v", err)
	}

	op, err := client.AnnotateVideo(ctx, &videopb.AnnotateVideoRequest{
		InputContent: fileBytes,
		Features: []videopb.Feature{
			videopb.Feature_TEXT_DETECTION,
		},
	})
	if err != nil {
		return fmt.Errorf("AnnotateVideo: %v", err)
	}

	resp, err := op.Wait(ctx)
	if err != nil {
		return fmt.Errorf("Wait: %v", err)
	}

	// Only one video was processed, so get the first result.
	result := resp.GetAnnotationResults()[0]

	for _, annotation := range result.TextAnnotations {
		fmt.Fprintf(w, "Text: %q\n", annotation.GetText())

		// Get the first text segment.
		segment := annotation.GetSegments()[0]
		start, _ := ptypes.Duration(segment.GetSegment().GetStartTimeOffset())
		end, _ := ptypes.Duration(segment.GetSegment().GetEndTimeOffset())
		fmt.Fprintf(w, "\tSegment: %v to %v\n", start, end)

		fmt.Fprintf(w, "\tConfidence: %f\n", segment.GetConfidence())

		// Show the result for the first frame in this segment.
		frame := segment.GetFrames()[0]
		seconds := float32(frame.GetTimeOffset().GetSeconds())
		nanos := float32(frame.GetTimeOffset().GetNanos())
		fmt.Fprintf(w, "\tTime offset of the first frame: %fs\n", seconds+nanos/1e9)

		fmt.Fprintf(w, "\tRotated bounding box vertices:\n")
		for _, vertex := range frame.GetRotatedBoundingBox().GetVertices() {
			fmt.Fprintf(w, "\t\tVertex x=%f, y=%f\n", vertex.GetX(), vertex.GetY())
		}
	}

	return nil
}

Java

/**
 * Detect text in a video.
 *
 * @param filePath the path to the video file to analyze.
 */
public static VideoAnnotationResults detectText(String filePath) throws Exception {
  try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
    // Read file
    Path path = Paths.get(filePath);
    byte[] data = Files.readAllBytes(path);

    // Create the request
    AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
        .setInputContent(ByteString.copyFrom(data))
        .addFeatures(Feature.TEXT_DETECTION)
        .build();

    // asynchronously perform object tracking on videos
    OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> future =
        client.annotateVideoAsync(request);

    System.out.println("Waiting for operation to complete...");
    // The first result is retrieved because a single video was processed.
    AnnotateVideoResponse response = future.get(300, TimeUnit.SECONDS);
    VideoAnnotationResults results = response.getAnnotationResults(0);

    // Get only the first annotation for demo purposes.
    TextAnnotation annotation = results.getTextAnnotations(0);
    System.out.println("Text: " + annotation.getText());

    // Get the first text segment.
    TextSegment textSegment = annotation.getSegments(0);
    System.out.println("Confidence: " + textSegment.getConfidence());
    // For the text segment display it's time offset
    VideoSegment videoSegment = textSegment.getSegment();
    Duration startTimeOffset = videoSegment.getStartTimeOffset();
    Duration endTimeOffset = videoSegment.getEndTimeOffset();
    // Display the offset times in seconds, 1e9 is part of the formula to convert nanos to seconds
    System.out.println(String.format("Start time: %.2f",
        startTimeOffset.getSeconds() + startTimeOffset.getNanos() / 1e9));
    System.out.println(String.format("End time: %.2f",
        endTimeOffset.getSeconds() + endTimeOffset.getNanos() / 1e9));

    // Show the first result for the first frame in the segment.
    TextFrame textFrame = textSegment.getFrames(0);
    Duration timeOffset = textFrame.getTimeOffset();
    System.out.println(String.format("Time offset for the first frame: %.2f",
        timeOffset.getSeconds() + timeOffset.getNanos() / 1e9));

    // Display the rotated bounding box for where the text is on the frame.
    System.out.println("Rotated Bounding Box Vertices:");
    List<NormalizedVertex> vertices = textFrame.getRotatedBoundingBox().getVerticesList();
    for (NormalizedVertex normalizedVertex : vertices) {
      System.out.println(String.format(
          "\tVertex.x: %.2f, Vertex.y: %.2f",
          normalizedVertex.getX(),
          normalizedVertex.getY()));
    }
    return results;
  }
}

Node.js

// Imports the Google Cloud Video Intelligence library + Node's fs library
const Video = require('@google-cloud/video-intelligence');
const fs = require('fs');
const util = require('util');
// Creates a client
const video = new Video.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const path = 'Local file to analyze, e.g. ./my-file.mp4';

// Reads a local video file and converts it to base64
const file = await util.promisify(fs.readFile)(path);
const inputContent = file.toString('base64');

const request = {
  inputContent: inputContent,
  features: ['TEXT_DETECTION'],
};
// Detects text in a video
const [operation] = await video.annotateVideo(request);
const results = await operation.promise();
console.log('Waiting for operation to complete...');

// Gets annotations for video
const textAnnotations = results[0].annotationResults[0].textAnnotations;
textAnnotations.forEach(textAnnotation => {
  console.log(`Text ${textAnnotation.text} occurs at:`);
  textAnnotation.segments.forEach(segment => {
    const time = segment.segment;
    if (time.startTimeOffset.seconds === undefined) {
      time.startTimeOffset.seconds = 0;
    }
    if (time.startTimeOffset.nanos === undefined) {
      time.startTimeOffset.nanos = 0;
    }
    if (time.endTimeOffset.seconds === undefined) {
      time.endTimeOffset.seconds = 0;
    }
    if (time.endTimeOffset.nanos === undefined) {
      time.endTimeOffset.nanos = 0;
    }
    console.log(
      `\tStart: ${time.startTimeOffset.seconds || 0}` +
        `.${(time.startTimeOffset.nanos / 1e6).toFixed(0)}s`
    );
    console.log(
      `\tEnd: ${time.endTimeOffset.seconds || 0}.` +
        `${(time.endTimeOffset.nanos / 1e6).toFixed(0)}s`
    );
    console.log(`\tConfidence: ${segment.confidence}`);
    segment.frames.forEach(frame => {
      const timeOffset = frame.timeOffset;
      console.log(
        `Time offset for the frame: ${timeOffset.seconds || 0}` +
          `.${(timeOffset.nanos / 1e6).toFixed(0)}s`
      );
      console.log('Rotated Bounding Box Vertices:');
      frame.rotatedBoundingBox.vertices.forEach(vertex => {
        console.log(`Vertex.x:${vertex.x}, Vertex.y:${vertex.y}`);
      });
    });
  });
});

Python

"""Detect text in a local video."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.TEXT_DETECTION]
video_context = videointelligence.types.VideoContext()

with io.open(path, "rb") as file:
    input_content = file.read()

operation = video_client.annotate_video(
    input_content=input_content,  # the bytes of the video file
    features=features,
    video_context=video_context,
)

print("\nProcessing video for text detection.")
result = operation.result(timeout=300)

# The first result is retrieved because a single video was processed.
annotation_result = result.annotation_results[0]

for text_annotation in annotation_result.text_annotations:
    print("\nText: {}".format(text_annotation.text))

    # Get the first text segment
    text_segment = text_annotation.segments[0]
    start_time = text_segment.segment.start_time_offset
    end_time = text_segment.segment.end_time_offset
    print(
        "start_time: {}, end_time: {}".format(
            start_time.seconds + start_time.nanos * 1e-9,
            end_time.seconds + end_time.nanos * 1e-9,
        )
    )

    print("Confidence: {}".format(text_segment.confidence))

    # Show the result for the first frame in this segment.
    frame = text_segment.frames[0]
    time_offset = frame.time_offset
    print(
        "Time offset for the first frame: {}".format(
            time_offset.seconds + time_offset.nanos * 1e-9
        )
    )
    print("Rotated Bounding Box Vertices:")
    for vertex in frame.rotated_bounding_box.vertices:
        print("\tVertex.x: {}, Vertex.y: {}".format(vertex.x, vertex.y))

Ruby

# "Path to a local video file: path/to/file.mp4"

require "google/cloud/video_intelligence"

video = Google::Cloud::VideoIntelligence.video_intelligence_service

video_contents = File.binread path

# Register a callback during the method call
operation = video.annotate_video features: [:TEXT_DETECTION], input_content: video_contents

puts "Processing video for text detection:"
operation.wait_until_done!

raise operation.results.message? if operation.error?
puts "Finished Processing."

text_annotations = operation.results.annotation_results.first.text_annotations
print_text_annotations text_annotations