获取音轨转录

Video Intelligence API 会将语音从支持的视频文件转录为文本。有两种受支持的模型,即“默认”和“视频”。

请求对视频执行语音转录

REST 和命令行

发送处理请求

下面演示了如何向 videos:annotate 方法发送 POST 请求。本示例针对通过 Cloud SDK 为项目设置的服务帐号使用访问令牌。如需了解有关安装 Cloud SDK、使用服务帐号设置项目以及获取访问令牌的说明,请参阅 Video Intelligence 快速入门

在使用任何请求数据之前,请先进行以下替换:

  • INPUT_URI:包含要添加注释的文件的 Cloud Storage 存储分区(包括文件名)。必须以 gs:// 开头。
    例如:"inputUri": "gs://cloud-videointelligence-demo/assistant.mp4",
  • LANGUAGE_CODE:[可选]请参阅支持的语言

HTTP 方法和网址:

POST https://videointelligence.googleapis.com/v1/videos:annotate

请求 JSON 正文:

{
"inputUri": "INPUT_URI",
  "features": ["SPEECH_TRANSCRIPTION"],
  "videoContext": {
    "speechTranscriptionConfig": {
      "languageCode": "LANGUAGE_CODE",
      "enableAutomaticPunctuation": true,
      "filterProfanity": true
    }
  }
}

如需发送您的请求,请展开以下选项之一:

您应该收到类似以下内容的 JSON 响应:

{
  "name": "projects/PROJECT_NUMBER/locations/LOCATION_ID/operations/OPERATION_ID"
}

如果请求成功,则 Video Intelligence 会为您的操作返回 name。上面的示例展示了此类响应的示例,其中 project-number 是您的项目编号,operation-id 是为请求创建的长时间运行的操作的 ID。

获取结果

要获取请求的结果,您必须使用对 videos:annotate 的调用返回的操作名称发送 GET,如下例所示。

在使用任何请求数据之前,请先进行以下替换:

  • OPERATION_NAME/var>: the name of the operation as returned by Video Intelligence API. The operation name has the format projects/PROJECT_NUMBER/locations/LOCATION_ID/operations/OPERATION_ID

HTTP 方法和网址:

GET https://videointelligence.googleapis.com/v1/OPERATION_NAME

如需发送您的请求,请展开以下选项之一:

您应该收到类似以下内容的 JSON 响应:

下载注释结果

将来源中的注释复制到目标存储分区(请参阅复制文件和对象):

gsutil cp gcs_uri gs://my-bucket

注意:如果输出 gcs uri 由用户提供,则注释存储在该 gcs uri 中。

Go


func speechTranscriptionURI(w io.Writer, file string) error {
	ctx := context.Background()
	client, err := video.NewClient(ctx)
	if err != nil {
		return err
	}
	defer client.Close()

	op, err := client.AnnotateVideo(ctx, &videopb.AnnotateVideoRequest{
		Features: []videopb.Feature{
			videopb.Feature_SPEECH_TRANSCRIPTION,
		},
		VideoContext: &videopb.VideoContext{
			SpeechTranscriptionConfig: &videopb.SpeechTranscriptionConfig{
				LanguageCode:               "en-US",
				EnableAutomaticPunctuation: true,
			},
		},
		InputUri: file,
	})
	if err != nil {
		return err
	}
	resp, err := op.Wait(ctx)
	if err != nil {
		return err
	}

	// A single video was processed. Get the first result.
	result := resp.AnnotationResults[0]

	for _, transcription := range result.SpeechTranscriptions {
		// The number of alternatives for each transcription is limited by
		// SpeechTranscriptionConfig.MaxAlternatives.
		// Each alternative is a different possible transcription
		// and has its own confidence score.
		for _, alternative := range transcription.GetAlternatives() {
			fmt.Fprintf(w, "Alternative level information:\n")
			fmt.Fprintf(w, "\tTranscript: %v\n", alternative.GetTranscript())
			fmt.Fprintf(w, "\tConfidence: %v\n", alternative.GetConfidence())

			fmt.Fprintf(w, "Word level information:\n")
			for _, wordInfo := range alternative.GetWords() {
				startTime := wordInfo.GetStartTime()
				endTime := wordInfo.GetEndTime()
				fmt.Fprintf(w, "\t%4.1f - %4.1f: %v (speaker %v)\n",
					float64(startTime.GetSeconds())+float64(startTime.GetNanos())*1e-9, // start as seconds
					float64(endTime.GetSeconds())+float64(endTime.GetNanos())*1e-9,     // end as seconds
					wordInfo.GetWord(),
					wordInfo.GetSpeakerTag())
			}
		}
	}

	return nil
}

Java

// Instantiate a com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClient
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
  // Set the language code
  SpeechTranscriptionConfig config =
      SpeechTranscriptionConfig.newBuilder()
          .setLanguageCode("en-US")
          .setEnableAutomaticPunctuation(true)
          .build();

  // Set the video context with the above configuration
  VideoContext context = VideoContext.newBuilder().setSpeechTranscriptionConfig(config).build();

  // Create the request
  AnnotateVideoRequest request =
      AnnotateVideoRequest.newBuilder()
          .setInputUri(gcsUri)
          .addFeatures(Feature.SPEECH_TRANSCRIPTION)
          .setVideoContext(context)
          .build();

  // asynchronously perform speech transcription on videos
  OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
      client.annotateVideoAsync(request);

  System.out.println("Waiting for operation to complete...");
  // Display the results
  for (VideoAnnotationResults results :
      response.get(600, TimeUnit.SECONDS).getAnnotationResultsList()) {
    for (SpeechTranscription speechTranscription : results.getSpeechTranscriptionsList()) {
      try {
        // Print the transcription
        if (speechTranscription.getAlternativesCount() > 0) {
          SpeechRecognitionAlternative alternative = speechTranscription.getAlternatives(0);

          System.out.printf("Transcript: %s\n", alternative.getTranscript());
          System.out.printf("Confidence: %.2f\n", alternative.getConfidence());

          System.out.println("Word level information:");
          for (WordInfo wordInfo : alternative.getWordsList()) {
            double startTime =
                wordInfo.getStartTime().getSeconds() + wordInfo.getStartTime().getNanos() / 1e9;
            double endTime =
                wordInfo.getEndTime().getSeconds() + wordInfo.getEndTime().getNanos() / 1e9;
            System.out.printf(
                "\t%4.2fs - %4.2fs: %s\n", startTime, endTime, wordInfo.getWord());
          }
        } else {
          System.out.println("No transcription found");
        }
      } catch (IndexOutOfBoundsException ioe) {
        System.out.println("Could not retrieve frame: " + ioe.getMessage());
      }
    }
  }
}

Node.js

// Imports the Google Cloud Video Intelligence library
const videoIntelligence = require('@google-cloud/video-intelligence');

// Creates a client
const client = new videoIntelligence.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const gcsUri = 'GCS URI of video to analyze, e.g. gs://my-bucket/my-video.mp4';

async function analyzeVideoTranscript() {
  const videoContext = {
    speechTranscriptionConfig: {
      languageCode: 'en-US',
      enableAutomaticPunctuation: true,
    },
  };

  const request = {
    inputUri: gcsUri,
    features: ['SPEECH_TRANSCRIPTION'],
    videoContext: videoContext,
  };

  const [operation] = await client.annotateVideo(request);
  console.log('Waiting for operation to complete...');
  const [operationResult] = await operation.promise();
  // There is only one annotation_result since only
  // one video is processed.
  const annotationResults = operationResult.annotationResults[0];

  for (const speechTranscription of annotationResults.speechTranscriptions) {
    // The number of alternatives for each transcription is limited by
    // SpeechTranscriptionConfig.max_alternatives.
    // Each alternative is a different possible transcription
    // and has its own confidence score.
    for (const alternative of speechTranscription.alternatives) {
      console.log('Alternative level information:');
      console.log(`Transcript: ${alternative.transcript}`);
      console.log(`Confidence: ${alternative.confidence}`);

      console.log('Word level information:');
      for (const wordInfo of alternative.words) {
        const word = wordInfo.word;
        const start_time =
          wordInfo.startTime.seconds + wordInfo.startTime.nanos * 1e-9;
        const end_time =
          wordInfo.endTime.seconds + wordInfo.endTime.nanos * 1e-9;
        console.log('\t' + start_time + 's - ' + end_time + 's: ' + word);
      }
    }
  }
}

analyzeVideoTranscript();

Python

"""Transcribe speech from a video stored on GCS."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]

config = videointelligence.SpeechTranscriptionConfig(
    language_code="en-US", enable_automatic_punctuation=True
)
video_context = videointelligence.VideoContext(speech_transcription_config=config)

operation = video_client.annotate_video(
    request={
        "features": features,
        "input_uri": path,
        "video_context": video_context,
    }
)

print("\nProcessing video for speech transcription.")

result = operation.result(timeout=600)

# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:

    # The number of alternatives for each transcription is limited by
    # SpeechTranscriptionConfig.max_alternatives.
    # Each alternative is a different possible transcription
    # and has its own confidence score.
    for alternative in speech_transcription.alternatives:
        print("Alternative level information:")

        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}\n".format(alternative.confidence))

        print("Word level information:")
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                "\t{}s - {}s: {}".format(
                    start_time.seconds + start_time.microseconds * 1e-6,
                    end_time.seconds + end_time.microseconds * 1e-6,
                    word,
                )
            )

其他语言

C#:请按照客户端库页面上的 C# 设置说明操作,然后访问 .NET 版 Video Intelligence 参考文档。

PHP:请按照客户端库页面上的 PHP 设置说明操作,然后访问 PHP 版 Video Intelligence 参考文档

Ruby:请按照客户端库页面上的 Ruby 设置说明操作,然后访问 Ruby 版 Video Intelligence 参考文档