Video Intelligence API 会将语音从支持的视频文件转录为文本。有两种受支持的模型,即“默认”和“视频”。
请求对视频执行语音转录
REST
发送处理请求
下面演示了如何向 videos:annotate 方法发送 POST 请求。本示例使用为项目设置的服务账号的访问令牌
使用 Google Cloud CLI如需了解有关安装 Google Cloud CLI、使用服务账号设置项目以及获取访问令牌的说明,请参阅 Video Intelligence 快速入门。
如需向 Video Intelligence 进行身份验证,请设置应用默认凭据。
如需了解详情,请参阅为本地开发环境设置身份验证。
funcspeechTranscriptionURI(wio.Writer,filestring)error{ctx:=context.Background()client,err:=video.NewClient(ctx)iferr!=nil{returnerr}deferclient.Close()op,err:=client.AnnotateVideo(ctx,&videopb.AnnotateVideoRequest{Features:[]videopb.Feature{videopb.Feature_SPEECH_TRANSCRIPTION,},VideoContext:&videopb.VideoContext{SpeechTranscriptionConfig:&videopb.SpeechTranscriptionConfig{LanguageCode:"en-US",EnableAutomaticPunctuation:true,},},InputUri:file,})iferr!=nil{returnerr}resp,err:=op.Wait(ctx)iferr!=nil{returnerr}// A single video was processed. Get the first result.result:=resp.AnnotationResults[0]for_,transcription:=rangeresult.SpeechTranscriptions{// The number of alternatives for each transcription is limited by// SpeechTranscriptionConfig.MaxAlternatives.// Each alternative is a different possible transcription// and has its own confidence score.for_,alternative:=rangetranscription.GetAlternatives(){fmt.Fprintf(w,"Alternative level information:\n")fmt.Fprintf(w,"\tTranscript: %v\n",alternative.GetTranscript())fmt.Fprintf(w,"\tConfidence: %v\n",alternative.GetConfidence())fmt.Fprintf(w,"Word level information:\n")for_,wordInfo:=rangealternative.GetWords(){startTime:=wordInfo.GetStartTime()endTime:=wordInfo.GetEndTime()fmt.Fprintf(w,"\t%4.1f - %4.1f: %v (speaker %v)\n",float64(startTime.GetSeconds())+float64(startTime.GetNanos())*1e-9,// start as secondsfloat64(endTime.GetSeconds())+float64(endTime.GetNanos())*1e-9,// end as secondswordInfo.GetWord(),wordInfo.GetSpeakerTag())}}}returnnil}
Java
如需向 Video Intelligence 进行身份验证,请设置应用默认凭据。
如需了解详情,请参阅为本地开发环境设置身份验证。
// Instantiate a com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClienttry(VideoIntelligenceServiceClientclient=VideoIntelligenceServiceClient.create()){// Set the language codeSpeechTranscriptionConfigconfig=SpeechTranscriptionConfig.newBuilder().setLanguageCode("en-US").setEnableAutomaticPunctuation(true).build();// Set the video context with the above configurationVideoContextcontext=VideoContext.newBuilder().setSpeechTranscriptionConfig(config).build();// Create the requestAnnotateVideoRequestrequest=AnnotateVideoRequest.newBuilder().setInputUri(gcsUri).addFeatures(Feature.SPEECH_TRANSCRIPTION).setVideoContext(context).build();// asynchronously perform speech transcription on videosOperationFuture<AnnotateVideoResponse,AnnotateVideoProgress>response=client.annotateVideoAsync(request);System.out.println("Waiting for operation to complete...");// Display the resultsfor(VideoAnnotationResultsresults:response.get(600,TimeUnit.SECONDS).getAnnotationResultsList()){for(SpeechTranscriptionspeechTranscription:results.getSpeechTranscriptionsList()){try{// Print the transcriptionif(speechTranscription.getAlternativesCount() > 0){SpeechRecognitionAlternativealternative=speechTranscription.getAlternatives(0);System.out.printf("Transcript: %s\n",alternative.getTranscript());System.out.printf("Confidence: %.2f\n",alternative.getConfidence());System.out.println("Word level information:");for(WordInfowordInfo:alternative.getWordsList()){doublestartTime=wordInfo.getStartTime().getSeconds()+wordInfo.getStartTime().getNanos()/1e9;doubleendTime=wordInfo.getEndTime().getSeconds()+wordInfo.getEndTime().getNanos()/1e9;System.out.printf("\t%4.2fs - %4.2fs: %s\n",startTime,endTime,wordInfo.getWord());}}else{System.out.println("No transcription found");}}catch(IndexOutOfBoundsExceptionioe){System.out.println("Could not retrieve frame: "+ioe.getMessage());}}}}
Node.js
如需向 Video Intelligence 进行身份验证,请设置应用默认凭据。
如需了解详情,请参阅为本地开发环境设置身份验证。
// Imports the Google Cloud Video Intelligence libraryconstvideoIntelligence=require('@google-cloud/video-intelligence');// Creates a clientconstclient=newvideoIntelligence.VideoIntelligenceServiceClient();/** * TODO(developer): Uncomment the following line before running the sample. */// const gcsUri = 'GCS URI of video to analyze, e.g. gs://my-bucket/my-video.mp4';asyncfunctionanalyzeVideoTranscript(){constvideoContext={speechTranscriptionConfig:{languageCode:'en-US',enableAutomaticPunctuation:true,},};constrequest={inputUri:gcsUri,features:['SPEECH_TRANSCRIPTION'],videoContext:videoContext,};const[operation]=awaitclient.annotateVideo(request);console.log('Waiting for operation to complete...');const[operationResult]=awaitoperation.promise();// There is only one annotation_result since only// one video is processed.constannotationResults=operationResult.annotationResults[0];for(constspeechTranscriptionofannotationResults.speechTranscriptions){// The number of alternatives for each transcription is limited by// SpeechTranscriptionConfig.max_alternatives.// Each alternative is a different possible transcription// and has its own confidence score.for(constalternativeofspeechTranscription.alternatives){console.log('Alternative level information:');console.log(`Transcript: ${alternative.transcript}`);console.log(`Confidence: ${alternative.confidence}`);console.log('Word level information:');for(constwordInfoofalternative.words){constword=wordInfo.word;conststart_time=wordInfo.startTime.seconds+wordInfo.startTime.nanos*1e-9;constend_time=wordInfo.endTime.seconds+wordInfo.endTime.nanos*1e-9;console.log('\t'+start_time+'s - '+end_time+'s: '+word);}}}}analyzeVideoTranscript();
Python
如需向 Video Intelligence 进行身份验证,请设置应用默认凭据。
如需了解详情,请参阅为本地开发环境设置身份验证。
"""Transcribe speech from a video stored on GCS."""fromgoogle.cloudimportvideointelligencevideo_client=videointelligence.VideoIntelligenceServiceClient()features=[videointelligence.Feature.SPEECH_TRANSCRIPTION]config=videointelligence.SpeechTranscriptionConfig(language_code="en-US",enable_automatic_punctuation=True)video_context=videointelligence.VideoContext(speech_transcription_config=config)operation=video_client.annotate_video(request={"features":features,"input_uri":path,"video_context":video_context,})print("\nProcessing video for speech transcription.")result=operation.result(timeout=600)# There is only one annotation_result since only# one video is processed.annotation_results=result.annotation_results[0]forspeech_transcriptioninannotation_results.speech_transcriptions:# The number of alternatives for each transcription is limited by# SpeechTranscriptionConfig.max_alternatives.# Each alternative is a different possible transcription# and has its own confidence score.foralternativeinspeech_transcription.alternatives:print("Alternative level information:")print("Transcript: {}".format(alternative.transcript))print("Confidence: {}\n".format(alternative.confidence))print("Word level information:")forword_infoinalternative.words:word=word_info.wordstart_time=word_info.start_timeend_time=word_info.end_timeprint("\t{}s - {}s: {}".format(start_time.seconds+start_time.microseconds*1e-6,end_time.seconds+end_time.microseconds*1e-6,word,))
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["Hard to understand","hardToUnderstand","thumb-down"],["Incorrect information or sample code","incorrectInformationOrSampleCode","thumb-down"],["Missing the information/samples I need","missingTheInformationSamplesINeed","thumb-down"],["翻译问题","translationIssue","thumb-down"],["其他","otherDown","thumb-down"]],["最后更新时间 (UTC):2024-10-13。"],[],[]]