Obtenir une transcription de piste audio

Video Intelligence transcrit une source vocale en texte à partir des fichiers vidéo compatibles. Il existe deux modèles compatibles : "par défaut" et "vidéo".

Effectuer une requête de transcription vocale pour une vidéo

API REST et ligne de commande

Envoyer la requête de processus

Vous trouverez ci-dessous la procédure à suivre pour envoyer une requête POST à la méthode videos:annotate. Cet exemple fait intervenir le jeton d'accès associé à un compte de service configuré pour le projet à l'aide du SDK Cloud. Pour obtenir des instructions sur l'installation du SDK Cloud, la configuration d'un projet avec un compte de service et l'obtention d'un jeton d'accès, consultez le Guide de démarrage rapide de Video Intelligence.

Avant d'utiliser les données de requête ci-dessous, effectuez les remplacements suivants :

  • input-uri : bucket Cloud Storage contenant le fichier que vous souhaitez annoter, y compris son nom. Doit commencer par gs://.
    Par exemple, "inputUri": "gs://cloud-videointelligence-demo/assistant.mp4",
  • language-code : [facultatif] Consultez la liste des langues compatibles

Méthode HTTP et URL :

POST https://videointelligence.googleapis.com/v1/videos:annotate

Corps JSON de la requête :

{
"inputUri": "input-uri",
  "features": ["SPEECH_TRANSCRIPTION"],
  "videoContext": {
    "speechTranscriptionConfig": {
      "languageCode": "language-code",
      "enableAutomaticPunctuation": true,
      "filterProfanity": true
    }
  }
}

Pour envoyer votre requête, développez l'une des options suivantes :

Vous devriez recevoir une réponse JSON de ce type :

{
  "name": "projects/project-number/locations/location-id/operations/operation-id"
}

Si la requête aboutit, Video Intelligence renvoie le name correspond à votre opération. L'exemple ci-dessus montre un exemple de ce type de réponse, où project-number est le numéro de votre projet et operation-id est l'ID de l'opération de longue durée créée pour la requête.

Obtenir les résultats

Pour obtenir les résultats de votre requête, envoyez une requête GET à l'aide du nom d'opération renvoyé par l'appel à videos:annotate, comme indiqué dans l'exemple suivant.

Avant d'utiliser les données de requête ci-dessous, effectuez les remplacements suivants :

  • operation-name: nom de l'opération tel qu'il a été renvoyé par l'API Video Intelligence. Il est au format suivant : projects/project-number/locations/location-id/operations/operation-id.

Méthode HTTP et URL :

GET https://videointelligence.googleapis.com/v1/operation-name

Pour envoyer votre requête, développez l'une des options suivantes :

Vous devriez recevoir une réponse JSON de ce type :

C#

public static object TranscribeVideo(string uri)
{
    Console.WriteLine("Processing video for speech transcription.");

    var client = VideoIntelligenceServiceClient.Create();
    var request = new AnnotateVideoRequest
    {
        InputUri = uri,
        Features = { Feature.SpeechTranscription },
        VideoContext = new VideoContext
        {
            SpeechTranscriptionConfig = new SpeechTranscriptionConfig
            {
                LanguageCode = "en-US",
                EnableAutomaticPunctuation = true
            }
        },
    };
    var op = client.AnnotateVideo(request).PollUntilCompleted();

    // There is only one annotation result since only one video is
    // processed.
    var annotationResults = op.Result.AnnotationResults[0];
    foreach (var transcription in annotationResults.SpeechTranscriptions)
    {
        // The number of alternatives for each transcription is limited
        // by SpeechTranscriptionConfig.MaxAlternatives.
        // Each alternative is a different possible transcription
        // and has its own confidence score.
        foreach (var alternative in transcription.Alternatives)
        {
            Console.WriteLine("Alternative level information:");

            Console.WriteLine($"Transcript: {alternative.Transcript}");
            Console.WriteLine($"Confidence: {alternative.Confidence}");

            foreach (var wordInfo in alternative.Words)
            {
                Console.WriteLine($"\t{wordInfo.StartTime} - " +
                                  $"{wordInfo.EndTime}:" +
                                  $"{wordInfo.Word}");
            }
        }
    }

    return 0;
}

Go


func speechTranscription(w io.Writer, file string) error {
	ctx := context.Background()
	client, err := video.NewClient(ctx)
	if err != nil {
		return err
	}

	fileBytes, err := ioutil.ReadFile(file)
	if err != nil {
		return err
	}

	op, err := client.AnnotateVideo(ctx, &videopb.AnnotateVideoRequest{
		Features: []videopb.Feature{
			videopb.Feature_SPEECH_TRANSCRIPTION,
		},
		VideoContext: &videopb.VideoContext{
			SpeechTranscriptionConfig: &videopb.SpeechTranscriptionConfig{
				LanguageCode:               "en-US",
				EnableAutomaticPunctuation: true,
			},
		},
		InputContent: fileBytes,
	})
	if err != nil {
		return err
	}
	resp, err := op.Wait(ctx)
	if err != nil {
		return err
	}

	// A single video was processed. Get the first result.
	result := resp.AnnotationResults[0]

	for _, transcription := range result.SpeechTranscriptions {
		// The number of alternatives for each transcription is limited by
		// SpeechTranscriptionConfig.MaxAlternatives.
		// Each alternative is a different possible transcription
		// and has its own confidence score.
		for _, alternative := range transcription.GetAlternatives() {
			fmt.Fprintf(w, "Alternative level information:\n")
			fmt.Fprintf(w, "\tTranscript: %v\n", alternative.GetTranscript())
			fmt.Fprintf(w, "\tConfidence: %v\n", alternative.GetConfidence())

			fmt.Fprintf(w, "Word level information:\n")
			for _, wordInfo := range alternative.GetWords() {
				startTime := wordInfo.GetStartTime()
				endTime := wordInfo.GetEndTime()
				fmt.Fprintf(w, "\t%4.1f - %4.1f: %v (speaker %v)\n",
					float64(startTime.GetSeconds())+float64(startTime.GetNanos())*1e-9, // start as seconds
					float64(endTime.GetSeconds())+float64(endTime.GetNanos())*1e-9,     // end as seconds
					wordInfo.GetWord(),
					wordInfo.GetSpeakerTag())
			}
		}
	}

	return nil
}

Java

// Instantiate a com.google.cloud.videointelligence.v1.VideoIntelligenceServiceClient
try (VideoIntelligenceServiceClient client = VideoIntelligenceServiceClient.create()) {
  // Set the language code
  SpeechTranscriptionConfig config = SpeechTranscriptionConfig.newBuilder()
          .setLanguageCode("en-US")
          .setEnableAutomaticPunctuation(true)
          .build();

  // Set the video context with the above configuration
  VideoContext context = VideoContext.newBuilder()
          .setSpeechTranscriptionConfig(config)
          .build();

  // Create the request
  AnnotateVideoRequest request = AnnotateVideoRequest.newBuilder()
          .setInputUri(gcsUri)
          .addFeatures(Feature.SPEECH_TRANSCRIPTION)
          .setVideoContext(context)
          .build();

  // asynchronously perform speech transcription on videos
  OperationFuture<AnnotateVideoResponse, AnnotateVideoProgress> response =
          client.annotateVideoAsync(request);

  System.out.println("Waiting for operation to complete...");
  // Display the results
  for (VideoAnnotationResults results : response.get(600, TimeUnit.SECONDS)
          .getAnnotationResultsList()) {
    for (SpeechTranscription speechTranscription : results.getSpeechTranscriptionsList()) {
      try {
        // Print the transcription
        if (speechTranscription.getAlternativesCount() > 0) {
          SpeechRecognitionAlternative alternative = speechTranscription.getAlternatives(0);

          System.out.printf("Transcript: %s\n", alternative.getTranscript());
          System.out.printf("Confidence: %.2f\n", alternative.getConfidence());

          System.out.println("Word level information:");
          for (WordInfo wordInfo : alternative.getWordsList()) {
            double startTime = wordInfo.getStartTime().getSeconds()
                    + wordInfo.getStartTime().getNanos() / 1e9;
            double endTime = wordInfo.getEndTime().getSeconds()
                    + wordInfo.getEndTime().getNanos() / 1e9;
            System.out.printf("\t%4.2fs - %4.2fs: %s\n",
                    startTime, endTime, wordInfo.getWord());
          }
        } else {
          System.out.println("No transcription found");
        }
      } catch (IndexOutOfBoundsException ioe) {
        System.out.println("Could not retrieve frame: " + ioe.getMessage());
      }
    }
  }
}

Node.js

// Imports the Google Cloud Video Intelligence library
const videoIntelligence = require('@google-cloud/video-intelligence');

// Creates a client
const client = new videoIntelligence.VideoIntelligenceServiceClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const gcsUri = 'GCS URI of video to analyze, e.g. gs://my-bucket/my-video.mp4';

const videoContext = {
  speechTranscriptionConfig: {
    languageCode: 'en-US',
    enableAutomaticPunctuation: true,
  },
};

const request = {
  inputUri: gcsUri,
  features: ['SPEECH_TRANSCRIPTION'],
  videoContext: videoContext,
};

const [operation] = await client.annotateVideo(request);
console.log('Waiting for operation to complete...');
const [operationResult] = await operation.promise();
console.log('Word level information:');
const alternative =
  operationResult.annotationResults[0].speechTranscriptions[0]
    .alternatives[0];
alternative.words.forEach(wordInfo => {
  const start_time =
    wordInfo.startTime.seconds + wordInfo.startTime.nanos * 1e-9;
  const end_time = wordInfo.endTime.seconds + wordInfo.endTime.nanos * 1e-9;
  console.log('\t' + start_time + 's - ' + end_time + 's: ' + wordInfo.word);
});
console.log('Transcription: ' + alternative.transcript);

PHP

use Google\Cloud\VideoIntelligence\V1\VideoIntelligenceServiceClient;
use Google\Cloud\VideoIntelligence\V1\Feature;
use Google\Cloud\VideoIntelligence\V1\VideoContext;
use Google\Cloud\VideoIntelligence\V1\SpeechTranscriptionConfig;

/** Uncomment and populate these variables in your code */
// $uri = 'The cloud storage object to analyze (gs://your-bucket-name/your-object-name)';
// $options = [];

# set configs
$features = [Feature::SPEECH_TRANSCRIPTION];
$speechTranscriptionConfig = (new SpeechTranscriptionConfig())
    ->setLanguageCode('en-US')
    ->setEnableAutomaticPunctuation(true);
$videoContext = (new VideoContext())
    ->setSpeechTranscriptionConfig($speechTranscriptionConfig);

# instantiate a client
$client = new VideoIntelligenceServiceClient();

# execute a request.
$operation = $client->annotateVideo([
    'inputUri' => $uri,
    'features' => $features,
    'videoContext' => $videoContext
]);

print('Processing video for speech transcription...' . PHP_EOL);
# Wait for the request to complete.
$operation->pollUntilComplete($options);

# Print the result.
if ($operation->operationSucceeded()) {
    $result = $operation->getResult();
    # there is only one annotation_result since only
    # one video is processed.
    $annotationResults = $result->getAnnotationResults()[0];
    $speechTranscriptions = $annotationResults ->getSpeechTranscriptions();

    foreach ($speechTranscriptions as $transcription) {
        # the number of alternatives for each transcription is limited by
        # $max_alternatives in SpeechTranscriptionConfig
        # each alternative is a different possible transcription
        # and has its own confidence score.
        foreach ($transcription->getAlternatives() as $alternative) {
            print('Alternative level information' . PHP_EOL);

            printf('Transcript: %s' . PHP_EOL, $alternative->getTranscript());
            printf('Confidence: %s' . PHP_EOL, $alternative->getConfidence());

            print('Word level information:');
            foreach ($alternative->getWords() as $wordInfo) {
                printf(
                    '%s s - %s s: %s' . PHP_EOL,
                    $wordInfo->getStartTime()->getSeconds(),
                    $wordInfo->getEndTime()->getSeconds(),
                    $wordInfo->getWord()
                );
            }
        }
    }
}
$client->close();

Python

"""Transcribe speech from a video stored on GCS."""
from google.cloud import videointelligence

video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

config = videointelligence.types.SpeechTranscriptionConfig(
    language_code="en-US", enable_automatic_punctuation=True
)
video_context = videointelligence.types.VideoContext(
    speech_transcription_config=config
)

operation = video_client.annotate_video(
    input_uri=path, features=features, video_context=video_context
)

print("\nProcessing video for speech transcription.")

result = operation.result(timeout=600)

# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:

    # The number of alternatives for each transcription is limited by
    # SpeechTranscriptionConfig.max_alternatives.
    # Each alternative is a different possible transcription
    # and has its own confidence score.
    for alternative in speech_transcription.alternatives:
        print("Alternative level information:")

        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}\n".format(alternative.confidence))

        print("Word level information:")
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            print(
                "\t{}s - {}s: {}".format(
                    start_time.seconds + start_time.nanos * 1e-9,
                    end_time.seconds + end_time.nanos * 1e-9,
                    word,
                )
            )

Ruby

# path = "Path to a video file on Google Cloud Storage: gs://bucket/video.mp4"

require "google/cloud/video_intelligence"

video = Google::Cloud::VideoIntelligence.video_intelligence_service

context = {
  speech_transcription_config: {
    language_code:                "en-US",
    enable_automatic_punctuation: true
  }
}

# Register a callback during the method call
operation = video.annotate_video features: [:SPEECH_TRANSCRIPTION], input_uri: path, video_context: context

puts "Processing video for speech transcriptions:"
operation.wait_until_done!

raise operation.results.message? if operation.error?
puts "Finished Processing."

transcriptions = operation.results.annotation_results.first.speech_transcriptions
print_transcriptions transcriptions