Envoyer une requête de reconnaissance à l'aide de l'adaptation vocale

Vous pouvez améliorer la précision des résultats de transcription obtenus avec Speech-to-Text grâce à l'adaptation vocale.

L'exemple de code suivant montre comment améliorer la précision de la transcription en définissant des contextes vocaux dans une requête envoyée à l'API Speech-to-Text. Consultez la page sur les jetons de classe pour connaître la liste des classes disponibles pour votre langue.

API REST et ligne de commande

Reportez-vous au point de terminaison speech:recognize de l'API pour obtenir des informations complètes.

Avant d'utiliser les données de requête ci-dessous, effectuez les remplacements suivants :

  • language-code : code BCP-47 de la langue parlée dans votre extrait audio
  • phrases-to-boost : expression(s) que vous souhaitez améliorer dans Speech-to-Text, sous la forme d'un tableau de chaînes
  • storage-bucket : bucket Cloud Storage
  • input-audio : données audio que vous souhaitez transcrire

Méthode HTTP et URL :

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

Corps JSON de la requête :

    {
      "config":{
          "languageCode":"language-code",
          "speechContexts":[{
              "phrases":[phrases-to-boost],
              "boost": 2
          }]
      },
      "audio":{
        "uri":"gs:storage-bucket/input-file"
      }
    }
    

Pour envoyer votre requête, développez l'une des options suivantes :

Vous devriez recevoir une réponse JSON de ce type :

    {
      "results": [
        {
          "alternatives": [
            {
              "transcript": "When deciding whether to bring an umbrella, I consider the weather",
              "confidence": 0.9463943
            }
          ],
          "languageCode": "en-us"
        }
      ]
    }
    

Java

import com.google.cloud.speech.v1.RecognitionAudio;
    import com.google.cloud.speech.v1.RecognitionConfig;
    import com.google.cloud.speech.v1.RecognizeRequest;
    import com.google.cloud.speech.v1.RecognizeResponse;
    import com.google.cloud.speech.v1.SpeechClient;
    import com.google.cloud.speech.v1.SpeechContext;
    import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
    import com.google.cloud.speech.v1.SpeechRecognitionResult;
    import java.io.IOException;

    class TranscribeContextClasses {

      void transcribeContextClasses() throws IOException {
        // TODO(developer): Replace these variables before running the sample.
        String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
        transcribeContextClasses(storageUri);
      }

      // Provides "hints" to the speech recognizer to favor specific classes of words in the results.
      static void transcribeContextClasses(String storageUri) throws IOException {
        // Initialize client that will be used to send requests. This client only needs to be created
        // once, and can be reused for multiple requests. After completing all of your requests, call
        // the "close" method on the client to safely clean up any remaining background resources.
        try (SpeechClient speechClient = SpeechClient.create()) {
          // SpeechContext: to configure your speech_context see:
          // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
          // Full list of supported phrases (class tokens) here:
          // https://cloud.google.com/speech-to-text/docs/class-tokens
          SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();

          // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
          // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
          RecognitionConfig config =
              RecognitionConfig.newBuilder()
                  .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
                  .setSampleRateHertz(8000)
                  .setLanguageCode("en-US")
                  .addSpeechContexts(speechContext)
                  .build();

          // Set the path to your audio file
          RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();

          // Build the request
          RecognizeRequest request =
              RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

          // Perform the request
          RecognizeResponse response = speechClient.recognize(request);

          for (SpeechRecognitionResult result : response.getResultsList()) {
            // First alternative is the most probable result
            SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
            System.out.printf("Transcript: %s\n", alternative.getTranscript());
          }
        }
      }
    }

Node.js

// Provides "hints" to the speech recognizer to favor
    // specific classes of words in the results.

    // Imports the Google Cloud client library
    const speech = require('@google-cloud/speech');

    // Creates a client
    const client = new speech.SpeechClient();

    async function transcribeContextClasses() {
      // storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
      const audio = {
        uri: storageUri,
      };

      // SpeechContext: to configure your speech_context see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
      // Full list of supported phrases(class tokens) here:
      // https://cloud.google.com/speech-to-text/docs/class-tokens
      const speechContext = {
        phrases: ['$TIME'],
      };

      // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
      const config = {
        encoding: 'LINEAR16',
        sampleRateHertz: 8000,
        languageCode: 'en-US',
        speechContexts: [speechContext],
      };

      const request = {
        config: config,
        audio: audio,
      };

      // Detects speech in the audio file.
      const [response] = await client.recognize(request);
      response.results.forEach((result, index) => {
        const transcript = result.alternatives[0].transcript;
        console.log('-'.repeat(20));
        console.log(`First alternative of result ${index}`);
        console.log(`Transcript: ${transcript}`);
      });
    }

    transcribeContextClasses().catch(console.error);

Python

from google.cloud import speech
    client = speech.SpeechClient()

    # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
    audio = speech.types.RecognitionAudio(uri=storage_uri)

    # SpeechContext: to configure your speech_context see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
    # Full list of supported phrases (class tokens) here:
    # https://cloud.google.com/speech-to-text/docs/class-tokens
    speech_context = speech.types.SpeechContext(phrases=['$TIME'])

    # RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
    # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        speech_contexts=[speech_context])

    response = client.recognize(config, audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print('First alternative of result {}'.format(i))
        print('Transcript: {}'.format(alternative.transcript))

C#


    using Google.Cloud.Speech.V1;
    using System;

    namespace GoogleCloudSamples
    {
        public class SpeechContextClasses
        {
            /// <summary>
            /// Provides "hints" to the speech recognizer to favor specific classes of words in the results.
            ///</summary>
            /// <param name="uriPath">Path to the audio file stored on GCS.</param>
            public static object TranscribeContextClasses(
                string uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3")
            {
                var speechClient = SpeechClient.Create();
                SpeechContext speechContext = new SpeechContext();
                // SpeechContext: to configure your speech_context see:
                // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
                // Full list of supported phrases (class tokens) here:
                // https://cloud.google.com/speech-to-text/docs/class-tokens
                speechContext.Phrases.Add("$TIME");

                // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
                // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
                RecognitionConfig recognitionConfig = new RecognitionConfig
                {
                    Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
                    SampleRateHertz = 8000,
                    LanguageCode = "en-US"
                };
                recognitionConfig.SpeechContexts.Add(speechContext);

                // Set the path to your audio file
                RecognitionAudio audio = new RecognitionAudio
                {
                    Uri = uriPath
                };

                // Build the request
                RecognizeRequest request = new RecognizeRequest
                {
                    Config = recognitionConfig,
                    Audio = audio
                };

                // Perform the request
                var response = speechClient.Recognize(request);
                foreach (SpeechRecognitionResult result in response.Results)
                {
                    // First alternative is the most probable result
                    var alternative = result.Alternatives[0];
                    Console.WriteLine($"Transcript: {alternative.Transcript}");
                }
                return 0;
            }
        }
    }

Go


    import (
    	"context"
    	"fmt"
    	"io"
    	"strings"

    	speech "cloud.google.com/go/speech/apiv1"
    	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
    )

    // contextClasses provides "hints" to the speech recognizer
    // to favour specific classes of words in the results.
    func contextClasses(w io.Writer, gcsURI string) error {
    	ctx := context.Background()

    	client, err := speech.NewClient(ctx)
    	if err != nil {
    		return fmt.Errorf("NewClient: %v", err)
    	}
    	defer client.Close()

    	// SpeechContext: to configure your speech_context see:
    	// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
    	// Full list of supported phrases (class tokens) here:
    	// https://cloud.google.com/speech-to-text/docs/class-tokens
    	// In this instance, the use of "$TIME" favours time of day detections.
    	speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}
    	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
    		Config: &speechpb.RecognitionConfig{
    			Encoding:        speechpb.RecognitionConfig_LINEAR16,
    			SampleRateHertz: 8000,
    			LanguageCode:    "en-US",
    			SpeechContexts:  []*speechpb.SpeechContext{speechContext},
    		},
    		Audio: &speechpb.RecognitionAudio{
    			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
    		},
    	})
    	if err != nil {
    		return fmt.Errorf("Recognize: %v", err)
    	}

    	// Print the results.
    	for i, result := range resp.Results {
    		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
    		fmt.Fprintf(w, "Result %d\n", i+1)
    		for j, alternative := range result.Alternatives {
    			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
    		}
    	}
    	return nil
    }