Using speech adaptation

You can improve the quality of the transcription results you get from Speech-to-Text by using speech adaptation.

The following code sample demonstrates how to use speech adaptation by setting speech contexts in a request sent to Speech-to-Text API.

REST & CMD LINE

Refer to the speech:recognize API endpoint for complete details.

Before using any of the request data below, make the following replacements:

  • language-code: the BCP-47 code of the language spoken in your audio clip.
  • phrases-to-boost: phrase or phrases that you want Cloud Speech-to-Text to boost, as an array of strings.
  • storage-bucket: a Cloud Storage bucket.
  • input-audio: the audio data that you want to transcribe.

HTTP method and URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

Request JSON body:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
 * import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
 * import com.google.cloud.speech.v1p1beta1.RecognizeRequest;
 * import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
 * import com.google.cloud.speech.v1p1beta1.SpeechClient;
 * import com.google.cloud.speech.v1p1beta1.SpeechContext;
 * import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
 * import java.util.Arrays;
 * import java.util.List;
 */

public static void sampleRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String storageUri = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3";
  String phrase = "Brooklyn Bridge";
  sampleRecognize(storageUri, phrase);
}

/**
 * Transcribe a short audio file with speech adaptation.
 *
 * @param storageUri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
 * @param phrase Phrase "hints" help recognize the specified phrases from your audio.
 */
public static void sampleRecognize(String storageUri, String phrase) {
  try (SpeechClient speechClient = SpeechClient.create()) {
    List<String> phrases = Arrays.asList(phrase);

    // Hint Boost. This value increases the probability that a specific
    // phrase will be recognized over other similar sounding phrases.
    // The higher the boost, the higher the chance of false positive
    // recognition as well. Can accept wide range of positive values.
    // Most use cases are best served with values between 0 and 20.
    // Using a binary search happroach may help you find the optimal value.
    float boost = 20.0F;
    SpeechContext speechContextsElement =
        SpeechContext.newBuilder().addAllPhrases(phrases).setBoost(boost).build();
    List<SpeechContext> speechContexts = Arrays.asList(speechContextsElement);

    // Sample rate in Hertz of the audio data sent
    int sampleRateHertz = 44100;

    // The language of the supplied audio
    String languageCode = "en-US";

    // Encoding of audio data sent. This sample sets this explicitly.
    // This field is optional for FLAC and WAV audio formats.
    RecognitionConfig.AudioEncoding encoding = RecognitionConfig.AudioEncoding.MP3;
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .addAllSpeechContexts(speechContexts)
            .setSampleRateHertz(sampleRateHertz)
            .setLanguageCode(languageCode)
            .setEncoding(encoding)
            .build();
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();
    RecognizeRequest request =
        RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    RecognizeResponse response = speechClient.recognize(request);
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative is the most probable result
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js


const speech = require('@google-cloud/speech').v1p1beta1;

/**
 * Performs synchronous speech recognition with speech adaptation.
 *
 * @param sampleRateHertz {number} Sample rate in Hertz of the audio data sent in all
 * `RecognitionAudio` messages. Valid values are: 8000-48000.
 * @param languageCode {string} The language of the supplied audio.
 * @param phrase {string} Phrase "hints" help Speech-to-Text API recognize the specified phrases from
 * your audio data.
 * @param boost {number} Positive value will increase the probability that a specific phrase will be
 * recognized over other similar sounding phrases.
 * @param uriPath {string} Path to the audio file stored on GCS.
 */
function sampleRecognize(
  sampleRateHertz,
  languageCode,
  phrase,
  boost,
  uriPath
) {
  const client = new speech.SpeechClient();
  // const sampleRateHertz = 44100;
  // const languageCode = 'en-US';
  // const phrase = 'Brooklyn Bridge';
  // const boost = 20.0;
  // const uriPath = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3';
  const encoding = 'MP3';
  const phrases = [phrase];
  const speechContextsElement = {
    phrases: phrases,
    boost: boost,
  };
  const speechContexts = [speechContextsElement];
  const config = {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    speechContexts: speechContexts,
  };
  const audio = {
    uri: uriPath,
  };
  const request = {
    config: config,
    audio: audio,
  };
  client
    .recognize(request)
    .then(responses => {
      const response = responses[0];
      for (const result of response.results) {
        // First alternative is the most probable result
        const alternative = result.alternatives[0];
        console.log(`Transcript: ${alternative.transcript}`);
      }
    })
    .catch(err => {
      console.error(err);
    });
}

Python

from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums


def sample_recognize(storage_uri, phrase):
    """
    Transcribe a short audio file with speech adaptation.

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
      phrase Phrase "hints" help recognize the specified phrases from your audio.
    """

    client = speech_v1p1beta1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'
    # phrase = 'Brooklyn Bridge'
    phrases = [phrase]

    # Hint Boost. This value increases the probability that a specific
    # phrase will be recognized over other similar sounding phrases.
    # The higher the boost, the higher the chance of false positive
    # recognition as well. Can accept wide range of positive values.
    # Most use cases are best served with values between 0 and 20.
    # Using a binary search happroach may help you find the optimal value.
    boost = 20.0
    speech_contexts_element = {"phrases": phrases, "boost": boost}
    speech_contexts = [speech_contexts_element]

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 44100

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.MP3
    config = {
        "speech_contexts": speech_contexts,
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))