Improve transcription accuracy with speech adaptation boost

Speech adaptation boost is an optional feature of speech adaptation. Boost allows you to add numerical weights to words and/or phrases according to how frequently they should be recognized in your audio data.

We recommend that you implement boost if 1) you have already implemented speech adaptation, and 2) you would like to adjust the strength of speech adaptation effects on your transcription results.

See the speech adaptation concepts page for speech adaptation and speech adaptation boost best practices information.

The following code sample demonstrates how to send a request using speech adaptation boost.

REST & CMD LINE

For details about the API endpoint, see speech:recognize.

Before using any of the request data below, make the following replacements:

  • language-code: the BCP-47 code of the language spoken in your audio clip.
  • phrases-to-boost: phrase or phrases that you want Speech-to-Text to boost, as an array of strings.
  • storage-bucket: a Cloud Storage bucket.
  • input-audio: the audio data that you want to transcribe.

HTTP method and URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

Request JSON body:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

Java

import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognizeRequest;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechContext;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
import java.io.IOException;

public class SpeechAdaptation {

  public void speechAdaptation() throws IOException {
    String uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3";
    speechAdaptation(uriPath);
  }

  public static void speechAdaptation(String uriPath) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {

      // Provides "hints" to the speech recognizer to favor specific words and phrases in the
      // results.
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeechContext
      SpeechContext speechContext =
          SpeechContext.newBuilder().addPhrases("Brooklyn Bridge").setBoost(20.0F).build();
      // Configure recognition config to match your audio file.
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.MP3)
              .setSampleRateHertz(44100)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();
      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(uriPath).build();

      // Make the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Display the results
      RecognizeResponse response = speechClient.recognize(request);
      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js


const speech = require('@google-cloud/speech').v1p1beta1;

/**
 * Performs synchronous speech recognition with speech adaptation.
 *
 * @param sampleRateHertz {number} Sample rate in Hertz of the audio data sent in all
 * `RecognitionAudio` messages. Valid values are: 8000-48000.
 * @param languageCode {string} The language of the supplied audio.
 * @param phrase {string} Phrase "hints" help Speech-to-Text API recognize the specified phrases from
 * your audio data.
 * @param boost {number} Positive value will increase the probability that a specific phrase will be
 * recognized over other similar sounding phrases.
 * @param uriPath {string} Path to the audio file stored on GCS.
 */
function sampleRecognize(
  sampleRateHertz,
  languageCode,
  phrase,
  boost,
  uriPath
) {
  const client = new speech.SpeechClient();
  // const sampleRateHertz = 44100;
  // const languageCode = 'en-US';
  // const phrase = 'Brooklyn Bridge';
  // const boost = 20.0;
  // const uriPath = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3';
  const encoding = 'MP3';
  const phrases = [phrase];
  const speechContextsElement = {
    phrases: phrases,
    boost: boost,
  };
  const speechContexts = [speechContextsElement];
  const config = {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    speechContexts: speechContexts,
  };
  const audio = {
    uri: uriPath,
  };
  const request = {
    config: config,
    audio: audio,
  };
  client
    .recognize(request)
    .then(responses => {
      const response = responses[0];
      for (const result of response.results) {
        // First alternative is the most probable result
        const alternative = result.alternatives[0];
        console.log(`Transcript: ${alternative.transcript}`);
      }
    })
    .catch(err => {
      console.error(err);
    });
}

Python

from google.cloud import speech_v1p1beta1


def sample_recognize(storage_uri, phrase):
    """
    Transcribe a short audio file with speech adaptation.

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
      phrase Phrase "hints" help recognize the specified phrases from your audio.
    """

    client = speech_v1p1beta1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'
    # phrase = 'Brooklyn Bridge'
    phrases = [phrase]

    # Hint Boost. This value increases the probability that a specific
    # phrase will be recognized over other similar sounding phrases.
    # The higher the boost, the higher the chance of false positive
    # recognition as well. Can accept wide range of positive values.
    # Most use cases are best served with values between 0 and 20.
    # Using a binary search happroach may help you find the optimal value.
    boost = 20.0
    speech_contexts_element = {"phrases": phrases, "boost": boost}
    speech_contexts = [speech_contexts_element]

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 44100

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3
    config = {
        "speech_contexts": speech_contexts,
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(request={"config": config, "audio": audio})
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

What's next