Using enhanced models

This page describes how to request an enhanced speech recognition model when you send a transcription request to Speech-to-Text.

There are currently two enhanced models: phone call and video. These models have been optimized to more accurately transcribe audio data from these specific sources. See the supported languages page to see if the enhanced models are available for your language.

Google creates and improves enhanced models based on data collected through data logging. While opting in to data logging is not required in order to use enhanced models, if you do opt in you can help Google improve these models and also enjoy a discount on your usage.

To use the enhanced recognition models set the following fields in RecognitionConfig:

  1. Set useEnhanced to true.
  2. Pass either the phone_call or video string in the model field.

Speech-to-Text supports enhanced models for all speech recognition methods: speech:recognize speech:longrunningrecognize, and Streaming.

The following code samples demonstrate how to request to use an enhanced model for a transcription request.

Protocol

Refer to the speech:recognize API endpoint for complete details.

To perform synchronous speech recognition, make a POST request and provide the appropriate request body. The following shows an example of a POST request using curl. The example uses the access token for a service account set up for the project using the Google Cloud Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the quickstart.

curl -s -H "Content-Type: application/json" \
    -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
    https://speech.googleapis.com/v1/speech:recognize \
    --data '{
    "config": {
        "encoding": "LINEAR16",
        "languageCode": "en-US",
        "enableWordTimeOffsets": false,
        "enableAutomaticPunctuation": true,
        "model": "phone_call",
        "useEnhanced": true
    },
    "audio": {
        "uri": "gs://cloud-samples-tests/speech/commercial_mono.wav"
    }
}'

See the RecognitionConfig reference documentation for more information on configuring the request body.

If the request is successful, the server returns a 200 OK HTTP status code and the response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "Hi, I'd like to buy a Chromecast. I was wondering whether you could help me with that.",
          "confidence": 0.8930228
        }
      ],
      "resultEndTime": "5.640s"
    },
    {
      "alternatives": [
        {
          "transcript": " Certainly, which color would you like? We are blue black and red.",
          "confidence": 0.9101991
        }
      ],
      "resultEndTime": "10.220s"
    },
    {
      "alternatives": [
        {
          "transcript": " Let's go with the black one.",
          "confidence": 0.8818244
        }
      ],
      "resultEndTime": "13.870s"
    },
    {
      "alternatives": [
        {
          "transcript": " Would you like the new Chromecast Ultra model or the regular Chromecast?",
          "confidence": 0.94733626
        }
      ],
      "resultEndTime": "18.460s"
    },
    {
      "alternatives": [
        {
          "transcript": " Regular Chromecast is fine. Thank you. Okay. Sure. Would you like to ship it regular or Express?",
          "confidence": 0.9519095
        }
      ],
      "resultEndTime": "25.930s"
    },
    {
      "alternatives": [
        {
          "transcript": " Express, please.",
          "confidence": 0.9101229
        }
      ],
      "resultEndTime": "28.260s"
    },
    {
      "alternatives": [
        {
          "transcript": " Terrific. It's on the way. Thank you. Thank you very much. Bye.",
          "confidence": 0.9321616
        }
      ],
      "resultEndTime": "34.150s"
    }
 ]
}

C#

static object SyncRecognizeEnhancedModel(string filePath)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 8000,
        LanguageCode = "en-US",
        UseEnhanced = true,
        Model = "phone_call",
    }, RecognitionAudio.FromFile(filePath));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func enhancedModel(w io.Writer, path string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}

	// path = "../testdata/commercial_mono.wav"
	data, err := ioutil.ReadFile(path)
	if err != nil {
		return fmt.Errorf("ReadFile: %v", err)
	}

	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 8000,
			LanguageCode:    "en-US",
			UseEnhanced:     true,
			// A model must be specified to use enhanced model.
			Model: "phone_call",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}

Python

import io

from google.cloud import speech

client = speech.SpeechClient()

# path = 'resources/commercial_mono.wav'
with io.open(path, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    # Enhanced models are only available to projects that
    # opt in for audio data collection.
    use_enhanced=True,
    # A model must be specified to use enhanced model.
    model="phone_call",
)

response = client.recognize(config=config, audio=audio)

for i, result in enumerate(response.results):
    alternative = result.alternatives[0]
    print("-" * 20)
    print("First alternative of result {}".format(i))
    print("Transcript: {}".format(alternative.transcript))

Java

/**
 * Transcribe the given audio file using an enhanced model.
 *
 * @param fileName the path to an audio file.
 */
public static void transcribeFileWithEnhancedModel(String fileName) throws Exception {
  Path path = Paths.get(fileName);
  byte[] content = Files.readAllBytes(path);

  try (SpeechClient speechClient = SpeechClient.create()) {
    // Get the contents of the local audio file
    RecognitionAudio recognitionAudio =
        RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();

    // Configure request to enable enhanced models
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            .setSampleRateHertz(8000)
            .setUseEnhanced(true)
            // A model must be specified to use enhanced model.
            .setModel("phone_call")
            .build();

    // Perform the transcription request
    RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);

    // Print out the results
    for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
      // There can be several alternative transcripts for a given chunk of speech. Just use the
      // first (most likely) one here.
      SpeechRecognitionAlternative alternative = result.getAlternatives(0);
      System.out.format("Transcript: %s\n\n", alternative.getTranscript());
    }
  }
}

Node.js

// Imports the Google Cloud client library for Beta API
/**
 * TODO(developer): Update client library import to use new
 * version of API when desired features become available
 */
const speech = require('@google-cloud/speech').v1p1beta1;
const fs = require('fs');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  languageCode: languageCode,
  useEnhanced: true,
  model: 'phone_call',
};
const audio = {
  content: fs.readFileSync(filename).toString('base64'),
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
response.results.forEach(result => {
  const alternative = result.alternatives[0];
  console.log(alternative.transcript);
});

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 8000;
$languageCode = 'en-US';

// get contents of a file into a string
$content = file_get_contents($audioFile);

// set string as audio content
$audio = (new RecognitionAudio())
    ->setContent($content);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode)
    ->setUseEnhanced(true)
    ->setModel('phone_call');

// create the speech client
$client = new SpeechClient();

// make the API call
$response = $client->recognize($config, $audio);
$results = $response->getResults();

// print results
foreach ($results as $result) {
    $alternatives = $result->getAlternatives();
    $mostLikely = $alternatives[0];
    $transcript = $mostLikely->getTranscript();
    $confidence = $mostLikely->getConfidence();
    printf('Transcript: %s' . PHP_EOL, $transcript);
    printf('Confidence: %s' . PHP_EOL, $confidence);
}

$client->close();

Ruby

# audio_file_path = "path/to/audio.wav"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = {
  encoding:          :LINEAR16,
  sample_rate_hertz: 8000,
  language_code:     "en-US",
  use_enhanced:      true,
  model:             "phone_call"
}

audio_file = File.binread audio_file_path
audio      = { content: audio_file }

operation = speech.long_running_recognize config: config, audio: audio

puts "Operation started"

operation.wait_until_done!

raise operation.results.message if operation.error?

results = operation.response.results

results.each_with_index do |result, i|
  alternative = result.alternatives.first
  puts "-" * 20
  puts "First alternative of result #{i}"
  puts "Transcript: #{alternative.transcript}"
end

What's next

Review how to make synchronous transcription requests.