Selecting a transcription model

This page describes how to use a specific machine learning model for audio transcription requests to Speech-to-Text.

Transcription models

Speech-to-Text detects words in an audio clip by comparing input to one of many machine learning models. Each model has been trained by analyzing millions of examples—in this case, many, many audio recordings of people speaking.

Speech-to-Text has specialized models trained from audio from specific sources, for example phone calls or videos. Because of this training process, these specialized models provide better results when applied towards similar kinds of audio data.

For example, Speech-to-Text has a transcription model trained to recognize speech captured on a phone. When Speech-to-Text uses this model to transcribe phone audio, it produces significantly better results than other models.

The following table shows the transcriptions models available for use with Speech-to-Text.

Model name Description
command_and_search Best for short or single-word utterances like voice commands or voice search.
phone_call Best for audio that originated from a phone call (typically recorded at an 8khz sampling rate).
video

Best for audio that originated from video or that includes more than one person talking. Ideally the audio is recorded at a 16khz or greater sampling rate.

This is a premium model that costs more than the standard rate. See the pricing page for more details.

default Best for audio that does not fit the other audio models, like long-form audio or dictation. Ideally the audio is high-fidelity, recorded at a 16khz or greater sampling rate.

Selecting a model for audio transcription

To specify a specific model to use for audio transcription, you must set the model field to one of the allowed values—video, phone_call, command_and_search, or default—in the RecognitionConfig parameters for the request. Speech-to-Text supports model selection for all speech recognition methods: speech:recognize, speech:longrunningrecognize, and Streaming.

Perform transcription of a local audio file

Protocol

Refer to the [speech:recognize] API endpoint for complete details.

To perform synchronous speech recognition, make a POST request and provide the appropriate request body. The following shows an example of a POST request using curl. The example uses the access token for a service account set up for the project using the Google Cloud Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the quickstart.

curl -s -H "Content-Type: application/json" \
    -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
    https://speech.googleapis.com/v1/speech:recognize \
    --data '{
    "config": {
        "encoding": "LINEAR16",
        "sampleRateHertz": 16000,
        "languageCode": "en-US",
        "model": "video"
    },
    "audio": {
        "uri": "gs://cloud-samples-tests/speech/Google_Gnome.wav"
    }
}'

See the RecognitionConfig reference documentation for more information on configuring the request body.

If the request is successful, the server returns a 200 OK HTTP status code and the response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "OK Google stream stranger things from
            Netflix to my TV okay stranger things from
            Netflix playing on TV from the people that brought you
            Google home comes the next evolution of the smart home
            and it's just outside your window me Google know hi
            how can I help okay no what's the weather like outside
            the weather outside is sunny and 76 degrees he's right
            okay no turn on the hose I'm holding sure okay no I'm can
            I eat this lemon tree leaf yes what about this Daisy yes
            but I wouldn't recommend it but I could eat it okay
            Nomad milk to my shopping list I'm sorry that sounds like
            an indoor request I keep doing that sorry you do keep
            doing that okay no is this compost really we're all
            compost if you think about it pretty much everything is
            made up of organic matter and will return",
          "confidence": 0.9251011
        }
      ]
    }
  ]
}

C#

static object SyncRecognizeModelSelection(string filePath, string model)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 16000,
        LanguageCode = "en",
        // The `model` value must be one of the following:
        // "video", "phone_call", "command_and_search", "default"
        Model = model
    }, RecognitionAudio.FromFile(filePath));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func modelSelection(w io.Writer, path string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}

	// path = "../testdata/Google_Gnome.wav"
	data, err := ioutil.ReadFile(path)
	if err != nil {
		return fmt.Errorf("ReadFile: %v", err)
	}

	req := &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 16000,
			LanguageCode:    "en-US",
			Model:           "video",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
		},
	}

	resp, err := client.Recognize(ctx, req)
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}

Java

/**
 * Performs transcription of the given audio file synchronously with the selected model.
 *
 * @param fileName the path to a audio file to transcribe
 */
public static void transcribeModelSelection(String fileName) throws Exception {
  Path path = Paths.get(fileName);
  byte[] content = Files.readAllBytes(path);

  try (SpeechClient speech = SpeechClient.create()) {
    // Configure request with video media type
    RecognitionConfig recConfig =
        RecognitionConfig.newBuilder()
            // encoding may either be omitted or must match the value in the file header
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            // sample rate hertz may be either be omitted or must match the value in the file
            // header
            .setSampleRateHertz(16000)
            .setModel("video")
            .build();

    RecognitionAudio recognitionAudio =
        RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();

    RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio);
    // Just print the first result here.
    SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0);
    // There can be several alternative transcripts for a given chunk of speech. Just use the
    // first (most likely) one here.
    SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
    System.out.printf("Transcript : %s\n", alternative.getTranscript());
  }
}

Node.js

// Imports the Google Cloud client library for Beta API
/**
 * TODO(developer): Update client library import to use new
 * version of API when desired features become available
 */
const speech = require('@google-cloud/speech').v1p1beta1;
const fs = require('fs');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const model = 'Model to use, e.g. phone_call, video, default';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
  model: model,
};
const audio = {
  content: fs.readFileSync(filename).toString('base64'),
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';
// $model = 'video';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// get contents of a file into a string
$content = file_get_contents($audioFile);

// set string as audio content
$audio = (new RecognitionAudio())
    ->setContent($content);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode)
    ->setModel($model);

// create the speech client
$client = new SpeechClient();

// make the API call
$response = $client->recognize($config, $audio);
$results = $response->getResults();

// print results
foreach ($results as $result) {
    $alternatives = $result->getAlternatives();
    $mostLikely = $alternatives[0];
    $transcript = $mostLikely->getTranscript();
    $confidence = $mostLikely->getConfidence();
    printf('Transcript: %s' . PHP_EOL, $transcript);
    printf('Confidence: %s' . PHP_EOL, $confidence);
}

$client->close();

Python

def transcribe_model_selection(speech_file, model):
    """Transcribe the given audio file synchronously with
    the selected model."""
    from google.cloud import speech

    client = speech.SpeechClient()

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        model=model,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))

Ruby

# file_path = "path/to/audio.wav"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = {
  encoding:          :LINEAR16,
  sample_rate_hertz: 16_000,
  language_code:     "en-US",
  model:             model
}

file  = File.binread file_path
audio = { content: file }

operation = speech.long_running_recognize config: config, audio: audio

puts "Operation started"

operation.wait_until_done!

raise operation.results.message if operation.error?

results = operation.response.results

results.each_with_index do |result, i|
  alternative = result.alternatives.first
  puts "-" * 20
  puts "First alternative of result #{i}"
  puts "Transcript: #{alternative.transcript}"
end

Perform transcription of a Google Cloud Storage audio file

Java

/**
 * Performs transcription of the remote audio file asynchronously with the selected model.
 *
 * @param gcsUri the path to the remote audio file to transcribe.
 */
public static void transcribeModelSelectionGcs(String gcsUri) throws Exception {
  try (SpeechClient speech = SpeechClient.create()) {

    // Configure request with video media type
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            // encoding may either be omitted or must match the value in the file header
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            // sample rate hertz may be either be omitted or must match the value in the file
            // header
            .setSampleRateHertz(16000)
            .setModel("video")
            .build();

    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();

    // Use non-blocking call for getting file transcription
    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata> response =
        speech.longRunningRecognizeAsync(config, audio);

    while (!response.isDone()) {
      System.out.println("Waiting for response...");
      Thread.sleep(10000);
    }

    List<SpeechRecognitionResult> results = response.get().getResultsList();

    // Just print the first result here.
    SpeechRecognitionResult result = results.get(0);
    // There can be several alternative transcripts for a given chunk of speech. Just use the
    // first (most likely) one here.
    SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
    System.out.printf("Transcript : %s\n", alternative.getTranscript());
  }
}

Node.js

// Imports the Google Cloud client library for Beta API
/**
 * TODO(developer): Update client library import to use new
 * version of API when desired features become available
 */
const speech = require('@google-cloud/speech').v1p1beta1;

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const gcsUri = 'gs://my-bucket/audio.raw';
// const model = 'Model to use, e.g. phone_call, video, default';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
  model: model,
};
const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file.
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);