Transcribing short audio files

This page demonstrates how to transcribe a short audio file to text using synchronous speech recognition.

Synchronous speech recognition returns the recognized text for short audio (less than ~1 minute) in the response as soon as it is processed. To process a speech recognition request for long audio, use Asynchronous Speech Recognition.

Audio content can be sent directly to Speech-to-Text, or it can process audio content that already resides in Google Cloud Storage. See also the audio limits for synchronous speech recognition requests.

Performing synchronous speech recognition on a local file

Here is an example of performing synchronous speech recognition on a local audio file:

Protocol

Refer to the speech:recognize API endpoint for complete details.

To perform synchronous speech recognition, make a POST request and provide the appropriate request body. The following shows an example of a POST request using curl. The example uses the access token for a service account set up for the project using the Google Cloud Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the quickstart.

curl -X POST \
     -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
     -H "Content-Type: application/json; charset=utf-8" \
     --data "{
  'config': {
    'encoding': 'LINEAR16',
    'sampleRateHertz': 16000,
    'languageCode': 'en-US',
    'enableWordTimeOffsets': false
  },
  'audio': {
    'content': '/9j/7QBEUGhvdG9zaG9...base64-encoded-audio-content...fXNWzvDEeYxxxzj/Coa6Bax//Z'
  }
}" "https://speech.googleapis.com/v1/speech:recognize"
  

See the RecognitionConfig reference documentation for more information on configuring the request body.

The audio content supplied in the request body is base64-encoded. For more information on how to base64-encode audio, see Base64 Encoding Audio Content. For more information on the content field, see RecognitionAudio.

If the request is successful, the server returns a 200 OK HTTP status code and the response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "how old is the Brooklyn Bridge",
          "confidence": 0.98267895
        }
      ]
    }
  ]
}

gcloud command

Refer to recognize command for complete details.

To perform speech recognition on a local file, use the gcloud command line tool, passing in the local filepath of the file to perform speech recognition on.

gcloud ml speech recognize PATH-TO-LOCAL-FILE --language-code='en-US'

If the request is successful, the server returns a response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "confidence": 0.9840146,
          "transcript": "how old is the Brooklyn Bridge"
        }
      ]
    }
  ]
}

C#

static object SyncRecognize(string filePath)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 16000,
        LanguageCode = "en",
    }, RecognitionAudio.FromFile(filePath));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func recognize(w io.Writer, file string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return err
	}

	data, err := ioutil.ReadFile(file)
	if err != nil {
		return err
	}

	// Send the contents of the audio file with the encoding and
	// and sample rate information to be transcripted.
	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 16000,
			LanguageCode:    "en-US",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
		},
	})

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(w, "\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
		}
	}
	return nil
}

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.cloud.speech.v1.RecognitionAudio;
 * import com.google.cloud.speech.v1.RecognitionConfig;
 * import com.google.cloud.speech.v1.RecognizeRequest;
 * import com.google.cloud.speech.v1.RecognizeResponse;
 * import com.google.cloud.speech.v1.SpeechClient;
 * import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1.SpeechRecognitionResult;
 * import com.google.protobuf.ByteString;
 * import java.nio.file.Files;
 * import java.nio.file.Path;
 * import java.nio.file.Paths;
 */

public static void sampleRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String localFilePath = "resources/brooklyn_bridge.raw";
  sampleRecognize(localFilePath);
}

/**
 * Transcribe a short audio file using synchronous speech recognition
 *
 * @param localFilePath Path to local audio file, e.g. /path/audio.wav
 */
public static void sampleRecognize(String localFilePath) {
  try (SpeechClient speechClient = SpeechClient.create()) {

    // The language of the supplied audio
    String languageCode = "en-US";

    // Sample rate in Hertz of the audio data sent
    int sampleRateHertz = 16000;

    // Encoding of audio data sent. This sample sets this explicitly.
    // This field is optional for FLAC and WAV audio formats.
    RecognitionConfig.AudioEncoding encoding = RecognitionConfig.AudioEncoding.LINEAR16;
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setLanguageCode(languageCode)
            .setSampleRateHertz(sampleRateHertz)
            .setEncoding(encoding)
            .build();
    Path path = Paths.get(localFilePath);
    byte[] data = Files.readAllBytes(path);
    ByteString content = ByteString.copyFrom(data);
    RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(content).build();
    RecognizeRequest request =
        RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    RecognizeResponse response = speechClient.recognize(request);
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative is the most probable result
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js

// Imports the Google Cloud client library
const fs = require('fs');
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
};
const audio = {
  content: fs.readFileSync(filename).toString('base64'),
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// get contents of a file into a string
$content = file_get_contents($audioFile);

// set string as audio content
$audio = (new RecognitionAudio())
    ->setContent($content);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode);

// create the speech client
$client = new SpeechClient();

try {
    $response = $client->recognize($config, $audio);
    foreach ($response->getResults() as $result) {
        $alternatives = $result->getAlternatives();
        $mostLikely = $alternatives[0];
        $transcript = $mostLikely->getTranscript();
        $confidence = $mostLikely->getConfidence();
        printf('Transcript: %s' . PHP_EOL, $transcript);
        printf('Confidence: %s' . PHP_EOL, $confidence);
    }
} finally {
    $client->close();
}

Python

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
import io


def sample_recognize(local_file_path):
    """
    Transcribe a short audio file using synchronous speech recognition

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    client = speech_v1.SpeechClient()

    # local_file_path = 'resources/brooklyn_bridge.raw'

    # The language of the supplied audio
    language_code = "en-US"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

Ruby

# audio_file_path = "Path to file on which to perform speech recognition"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

audio_file = File.binread audio_file_path
config     = { encoding:          :LINEAR16,
               sample_rate_hertz: 16_000,
               language_code:     "en-US" }
audio      = { content: audio_file }

response = speech.recognize config: config, audio: audio

results = response.results

alternatives = results.first.alternatives
alternatives.each do |alternative|
  puts "Transcription: #{alternative.transcript}"
end

Performing synchronous speech recognition on a remote file

For your convenience, Speech-to-Text API can perform synchronous speech recognition directly on an audio file located in Google Cloud Storage, without the need to send the contents of the audio file in the body of your request.

Here is an example of performing synchronous speech recognition on a file located in Cloud Storage:

Protocol

Refer to the speech:recognize API endpoint for complete details.

To perform synchronous speech recognition, make a POST request and provide the appropriate request body. The following shows an example of a POST request using curl. The example uses the access token for a service account set up for the project using the Google Cloud Cloud SDK. For instructions on installing the Cloud SDK, setting up a project with a service account, and obtaining an access token, see the quickstart.

curl -X POST -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
     -H "Content-Type: application/json; charset=utf-8" \
     --data "{
  'config': {
    'encoding': 'LINEAR16',
    'sampleRateHertz': 16000,
    'languageCode': 'en-US'
  },
  'audio': {
    'uri': 'gs://YOUR_BUCKET_NAME/YOUR_FILE_NAME'
  }
}" "https://speech.googleapis.com/v1/speech:recognize"

See the RecognitionConfig reference documentation for more information on configuring the request body.

If the request is successful, the server returns a 200 OK HTTP status code and the response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "how old is the Brooklyn Bridge",
          "confidence": 0.98267895
        }
      ]
    }
  ]
}

gcloud command

Refer to recognize command for complete details.

To perform speech recognition on a local file, use the gcloud command line tool, passing in the local filepath of the file to perform speech recognition on.

gcloud ml speech recognize 'gs://cloud-samples-tests/speech/brooklyn.flac' \
--language-code='en-US'

If the request is successful, the server returns a response in JSON format:

{
  "results": [
    {
      "alternatives": [
        {
          "confidence": 0.9840146,
          "transcript": "how old is the Brooklyn Bridge"
        }
      ]
    }
  ]
}

C#

static object SyncRecognizeGcs(string storageUri)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 16000,
        LanguageCode = "en",
    }, RecognitionAudio.FromStorageUri(storageUri));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func recognizeGCS(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return err
	}

	// Send the request with the URI (gs://...)
	// and sample rate information to be transcripted.
	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 16000,
			LanguageCode:    "en-US",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(w, "\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
		}
	}
	return nil
}

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.cloud.speech.v1.RecognitionAudio;
 * import com.google.cloud.speech.v1.RecognitionConfig;
 * import com.google.cloud.speech.v1.RecognizeRequest;
 * import com.google.cloud.speech.v1.RecognizeResponse;
 * import com.google.cloud.speech.v1.SpeechClient;
 * import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1.SpeechRecognitionResult;
 */

public static void sampleRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String storageUri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw";
  sampleRecognize(storageUri);
}

/**
 * Transcribe short audio file from Cloud Storage using synchronous speech recognition
 *
 * @param storageUri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
 */
public static void sampleRecognize(String storageUri) {
  try (SpeechClient speechClient = SpeechClient.create()) {

    // Sample rate in Hertz of the audio data sent
    int sampleRateHertz = 16000;

    // The language of the supplied audio
    String languageCode = "en-US";

    // Encoding of audio data sent. This sample sets this explicitly.
    // This field is optional for FLAC and WAV audio formats.
    RecognitionConfig.AudioEncoding encoding = RecognitionConfig.AudioEncoding.LINEAR16;
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setSampleRateHertz(sampleRateHertz)
            .setLanguageCode(languageCode)
            .setEncoding(encoding)
            .build();
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();
    RecognizeRequest request =
        RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    RecognizeResponse response = speechClient.recognize(request);
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative is the most probable result
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const gcsUri = 'gs://my-bucket/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
};
const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $uri = 'The Cloud Storage object to transcribe (gs://your-bucket-name/your-object-name)';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// set string as audio content
$audio = (new RecognitionAudio())
    ->setUri($uri);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode);

// create the speech client
$client = new SpeechClient();

try {
    $response = $client->recognize($config, $audio);
    foreach ($response->getResults() as $result) {
        $alternatives = $result->getAlternatives();
        $mostLikely = $alternatives[0];
        $transcript = $mostLikely->getTranscript();
        $confidence = $mostLikely->getConfidence();
        printf('Transcript: %s' . PHP_EOL, $transcript);
        printf('Confidence: %s' . PHP_EOL, $confidence);
    }
} finally {
    $client->close();
}

Python

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums


def sample_recognize(storage_uri):
    """
    Transcribe short audio file from Cloud Storage using synchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

Ruby

# storage_path = "Path to file in Cloud Storage, eg. gs://bucket/audio.raw"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = { encoding:          :LINEAR16,
           sample_rate_hertz: 16_000,
           language_code:     "en-US" }
audio  = { uri: storage_path }

response = speech.recognize config: config, audio: audio

results = response.results

alternatives = results.first.alternatives
alternatives.each do |alternative|
  puts "Transcription: #{alternative.transcript}"
end