Asynchronously transcribe an audio file with time offsets

Perform asynchronous transcription including time offsets on an audio file stored in Cloud Storage.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Go


func asyncWords(client *speech.Client, out io.Writer, gcsURI string) error {
	ctx := context.Background()

	// Send the contents of the audio file with the encoding and
	// and sample rate information to be transcripted.
	req := &speechpb.LongRunningRecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:              speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz:       16000,
			LanguageCode:          "en-US",
			EnableWordTimeOffsets: true,
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	}

	op, err := client.LongRunningRecognize(ctx, req)
	if err != nil {
		return err
	}
	resp, err := op.Wait(ctx)
	if err != nil {
		return err
	}

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(out, "\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
			for _, w := range alt.Words {
				fmt.Fprintf(out,
					"Word: \"%v\" (startTime=%3f, endTime=%3f)\n",
					w.Word,
					float64(w.StartTime.Seconds)+float64(w.StartTime.Nanos)*1e-9,
					float64(w.EndTime.Seconds)+float64(w.EndTime.Nanos)*1e-9,
				)
			}
		}
	}
	return nil
}

Java

/**
 * Performs non-blocking speech recognition on remote FLAC file and prints the transcription as
 * well as word time offsets.
 *
 * @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
 */
public static void asyncRecognizeWords(String gcsUri) throws Exception {
  // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
  try (SpeechClient speech = SpeechClient.create()) {

    // Configure remote file request for FLAC
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setEncoding(AudioEncoding.FLAC)
            .setLanguageCode("en-US")
            .setSampleRateHertz(16000)
            .setEnableWordTimeOffsets(true)
            .build();
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();

    // Use non-blocking call for getting file transcription
    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata> response =
        speech.longRunningRecognizeAsync(config, audio);
    while (!response.isDone()) {
      System.out.println("Waiting for response...");
      Thread.sleep(10000);
    }

    List<SpeechRecognitionResult> results = response.get().getResultsList();

    for (SpeechRecognitionResult result : results) {
      // There can be several alternative transcripts for a given chunk of speech. Just use the
      // first (most likely) one here.
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcription: %s\n", alternative.getTranscript());
      for (WordInfo wordInfo : alternative.getWordsList()) {
        System.out.println(wordInfo.getWord());
        System.out.printf(
            "\t%s.%s sec - %s.%s sec\n",
            wordInfo.getStartTime().getSeconds(),
            wordInfo.getStartTime().getNanos() / 100000000,
            wordInfo.getEndTime().getSeconds(),
            wordInfo.getEndTime().getNanos() / 100000000);
      }
    }
  }
}

Node.js

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const gcsUri = 'gs://my-bucket/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  enableWordTimeOffsets: true,
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
};

const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file. This creates a recognition job that you
// can wait for now, or get its result later.
const [operation] = await client.longRunningRecognize(request);

// Get a Promise representation of the final result of the job
const [response] = await operation.promise();
response.results.forEach(result => {
  console.log(`Transcription: ${result.alternatives[0].transcript}`);
  result.alternatives[0].words.forEach(wordInfo => {
    // NOTE: If you have a time offset exceeding 2^32 seconds, use the
    // wordInfo.{x}Time.seconds.high to calculate seconds.
    const startSecs =
      `${wordInfo.startTime.seconds}` +
      '.' +
      wordInfo.startTime.nanos / 100000000;
    const endSecs =
      `${wordInfo.endTime.seconds}` +
      '.' +
      wordInfo.endTime.nanos / 100000000;
    console.log(`Word: ${wordInfo.word}`);
    console.log(`\t ${startSecs} secs - ${endSecs} secs`);
  });
});

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

if (!extension_loaded('grpc')) {
    throw new \Exception('Install the grpc extension (pecl install grpc)');
}

// When true, time offsets for every word will be included in the response.
$enableWordTimeOffsets = true;

// get contents of a file into a string
$content = file_get_contents($audioFile);

// set string as audio content
$audio = (new RecognitionAudio())
    ->setContent($content);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode)
    ->setEnableWordTimeOffsets($enableWordTimeOffsets);

// create the speech client
$client = new SpeechClient();

// create the asyncronous recognize operation
$operation = $client->longRunningRecognize($config, $audio);
$operation->pollUntilComplete();

if ($operation->operationSucceeded()) {
    $response = $operation->getResult();

    // each result is for a consecutive portion of the audio. iterate
    // through them to get the transcripts for the entire audio file.
    foreach ($response->getResults() as $result) {
        $alternatives = $result->getAlternatives();
        $mostLikely = $alternatives[0];
        $transcript = $mostLikely->getTranscript();
        $confidence = $mostLikely->getConfidence();
        printf('Transcript: %s' . PHP_EOL, $transcript);
        printf('Confidence: %s' . PHP_EOL, $confidence);
        foreach ($mostLikely->getWords() as $wordInfo) {
            $startTime = $wordInfo->getStartTime();
            $endTime = $wordInfo->getEndTime();
            printf('  Word: %s (start: %s, end: %s)' . PHP_EOL,
                $wordInfo->getWord(),
                $startTime->serializeToJsonString(),
                $endTime->serializeToJsonString());
        }
    }
} else {
    print_r($operation->getError());
}

$client->close();

Python

def transcribe_gcs_with_word_time_offsets(gcs_uri):
    """Transcribe the given audio file asynchronously and output the word time
    offsets."""
    from google.cloud import speech

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_word_time_offsets=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    result = operation.result(timeout=90)

    for result in result.results:
        alternative = result.alternatives[0]
        print("Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time

            print(
                f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}"
            )

Ruby

# storage_path = "Path to file in Cloud Storage, eg. gs://bucket/audio.raw"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = { encoding:                 :LINEAR16,
           sample_rate_hertz:        16_000,
           language_code:            "en-US",
           enable_word_time_offsets: true }
audio  = { uri: storage_path }

operation = speech.long_running_recognize config: config, audio: audio

puts "Operation started"

operation.wait_until_done!

raise operation.results.message if operation.error?

results = operation.response.results

results.first.alternatives.each do |alternative|
  puts "Transcription: #{alternative.transcript}"

  alternative.words.each do |word|
    start_time = word.start_time.seconds + word.start_time.nanos / 1_000_000_000.0
    end_time   = word.end_time.seconds + word.end_time.nanos / 1_000_000_000.0

    puts "Word: #{word.word} #{start_time} #{end_time}"
  end
end

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.