Transcribe a streaming feed on a local file

Transcribe a streaming audio feed from a microphone on a local file.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Go

import (
	"context"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"

	speech "cloud.google.com/go/speech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

func main() {
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s <AUDIOFILE>\n", filepath.Base(os.Args[0]))
		fmt.Fprintf(os.Stderr, "<AUDIOFILE> must be a path to a local audio file. Audio file must be a 16-bit signed little-endian encoded with a sample rate of 16000.\n")

	}
	flag.Parse()
	if len(flag.Args()) != 1 {
		log.Fatal("Please pass path to your local audio file as a command line argument")
	}
	audioFile := flag.Arg(0)

	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		log.Fatal(err)
	}
	stream, err := client.StreamingRecognize(ctx)
	if err != nil {
		log.Fatal(err)
	}
	// Send the initial configuration message.
	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
			StreamingConfig: &speechpb.StreamingRecognitionConfig{
				Config: &speechpb.RecognitionConfig{
					Encoding:        speechpb.RecognitionConfig_LINEAR16,
					SampleRateHertz: 16000,
					LanguageCode:    "en-US",
				},
			},
		},
	}); err != nil {
		log.Fatal(err)
	}

	f, err := os.Open(audioFile)
	if err != nil {
		log.Fatal(err)
	}
	defer f.Close()

	go func() {
		buf := make([]byte, 1024)
		for {
			n, err := f.Read(buf)
			if n > 0 {
				if err := stream.Send(&speechpb.StreamingRecognizeRequest{
					StreamingRequest: &speechpb.StreamingRecognizeRequest_AudioContent{
						AudioContent: buf[:n],
					},
				}); err != nil {
					log.Printf("Could not send audio: %v", err)
				}
			}
			if err == io.EOF {
				// Nothing else to pipe, close the stream.
				if err := stream.CloseSend(); err != nil {
					log.Fatalf("Could not close stream: %v", err)
				}
				return
			}
			if err != nil {
				log.Printf("Could not read from %s: %v", audioFile, err)
				continue
			}
		}
	}()

	for {
		resp, err := stream.Recv()
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatalf("Cannot stream results: %v", err)
		}
		if err := resp.Error; err != nil {
			log.Fatalf("Could not recognize: %v", err)
		}
		for _, result := range resp.Results {
			fmt.Printf("Result: %+v\n", result)
		}
	}
}

Java

/**
 * Performs streaming speech recognition on raw PCM audio data.
 *
 * @param fileName the path to a PCM audio file to transcribe.
 */
public static void streamingRecognizeFile(String fileName) throws Exception, IOException {
  Path path = Paths.get(fileName);
  byte[] data = Files.readAllBytes(path);

  // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
  try (SpeechClient speech = SpeechClient.create()) {

    // Configure request with local raw PCM audio
    RecognitionConfig recConfig =
        RecognitionConfig.newBuilder()
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            .setSampleRateHertz(16000)
            .setModel("default")
            .build();
    StreamingRecognitionConfig config =
        StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build();

    class ResponseApiStreamingObserver<T> implements ApiStreamObserver<T> {
      private final SettableFuture<List<T>> future = SettableFuture.create();
      private final List<T> messages = new java.util.ArrayList<T>();

      @Override
      public void onNext(T message) {
        messages.add(message);
      }

      @Override
      public void onError(Throwable t) {
        future.setException(t);
      }

      @Override
      public void onCompleted() {
        future.set(messages);
      }

      // Returns the SettableFuture object to get received messages / exceptions.
      public SettableFuture<List<T>> future() {
        return future;
      }
    }

    ResponseApiStreamingObserver<StreamingRecognizeResponse> responseObserver =
        new ResponseApiStreamingObserver<>();

    BidiStreamingCallable<StreamingRecognizeRequest, StreamingRecognizeResponse> callable =
        speech.streamingRecognizeCallable();

    ApiStreamObserver<StreamingRecognizeRequest> requestObserver =
        callable.bidiStreamingCall(responseObserver);

    // The first request must **only** contain the audio configuration:
    requestObserver.onNext(
        StreamingRecognizeRequest.newBuilder().setStreamingConfig(config).build());

    // Subsequent requests must **only** contain the audio data.
    requestObserver.onNext(
        StreamingRecognizeRequest.newBuilder()
            .setAudioContent(ByteString.copyFrom(data))
            .build());

    // Mark transmission as completed after sending the data.
    requestObserver.onCompleted();

    List<StreamingRecognizeResponse> responses = responseObserver.future().get();

    for (StreamingRecognizeResponse response : responses) {
      // For streaming recognize, the results list has one is_final result (if available) followed
      // by a number of in-progress results (if iterim_results is true) for subsequent utterances.
      // Just print the first result here.
      StreamingRecognitionResult result = response.getResultsList().get(0);
      // There can be several alternative transcripts for a given chunk of speech. Just use the
      // first (most likely) one here.
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript : %s\n", alternative.getTranscript());
    }
  }
}

Node.js

const fs = require('fs');

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const request = {
  config: {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
  },
  interimResults: false, // If you want interim results, set this to true
};

// Stream the audio to the Google Cloud Speech API
const recognizeStream = client
  .streamingRecognize(request)
  .on('error', console.error)
  .on('data', data => {
    console.log(
      `Transcription: ${data.results[0].alternatives[0].transcript}`
    );
  });

// Stream an audio file from disk to the Speech API, e.g. "./resources/audio.raw"
fs.createReadStream(filename).pipe(recognizeStream);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\StreamingRecognitionConfig;
use Google\Cloud\Speech\V1\StreamingRecognizeRequest;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// the gRPC extension is required for streaming
if (!extension_loaded('grpc')) {
    throw new \Exception('Install the grpc extension (pecl install grpc)');
}

$speechClient = new SpeechClient();
try {
    $config = (new RecognitionConfig())
        ->setEncoding($encoding)
        ->setSampleRateHertz($sampleRateHertz)
        ->setLanguageCode($languageCode);

    $strmConfig = new StreamingRecognitionConfig();
    $strmConfig->setConfig($config);

    $strmReq = new StreamingRecognizeRequest();
    $strmReq->setStreamingConfig($strmConfig);

    $strm = $speechClient->streamingRecognize();
    $strm->write($strmReq);

    $strmReq = new StreamingRecognizeRequest();
    $content = file_get_contents($audioFile);
    $strmReq->setAudioContent($content);
    $strm->write($strmReq);

    foreach ($strm->closeWriteAndReadAll() as $response) {
        foreach ($response->getResults() as $result) {
            foreach ($result->getAlternatives() as $alt) {
                printf("Transcription: %s\n", $alt->getTranscript());
            }
        }
    }
} finally {
    $speechClient->close();
}

Python

def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""
    import io
    from google.cloud import speech

    client = speech.SpeechClient()

    with io.open(stream_file, "rb") as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]

    requests = (
        speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream
    )

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )

    streaming_config = speech.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(
        config=streaming_config,
        requests=requests,
    )

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            print("Finished: {}".format(result.is_final))
            print("Stability: {}".format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                print("Confidence: {}".format(alternative.confidence))
                print(u"Transcript: {}".format(alternative.transcript))

Ruby

# audio_file_path = "Path to file on which to perform speech recognition"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

audio_content = File.binread audio_file_path
bytes_total   = audio_content.size
bytes_sent    = 0
chunk_size    = 32_000

input_stream = Gapic::StreamInput.new
output_stream = speech.streaming_recognize input_stream

config = {
  config: {
    encoding:                 :LINEAR16,
    sample_rate_hertz:        16_000,
    language_code:            "en-US",
    enable_word_time_offsets: true
  }
}
input_stream.push streaming_config: config

# Simulated streaming from a microphone
# Stream bytes...
while bytes_sent < bytes_total
  input_stream.push audio_content: audio_content[bytes_sent, chunk_size]
  bytes_sent += chunk_size
  sleep 1
end

puts "Stopped passing"
input_stream.close

results = output_stream

results.each do |result|
  puts "Transcript: #{result}"
end

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser