Transcription of a local file with diarization

Recognize multiple speakers in a local audio file.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.api.gax.longrunning.OperationFuture;
 * import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata;
 * import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeRequest;
 * import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse;
 * import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
 * import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
 * import com.google.cloud.speech.v1p1beta1.SpeechClient;
 * import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
 * import com.google.cloud.speech.v1p1beta1.WordInfo;
 * import com.google.protobuf.ByteString;
 * import java.nio.file.Files;
 * import java.nio.file.Path;
 * import java.nio.file.Paths;
 */

public static void sampleLongRunningRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String localFilePath = "resources/commercial_mono.wav";
  sampleLongRunningRecognize(localFilePath);
}

/**
 * Print confidence level for individual words in a transcription of a short audio file Separating
 * different speakers in an audio file recording
 *
 * @param localFilePath Path to local audio file, e.g. /path/audio.wav
 */
public static void sampleLongRunningRecognize(String localFilePath) {
  try (SpeechClient speechClient = SpeechClient.create()) {

    // If enabled, each word in the first alternative of each result will be
    // tagged with a speaker tag to identify the speaker.
    boolean enableSpeakerDiarization = true;

    // Optional. Specifies the estimated number of speakers in the conversation.
    int diarizationSpeakerCount = 2;

    // The language of the supplied audio
    String languageCode = "en-US";
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setEnableSpeakerDiarization(enableSpeakerDiarization)
            .setDiarizationSpeakerCount(diarizationSpeakerCount)
            .setLanguageCode(languageCode)
            .build();
    Path path = Paths.get(localFilePath);
    byte[] data = Files.readAllBytes(path);
    ByteString content = ByteString.copyFrom(data);
    RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(content).build();
    LongRunningRecognizeRequest request =
        LongRunningRecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata> future =
        speechClient.longRunningRecognizeAsync(request);

    System.out.println("Waiting for operation to complete...");
    LongRunningRecognizeResponse response = future.get();
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative has words tagged with speakers
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
      // Print the speakerTag of each word
      for (WordInfo word : alternative.getWordsList()) {
        System.out.printf("Word: %s\n", word.getWord());
        System.out.printf("Speaker tag: %s\n", word.getSpeakerTag());
      }
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js

const fs = require('fs');

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech').v1p1beta1;

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const fileName = 'Local path to audio file, e.g. /path/to/audio.raw';

const config = {
  encoding: 'LINEAR16',
  sampleRateHertz: 8000,
  languageCode: 'en-US',
  enableSpeakerDiarization: true,
  diarizationSpeakerCount: 2,
  model: 'phone_call',
};

const audio = {
  content: fs.readFileSync(fileName).toString('base64'),
};

const request = {
  config: config,
  audio: audio,
};

const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log(`Transcription: ${transcription}`);
console.log('Speaker Diarization:');
const result = response.results[response.results.length - 1];
const wordsInfo = result.alternatives[0].words;
// Note: The transcript within each result is separate and sequential per result.
// However, the words list within an alternative includes all the words
// from all the results thus far. Thus, to get all the words with speaker
// tags, you only have to take the words list from the last result:
wordsInfo.forEach(a =>
  console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`)
);

Python

from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient()

speech_file = "resources/commercial_mono.wav"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    enable_speaker_diarization=True,
    diarization_speaker_count=2,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
    print(
        u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
    )

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.