Transcription with diarization

Identify the different speakers in the audio sample.

Code sample


To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

class TranscribeDiarization {

  static void transcribeDiarization() throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String fileName = "resources/commercial_mono.wav";

  // Transcribe the given audio file using speaker diarization.
  static void transcribeDiarization(String fileName) throws IOException {
    Path path = Paths.get(fileName);
    byte[] content = Files.readAllBytes(path);

    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient client = SpeechClient.create()) {
      // Get the contents of the local audio file
      RecognitionAudio recognitionAudio =
      SpeakerDiarizationConfig speakerDiarizationConfig =
      // Configure request to enable Speaker diarization
      RecognitionConfig config =

      // Perform the transcription request
      RecognizeResponse recognizeResponse = client.recognize(config, recognitionAudio);

      // Speaker Tags are only included in the last result object, which has only one alternative.
      SpeechRecognitionAlternative alternative =
          recognizeResponse.getResults(recognizeResponse.getResultsCount() - 1).getAlternatives(0);
      // The alternative is made up of WordInfo objects that contain the speaker_tag.
      WordInfo wordInfo = alternative.getWords(0);
      int currentSpeakerTag = wordInfo.getSpeakerTag();
      // For each word, get all the words associated with one speaker, once the speaker changes,
      // add a new line with the new speaker and their spoken words.
      StringBuilder speakerWords =
          new StringBuilder(
              String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));
      for (int i = 1; i < alternative.getWordsCount(); i++) {
        wordInfo = alternative.getWords(i);
        if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
          speakerWords.append(" ");
        } else {
              String.format("\nSpeaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));
          currentSpeakerTag = wordInfo.getSpeakerTag();


To authenticate to Speech-to-Text, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

const fs = require('fs');

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

// Set config for Diarization
const diarizationConfig = {
  enableSpeakerDiarization: true,
  maxSpeakerCount: 2,

const config = {
  encoding: 'LINEAR16',
  sampleRateHertz: 8000,
  languageCode: 'en-US',
  diarizationConfig: diarizationConfig,
  model: 'phone_call',

 * TODO(developer): Uncomment the following lines before running the sample.
// const fileName = 'Local path to audio file, e.g. /path/to/audio.raw';

const audio = {
  content: fs.readFileSync(fileName).toString('base64'),

const request = {
  config: config,
  audio: audio,

const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
console.log(`Transcription: ${transcription}`);
console.log('Speaker Diarization:');
const result = response.results[response.results.length - 1];
const wordsInfo = result.alternatives[0].words;
// Note: The transcript within each result is separate and sequential per result.
// However, the words list within an alternative includes all the words
// from all the results thus far. Thus, to get all the words with speaker
// tags, you only have to take the words list from the last result:
wordsInfo.forEach(a =>
  console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`)

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.