Transcripción de un archivo de Cloud Storage con diarización (beta)

Reconocer varios interlocutores en un archivo de audio almacenado en Cloud Storage.

Investigar más

Para obtener documentación detallada que incluya este código de muestra, consulta lo siguiente:

Código de ejemplo

Go

Para saber cómo instalar y usar la biblioteca de cliente de Speech-to-Text, consulta el artículo Bibliotecas de cliente de Speech-to-Text. Para obtener más información, consulta la documentación de referencia de la API Go Speech-to-Text.

Para autenticarte en Speech-to-Text, configura las credenciales predeterminadas de la aplicación. Para obtener más información, consulta el artículo Configurar la autenticación en un entorno de desarrollo local.


import (
	"context"
	"fmt"
	"io"
	"strings"

	speech "cloud.google.com/go/speech/apiv1"
	"cloud.google.com/go/speech/apiv1/speechpb"
)

// transcribe_diarization_gcs_beta Transcribes a remote audio file using speaker diarization.
func transcribe_diarization_gcs_beta(w io.Writer) error {
	// Google Cloud Storage URI pointing to the audio content.

	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %w", err)
	}
	defer client.Close()

	diarizationConfig := &speechpb.SpeakerDiarizationConfig{
		EnableSpeakerDiarization: true,
		MinSpeakerCount:          2,
		MaxSpeakerCount:          2,
	}

	recognitionConfig := &speechpb.RecognitionConfig{
		Encoding:          speechpb.RecognitionConfig_LINEAR16,
		SampleRateHertz:   8000,
		LanguageCode:      "en-US",
		DiarizationConfig: diarizationConfig,
	}

	// Set the remote path for the audio file
	audio := &speechpb.RecognitionAudio{
		AudioSource: &speechpb.RecognitionAudio_Uri{Uri: "gs://cloud-samples-tests/speech/commercial_mono.wav"},
	}

	longRunningRecognizeRequest := &speechpb.LongRunningRecognizeRequest{
		Config: recognitionConfig,
		Audio:  audio,
	}

	operation, err := client.LongRunningRecognize(ctx, longRunningRecognizeRequest)
	if err != nil {
		return fmt.Errorf("error running recognize %w", err)
	}

	response, err := operation.Wait(ctx)
	if err != nil {
		return err
	}

	// Speaker Tags are only included in the last result object, which has only one
	// alternative.
	alternative := response.Results[len(response.Results)-1].Alternatives[0]

	wordInfo := alternative.GetWords()[0]
	currentSpeakerTag := wordInfo.GetSpeakerTag()

	var speakerWords strings.Builder

	speakerWords.WriteString(fmt.Sprintf("Speaker %d: %s", wordInfo.GetSpeakerTag(), wordInfo.GetWord()))

	// For each word, get all the words associated with one speaker, once the speaker changes,
	// add a new line with the new speaker and their spoken words.
	for i := 1; i < len(alternative.Words); i++ {
		wordInfo := alternative.Words[i]
		if currentSpeakerTag == wordInfo.GetSpeakerTag() {
			speakerWords.WriteString(" ")
			speakerWords.WriteString(wordInfo.GetWord())
		} else {
			speakerWords.WriteString(fmt.Sprintf("\nSpeaker %d: %s",
				wordInfo.GetSpeakerTag(), wordInfo.GetWord()))
			currentSpeakerTag = wordInfo.GetSpeakerTag()
		}
	}
	fmt.Fprint(w, speakerWords.String())
	return nil
}

Java

Para saber cómo instalar y usar la biblioteca de cliente de Speech-to-Text, consulta el artículo Bibliotecas de cliente de Speech-to-Text. Para obtener más información, consulta la documentación de referencia de la API Java Speech-to-Text.

Para autenticarte en Speech-to-Text, configura las credenciales predeterminadas de la aplicación. Para obtener más información, consulta el artículo Configurar la autenticación en un entorno de desarrollo local.

/**
 * Transcribe a remote audio file using speaker diarization.
 *
 * @param gcsUri the path to an audio file.
 */
public static void transcribeDiarizationGcs(String gcsUri) throws Exception {
  try (SpeechClient speechClient = SpeechClient.create()) {
    SpeakerDiarizationConfig speakerDiarizationConfig =
        SpeakerDiarizationConfig.newBuilder()
            .setEnableSpeakerDiarization(true)
            .setMinSpeakerCount(2)
            .setMaxSpeakerCount(2)
            .build();

    // Configure request to enable Speaker diarization
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            .setSampleRateHertz(8000)
            .setDiarizationConfig(speakerDiarizationConfig)
            .build();

    // Set the remote path for the audio file
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();

    // Use non-blocking call for getting file transcription
    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata> response =
        speechClient.longRunningRecognizeAsync(config, audio);

    while (!response.isDone()) {
      System.out.println("Waiting for response...");
      Thread.sleep(10000);
    }

    // Speaker Tags are only included in the last result object, which has only one alternative.
    LongRunningRecognizeResponse longRunningRecognizeResponse = response.get();
    SpeechRecognitionAlternative alternative =
        longRunningRecognizeResponse
            .getResults(longRunningRecognizeResponse.getResultsCount() - 1)
            .getAlternatives(0);

    // The alternative is made up of WordInfo objects that contain the speaker_tag.
    WordInfo wordInfo = alternative.getWords(0);
    int currentSpeakerTag = wordInfo.getSpeakerTag();

    // For each word, get all the words associated with one speaker, once the speaker changes,
    // add a new line with the new speaker and their spoken words.
    StringBuilder speakerWords =
        new StringBuilder(
            String.format("Speaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));

    for (int i = 1; i < alternative.getWordsCount(); i++) {
      wordInfo = alternative.getWords(i);
      if (currentSpeakerTag == wordInfo.getSpeakerTag()) {
        speakerWords.append(" ");
        speakerWords.append(wordInfo.getWord());
      } else {
        speakerWords.append(
            String.format("\nSpeaker %d: %s", wordInfo.getSpeakerTag(), wordInfo.getWord()));
        currentSpeakerTag = wordInfo.getSpeakerTag();
      }
    }

    System.out.println(speakerWords.toString());
  }
}

Node.js

Para saber cómo instalar y usar la biblioteca de cliente de Speech-to-Text, consulta el artículo Bibliotecas de cliente de Speech-to-Text. Para obtener más información, consulta la documentación de referencia de la API Node.js Speech-to-Text.

Para autenticarte en Speech-to-Text, configura las credenciales predeterminadas de la aplicación. Para obtener más información, consulta el artículo Configurar la autenticación en un entorno de desarrollo local.

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech').v1p1beta1;

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following line before running the sample.
 */
// const uri = path to GCS audio file e.g. `gs:/bucket/audio.wav`;

const config = {
  encoding: 'LINEAR16',
  sampleRateHertz: 8000,
  languageCode: 'en-US',
  enableSpeakerDiarization: true,
  minSpeakerCount: 2,
  maxSpeakerCount: 2,
  model: 'phone_call',
};

const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log(`Transcription: ${transcription}`);
console.log('Speaker Diarization:');
const result = response.results[response.results.length - 1];
const wordsInfo = result.alternatives[0].words;
// Note: The transcript within each result is separate and sequential per result.
// However, the words list within an alternative includes all the words
// from all the results thus far. Thus, to get all the words with speaker
// tags, you only have to take the words list from the last result:
wordsInfo.forEach(a =>
  console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`)
);

Python

Para saber cómo instalar y usar la biblioteca de cliente de Speech-to-Text, consulta el artículo Bibliotecas de cliente de Speech-to-Text. Para obtener más información, consulta la documentación de referencia de la API Python Speech-to-Text.

Para autenticarte en Speech-to-Text, configura las credenciales predeterminadas de la aplicación. Para obtener más información, consulta el artículo Configurar la autenticación en un entorno de desarrollo local.


from google.cloud import speech


def transcribe_diarization_gcs_beta(audio_uri: str) -> bool:
    """Transcribe a remote audio file (stored in Google Cloud Storage) using speaker diarization.
    Args:
        audio_uri (str): The Google Cloud Storage path to an audio file.
            E.g., gs://[BUCKET]/[FILE]
    Returns:
        True if the operation successfully completed, False otherwise.
    """

    client = speech.SpeechClient()
    # Enhance diarization config with more speaker counts and details
    speaker_diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,  # Set minimum number of speakers
        max_speaker_count=2,  # Adjust max speakers based on expected number of speakers
    )

    # Configure recognition with enhanced audio settings
    recognition_config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="en-US",
        sample_rate_hertz=8000,
        diarization_config=speaker_diarization_config,
    )

    # Set the remote path for the audio file
    audio = speech.RecognitionAudio(
        uri=audio_uri,
    )

    # Use non-blocking call for getting file transcription
    response = client.long_running_recognize(
        config=recognition_config, audio=audio
    ).result(timeout=300)

    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result
    result = response.results[-1]
    words_info = result.alternatives[0].words

    # Print the output
    for word_info in words_info:
        print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}")
    return True

Siguientes pasos

Para buscar y filtrar ejemplos de código de otros Google Cloud productos, consulta el Google Cloud navegador de ejemplos.