Transcribe audio con varios canales

En esta página, se describe cómo transcribir archivos de audio que incluyen más de un canal con Speech-to-Text.

A menudo, los datos de audio incluyen un canal para cada interlocutor presente en la grabación. Por ejemplo, en un audio de dos personas que hablan por teléfono, el audio puede contener dos canales en los que cada línea se graba por separado.

Para transcribir datos de audio que incluyan varios canales, debes proporcionar la cantidad de canales en tu solicitud a la API de Speech-to-Text. En tu solicitud, configura el campo audioChannelCount en la cantidad de canales presentes en tu audio.

Cuando envías una solicitud con varios canales a Speech-to-Text, se muestra un resultado en el que se identifican los distintos canales presentes en el audio y también se etiquetan las alternativas para cada resultado con el campo channelTag.

En la siguiente muestra de código, se explica cómo transcribir audio que contiene varios canales.

Protocolo

Consulta el extremo de la API de speech:recognize para obtener más detalles.

Para realizar un reconocimiento de voz síncrono, haz una solicitud POST y proporciona el cuerpo de la solicitud apropiado. A continuación, se muestra un ejemplo de una solicitud POST con curl. En el ejemplo, se utiliza el token de acceso correspondiente a la configuración de una cuenta de servicio para el proyecto con el SDK de Cloud de Google Cloud. Si deseas obtener instrucciones para instalar el SDK de Cloud, configurar un proyecto con una cuenta de servicio y conseguir un token de acceso, consulta la guía de inicio rápido.

En el siguiente ejemplo, se muestra cómo enviar una solicitud POST con curl, donde el cuerpo de la solicitud especifica el número de canales presentes en la muestra de audio.

curl -X POST -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
     -H "Content-Type: application/json; charset=utf-8" \
     --data '{
    "config": {
        "encoding": "LINEAR16",
        "languageCode": "en-US",
        "audioChannelCount": 2,
        "enableSeparateRecognitionPerChannel": true
    },
    "audio": {
        "uri": "gs://cloud-samples-tests/speech/commercial_stereo.wav"
    }
}' "https://speech.googleapis.com/v1/speech:recognize" > multi-channel.txt

Si la solicitud se completa correctamente, el servidor muestra un código de estado HTTP 200 OK y la respuesta en formato JSON, guardada en un archivo llamado multi-channel.txt.

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "hi I'd like to buy a Chromecast I'm always wondering whether you could help me with that",
          "confidence": 0.8991147
        }
      ],
      "channelTag": 1,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": "certainly which color would you like we have blue black and red",
          "confidence": 0.9408236
        }
      ],
      "channelTag": 2,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " let's go with the black one",
          "confidence": 0.98783094
        }
      ],
      "channelTag": 1,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " would you like the new Chromecast Ultra model or the regular Chromecast",
          "confidence": 0.9573053
        }
      ],
      "channelTag": 2,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " regular Chromecast is fine thank you",
          "confidence": 0.9671048
        }
      ],
      "channelTag": 1,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " okay sure would you like to ship it regular or Express",
          "confidence": 0.9544821
        }
      ],
      "channelTag": 2,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " express please",
          "confidence": 0.9487205
        }
      ],
      "channelTag": 1,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " terrific it's on the way thank you",
          "confidence": 0.97655964
        }
      ],
      "channelTag": 2,
      "languageCode": "en-us"
    },
    {
      "alternatives": [
        {
          "transcript": " thank you very much bye",
          "confidence": 0.9735077
        }
      ],
      "channelTag": 1,
      "languageCode": "en-us"
    }
  ]
}

Go


// transcribeMultichannel generates a transcript from a multichannel speech file and tags the speech from each channel.
func transcribeMultichannel(w io.Writer, path string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}

	data, err := ioutil.ReadFile(path)
	if err != nil {
		return fmt.Errorf("ReadFile: %v", err)
	}

	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:                            speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz:                     44100,
			LanguageCode:                        "en-US",
			AudioChannelCount:                   2,
			EnableSeparateRecognitionPerChannel: true,
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(w, "Channel %v: %v\n", result.ChannelTag, alt.Transcript)
		}
	}
	return nil
}

Java

/**
 * Transcribe a remote audio file with multi-channel recognition
 *
 * @param gcsUri the path to the audio file
 */
public static void transcribeMultiChannelGcs(String gcsUri) throws Exception {

  try (SpeechClient speechClient = SpeechClient.create()) {

    // Configure request to enable multiple channels
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setEncoding(AudioEncoding.LINEAR16)
            .setLanguageCode("en-US")
            .setSampleRateHertz(44100)
            .setAudioChannelCount(2)
            .setEnableSeparateRecognitionPerChannel(true)
            .build();

    // Set the remote path for the audio file
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build();

    // Use non-blocking call for getting file transcription
    OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata> response =
        speechClient.longRunningRecognizeAsync(config, audio);

    while (!response.isDone()) {
      System.out.println("Waiting for response...");
      Thread.sleep(10000);
    }
    // Just print the first result here.
    for (SpeechRecognitionResult result : response.get().getResultsList()) {

      // There can be several alternative transcripts for a given chunk of speech. Just use the
      // first (most likely) one here.
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);

      // Print out the result
      System.out.printf("Transcript : %s\n", alternative.getTranscript());
      System.out.printf("Channel Tag : %s\n", result.getChannelTag());
    }
  }
}

Node.js

const speech = require('@google-cloud/speech').v1;

// Creates a client
const client = new speech.SpeechClient();

const config = {
  encoding: 'LINEAR16',
  languageCode: 'en-US',
  audioChannelCount: 2,
  enableSeparateRecognitionPerChannel: true,
};

const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

const [response] = await client.recognize(request);
const transcription = response.results
  .map(
    result =>
      ` Channel Tag: ${result.channelTag} ${result.alternatives[0].transcript}`
  )
  .join('\n');
console.log(`Transcription: \n${transcription}`);

Ruby

# storage_path = "Path to file in Cloud Storage, eg. gs://bucket/audio.raw"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = {
  encoding:                                :LINEAR16,
  sample_rate_hertz:                       44_100,
  language_code:                           "en-US",
  audio_channel_count:                     2,
  enable_separate_recognition_per_channel: true
}

audio = { uri: storage_path }

response = speech.recognize config: config, audio: audio

results = response.results

results.each_with_index do |result, i|
  alternative = result.alternatives.first
  puts "-" * 20
  puts "First alternative of result #{i}"
  puts "Transcript: #{alternative.transcript}"
  puts "Channel Tag: #{result.channel_tag}"
end

Python

from google.cloud import speech_v1

def sample_recognize(storage_uri):
    """
    Transcribe a short audio file from Cloud Storage with multiple channels

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/multi.wav'

    # The number of channels in the input audio file (optional)
    audio_channel_count = 2

    # When set to true, each audio channel will be recognized separately.
    # The recognition result will contain a channel_tag field to state which
    # channel that result belongs to
    enable_separate_recognition_per_channel = True

    # The language of the supplied audio
    language_code = "en-US"
    config = {
        "audio_channel_count": audio_channel_count,
        "enable_separate_recognition_per_channel": enable_separate_recognition_per_channel,
        "language_code": language_code,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # channel_tag to recognize which audio channel this result is for
        print(u"Channel tag: {}".format(result.channel_tag))
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

C#

static object SyncRecognizeMultipleChannels(string filePath, int channelCount)
{
    var speech = SpeechClient.Create();

    // Create transcription request
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        LanguageCode = "en",
        // Configure request to enable multiple channels
        EnableSeparateRecognitionPerChannel = true,
        AudioChannelCount = channelCount
        // Note: Sample uses local file.
    }, RecognitionAudio.FromFile(filePath));

    // Print out the results.
    foreach (var result in response.Results)
    {
        // There can be several transcripts for a chunk of audio.
        // Print out the first (most likely) one here.
        var alternative = result.Alternatives[0];
        Console.WriteLine($"Transcript: {alternative.Transcript}");
        Console.WriteLine($"Channel Tag: {result.ChannelTag}");
    }
    return 0;
}