Aumenta la exactitud de la transcripción con la mejora de la adaptación de voz

La mejora de la adaptación de voz es una función opcional de la adaptación de voz. La mejora te permite agregar pesos numéricos a las palabras o frases según la frecuencia con la que se deben reconocer en tus datos de audio.

Te recomendamos que implementes la mejora si, 1) ya implementaste la adaptación de voz y, 2) deseas ajustar la intensidad de los efectos de la adaptación de voz en los resultados de la transcripción.

Consulta la página de conceptos de adaptación de voz para obtener información sobre las prácticas recomendadas de la adaptación de voz y la mejora de la adaptación de voz.

En la siguiente muestra de código, se indica cómo enviar una solicitud con la mejora de la adaptación de voz.

LÍNEA DE CMD Y REST

Para obtener detalles sobre el extremo de la API, consulta speech:recognize.

Antes de usar cualquiera de los datos de solicitud siguientes, realiza los siguientes reemplazos:

  • language-code: Es el código BCP-47 del idioma que se habla en el clip de audio.
  • phrases-to-boost: Es la frase o las frases que deseas que Speech-to-Text priorice, como un arreglo de strings.
  • storage-bucket: Es un depósito de Cloud Storage.
  • input-audio: Son los datos de audio que deseas transcribir.

Método HTTP y URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

Cuerpo JSON de la solicitud:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

Para enviar tu solicitud, expande una de estas opciones:

Deberías recibir una respuesta JSON similar a la que se muestra a continuación:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

Java

import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognizeRequest;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechContext;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
import java.io.IOException;

public class SpeechAdaptation {

  public void speechAdaptation() throws IOException {
    String uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3";
    speechAdaptation(uriPath);
  }

  public static void speechAdaptation(String uriPath) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {

      // Provides "hints" to the speech recognizer to favor specific words and phrases in the
      // results.
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeechContext
      SpeechContext speechContext =
          SpeechContext.newBuilder().addPhrases("Brooklyn Bridge").setBoost(20.0F).build();
      // Configure recognition config to match your audio file.
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.MP3)
              .setSampleRateHertz(44100)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();
      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(uriPath).build();

      // Make the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Display the results
      RecognizeResponse response = speechClient.recognize(request);
      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js


const speech = require('@google-cloud/speech').v1p1beta1;

/**
 * Performs synchronous speech recognition with speech adaptation.
 *
 * @param sampleRateHertz {number} Sample rate in Hertz of the audio data sent in all
 * `RecognitionAudio` messages. Valid values are: 8000-48000.
 * @param languageCode {string} The language of the supplied audio.
 * @param phrase {string} Phrase "hints" help Speech-to-Text API recognize the specified phrases from
 * your audio data.
 * @param boost {number} Positive value will increase the probability that a specific phrase will be
 * recognized over other similar sounding phrases.
 * @param uriPath {string} Path to the audio file stored on GCS.
 */
function sampleRecognize(
  sampleRateHertz,
  languageCode,
  phrase,
  boost,
  uriPath
) {
  const client = new speech.SpeechClient();
  // const sampleRateHertz = 44100;
  // const languageCode = 'en-US';
  // const phrase = 'Brooklyn Bridge';
  // const boost = 20.0;
  // const uriPath = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3';
  const encoding = 'MP3';
  const phrases = [phrase];
  const speechContextsElement = {
    phrases: phrases,
    boost: boost,
  };
  const speechContexts = [speechContextsElement];
  const config = {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    speechContexts: speechContexts,
  };
  const audio = {
    uri: uriPath,
  };
  const request = {
    config: config,
    audio: audio,
  };
  client
    .recognize(request)
    .then(responses => {
      const response = responses[0];
      for (const result of response.results) {
        // First alternative is the most probable result
        const alternative = result.alternatives[0];
        console.log(`Transcript: ${alternative.transcript}`);
      }
    })
    .catch(err => {
      console.error(err);
    });
}

Python

from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums

def sample_recognize(storage_uri, phrase):
    """
    Transcribe a short audio file with speech adaptation.

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
      phrase Phrase "hints" help recognize the specified phrases from your audio.
    """

    client = speech_v1p1beta1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'
    # phrase = 'Brooklyn Bridge'
    phrases = [phrase]

    # Hint Boost. This value increases the probability that a specific
    # phrase will be recognized over other similar sounding phrases.
    # The higher the boost, the higher the chance of false positive
    # recognition as well. Can accept wide range of positive values.
    # Most use cases are best served with values between 0 and 20.
    # Using a binary search happroach may help you find the optimal value.
    boost = 20.0
    speech_contexts_element = {"phrases": phrases, "boost": boost}
    speech_contexts = [speech_contexts_element]

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 44100

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.MP3
    config = {
        "speech_contexts": speech_contexts,
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

Qué sigue