通过增强型语音自适应提高转录准确率

增强型语音自适应是可选的语音自适应功能。 使用增强型,您可以根据音频数据中字词和/或短语的识别频率,向其添加数值权重。

在以下情况下,我们建议您实现增强型功能:1) 已实现语音自适应;2) 您希望调整语音自适应对转录结果的影响强度。

如需了解语音自适应和语音自适应增强型最佳做法,请参阅语音自适应概念页面

以下代码示例演示了如何使用增强型语音自适应发送请求。

REST

如需详细了解 API 端点,请参阅 speech:recognize

在使用任何请求数据之前,请先进行以下替换:

  • language-code:音频剪辑中所用语言的 BCP-47 代码。
  • phrases-to-boost:您希望 Speech-to-Text 增强的短语或短语组(以一组字符串的形式提供)。
  • storage-bucket:Cloud Storage 存储分区;
  • input-audio:您要转录的音频数据。

HTTP 方法和网址:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

请求 JSON 正文:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

如需发送您的请求,请展开以下选项之一:

您应该收到类似以下内容的 JSON 响应:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

Java

import com.google.cloud.speech.v1p1beta1.RecognitionAudio;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.RecognizeRequest;
import com.google.cloud.speech.v1p1beta1.RecognizeResponse;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechContext;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult;
import java.io.IOException;

public class SpeechAdaptation {

  public void speechAdaptation() throws IOException {
    String uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3";
    speechAdaptation(uriPath);
  }

  public static void speechAdaptation(String uriPath) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {

      // Provides "hints" to the speech recognizer to favor specific words and phrases in the
      // results.
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1p1beta1#google.cloud.speech.v1p1beta1.SpeechContext
      SpeechContext speechContext =
          SpeechContext.newBuilder().addPhrases("Brooklyn Bridge").setBoost(20.0F).build();
      // Configure recognition config to match your audio file.
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.MP3)
              .setSampleRateHertz(44100)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();
      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(uriPath).build();

      // Make the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Display the results
      RecognizeResponse response = speechClient.recognize(request);
      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js


const speech = require('@google-cloud/speech').v1p1beta1;

/**
 * Performs synchronous speech recognition with speech adaptation.
 *
 * @param sampleRateHertz {number} Sample rate in Hertz of the audio data sent in all
 * `RecognitionAudio` messages. Valid values are: 8000-48000.
 * @param languageCode {string} The language of the supplied audio.
 * @param phrase {string} Phrase "hints" help Speech-to-Text API recognize the specified phrases from
 * your audio data.
 * @param boost {number} Positive value will increase the probability that a specific phrase will be
 * recognized over other similar sounding phrases.
 * @param uriPath {string} Path to the audio file stored on GCS.
 */
function sampleRecognize(
  sampleRateHertz,
  languageCode,
  phrase,
  boost,
  uriPath
) {
  const client = new speech.SpeechClient();
  // const sampleRateHertz = 44100;
  // const languageCode = 'en-US';
  // const phrase = 'Brooklyn Bridge';
  // const boost = 20.0;
  // const uriPath = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3';
  const encoding = 'MP3';
  const phrases = [phrase];
  const speechContextsElement = {
    phrases: phrases,
    boost: boost,
  };
  const speechContexts = [speechContextsElement];
  const config = {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    speechContexts: speechContexts,
  };
  const audio = {
    uri: uriPath,
  };
  const request = {
    config: config,
    audio: audio,
  };
  client
    .recognize(request)
    .then(responses => {
      const response = responses[0];
      for (const result of response.results) {
        // First alternative is the most probable result
        const alternative = result.alternatives[0];
        console.log(`Transcript: ${alternative.transcript}`);
      }
    })
    .catch(err => {
      console.error(err);
    });
}

Python

from google.cloud import speech_v1p1beta1 as speech

def sample_recognize(storage_uri, phrase):
    """
    Transcribe a short audio file with speech adaptation.

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
      phrase Phrase "hints" help recognize the specified phrases from your audio.
    """

    client = speech.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3'
    # phrase = 'Brooklyn Bridge'
    phrases = [phrase]

    # Hint Boost. This value increases the probability that a specific
    # phrase will be recognized over other similar sounding phrases.
    # The higher the boost, the higher the chance of false positive
    # recognition as well. Can accept wide range of positive values.
    # Most use cases are best served with values between 0 and 20.
    # Using a binary search approach may help you find the optimal value.
    boost = 20.0
    speech_contexts_element = {"phrases": phrases, "boost": boost}
    speech_contexts = [speech_contexts_element]

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 44100

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats
    encoding = speech.RecognitionConfig.AudioEncoding.MP3

    config = {
        "speech_contexts": speech_contexts,
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

后续步骤