通过语音自适应,您可以提高从 Speech-to-Text 获得的转录结果的准确率。使用语音自适应功能,您可以指定 STT 应该在您的音频数据更加频繁识别的字词和/或短语,而不是其他替代做法。在以下情况中,语音自适应可显著提升转录的准确性:
- 您的音频中包含可能会频繁出现的字词/短语。
- 您的音频可能包含罕见的字词(例如专有名词)或者常规用语中不使用的字词。
- 您的音频包含噪音或不够清晰。
如需了解语音自适应和语音自适应增强型最佳做法,请参阅语音自适应概念页面。
以下代码示例演示了如何在发送至 Speech-to-Text API 的请求中设置语音上下文来提高转录准确率。请查看类令牌页面,了解适用于您所用语言的类列表。
REST 和命令行
如需了解完整的详细信息,请参阅 speech:recognize
API 端点。
在使用任何请求数据之前,请先进行以下替换:
- language-code:音频剪辑中所用语言的 BCP-47 代码。
- phrases-to-boost:您希望 Speech-to-Text 增强的短语或短语组(以一组字符串的形式提供)。
- storage-bucket:Cloud Storage 存储分区;
- input-audio:您要转录的音频数据。
HTTP 方法和网址:
POST https://speech.googleapis.com/v1p1beta1/speech:recognize
请求 JSON 正文:
{ "config":{ "languageCode":"language-code", "speechContexts":[{ "phrases":[phrases-to-boost], "boost": 2 }] }, "audio":{ "uri":"gs:storage-bucket/input-file" } }
如需发送您的请求,请展开以下选项之一:
curl(Linux、macOS 或 Cloud Shell)
将请求正文保存在名为 request.json
的文件中,然后执行以下命令:
curl -X POST \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
-H "Content-Type: application/json; charset=utf-8" \
-d @request.json \
"https://speech.googleapis.com/v1p1beta1/speech:recognize"
PowerShell (Windows)
将请求正文保存在名为 request.json
的文件中,然后执行以下命令:
$cred = gcloud auth application-default print-access-token
$headers = @{ "Authorization" = "Bearer $cred" }
Invoke-WebRequest `
-Method POST `
-Headers $headers `
-ContentType: "application/json; charset=utf-8" `
-InFile request.json `
-Uri "https://speech.googleapis.com/v1p1beta1/speech:recognize" | Select-Object -Expand Content
您应会收到如下所示的 JSON 响应:
{ "results": [ { "alternatives": [ { "transcript": "When deciding whether to bring an umbrella, I consider the weather", "confidence": 0.9463943 } ], "languageCode": "en-us" } ] }
Java
import com.google.cloud.speech.v1.RecognitionAudio;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.RecognizeRequest;
import com.google.cloud.speech.v1.RecognizeResponse;
import com.google.cloud.speech.v1.SpeechClient;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.SpeechRecognitionResult;
import java.io.IOException;
class TranscribeContextClasses {
void transcribeContextClasses() throws IOException {
// TODO(developer): Replace these variables before running the sample.
String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
transcribeContextClasses(storageUri);
}
// Provides "hints" to the speech recognizer to favor specific classes of words in the results.
static void transcribeContextClasses(String storageUri) throws IOException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (SpeechClient speechClient = SpeechClient.create()) {
// SpeechContext: to configure your speech_context see:
// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
// Full list of supported phrases (class tokens) here:
// https://cloud.google.com/speech-to-text/docs/class-tokens
SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();
// RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
RecognitionConfig config =
RecognitionConfig.newBuilder()
.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
.setSampleRateHertz(8000)
.setLanguageCode("en-US")
.addSpeechContexts(speechContext)
.build();
// Set the path to your audio file
RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();
// Build the request
RecognizeRequest request =
RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
// Perform the request
RecognizeResponse response = speechClient.recognize(request);
for (SpeechRecognitionResult result : response.getResultsList()) {
// First alternative is the most probable result
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
System.out.printf("Transcript: %s\n", alternative.getTranscript());
}
}
}
}
Node.js
// Provides "hints" to the speech recognizer to favor
// specific classes of words in the results.
// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');
// Creates a client
const client = new speech.SpeechClient();
async function transcribeContextClasses() {
// storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
const audio = {
uri: storageUri,
};
// SpeechContext: to configure your speech_context see:
// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
// Full list of supported phrases(class tokens) here:
// https://cloud.google.com/speech-to-text/docs/class-tokens
const speechContext = {
phrases: ['$TIME'],
};
// RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
const config = {
encoding: 'LINEAR16',
sampleRateHertz: 8000,
languageCode: 'en-US',
speechContexts: [speechContext],
};
const request = {
config: config,
audio: audio,
};
// Detects speech in the audio file.
const [response] = await client.recognize(request);
response.results.forEach((result, index) => {
const transcript = result.alternatives[0].transcript;
console.log('-'.repeat(20));
console.log(`First alternative of result ${index}`);
console.log(`Transcript: ${transcript}`);
});
}
transcribeContextClasses();
Python
from google.cloud import speech
client = speech.SpeechClient()
# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=storage_uri)
# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
# Full list of supported phrases (class tokens) here:
# https://cloud.google.com/speech-to-text/docs/class-tokens
speech_context = speech.SpeechContext(phrases=["$TIME"])
# RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
speech_contexts=[speech_context],
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print("First alternative of result {}".format(i))
print("Transcript: {}".format(alternative.transcript))
Go
import (
"context"
"fmt"
"io"
"strings"
speech "cloud.google.com/go/speech/apiv1"
speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)
// contextClasses provides "hints" to the speech recognizer
// to favour specific classes of words in the results.
func contextClasses(w io.Writer, gcsURI string) error {
ctx := context.Background()
client, err := speech.NewClient(ctx)
if err != nil {
return fmt.Errorf("NewClient: %v", err)
}
defer client.Close()
// SpeechContext: to configure your speech_context see:
// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
// Full list of supported phrases (class tokens) here:
// https://cloud.google.com/speech-to-text/docs/class-tokens
// In this instance, the use of "$TIME" favours time of day detections.
speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}
resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
Config: &speechpb.RecognitionConfig{
Encoding: speechpb.RecognitionConfig_LINEAR16,
SampleRateHertz: 8000,
LanguageCode: "en-US",
SpeechContexts: []*speechpb.SpeechContext{speechContext},
},
Audio: &speechpb.RecognitionAudio{
AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
},
})
if err != nil {
return fmt.Errorf("Recognize: %v", err)
}
// Print the results.
for i, result := range resp.Results {
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
fmt.Fprintf(w, "Result %d\n", i+1)
for j, alternative := range result.Alternatives {
fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
}
}
return nil
}