음성 적응이 포함된 인식 요청 보내기

음성 적응을 사용하면 Speech-to-Text에서 가져온 텍스트 변환 결과의 정확도를 향상시킬 수 있습니다.

다음 코드 샘플은 Speech-to-Text API로 전송되는 요청에 음성 컨텍스트를 설정하여 텍스트 변환 정확도를 향상시키는 방법을 보여줍니다. 사용 중인 언어에 사용 가능한 클래스 목록은 클래스 토큰 페이지를 참조하세요.

REST 및 명령줄

자세한 내용은 speech:recognize API 엔드포인트를 참조하세요.

아래의 요청 데이터를 사용하기 전에 다음을 바꿉니다.

  • language-code: 오디오 클립에서 사용된 언어의 BCP-47 코드입니다.
  • phrases-to-boost: Speech-to-Text에서 부스트할 구문이 포함된 문자열 배열입니다.
  • storage-bucket: Cloud Storage 버킷입니다.
  • input-audio: 변환할 오디오 데이터입니다.

HTTP 메서드 및 URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

JSON 요청 본문:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

요청을 보내려면 다음 옵션 중 하나를 펼칩니다.

다음과 비슷한 JSON 응답이 표시됩니다.

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

자바

import com.google.cloud.speech.v1.RecognitionAudio;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.RecognizeRequest;
import com.google.cloud.speech.v1.RecognizeResponse;
import com.google.cloud.speech.v1.SpeechClient;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.SpeechRecognitionResult;
import java.io.IOException;

class TranscribeContextClasses {

  void transcribeContextClasses() throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
    transcribeContextClasses(storageUri);
  }

  // Provides "hints" to the speech recognizer to favor specific classes of words in the results.
  static void transcribeContextClasses(String storageUri) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {
      // SpeechContext: to configure your speech_context see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
      // Full list of supported phrases (class tokens) here:
      // https://cloud.google.com/speech-to-text/docs/class-tokens
      SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();

      // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
              .setSampleRateHertz(8000)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();

      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();

      // Build the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Perform the request
      RecognizeResponse response = speechClient.recognize(request);

      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js

// Provides "hints" to the speech recognizer to favor
// specific classes of words in the results.

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

async function transcribeContextClasses() {
  // storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
  const audio = {
    uri: storageUri,
  };

  // SpeechContext: to configure your speech_context see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
  // Full list of supported phrases(class tokens) here:
  // https://cloud.google.com/speech-to-text/docs/class-tokens
  const speechContext = {
    phrases: ['$TIME'],
  };

  // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
  const config = {
    encoding: 'LINEAR16',
    sampleRateHertz: 8000,
    languageCode: 'en-US',
    speechContexts: [speechContext],
  };

  const request = {
    config: config,
    audio: audio,
  };

  // Detects speech in the audio file.
  const [response] = await client.recognize(request);
  response.results.forEach((result, index) => {
    const transcript = result.alternatives[0].transcript;
    console.log('-'.repeat(20));
    console.log(`First alternative of result ${index}`);
    console.log(`Transcript: ${transcript}`);
  });
}

transcribeContextClasses().catch(console.error);

Python

from google.cloud import speech
client = speech.SpeechClient()

# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.types.RecognitionAudio(uri=storage_uri)

# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
# Full list of supported phrases (class tokens) here:
# https://cloud.google.com/speech-to-text/docs/class-tokens
speech_context = speech.types.SpeechContext(phrases=['$TIME'])

# RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
config = speech.types.RecognitionConfig(
    encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code='en-US',
    speech_contexts=[speech_context])

response = client.recognize(config, audio)

for i, result in enumerate(response.results):
    alternative = result.alternatives[0]
    print('-' * 20)
    print('First alternative of result {}'.format(i))
    print('Transcript: {}'.format(alternative.transcript))

C#


using Google.Cloud.Speech.V1;
using System;

namespace GoogleCloudSamples
{
    public class SpeechContextClasses
    {
        /// <summary>
        /// Provides "hints" to the speech recognizer to favor specific classes of words in the results.
        ///</summary>
        /// <param name="uriPath">Path to the audio file stored on GCS.</param>
        public static object TranscribeContextClasses(
            string uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3")
        {
            var speechClient = SpeechClient.Create();
            SpeechContext speechContext = new SpeechContext();
            // SpeechContext: to configure your speech_context see:
            // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
            // Full list of supported phrases (class tokens) here:
            // https://cloud.google.com/speech-to-text/docs/class-tokens
            speechContext.Phrases.Add("$TIME");

            // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
            // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
            RecognitionConfig recognitionConfig = new RecognitionConfig
            {
                Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
                SampleRateHertz = 8000,
                LanguageCode = "en-US"
            };
            recognitionConfig.SpeechContexts.Add(speechContext);

            // Set the path to your audio file
            RecognitionAudio audio = new RecognitionAudio
            {
                Uri = uriPath
            };

            // Build the request
            RecognizeRequest request = new RecognizeRequest
            {
                Config = recognitionConfig,
                Audio = audio
            };

            // Perform the request
            var response = speechClient.Recognize(request);
            foreach (SpeechRecognitionResult result in response.Results)
            {
                // First alternative is the most probable result
                var alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
            return 0;
        }
    }
}

Go


import (
	"context"
	"fmt"
	"io"
	"strings"

	speech "cloud.google.com/go/speech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

// contextClasses provides "hints" to the speech recognizer
// to favour specific classes of words in the results.
func contextClasses(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}
	defer client.Close()

	// SpeechContext: to configure your speech_context see:
	// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
	// Full list of supported phrases (class tokens) here:
	// https://cloud.google.com/speech-to-text/docs/class-tokens
	// In this instance, the use of "$TIME" favours time of day detections.
	speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}
	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 8000,
			LanguageCode:    "en-US",
			SpeechContexts:  []*speechpb.SpeechContext{speechContext},
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	// Print the results.
	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}