음성 적응이 포함된 인식 요청 보내기

음성 적응을 사용하면 Speech-to-Text에서 가져온 텍스트 변환 결과의 정확도를 향상시킬 수 있습니다. 음성 적응 기능을 사용하면 오디오 데이터에서 다른 경우에 추천될 수 있는 대안보다 STT가 더 자주 인식해야 하는 단어 또는 문구를 지정할 수 있습니다. 음성 적응은 다음과 같은 경우에 텍스트 변환 정확도를 향상시키는 데 특히 유용합니다.

  1. 오디오에 매우 자주 나타날 가능성이 높은 단어/문구가 포함되어 있습니다.
  2. 고유 이름과 같이 일반적인 오디오에 포함될 가능성이 높지 않은 사용 빈도가 낮은 단어가 오디오에 포함될 가능성이 높습니다.
  3. 오디오에 노이즈가 있거나 명확하게 들리지 않습니다.

음성 적응 및 음성 적응 부스트 권장사항 정보는 음성 적응 개념 페이지를 참조하세요.

다음 코드 샘플은 Speech-to-Text API로 전송되는 요청에 음성 컨텍스트를 설정하여 텍스트 변환 정확도를 향상시키는 방법을 보여줍니다. 사용 중인 언어에 사용 가능한 클래스 목록은 클래스 토큰 페이지를 참조하세요.

REST 및 명령줄

자세한 내용은 speech:recognize API 엔드포인트를 참조하세요.

아래의 요청 데이터를 사용하기 전에 다음을 바꿉니다.

  • language-code: 오디오 클립에서 사용된 언어의 BCP-47 코드입니다.
  • phrases-to-boost: Speech-to-Text에서 부스트할 구문이 포함된 문자열 배열입니다.
  • storage-bucket: Cloud Storage 버킷입니다.
  • input-audio: 변환할 오디오 데이터입니다.

HTTP 메서드 및 URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

JSON 요청 본문:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

요청을 보내려면 다음 옵션 중 하나를 펼칩니다.

다음과 비슷한 JSON 응답이 표시됩니다.

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

자바

import com.google.cloud.speech.v1.RecognitionAudio;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.RecognizeRequest;
import com.google.cloud.speech.v1.RecognizeResponse;
import com.google.cloud.speech.v1.SpeechClient;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.SpeechRecognitionResult;
import java.io.IOException;

class TranscribeContextClasses {

  void transcribeContextClasses() throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
    transcribeContextClasses(storageUri);
  }

  // Provides "hints" to the speech recognizer to favor specific classes of words in the results.
  static void transcribeContextClasses(String storageUri) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {
      // SpeechContext: to configure your speech_context see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
      // Full list of supported phrases (class tokens) here:
      // https://cloud.google.com/speech-to-text/docs/class-tokens
      SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();

      // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
              .setSampleRateHertz(8000)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();

      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();

      // Build the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Perform the request
      RecognizeResponse response = speechClient.recognize(request);

      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js

// Provides "hints" to the speech recognizer to favor
// specific classes of words in the results.

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

async function transcribeContextClasses() {
  // storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
  const audio = {
    uri: storageUri,
  };

  // SpeechContext: to configure your speech_context see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
  // Full list of supported phrases(class tokens) here:
  // https://cloud.google.com/speech-to-text/docs/class-tokens
  const speechContext = {
    phrases: ['$TIME'],
  };

  // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
  const config = {
    encoding: 'LINEAR16',
    sampleRateHertz: 8000,
    languageCode: 'en-US',
    speechContexts: [speechContext],
  };

  const request = {
    config: config,
    audio: audio,
  };

  // Detects speech in the audio file.
  const [response] = await client.recognize(request);
  response.results.forEach((result, index) => {
    const transcript = result.alternatives[0].transcript;
    console.log('-'.repeat(20));
    console.log(`First alternative of result ${index}`);
    console.log(`Transcript: ${transcript}`);
  });
}

transcribeContextClasses().catch(console.error);

Python

from google.cloud import speech

client = speech.SpeechClient()

# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=storage_uri)

# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
# Full list of supported phrases (class tokens) here:
# https://cloud.google.com/speech-to-text/docs/class-tokens
speech_context = speech.SpeechContext(phrases=["$TIME"])

# RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    speech_contexts=[speech_context],
)

response = client.recognize(request={"config": config, "audio": audio})

for i, result in enumerate(response.results):
    alternative = result.alternatives[0]
    print("-" * 20)
    print("First alternative of result {}".format(i))
    print("Transcript: {}".format(alternative.transcript))

Go


import (
	"context"
	"fmt"
	"io"
	"strings"

	speech "cloud.google.com/go/speech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

// contextClasses provides "hints" to the speech recognizer
// to favour specific classes of words in the results.
func contextClasses(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}
	defer client.Close()

	// SpeechContext: to configure your speech_context see:
	// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
	// Full list of supported phrases (class tokens) here:
	// https://cloud.google.com/speech-to-text/docs/class-tokens
	// In this instance, the use of "$TIME" favours time of day detections.
	speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}

	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 8000,
			LanguageCode:    "en-US",
			SpeechContexts:  []*speechpb.SpeechContext{speechContext},
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	// Print the results.
	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}