음성 적응이 포함된 인식 요청 보내기

음성 적응을 사용하면 Speech-to-Text에서 가져온 스크랩트 작성 결과의 정확도를 향상시킬 수 있습니다. 음성 적응 기능을 사용하면 오디오 데이터에서 다른 경우에 추천될 수 있는 대안보다 STT가 더 자주 인식해야 하는 단어 또는 문구를 지정할 수 있습니다. 음성 적응은 다음과 같은 경우에 스크랩트 작성 정확도를 향상시키는 데 특히 유용합니다.

  1. 오디오에 매우 자주 나타날 가능성이 높은 단어/문구가 포함되어 있는 경우
  2. 고유 이름과 같이 일반적인 오디오에 포함될 가능성이 높지 않은 사용 빈도가 낮은 단어가 오디오에 포함될 가능성이 높은 경우
  3. 오디오에 노이즈가 있거나 명확하게 들리지 않는 경우

음성 적응 및 음성 적응 부스트 권장사항 정보는 음성 적응 개념 페이지를 참조하세요.

다음 코드 샘플은 Speech-to-Text API로 전송되는 요청에 음성 컨텍스트를 설정하여 스크랩트 작성 정확도를 향상시키는 방법을 보여줍니다. 사용 중인 언어에 사용 가능한 클래스 목록은 클래스 토큰 페이지를 참조하세요.

REST 및 명령줄

자세한 내용은 speech:recognize API 엔드포인트를 참조하세요.

요청 데이터를 사용하기 전에 다음을 바꿉니다.

  • language-code: 오디오 클립에서 사용된 언어의 BCP-47 코드입니다.
  • phrases-to-boost: Speech-to-Text에서 부스트할 구문이 포함된 문자열 배열입니다.
  • storage-bucket: Cloud Storage 버킷입니다.
  • input-audio: 변환할 오디오 데이터입니다.

HTTP 메서드 및 URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

JSON 요청 본문:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

요청을 보내려면 다음 옵션 중 하나를 펼칩니다.

다음과 비슷한 JSON 응답이 표시됩니다.

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

자바

import com.google.cloud.speech.v1.RecognitionAudio;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.RecognizeRequest;
import com.google.cloud.speech.v1.RecognizeResponse;
import com.google.cloud.speech.v1.SpeechClient;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.SpeechRecognitionResult;
import java.io.IOException;

class TranscribeContextClasses {

  void transcribeContextClasses() throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
    transcribeContextClasses(storageUri);
  }

  // Provides "hints" to the speech recognizer to favor specific classes of words in the results.
  static void transcribeContextClasses(String storageUri) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {
      // SpeechContext: to configure your speech_context see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
      // Full list of supported phrases (class tokens) here:
      // https://cloud.google.com/speech-to-text/docs/class-tokens
      SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();

      // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
              .setSampleRateHertz(8000)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();

      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();

      // Build the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Perform the request
      RecognizeResponse response = speechClient.recognize(request);

      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js

// Provides "hints" to the speech recognizer to favor
// specific classes of words in the results.

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

async function transcribeContextClasses() {
  // storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
  const audio = {
    uri: storageUri,
  };

  // SpeechContext: to configure your speech_context see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
  // Full list of supported phrases(class tokens) here:
  // https://cloud.google.com/speech-to-text/docs/class-tokens
  const speechContext = {
    phrases: ['$TIME'],
  };

  // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
  const config = {
    encoding: 'LINEAR16',
    sampleRateHertz: 8000,
    languageCode: 'en-US',
    speechContexts: [speechContext],
  };

  const request = {
    config: config,
    audio: audio,
  };

  // Detects speech in the audio file.
  const [response] = await client.recognize(request);
  response.results.forEach((result, index) => {
    const transcript = result.alternatives[0].transcript;
    console.log('-'.repeat(20));
    console.log(`First alternative of result ${index}`);
    console.log(`Transcript: ${transcript}`);
  });
}

transcribeContextClasses();

Python

from google.cloud import speech

client = speech.SpeechClient()

# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=storage_uri)

# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
# Full list of supported phrases (class tokens) here:
# https://cloud.google.com/speech-to-text/docs/class-tokens
speech_context = speech.SpeechContext(phrases=["$TIME"])

# RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    speech_contexts=[speech_context],
)

response = client.recognize(config=config, audio=audio)

for i, result in enumerate(response.results):
    alternative = result.alternatives[0]
    print("-" * 20)
    print("First alternative of result {}".format(i))
    print("Transcript: {}".format(alternative.transcript))

Go


import (
	"context"
	"fmt"
	"io"
	"strings"

	speech "cloud.google.com/go/speech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

// contextClasses provides "hints" to the speech recognizer
// to favour specific classes of words in the results.
func contextClasses(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}
	defer client.Close()

	// SpeechContext: to configure your speech_context see:
	// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
	// Full list of supported phrases (class tokens) here:
	// https://cloud.google.com/speech-to-text/docs/class-tokens
	// In this instance, the use of "$TIME" favours time of day detections.
	speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}

	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 8000,
			LanguageCode:    "en-US",
			SpeechContexts:  []*speechpb.SpeechContext{speechContext},
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	// Print the results.
	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}