Erkennungsanfrage mit Sprachanpassung senden

Mit der Sprachanpassung können Sie die Genauigkeit der Sprache-zu-Text-Ergebnisse verbessern, die Sie von Spracherkennung erhalten. Mit der Sprachanpassung können Sie Wörter und/oder Phrasen angeben, die von der Spracherkennung häufiger in Ihren Audiodaten erkannt werden sollen als andere Alternativen. Die Sprachanpassung ist in den folgenden Fällen besonders nützlich, um die Sprache-zu-Text-Genauigkeit zu verbessern:

  1. Ihr Audio enthält Wörter oder Wortgruppen, die möglicherweise sehr häufig auftreten.
  2. Ihre Audiodaten enthalten wahrscheinlich seltene Wörter (z. B. Eigennamen) oder Wörter, die nicht zum üblichen Sprachgebrauch gehören.
  3. Ihre Audiodateien enthalten Rauschen oder sind anderweitig undeutlich.

Auf der Seite Konzepte zur Sprachanpassung finden Sie Informationen zu den Best Practices für die Sprachanpassung und die Optimierung der Sprachanpassung.

Das folgende Codebeispiel zeigt, wie Sie die Transkriptionsgenauigkeit durch Festlegen von Sprachkontexten in einer Anfrage an die Speech-to-Text API verbessern. Eine Liste der für Ihre Sprache verfügbaren Klassen finden Sie auf den Seite zu den Klassentokens.

REST UND BEFEHLSZEILE

Ausführliche Informationen finden Sie unter dem API-Endpunkt speech:recognize.

Ersetzen Sie diese Werte in den folgenden Anweisungen:

  • language-code: Der BCP-47-Code der Sprache, die in Ihrem Audioclip gesprochen wird.
  • phrases-to-boost: Wortgruppe(n), die von Speech-to-Text optimiert werden sollen, als Array von Strings.
  • storage-bucket: Ein Cloud Storage-Bucket.
  • input-audio: Die zu transkribierenden Audiodaten.

HTTP-Methode und URL:

POST https://speech.googleapis.com/v1p1beta1/speech:recognize

JSON-Text anfordern:

{
  "config":{
      "languageCode":"language-code",
      "speechContexts":[{
          "phrases":[phrases-to-boost],
          "boost": 2
      }]
  },
  "audio":{
    "uri":"gs:storage-bucket/input-file"
  }
}

Wenn Sie die Anfrage senden möchten, maximieren Sie eine der folgenden Optionen:

Sie sollten in etwa folgende JSON-Antwort erhalten:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "When deciding whether to bring an umbrella, I consider the weather",
          "confidence": 0.9463943
        }
      ],
      "languageCode": "en-us"
    }
  ]
}

Java

import com.google.cloud.speech.v1.RecognitionAudio;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.RecognizeRequest;
import com.google.cloud.speech.v1.RecognizeResponse;
import com.google.cloud.speech.v1.SpeechClient;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.SpeechRecognitionResult;
import java.io.IOException;

class TranscribeContextClasses {

  void transcribeContextClasses() throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String storageUri = "gs://YOUR_BUCKET_ID/path/to/your/file.wav";
    transcribeContextClasses(storageUri);
  }

  // Provides "hints" to the speech recognizer to favor specific classes of words in the results.
  static void transcribeContextClasses(String storageUri) throws IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (SpeechClient speechClient = SpeechClient.create()) {
      // SpeechContext: to configure your speech_context see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
      // Full list of supported phrases (class tokens) here:
      // https://cloud.google.com/speech-to-text/docs/class-tokens
      SpeechContext speechContext = SpeechContext.newBuilder().addPhrases("$TIME").build();

      // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
      // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
      RecognitionConfig config =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
              .setSampleRateHertz(8000)
              .setLanguageCode("en-US")
              .addSpeechContexts(speechContext)
              .build();

      // Set the path to your audio file
      RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();

      // Build the request
      RecognizeRequest request =
          RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();

      // Perform the request
      RecognizeResponse response = speechClient.recognize(request);

      for (SpeechRecognitionResult result : response.getResultsList()) {
        // First alternative is the most probable result
        SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
        System.out.printf("Transcript: %s\n", alternative.getTranscript());
      }
    }
  }
}

Node.js

// Provides "hints" to the speech recognizer to favor
// specific classes of words in the results.

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

async function transcribeContextClasses() {
  // storageUri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
  const audio = {
    uri: storageUri,
  };

  // SpeechContext: to configure your speech_context see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
  // Full list of supported phrases(class tokens) here:
  // https://cloud.google.com/speech-to-text/docs/class-tokens
  const speechContext = {
    phrases: ['$TIME'],
  };

  // RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
  // https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
  const config = {
    encoding: 'LINEAR16',
    sampleRateHertz: 8000,
    languageCode: 'en-US',
    speechContexts: [speechContext],
  };

  const request = {
    config: config,
    audio: audio,
  };

  // Detects speech in the audio file.
  const [response] = await client.recognize(request);
  response.results.forEach((result, index) => {
    const transcript = result.alternatives[0].transcript;
    console.log('-'.repeat(20));
    console.log(`First alternative of result ${index}`);
    console.log(`Transcript: ${transcript}`);
  });
}

transcribeContextClasses();

Python

from google.cloud import speech

client = speech.SpeechClient()

# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=storage_uri)

# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
# Full list of supported phrases (class tokens) here:
# https://cloud.google.com/speech-to-text/docs/class-tokens
speech_context = speech.SpeechContext(phrases=["$TIME"])

# RecognitionConfig: to configure your encoding and sample_rate_hertz, see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    speech_contexts=[speech_context],
)

response = client.recognize(config=config, audio=audio)

for i, result in enumerate(response.results):
    alternative = result.alternatives[0]
    print("-" * 20)
    print("First alternative of result {}".format(i))
    print("Transcript: {}".format(alternative.transcript))

Go


import (
	"context"
	"fmt"
	"io"
	"strings"

	speech "cloud.google.com/go/speech/apiv1"
	speechpb "google.golang.org/genproto/googleapis/cloud/speech/v1"
)

// contextClasses provides "hints" to the speech recognizer
// to favour specific classes of words in the results.
func contextClasses(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return fmt.Errorf("NewClient: %v", err)
	}
	defer client.Close()

	// SpeechContext: to configure your speech_context see:
	// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
	// Full list of supported phrases (class tokens) here:
	// https://cloud.google.com/speech-to-text/docs/class-tokens
	// In this instance, the use of "$TIME" favours time of day detections.
	speechContext := &speechpb.SpeechContext{Phrases: []string{"$TIME"}}

	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 8000,
			LanguageCode:    "en-US",
			SpeechContexts:  []*speechpb.SpeechContext{speechContext},
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})
	if err != nil {
		return fmt.Errorf("Recognize: %v", err)
	}

	// Print the results.
	for i, result := range resp.Results {
		fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
		fmt.Fprintf(w, "Result %d\n", i+1)
		for j, alternative := range result.Alternatives {
			fmt.Fprintf(w, "Alternative %d: %s\n", j+1, alternative.Transcript)
		}
	}
	return nil
}