转录短音频文件

本页面演示了如何使用同步语音识别将短音频文件转录为文字。

同步语音识别功能会在处理短音频(不到 1 分钟)文件后立即作出响应,返回相应的识别文字。要处理长音频的语音识别请求,请使用异步语音识别

Speech-to-Text 可以直接接收音频内容,也可以处理已存在于 Google Cloud Storage 中的音频内容。另请参阅同步语音识别请求的音频限制

对本地文件执行同步语音识别

以下是对本地音频文件执行同步语音识别的示例:

协议

如需了解完整的详细信息,请参阅 speech:recognize API 端点。

如需执行同步语音识别,请发出 POST 请求并提供相应的请求正文。以下示例展示了一个使用 curl 发出的 POST 请求。该示例使用通过 Google Cloud Cloud SDK 为项目设置的服务帐号的访问令牌。如需了解有关安装 Cloud SDK、建立项目和服务帐号以及获取访问令牌的说明,请参阅快速入门

curl -X POST \
     -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
     -H "Content-Type: application/json; charset=utf-8" \
     --data "{
  'config': {
    'encoding': 'LINEAR16',
    'sampleRateHertz': 16000,
    'languageCode': 'en-US',
    'enableWordTimeOffsets': false
  },
  'audio': {
    'content': '/9j/7QBEUGhvdG9zaG9...base64-encoded-audio-content...fXNWzvDEeYxxxzj/Coa6Bax//Z'
  }
}" "https://speech.googleapis.com/v1/speech:recognize"
  

如需详细了解如何配置请求体,请参阅 RecognitionConfig 参考文档。

请求正文中提供的音频内容采用 base64 编码。如需详细了解如何对音频执行 base64 编码,请参阅 Base64 编码音频内容。如需详细了解 content 字段,请参阅 RecognitionAudio

如果请求成功,服务器将返回一个 200 OK HTTP 状态代码以及 JSON 格式的响应:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "how old is the Brooklyn Bridge",
          "confidence": 0.98267895
        }
      ]
    }
  ]
}

gcloud 命令

如需详细了解全部信息,请参阅 recognize 命令。

要对本地文件执行语音识别,请使用 gcloud 命令行工具,并传入要对其执行语音识别的文件的本地文件路径。

gcloud ml speech recognize PATH-TO-LOCAL-FILE --language-code='en-US'

如果请求成功,则服务器返回 JSON 格式的响应:

{
  "results": [
    {
      "alternatives": [
        {
          "confidence": 0.9840146,
          "transcript": "how old is the Brooklyn Bridge"
        }
      ]
    }
  ]
}

C#

static object SyncRecognize(string filePath)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 16000,
        LanguageCode = "en",
    }, RecognitionAudio.FromFile(filePath));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func recognize(w io.Writer, file string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return err
	}

	data, err := ioutil.ReadFile(file)
	if err != nil {
		return err
	}

	// Send the contents of the audio file with the encoding and
	// and sample rate information to be transcripted.
	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 16000,
			LanguageCode:    "en-US",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Content{Content: data},
		},
	})

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(w, "\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
		}
	}
	return nil
}

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.cloud.speech.v1.RecognitionAudio;
 * import com.google.cloud.speech.v1.RecognitionConfig;
 * import com.google.cloud.speech.v1.RecognizeRequest;
 * import com.google.cloud.speech.v1.RecognizeResponse;
 * import com.google.cloud.speech.v1.SpeechClient;
 * import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1.SpeechRecognitionResult;
 * import com.google.protobuf.ByteString;
 * import java.nio.file.Files;
 * import java.nio.file.Path;
 * import java.nio.file.Paths;
 */

public static void sampleRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String localFilePath = "resources/brooklyn_bridge.raw";
  sampleRecognize(localFilePath);
}

/**
 * Transcribe a short audio file using synchronous speech recognition
 *
 * @param localFilePath Path to local audio file, e.g. /path/audio.wav
 */
public static void sampleRecognize(String localFilePath) {
  try (SpeechClient speechClient = SpeechClient.create()) {

    // The language of the supplied audio
    String languageCode = "en-US";

    // Sample rate in Hertz of the audio data sent
    int sampleRateHertz = 16000;

    // Encoding of audio data sent. This sample sets this explicitly.
    // This field is optional for FLAC and WAV audio formats.
    RecognitionConfig.AudioEncoding encoding = RecognitionConfig.AudioEncoding.LINEAR16;
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setLanguageCode(languageCode)
            .setSampleRateHertz(sampleRateHertz)
            .setEncoding(encoding)
            .build();
    Path path = Paths.get(localFilePath);
    byte[] data = Files.readAllBytes(path);
    ByteString content = ByteString.copyFrom(data);
    RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(content).build();
    RecognizeRequest request =
        RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    RecognizeResponse response = speechClient.recognize(request);
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative is the most probable result
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js

// Imports the Google Cloud client library
const fs = require('fs');
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
};
const audio = {
  content: fs.readFileSync(filename).toString('base64'),
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $audioFile = 'path to an audio file';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// get contents of a file into a string
$content = file_get_contents($audioFile);

// set string as audio content
$audio = (new RecognitionAudio())
    ->setContent($content);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode);

// create the speech client
$client = new SpeechClient();

try {
    $response = $client->recognize($config, $audio);
    foreach ($response->getResults() as $result) {
        $alternatives = $result->getAlternatives();
        $mostLikely = $alternatives[0];
        $transcript = $mostLikely->getTranscript();
        $confidence = $mostLikely->getConfidence();
        printf('Transcript: %s' . PHP_EOL, $transcript);
        printf('Confidence: %s' . PHP_EOL, $confidence);
    }
} finally {
    $client->close();
}

Python

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums
import io

def sample_recognize(local_file_path):
    """
    Transcribe a short audio file using synchronous speech recognition

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """

    client = speech_v1.SpeechClient()

    # local_file_path = 'resources/brooklyn_bridge.raw'

    # The language of the supplied audio
    language_code = "en-US"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
    }
    with io.open(local_file_path, "rb") as f:
        content = f.read()
    audio = {"content": content}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

Ruby

# audio_file_path = "Path to file on which to perform speech recognition"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

audio_file = File.binread audio_file_path
config     = { encoding:          :LINEAR16,
               sample_rate_hertz: 16_000,
               language_code:     "en-US" }
audio      = { content: audio_file }

response = speech.recognize config: config, audio: audio

results = response.results

alternatives = results.first.alternatives
alternatives.each do |alternative|
  puts "Transcription: #{alternative.transcript}"
end

对远程文件执行同步语音识别

为方便您使用,Speech-to-Text API 可以直接对位于 Google Cloud Storage 的音频文件执行同步语音识别,而您无需在请求正文中发送音频文件的内容。

以下是对 Cloud Storage 中的文件执行同步语音识别的示例:

协议

如需了解完整的详细信息,请参阅 speech:recognize API 端点。

如需执行同步语音识别,请发出 POST 请求并提供相应的请求正文。以下示例展示了一个使用 curl 发出的 POST 请求。该示例使用通过 Google Cloud Cloud SDK 为项目设置的服务帐号的访问令牌。如需了解有关安装 Cloud SDK、建立项目和服务帐号以及获取访问令牌的说明,请参阅快速入门

curl -X POST -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
     -H "Content-Type: application/json; charset=utf-8" \
     --data "{
  'config': {
    'encoding': 'LINEAR16',
    'sampleRateHertz': 16000,
    'languageCode': 'en-US'
  },
  'audio': {
    'uri': 'gs://YOUR_BUCKET_NAME/YOUR_FILE_NAME'
  }
}" "https://speech.googleapis.com/v1/speech:recognize"

如需详细了解如何配置请求体,请参阅 RecognitionConfig 参考文档。

如果请求成功,服务器将返回一个 200 OK HTTP 状态代码以及 JSON 格式的响应:

{
  "results": [
    {
      "alternatives": [
        {
          "transcript": "how old is the Brooklyn Bridge",
          "confidence": 0.98267895
        }
      ]
    }
  ]
}

gcloud 命令

如需详细了解全部信息,请参阅 recognize 命令。

要对本地文件执行语音识别,请使用 gcloud 命令行工具,并传入要对其执行语音识别的文件的本地文件路径。

gcloud ml speech recognize 'gs://cloud-samples-tests/speech/brooklyn.flac' \
--language-code='en-US'

如果请求成功,则服务器返回 JSON 格式的响应:

{
  "results": [
    {
      "alternatives": [
        {
          "confidence": 0.9840146,
          "transcript": "how old is the Brooklyn Bridge"
        }
      ]
    }
  ]
}

C#

static object SyncRecognizeGcs(string storageUri)
{
    var speech = SpeechClient.Create();
    var response = speech.Recognize(new RecognitionConfig()
    {
        Encoding = RecognitionConfig.Types.AudioEncoding.Linear16,
        SampleRateHertz = 16000,
        LanguageCode = "en",
    }, RecognitionAudio.FromStorageUri(storageUri));
    foreach (var result in response.Results)
    {
        foreach (var alternative in result.Alternatives)
        {
            Console.WriteLine(alternative.Transcript);
        }
    }
    return 0;
}

Go


func recognizeGCS(w io.Writer, gcsURI string) error {
	ctx := context.Background()

	client, err := speech.NewClient(ctx)
	if err != nil {
		return err
	}

	// Send the request with the URI (gs://...)
	// and sample rate information to be transcripted.
	resp, err := client.Recognize(ctx, &speechpb.RecognizeRequest{
		Config: &speechpb.RecognitionConfig{
			Encoding:        speechpb.RecognitionConfig_LINEAR16,
			SampleRateHertz: 16000,
			LanguageCode:    "en-US",
		},
		Audio: &speechpb.RecognitionAudio{
			AudioSource: &speechpb.RecognitionAudio_Uri{Uri: gcsURI},
		},
	})

	// Print the results.
	for _, result := range resp.Results {
		for _, alt := range result.Alternatives {
			fmt.Fprintf(w, "\"%v\" (confidence=%3f)\n", alt.Transcript, alt.Confidence)
		}
	}
	return nil
}

Java

/*
 * Please include the following imports to run this sample.
 *
 * import com.google.cloud.speech.v1.RecognitionAudio;
 * import com.google.cloud.speech.v1.RecognitionConfig;
 * import com.google.cloud.speech.v1.RecognizeRequest;
 * import com.google.cloud.speech.v1.RecognizeResponse;
 * import com.google.cloud.speech.v1.SpeechClient;
 * import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
 * import com.google.cloud.speech.v1.SpeechRecognitionResult;
 */

public static void sampleRecognize() {
  // TODO(developer): Replace these variables before running the sample.
  String storageUri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw";
  sampleRecognize(storageUri);
}

/**
 * Transcribe short audio file from Cloud Storage using synchronous speech recognition
 *
 * @param storageUri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
 */
public static void sampleRecognize(String storageUri) {
  try (SpeechClient speechClient = SpeechClient.create()) {

    // Sample rate in Hertz of the audio data sent
    int sampleRateHertz = 16000;

    // The language of the supplied audio
    String languageCode = "en-US";

    // Encoding of audio data sent. This sample sets this explicitly.
    // This field is optional for FLAC and WAV audio formats.
    RecognitionConfig.AudioEncoding encoding = RecognitionConfig.AudioEncoding.LINEAR16;
    RecognitionConfig config =
        RecognitionConfig.newBuilder()
            .setSampleRateHertz(sampleRateHertz)
            .setLanguageCode(languageCode)
            .setEncoding(encoding)
            .build();
    RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(storageUri).build();
    RecognizeRequest request =
        RecognizeRequest.newBuilder().setConfig(config).setAudio(audio).build();
    RecognizeResponse response = speechClient.recognize(request);
    for (SpeechRecognitionResult result : response.getResultsList()) {
      // First alternative is the most probable result
      SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
      System.out.printf("Transcript: %s\n", alternative.getTranscript());
    }
  } catch (Exception exception) {
    System.err.println("Failed to create the client due to: " + exception);
  }
}

Node.js

// Imports the Google Cloud client library
const speech = require('@google-cloud/speech');

// Creates a client
const client = new speech.SpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const gcsUri = 'gs://my-bucket/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';

const config = {
  encoding: encoding,
  sampleRateHertz: sampleRateHertz,
  languageCode: languageCode,
};
const audio = {
  uri: gcsUri,
};

const request = {
  config: config,
  audio: audio,
};

// Detects speech in the audio file
const [response] = await client.recognize(request);
const transcription = response.results
  .map(result => result.alternatives[0].transcript)
  .join('\n');
console.log('Transcription: ', transcription);

PHP

use Google\Cloud\Speech\V1\SpeechClient;
use Google\Cloud\Speech\V1\RecognitionAudio;
use Google\Cloud\Speech\V1\RecognitionConfig;
use Google\Cloud\Speech\V1\RecognitionConfig\AudioEncoding;

/** Uncomment and populate these variables in your code */
// $uri = 'The Cloud Storage object to transcribe (gs://your-bucket-name/your-object-name)';

// change these variables if necessary
$encoding = AudioEncoding::LINEAR16;
$sampleRateHertz = 32000;
$languageCode = 'en-US';

// set string as audio content
$audio = (new RecognitionAudio())
    ->setUri($uri);

// set config
$config = (new RecognitionConfig())
    ->setEncoding($encoding)
    ->setSampleRateHertz($sampleRateHertz)
    ->setLanguageCode($languageCode);

// create the speech client
$client = new SpeechClient();

try {
    $response = $client->recognize($config, $audio);
    foreach ($response->getResults() as $result) {
        $alternatives = $result->getAlternatives();
        $mostLikely = $alternatives[0];
        $transcript = $mostLikely->getTranscript();
        $confidence = $mostLikely->getConfidence();
        printf('Transcript: %s' . PHP_EOL, $transcript);
        printf('Confidence: %s' . PHP_EOL, $confidence);
    }
} finally {
    $client->close();
}

Python

from google.cloud import speech_v1
from google.cloud.speech_v1 import enums

def sample_recognize(storage_uri):
    """
    Transcribe short audio file from Cloud Storage using synchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """

    client = speech_v1.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = 16000

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "sample_rate_hertz": sample_rate_hertz,
        "language_code": language_code,
        "encoding": encoding,
    }
    audio = {"uri": storage_uri}

    response = client.recognize(config, audio)
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        print(u"Transcript: {}".format(alternative.transcript))

Ruby

# storage_path = "Path to file in Cloud Storage, eg. gs://bucket/audio.raw"

require "google/cloud/speech"

speech = Google::Cloud::Speech.speech

config = { encoding:          :LINEAR16,
           sample_rate_hertz: 16_000,
           language_code:     "en-US" }
audio  = { uri: storage_path }

response = speech.recognize config: config, audio: audio

results = response.results

alternatives = results.first.alternatives
alternatives.each do |alternative|
  puts "Transcription: #{alternative.transcript}"
end