创建语音音频文件

您可以使用 Text-to-Speech 将字词和句子转换为自然人类语音的 base64 编码音频数据。然后,您可以通过对 base64 数据进行解码,将音频数据转换为 MP3 等可播放的音频文件。Text-to-Speech 接受的输入数据包括原始文本或语音合成标记语言 (SSML)

本文档介绍如何使用 Text-to-Speech 从文本或 SSML 输入创建音频文件。如果您不熟悉语音合成或 SSML 等概念,建议您阅读文章 Text-to-Speech 基础知识

这些示例要求您已设置 gcloud,并且已创建并激活服务帐号。如需了解如何设置 gcloud 以及如何创建和激活服务帐号,请参阅快速入门:Text-to-Speech

将文本转换为合成语音音频

以下代码示例演示如何将字符串转换为音频数据。

您可以通过多种方式配置语音合成的输出,包括选择独特的语音调制输出的音高、音量、语速和采样率

协议

如需了解完整的详细信息,请参阅 text:synthesize API 端点。

要从文本合成音频,请向 text:synthesize 端点发出 HTTP POST 请求。在 POST 请求正文的 voice 配置部分指定要合成的语音类型,在 input 部分的 text 字段中指定要合成的文本,并在 audioConfig 部分指定要创建的音频类型。

以下代码段将向 text:synthesize 端点发送合成请求并将结果保存到名为 synthesize-text.txt 的文件中。

curl -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json; charset=utf-8" \
  --data "{
    'input':{
      'text':'Android is a mobile operating system developed by Google,
         based on the Linux kernel and designed primarily for
         touchscreen mobile devices such as smartphones and tablets.'
    },
    'voice':{
      'languageCode':'en-gb',
      'name':'en-GB-Standard-A',
      'ssmlGender':'FEMALE'
    },
    'audioConfig':{
      'audioEncoding':'MP3'
    }
  }" "https://texttospeech.googleapis.com/v1/text:synthesize" > synthesize-text.txt

Text-to-Speech API 将返回包含在 JSON 输出中的 base64 编码数据格式的合成音频。synthesize-text.txt 文件中的 JSON 输出类似于以下代码片段。

{
  "audioContent": "//NExAASCCIIAAhEAGAAEMW4kAYPnwwIKw/BBTpwTvB+IAxIfghUfW.."
}

要将 Text-to-Speech API 的合成结果解码为 MP3 音频文件,请从 synthesize-text.txt 文件所在的目录运行以下命令。

cat synthesize-text.txt | grep 'audioContent' | \
sed 's|audioContent| |' | tr -d '\n ":{},' > tmp.txt && \
base64 tmp.txt --decode > synthesize-text-audio.mp3 && \
rm tmp.txt

Go


// SynthesizeText synthesizes plain text and saves the output to outputFile.
func SynthesizeText(w io.Writer, text, outputFile string) error {
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
		return err
	}
	defer client.Close()

	req := texttospeechpb.SynthesizeSpeechRequest{
		Input: &texttospeechpb.SynthesisInput{
			InputSource: &texttospeechpb.SynthesisInput_Text{Text: text},
		},
		// Note: the voice can also be specified by name.
		// Names of voices can be retrieved with client.ListVoices().
		Voice: &texttospeechpb.VoiceSelectionParams{
			LanguageCode: "en-US",
			SsmlGender:   texttospeechpb.SsmlVoiceGender_FEMALE,
		},
		AudioConfig: &texttospeechpb.AudioConfig{
			AudioEncoding: texttospeechpb.AudioEncoding_MP3,
		},
	}

	resp, err := client.SynthesizeSpeech(ctx, &req)
	if err != nil {
		return err
	}

	err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644)
	if err != nil {
		return err
	}
	fmt.Fprintf(w, "Audio content written to file: %v\n", outputFile)
	return nil
}

Java

/**
 * Demonstrates using the Text to Speech client to synthesize text or ssml.
 *
 * @param text the raw text to be synthesized. (e.g., "Hello there!")
 * @throws Exception on TextToSpeechClient Errors.
 */
public static ByteString synthesizeText(String text) throws Exception {
  // Instantiates a client
  try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create()) {
    // Set the text input to be synthesized
    SynthesisInput input = SynthesisInput.newBuilder().setText(text).build();

    // Build the voice request
    VoiceSelectionParams voice =
        VoiceSelectionParams.newBuilder()
            .setLanguageCode("en-US") // languageCode = "en_us"
            .setSsmlGender(SsmlVoiceGender.FEMALE) // ssmlVoiceGender = SsmlVoiceGender.FEMALE
            .build();

    // Select the type of audio file you want returned
    AudioConfig audioConfig =
        AudioConfig.newBuilder()
            .setAudioEncoding(AudioEncoding.MP3) // MP3 audio.
            .build();

    // Perform the text-to-speech request
    SynthesizeSpeechResponse response =
        textToSpeechClient.synthesizeSpeech(input, voice, audioConfig);

    // Get the audio contents from the response
    ByteString audioContents = response.getAudioContent();

    // Write the response to the output file.
    try (OutputStream out = new FileOutputStream("output.mp3")) {
      out.write(audioContents.toByteArray());
      System.out.println("Audio content written to file \"output.mp3\"");
      return audioContents;
    }
  }
}

Node.js

const textToSpeech = require('@google-cloud/text-to-speech');
const fs = require('fs');
const util = require('util');

const client = new textToSpeech.TextToSpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const text = 'Text to synthesize, eg. hello';
// const outputFile = 'Local path to save audio file to, e.g. output.mp3';

const request = {
  input: {text: text},
  voice: {languageCode: 'en-US', ssmlGender: 'FEMALE'},
  audioConfig: {audioEncoding: 'MP3'},
};
const [response] = await client.synthesizeSpeech(request);
const writeFile = util.promisify(fs.writeFile);
await writeFile(outputFile, response.audioContent, 'binary');
console.log(`Audio content written to file: ${outputFile}`);

Python

def synthesize_text(text):
    """Synthesizes speech from the input string of text."""
    from google.cloud import texttospeech

    client = texttospeech.TextToSpeechClient()

    input_text = texttospeech.SynthesisInput(text=text)

    # Note: the voice can also be specified by name.
    # Names of voices can be retrieved with client.list_voices().
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Standard-C",
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
    )

    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    response = client.synthesize_speech(
        request={"input": input_text, "voice": voice, "audio_config": audio_config}
    )

    # The response's audio_content is binary.
    with open("output.mp3", "wb") as out:
        out.write(response.audio_content)
        print('Audio content written to file "output.mp3"')

其他语言

C#:请按照客户端库页面上的 C# 设置说明操作,然后访问 .NET 的 Text-to-Speech 参考文档

PHP:请按照客户端库页面上的 PHP 设置说明 操作,然后访问 PHP 的 Text-to-Speech 参考文档

Ruby:请按照客户端库页面上的 Ruby 设置说明操作,然后访问 Ruby 的 Text-to-Speech 参考文档

将 SSML 转换为合成语音音频

在您的音频合成请求中使用 SSML 可以生成更像自然人类语音的音频。具体来说,SSML 可以让您更精细地控制音频输出如何表示语音中的暂停或音频中日期、时间、首字母缩写词和缩写词的发音。

如需详细了解 Text-to-Speech API 支持的 SSML 元素,请参阅 SSML 参考

协议

如需了解完整的详细信息,请参阅 text:synthesize API 端点。

要从 SSML 合成音频,请向 text:synthesize 端点发出 HTTP POST 请求。在 POST 请求正文的 voice 配置部分指定要合成的语音类型,在 input 部分的 ssml 字段中指定要合成的 SSML,并在 audioConfig 部分指定要创建的音频类型。

以下代码段将向 text:synthesize 端点发送合成请求并将结果保存到名为 synthesize-ssml.txt 的文件中。

curl -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json; charset=utf-8" --data "{
    'input':{
     'ssml':'<speak>The <say-as interpret-as=\"characters\">SSML</say-as> standard
          is defined by the <sub alias=\"World Wide Web Consortium\">W3C</sub>.</speak>'
    },
    'voice':{
      'languageCode':'en-us',
      'name':'en-US-Standard-B',
      'ssmlGender':'MALE'
    },
    'audioConfig':{
      'audioEncoding':'MP3'
    }
  }" "https://texttospeech.googleapis.com/v1/text:synthesize" > synthesize-ssml.txt

Text-to-Speech API 将返回包含在 JSON 输出中的 base64 编码数据格式的合成音频。synthesize-ssml.txt 文件中的 JSON 输出类似于以下代码片段。

{
  "audioContent": "//NExAASCCIIAAhEAGAAEMW4kAYPnwwIKw/BBTpwTvB+IAxIfghUfW.."
}

要将 Text-to-Speech API 的合成结果解码为 MP3 音频文件,请从 synthesize-ssml.txt 文件所在的目录运行以下命令。

cat synthesize-ssml.txt | grep 'audioContent' | \
sed 's|audioContent| |' | tr -d '\n ":{},' > tmp.txt && \
base64 tmp.txt --decode > synthesize-ssml-audio.mp3 && \
rm tmp.txt

Go


// SynthesizeSSML synthesizes ssml and saves the output to outputFile.
//
// ssml must be well-formed according to:
//   https://www.w3.org/TR/speech-synthesis/
// Example: <speak>Hello there.</speak>
func SynthesizeSSML(w io.Writer, ssml, outputFile string) error {
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
		return err
	}
	defer client.Close()

	req := texttospeechpb.SynthesizeSpeechRequest{
		Input: &texttospeechpb.SynthesisInput{
			InputSource: &texttospeechpb.SynthesisInput_Ssml{Ssml: ssml},
		},
		// Note: the voice can also be specified by name.
		// Names of voices can be retrieved with client.ListVoices().
		Voice: &texttospeechpb.VoiceSelectionParams{
			LanguageCode: "en-US",
			SsmlGender:   texttospeechpb.SsmlVoiceGender_FEMALE,
		},
		AudioConfig: &texttospeechpb.AudioConfig{
			AudioEncoding: texttospeechpb.AudioEncoding_MP3,
		},
	}

	resp, err := client.SynthesizeSpeech(ctx, &req)
	if err != nil {
		return err
	}

	err = ioutil.WriteFile(outputFile, resp.AudioContent, 0644)
	if err != nil {
		return err
	}
	fmt.Fprintf(w, "Audio content written to file: %v\n", outputFile)
	return nil
}

Java

/**
 * Demonstrates using the Text to Speech client to synthesize text or ssml.
 *
 * <p>Note: ssml must be well-formed according to: (https://www.w3.org/TR/speech-synthesis/
 * Example: <speak>Hello there.</speak>
 *
 * @param ssml the ssml document to be synthesized. (e.g., "<?xml...")
 * @throws Exception on TextToSpeechClient Errors.
 */
public static ByteString synthesizeSsml(String ssml) throws Exception {
  // Instantiates a client
  try (TextToSpeechClient textToSpeechClient = TextToSpeechClient.create()) {
    // Set the ssml input to be synthesized
    SynthesisInput input = SynthesisInput.newBuilder().setSsml(ssml).build();

    // Build the voice request
    VoiceSelectionParams voice =
        VoiceSelectionParams.newBuilder()
            .setLanguageCode("en-US") // languageCode = "en_us"
            .setSsmlGender(SsmlVoiceGender.FEMALE) // ssmlVoiceGender = SsmlVoiceGender.FEMALE
            .build();

    // Select the type of audio file you want returned
    AudioConfig audioConfig =
        AudioConfig.newBuilder()
            .setAudioEncoding(AudioEncoding.MP3) // MP3 audio.
            .build();

    // Perform the text-to-speech request
    SynthesizeSpeechResponse response =
        textToSpeechClient.synthesizeSpeech(input, voice, audioConfig);

    // Get the audio contents from the response
    ByteString audioContents = response.getAudioContent();

    // Write the response to the output file.
    try (OutputStream out = new FileOutputStream("output.mp3")) {
      out.write(audioContents.toByteArray());
      System.out.println("Audio content written to file \"output.mp3\"");
      return audioContents;
    }
  }
}

Node.js

const textToSpeech = require('@google-cloud/text-to-speech');
const fs = require('fs');
const util = require('util');

const client = new textToSpeech.TextToSpeechClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const ssml = '<speak>Hello there.</speak>';
// const outputFile = 'Local path to save audio file to, e.g. output.mp3';

const request = {
  input: {ssml: ssml},
  voice: {languageCode: 'en-US', ssmlGender: 'FEMALE'},
  audioConfig: {audioEncoding: 'MP3'},
};

const [response] = await client.synthesizeSpeech(request);
const writeFile = util.promisify(fs.writeFile);
await writeFile(outputFile, response.audioContent, 'binary');
console.log(`Audio content written to file: ${outputFile}`);

Python

def synthesize_ssml(ssml):
    """Synthesizes speech from the input string of ssml.

    Note: ssml must be well-formed according to:
        https://www.w3.org/TR/speech-synthesis/

    Example: <speak>Hello there.</speak>
    """
    from google.cloud import texttospeech

    client = texttospeech.TextToSpeechClient()

    input_text = texttospeech.SynthesisInput(ssml=ssml)

    # Note: the voice can also be specified by name.
    # Names of voices can be retrieved with client.list_voices().
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Standard-C",
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
    )

    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    response = client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )

    # The response's audio_content is binary.
    with open("output.mp3", "wb") as out:
        out.write(response.audio_content)
        print('Audio content written to file "output.mp3"')

其他语言

C#:请按照客户端库页面上的 C# 设置说明操作,然后访问 .NET 的 Text-to-Speech 参考文档

PHP:请按照客户端库页面上的 PHP 设置说明 操作,然后访问 PHP 的 Text-to-Speech 参考文档

Ruby:请按照客户端库页面上的 Ruby 设置说明操作,然后访问 Ruby 的 Text-to-Speech 参考文档

亲自尝试

如果您是 Google Cloud 新手,请创建一个帐号来评估 Text-to-Speech 在实际场景中的表现。新客户还可获享 $300 赠金,用于运行、测试和部署工作负载。

免费试用 Text-to-Speech