从音频输入文件中检测意图

本页面介绍如何使用 API 将音频输入发送给检测意图请求。Dialogflow 会处理音频并将其转换为文字,然后再尝试匹配意图。这种转换称为音频输入、语音识别语音转文字 (STT)。

准备工作

此功能仅适用于使用 API 与最终用户互动的情况。如果您使用的是集成服务,则可以跳过本指南。

在阅读本指南之前,请先完成以下事项:

  1. 阅读 Dialogflow 基础知识
  2. 执行设置步骤

创建代理

如果尚未创建代理,请立即创建一个:

  1. 转到 Dialogflow 控制台
  2. 如果系统要求登录 Dialogflow 控制台,请登录。如需了解详情,请参阅 Dialogflow 控制台概览
  3. 点击左侧边栏菜单中的创建代理 (Create Agent)。如果您已有其他代理,请点击代理名称,滚动到底部,然后点击创建新代理 (Create new agent)。
  4. 输入您的代理名称、默认语言和默认时区。
  5. 如果您已经创建了项目,请输入该项目。如果要允许 Dialogflow 控制台创建项目,请选择创建新 Google 项目 (Create a new Google project)。
  6. 点击创建 (Create) 按钮。

将示例文件导入代理

本指南中的步骤对您的代理进行了假设,因此您需要导入为本指南准备的代理。 导入时,这些步骤使用“恢复”(restore) 选项,该选项会覆盖所有代理设置、意图和实体。

如需导入文件,请按以下步骤操作:

  1. 下载 room-booking-agent.zip 文件。
  2. 转到 Dialogflow 控制台
  3. 选择您的代理。
  4. 点击代理名称旁边的设置 按钮。
  5. 选择导出和导入 (Export and Import) 标签页。
  6. 选择从 ZIP 文件恢复 (Restore from ZIP),然后按照说明恢复下载的 zip 文件。

检测意图

REST 和命令行

如需检测意图,请对 Sessions 类型调用 detectIntent 方法。

下载 book-a-room.wav 示例输入视频文件,其中有语音提示“book a room”。此示例中的音频文件必须采用 base64 编码,以便能够在下面的 JSON 请求中添加该音频文件。 下面是 Linux 平台上的一个示例:

wget https://cloud.google.com/dialogflow/docs/data/book-a-room.wav
base64 -w 0 book-a-room.wav > book-a-room.b64

如需查看其它平台上的示例,请参阅 Cloud Speech-to-Text API 文档上的 Base64 编码音频内容

在使用下面的请求数据之前,请先进行以下替换:

  • project-id:您的 GCP 项目 ID
  • audio:base64 编码的音频内容

HTTP 方法和网址:

POST https://dialogflow.googleapis.com/v2/projects/project-id/agent/sessions/123456789:detectIntent

请求 JSON 正文:

{
  "queryInput": {
    "audioConfig": {
      "languageCode": "en-US"
    }
  },
  "inputAudio": "audio"
}

如需发送您的请求,请展开以下选项之一:

您应该收到类似以下内容的 JSON 响应:

{
  "responseId": "3c1e5a89-75b9-4c3f-b63d-4b1351dd5e32",
  "queryResult": {
    "queryText": "book a room",
    "action": "room.reservation",
    "parameters": {
      "time": "",
      "date": "",
      "guests": "",
      "duration": "",
      "location": ""
    },
    "fulfillmentText": "I can help with that. Where would you like to reserve a room?",
    "fulfillmentMessages": [
      {
        "text": {
          "text": [
            "I can help with that. Where would you like to reserve a room?"
          ]
        }
      }
    ],
    "intent": {
      "name": "projects/project-id/agent/intents/e8f6a63e-73da-4a1a-8bfc-857183f71228",
      "displayName": "room.reservation"
    },
    "intentDetectionConfidence": 1,
    "diagnosticInfo": {},
    "languageCode": "en-us"
  }
}

请注意,queryResult.action 字段的值为“room.reservation”,queryResult.fulfillmentMessages[0|1].text.text[0] 字段的值要求用户提供更多信息。

Go

func DetectIntentAudio(projectID, sessionID, audioFile, languageCode string) (string, error) {
	ctx := context.Background()

	sessionClient, err := dialogflow.NewSessionsClient(ctx)
	if err != nil {
		return "", err
	}
	defer sessionClient.Close()

	if projectID == "" || sessionID == "" {
		return "", errors.New(fmt.Sprintf("Received empty project (%s) or session (%s)", projectID, sessionID))
	}

	sessionPath := fmt.Sprintf("projects/%s/agent/sessions/%s", projectID, sessionID)

	// In this example, we hard code the encoding and sample rate for simplicity.
	audioConfig := dialogflowpb.InputAudioConfig{AudioEncoding: dialogflowpb.AudioEncoding_AUDIO_ENCODING_LINEAR_16, SampleRateHertz: 16000, LanguageCode: languageCode}

	queryAudioInput := dialogflowpb.QueryInput_AudioConfig{AudioConfig: &audioConfig}

	audioBytes, err := ioutil.ReadFile(audioFile)
	if err != nil {
		return "", err
	}

	queryInput := dialogflowpb.QueryInput{Input: &queryAudioInput}
	request := dialogflowpb.DetectIntentRequest{Session: sessionPath, QueryInput: &queryInput, InputAudio: audioBytes}

	response, err := sessionClient.DetectIntent(ctx, &request)
	if err != nil {
		return "", err
	}

	queryResult := response.GetQueryResult()
	fulfillmentText := queryResult.GetFulfillmentText()
	return fulfillmentText, nil
}

Java


import com.google.api.gax.rpc.ApiException;
import com.google.cloud.dialogflow.v2.AudioEncoding;
import com.google.cloud.dialogflow.v2.DetectIntentRequest;
import com.google.cloud.dialogflow.v2.DetectIntentResponse;
import com.google.cloud.dialogflow.v2.InputAudioConfig;
import com.google.cloud.dialogflow.v2.QueryInput;
import com.google.cloud.dialogflow.v2.QueryResult;
import com.google.cloud.dialogflow.v2.SessionName;
import com.google.cloud.dialogflow.v2.SessionsClient;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

public class DetectIntentAudio {

  // DialogFlow API Detect Intent sample with audio files.
  public static QueryResult detectIntentAudio(
      String projectId, String audioFilePath, String sessionId, String languageCode)
      throws IOException, ApiException {
    // Instantiates a client
    try (SessionsClient sessionsClient = SessionsClient.create()) {
      // Set the session name using the sessionId (UUID) and projectID (my-project-id)
      SessionName session = SessionName.of(projectId, sessionId);
      System.out.println("Session Path: " + session.toString());

      // Note: hard coding audioEncoding and sampleRateHertz for simplicity.
      // Audio encoding of the audio content sent in the query request.
      AudioEncoding audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16;
      int sampleRateHertz = 16000;

      // Instructs the speech recognizer how to process the audio content.
      InputAudioConfig inputAudioConfig =
          InputAudioConfig.newBuilder()
              .setAudioEncoding(
                  audioEncoding) // audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16
              .setLanguageCode(languageCode) // languageCode = "en-US"
              .setSampleRateHertz(sampleRateHertz) // sampleRateHertz = 16000
              .build();

      // Build the query with the InputAudioConfig
      QueryInput queryInput = QueryInput.newBuilder().setAudioConfig(inputAudioConfig).build();

      // Read the bytes from the audio file
      byte[] inputAudio = Files.readAllBytes(Paths.get(audioFilePath));

      // Build the DetectIntentRequest
      DetectIntentRequest request =
          DetectIntentRequest.newBuilder()
              .setSession(session.toString())
              .setQueryInput(queryInput)
              .setInputAudio(ByteString.copyFrom(inputAudio))
              .build();

      // Performs the detect intent request
      DetectIntentResponse response = sessionsClient.detectIntent(request);

      // Display the query result
      QueryResult queryResult = response.getQueryResult();
      System.out.println("====================");
      System.out.format("Query Text: '%s'\n", queryResult.getQueryText());
      System.out.format(
          "Detected Intent: %s (confidence: %f)\n",
          queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
      System.out.format("Fulfillment Text: '%s'\n", queryResult.getFulfillmentText());

      return queryResult;
    }
  }
}

Node.js

const fs = require('fs');
const util = require('util');
const {struct} = require('pb-util');
// Imports the Dialogflow library
const dialogflow = require('@google-cloud/dialogflow');

// Instantiates a session client
const sessionClient = new dialogflow.SessionsClient();

// The path to identify the agent that owns the created intent.
const sessionPath = sessionClient.projectAgentSessionPath(
  projectId,
  sessionId
);

// Read the content of the audio file and send it as part of the request.
const readFile = util.promisify(fs.readFile);
const inputAudio = await readFile(filename);
const request = {
  session: sessionPath,
  queryInput: {
    audioConfig: {
      audioEncoding: encoding,
      sampleRateHertz: sampleRateHertz,
      languageCode: languageCode,
    },
  },
  inputAudio: inputAudio,
};

// Recognizes the speech in the audio and detects its intent.
const [response] = await sessionClient.detectIntent(request);

console.log('Detected intent:');
const result = response.queryResult;
// Instantiates a context client
const contextClient = new dialogflow.ContextsClient();

console.log(`  Query: ${result.queryText}`);
console.log(`  Response: ${result.fulfillmentText}`);
if (result.intent) {
  console.log(`  Intent: ${result.intent.displayName}`);
} else {
  console.log('  No intent matched.');
}
const parameters = JSON.stringify(struct.decode(result.parameters));
console.log(`  Parameters: ${parameters}`);
if (result.outputContexts && result.outputContexts.length) {
  console.log('  Output contexts:');
  result.outputContexts.forEach(context => {
    const contextId = contextClient.matchContextFromProjectAgentSessionContextName(
      context.name
    );
    const contextParameters = JSON.stringify(
      struct.decode(context.parameters)
    );
    console.log(`    ${contextId}`);
    console.log(`      lifespan: ${context.lifespanCount}`);
    console.log(`      parameters: ${contextParameters}`);
  });
}

PHP

namespace Google\Cloud\Samples\Dialogflow;

use Google\Cloud\Dialogflow\V2\SessionsClient;
use Google\Cloud\Dialogflow\V2\AudioEncoding;
use Google\Cloud\Dialogflow\V2\InputAudioConfig;
use Google\Cloud\Dialogflow\V2\QueryInput;

/**
* Returns the result of detect intent with an audio file as input.
* Using the same `session_id` between requests allows continuation
* of the conversation.
*/
function detect_intent_audio($projectId, $path, $sessionId, $languageCode = 'en-US')
{
    // new session
    $sessionsClient = new SessionsClient();
    $session = $sessionsClient->sessionName($projectId, $sessionId ?: uniqid());
    printf('Session path: %s' . PHP_EOL, $session);

    // load audio file
    $inputAudio = file_get_contents($path);

    // hard coding audio_encoding and sample_rate_hertz for simplicity
    $audioConfig = new InputAudioConfig();
    $audioConfig->setAudioEncoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
    $audioConfig->setLanguageCode($languageCode);
    $audioConfig->setSampleRateHertz(16000);

    // create query input
    $queryInput = new QueryInput();
    $queryInput->setAudioConfig($audioConfig);

    // get response and relevant info
    $response = $sessionsClient->detectIntent($session, $queryInput, ['inputAudio' => $inputAudio]);
    $queryResult = $response->getQueryResult();
    $queryText = $queryResult->getQueryText();
    $intent = $queryResult->getIntent();
    $displayName = $intent->getDisplayName();
    $confidence = $queryResult->getIntentDetectionConfidence();
    $fulfilmentText = $queryResult->getFulfillmentText();

    // output relevant info
    print(str_repeat("=", 20) . PHP_EOL);
    printf('Query text: %s' . PHP_EOL, $queryText);
    printf('Detected intent: %s (confidence: %f)' . PHP_EOL, $displayName,
        $confidence);
    print(PHP_EOL);
    printf('Fulfilment text: %s' . PHP_EOL, $fulfilmentText);

    $sessionsClient->close();
}

Python

def detect_intent_audio(project_id, session_id, audio_file_path,
                        language_code):
    """Returns the result of detect intent with an audio file as input.

    Using the same `session_id` between requests allows continuation
    of the conversation."""
    import dialogflow_v2 as dialogflow

    session_client = dialogflow.SessionsClient()

    # Note: hard coding audio_encoding and sample_rate_hertz for simplicity.
    audio_encoding = dialogflow.enums.AudioEncoding.AUDIO_ENCODING_LINEAR_16
    sample_rate_hertz = 16000

    session = session_client.session_path(project_id, session_id)
    print('Session path: {}\n'.format(session))

    with open(audio_file_path, 'rb') as audio_file:
        input_audio = audio_file.read()

    audio_config = dialogflow.types.InputAudioConfig(
        audio_encoding=audio_encoding, language_code=language_code,
        sample_rate_hertz=sample_rate_hertz)
    query_input = dialogflow.types.QueryInput(audio_config=audio_config)

    response = session_client.detect_intent(
        session=session, query_input=query_input,
        input_audio=input_audio)

    print('=' * 20)
    print('Query text: {}'.format(response.query_result.query_text))
    print('Detected intent: {} (confidence: {})\n'.format(
        response.query_result.intent.display_name,
        response.query_result.intent_detection_confidence))
    print('Fulfillment text: {}\n'.format(
        response.query_result.fulfillment_text))

Ruby

# project_id = "Your Google Cloud project ID"
# session_id = "mysession"
# audio_file_path = "resources/book_a_room.wav"
# language_code = "en-US"

require "google/cloud/dialogflow"

session_client = Google::Cloud::Dialogflow.sessions
session = session_client.session_path project: project_id,
                                      session: session_id
puts "Session path: #{session}"

begin
  audio_file = File.open audio_file_path, "rb"
  input_audio = audio_file.read
ensure
  audio_file.close
end

audio_config = {
  audio_encoding:    :AUDIO_ENCODING_LINEAR_16,
  sample_rate_hertz: 16_000,
  language_code:     language_code
}

query_input = { audio_config: audio_config }

response = session_client.detect_intent session:     session,
                                        query_input: query_input,
                                        input_audio: input_audio
query_result = response.query_result

puts "Query text:        #{query_result.query_text}"
puts "Intent detected:   #{query_result.intent.display_name}"
puts "Intent confidence: #{query_result.intent_detection_confidence}"
puts "Fulfillment text:  #{query_result.fulfillment_text}"