Detect intent with audio input file

This guide shows how to send audio input to a detect intent request using the API. Dialogflow processes the audio and converts it to text before attempting an intent match. This conversion is known as audio input, speech recognition, speech-to-text, or STT.

Before you begin

This feature is only applicable when using the API for end-user interactions. If you are using an integration, you can skip this guide.

You should do the following before reading this guide:

  1. Read Dialogflow basics.
  2. Perform setup steps.

Create an agent

If you have not already created an agent, create one now:

  1. Go to the Dialogflow ES Console.
  2. If requested, sign in to the Dialogflow Console. See Dialogflow console overview for more information.
  3. Click Create Agent in the left sidebar menu. (If you already have other agents, click the agent name, scroll to the bottom and click Create new agent.)
  4. Enter your agent's name, default language, and default time zone.
  5. If you have already created a project, enter that project. If you want to allow the Dialogflow Console to create the project, select Create a new Google project.
  6. Click the Create button.

Import the example file to your agent

The steps in this guide make assumptions about your agent, so you need to import an agent prepared for this guide. When importing, these steps use the restore option, which overwrites all agent settings, intents, and entities.

To import the file, follow these steps:

  1. Download the room-booking-agent.zip file.
  2. Go to the Dialogflow ES Console.
  3. Select your agent.
  4. Click the settings button next to the agent name.
  5. Select the Export and Import tab.
  6. Select Restore From Zip and follow instructions to restore the zip file that you downloaded.

Detect intent

REST & CMD LINE

To detect intent, call the detectIntent method on the Sessions type.

Download the book-a-room.wav sample input audio file, which says "book a room". The audio file must be base64 encoded for this example, so it can be provided in the JSON request below. Here is a Linux example:

wget https://cloud.google.com/dialogflow/docs/data/book-a-room.wav
base64 -w 0 book-a-room.wav > book-a-room.b64

For examples on other platforms, see Base64 encoding audio content in the Cloud Speech-to-Text API documentation.

Before using any of the request data below, make the following replacements:

  • project-id: your GCP project ID
  • audio: the base64 encoded audio content

HTTP method and URL:

POST https://dialogflow.googleapis.com/v2/projects/project-id/agent/sessions/123456789:detectIntent

Request JSON body:

{
  "queryInput": {
    "audioConfig": {
      "languageCode": "en-US"
    }
  },
  "inputAudio": "audio"
}

To send your request, expand one of these options:

You should receive a JSON response similar to the following:

{
  "responseId": "3c1e5a89-75b9-4c3f-b63d-4b1351dd5e32",
  "queryResult": {
    "queryText": "book a room",
    "action": "room.reservation",
    "parameters": {
      "time": "",
      "date": "",
      "guests": "",
      "duration": "",
      "location": ""
    },
    "fulfillmentText": "I can help with that. Where would you like to reserve a room?",
    "fulfillmentMessages": [
      {
        "text": {
          "text": [
            "I can help with that. Where would you like to reserve a room?"
          ]
        }
      }
    ],
    "intent": {
      "name": "projects/project-id/agent/intents/e8f6a63e-73da-4a1a-8bfc-857183f71228",
      "displayName": "room.reservation"
    },
    "intentDetectionConfidence": 1,
    "diagnosticInfo": {},
    "languageCode": "en-us"
  }
}

Notice that the value of the queryResult.action field is "room.reservation", and the value of the queryResult.fulfillmentMessages[0|1].text.text[0] field asks the user for more information.

Go

func DetectIntentAudio(projectID, sessionID, audioFile, languageCode string) (string, error) {
	ctx := context.Background()

	sessionClient, err := dialogflow.NewSessionsClient(ctx)
	if err != nil {
		return "", err
	}
	defer sessionClient.Close()

	if projectID == "" || sessionID == "" {
		return "", errors.New(fmt.Sprintf("Received empty project (%s) or session (%s)", projectID, sessionID))
	}

	sessionPath := fmt.Sprintf("projects/%s/agent/sessions/%s", projectID, sessionID)

	// In this example, we hard code the encoding and sample rate for simplicity.
	audioConfig := dialogflowpb.InputAudioConfig{AudioEncoding: dialogflowpb.AudioEncoding_AUDIO_ENCODING_LINEAR_16, SampleRateHertz: 16000, LanguageCode: languageCode}

	queryAudioInput := dialogflowpb.QueryInput_AudioConfig{AudioConfig: &audioConfig}

	audioBytes, err := ioutil.ReadFile(audioFile)
	if err != nil {
		return "", err
	}

	queryInput := dialogflowpb.QueryInput{Input: &queryAudioInput}
	request := dialogflowpb.DetectIntentRequest{Session: sessionPath, QueryInput: &queryInput, InputAudio: audioBytes}

	response, err := sessionClient.DetectIntent(ctx, &request)
	if err != nil {
		return "", err
	}

	queryResult := response.GetQueryResult()
	fulfillmentText := queryResult.GetFulfillmentText()
	return fulfillmentText, nil
}

Java


import com.google.api.gax.rpc.ApiException;
import com.google.cloud.dialogflow.v2.AudioEncoding;
import com.google.cloud.dialogflow.v2.DetectIntentRequest;
import com.google.cloud.dialogflow.v2.DetectIntentResponse;
import com.google.cloud.dialogflow.v2.InputAudioConfig;
import com.google.cloud.dialogflow.v2.QueryInput;
import com.google.cloud.dialogflow.v2.QueryResult;
import com.google.cloud.dialogflow.v2.SessionName;
import com.google.cloud.dialogflow.v2.SessionsClient;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

public class DetectIntentAudio {

  // DialogFlow API Detect Intent sample with audio files.
  public static QueryResult detectIntentAudio(
      String projectId, String audioFilePath, String sessionId, String languageCode)
      throws IOException, ApiException {
    // Instantiates a client
    try (SessionsClient sessionsClient = SessionsClient.create()) {
      // Set the session name using the sessionId (UUID) and projectID (my-project-id)
      SessionName session = SessionName.of(projectId, sessionId);
      System.out.println("Session Path: " + session.toString());

      // Note: hard coding audioEncoding and sampleRateHertz for simplicity.
      // Audio encoding of the audio content sent in the query request.
      AudioEncoding audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16;
      int sampleRateHertz = 16000;

      // Instructs the speech recognizer how to process the audio content.
      InputAudioConfig inputAudioConfig =
          InputAudioConfig.newBuilder()
              .setAudioEncoding(
                  audioEncoding) // audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16
              .setLanguageCode(languageCode) // languageCode = "en-US"
              .setSampleRateHertz(sampleRateHertz) // sampleRateHertz = 16000
              .build();

      // Build the query with the InputAudioConfig
      QueryInput queryInput = QueryInput.newBuilder().setAudioConfig(inputAudioConfig).build();

      // Read the bytes from the audio file
      byte[] inputAudio = Files.readAllBytes(Paths.get(audioFilePath));

      // Build the DetectIntentRequest
      DetectIntentRequest request =
          DetectIntentRequest.newBuilder()
              .setSession(session.toString())
              .setQueryInput(queryInput)
              .setInputAudio(ByteString.copyFrom(inputAudio))
              .build();

      // Performs the detect intent request
      DetectIntentResponse response = sessionsClient.detectIntent(request);

      // Display the query result
      QueryResult queryResult = response.getQueryResult();
      System.out.println("====================");
      System.out.format("Query Text: '%s'\n", queryResult.getQueryText());
      System.out.format(
          "Detected Intent: %s (confidence: %f)\n",
          queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
      System.out.format("Fulfillment Text: '%s'\n", queryResult.getFulfillmentText());

      return queryResult;
    }
  }
}

Node.js

const fs = require('fs');
const util = require('util');
const {struct} = require('pb-util');
// Imports the Dialogflow library
const dialogflow = require('@google-cloud/dialogflow');

// Instantiates a session client
const sessionClient = new dialogflow.SessionsClient();

// The path to identify the agent that owns the created intent.
const sessionPath = sessionClient.projectAgentSessionPath(
  projectId,
  sessionId
);

// Read the content of the audio file and send it as part of the request.
const readFile = util.promisify(fs.readFile);
const inputAudio = await readFile(filename);
const request = {
  session: sessionPath,
  queryInput: {
    audioConfig: {
      audioEncoding: encoding,
      sampleRateHertz: sampleRateHertz,
      languageCode: languageCode,
    },
  },
  inputAudio: inputAudio,
};

// Recognizes the speech in the audio and detects its intent.
const [response] = await sessionClient.detectIntent(request);

console.log('Detected intent:');
const result = response.queryResult;
// Instantiates a context client
const contextClient = new dialogflow.ContextsClient();

console.log(`  Query: ${result.queryText}`);
console.log(`  Response: ${result.fulfillmentText}`);
if (result.intent) {
  console.log(`  Intent: ${result.intent.displayName}`);
} else {
  console.log('  No intent matched.');
}
const parameters = JSON.stringify(struct.decode(result.parameters));
console.log(`  Parameters: ${parameters}`);
if (result.outputContexts && result.outputContexts.length) {
  console.log('  Output contexts:');
  result.outputContexts.forEach(context => {
    const contextId = contextClient.matchContextFromProjectAgentSessionContextName(
      context.name
    );
    const contextParameters = JSON.stringify(
      struct.decode(context.parameters)
    );
    console.log(`    ${contextId}`);
    console.log(`      lifespan: ${context.lifespanCount}`);
    console.log(`      parameters: ${contextParameters}`);
  });
}

PHP

namespace Google\Cloud\Samples\Dialogflow;

use Google\Cloud\Dialogflow\V2\SessionsClient;
use Google\Cloud\Dialogflow\V2\AudioEncoding;
use Google\Cloud\Dialogflow\V2\InputAudioConfig;
use Google\Cloud\Dialogflow\V2\QueryInput;

/**
* Returns the result of detect intent with an audio file as input.
* Using the same `session_id` between requests allows continuation
* of the conversation.
*/
function detect_intent_audio($projectId, $path, $sessionId, $languageCode = 'en-US')
{
    // new session
    $sessionsClient = new SessionsClient();
    $session = $sessionsClient->sessionName($projectId, $sessionId ?: uniqid());
    printf('Session path: %s' . PHP_EOL, $session);

    // load audio file
    $inputAudio = file_get_contents($path);

    // hard coding audio_encoding and sample_rate_hertz for simplicity
    $audioConfig = new InputAudioConfig();
    $audioConfig->setAudioEncoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
    $audioConfig->setLanguageCode($languageCode);
    $audioConfig->setSampleRateHertz(16000);

    // create query input
    $queryInput = new QueryInput();
    $queryInput->setAudioConfig($audioConfig);

    // get response and relevant info
    $response = $sessionsClient->detectIntent($session, $queryInput, ['inputAudio' => $inputAudio]);
    $queryResult = $response->getQueryResult();
    $queryText = $queryResult->getQueryText();
    $intent = $queryResult->getIntent();
    $displayName = $intent->getDisplayName();
    $confidence = $queryResult->getIntentDetectionConfidence();
    $fulfilmentText = $queryResult->getFulfillmentText();

    // output relevant info
    print(str_repeat("=", 20) . PHP_EOL);
    printf('Query text: %s' . PHP_EOL, $queryText);
    printf('Detected intent: %s (confidence: %f)' . PHP_EOL, $displayName,
        $confidence);
    print(PHP_EOL);
    printf('Fulfilment text: %s' . PHP_EOL, $fulfilmentText);

    $sessionsClient->close();
}

Python

def detect_intent_audio(project_id, session_id, audio_file_path,
                        language_code):
    """Returns the result of detect intent with an audio file as input.

    Using the same `session_id` between requests allows continuation
    of the conversation."""
    import dialogflow_v2 as dialogflow

    session_client = dialogflow.SessionsClient()

    # Note: hard coding audio_encoding and sample_rate_hertz for simplicity.
    audio_encoding = dialogflow.enums.AudioEncoding.AUDIO_ENCODING_LINEAR_16
    sample_rate_hertz = 16000

    session = session_client.session_path(project_id, session_id)
    print('Session path: {}\n'.format(session))

    with open(audio_file_path, 'rb') as audio_file:
        input_audio = audio_file.read()

    audio_config = dialogflow.types.InputAudioConfig(
        audio_encoding=audio_encoding, language_code=language_code,
        sample_rate_hertz=sample_rate_hertz)
    query_input = dialogflow.types.QueryInput(audio_config=audio_config)

    response = session_client.detect_intent(
        session=session, query_input=query_input,
        input_audio=input_audio)

    print('=' * 20)
    print('Query text: {}'.format(response.query_result.query_text))
    print('Detected intent: {} (confidence: {})\n'.format(
        response.query_result.intent.display_name,
        response.query_result.intent_detection_confidence))
    print('Fulfillment text: {}\n'.format(
        response.query_result.fulfillment_text))

Ruby

# project_id = "Your Google Cloud project ID"
# session_id = "mysession"
# audio_file_path = "resources/book_a_room.wav"
# language_code = "en-US"

require "google/cloud/dialogflow"

session_client = Google::Cloud::Dialogflow.sessions
session = session_client.session_path project: project_id,
                                      session: session_id
puts "Session path: #{session}"

begin
  audio_file = File.open audio_file_path, "rb"
  input_audio = audio_file.read
ensure
  audio_file.close
end

audio_config = {
  audio_encoding:    :AUDIO_ENCODING_LINEAR_16,
  sample_rate_hertz: 16_000,
  language_code:     language_code
}

query_input = { audio_config: audio_config }

response = session_client.detect_intent session:     session,
                                        query_input: query_input,
                                        input_audio: input_audio
query_result = response.query_result

puts "Query text:        #{query_result.query_text}"
puts "Intent detected:   #{query_result.intent.display_name}"
puts "Intent confidence: #{query_result.intent_detection_confidence}"
puts "Fulfillment text:  #{query_result.fulfillment_text}"