Detecting Intent from Audio

Here is an example of detecting intent by sending audio to a Dialogflow agent.

Set up your GCP project and authentication

Create an agent

Import the example file to your agent

Importing will add intents and entities to your agent. If any existing intents or entities have the same name as those in the imported file, they will be replaced.

To import the file, follow these steps:

  1. Download the RoomReservation.zip file
  2. Go to the Dialogflow Console
  3. Select your agent
  4. Click the settings settings button next to the agent name
  5. Select the Export and Import tab
  6. Select Import From Zip and import the zip file that you downloaded

Detect intent

curl command

After you have imported the sample agent, you can now pass audio user input to the detectIntent method to determine what the user has requested and which action to take. The audio must be base64 encoded. For information on how to encode and decode audio using base64 encoding, see Embedding Base64 encoded audio in the Cloud Speech API documentation.

Follow these steps to determine the intent from base64 encoded audio. In this example, the audio says "book a room".

  1. Use the following curl command to access the detectIntent method and specify base-64 encoded audio. In this example, the audio says "book a room". Replace project-id with your Google Cloud project ID.

    curl \
    -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
    -H "Content-Type: application/json; charset=utf-8" \
    --data "{
      'queryInput': {
        'audioConfig': {
          'languageCode': 'en-US'
        }
      },
      'inputAudio': 'UklGRkp5AABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAARkxMUsw...'
    }" "https://dialogflow.googleapis.com/v2/projects/project-id/agent/sessions/123456789:detectIntent"
    

    You should see a response similar to the following. Notice that the value of the queryResult.action field is "room.reservation", and the value of the queryResult.fulfillmentMessages[0|1].text.text[0] field asks the user for more information.

    {
      "responseId": "3c1e5a89-75b9-4c3f-b63d-4b1351dd5e32",
      "queryResult": {
        "queryText": "book a room",
        "action": "room.reservation",
        "parameters": {
          "time": "",
          "date": "",
          "guests": "",
          "duration": "",
          "location": ""
        },
        "fulfillmentText": "I can help with that. Where would you like to reserve a room?",
        "fulfillmentMessages": [
          {
            "text": {
              "text": [
                "I can help with that. Where would you like to reserve a room?"
              ]
            },
            "platform": "FACEBOOK"
          },
          {
            "text": {
              "text": [
                "I can help with that. Where would you like to reserve a room?"
              ]
            }
          }
        ],
        "outputContexts": [
          {
            "name": "projects/myproject/agent/sessions/123456789/contexts/e8f6a63e-73da-4a1a-8bfc-857183f71228_id_dialog_context",
            "lifespanCount": 2,
            "parameters": {
              "date": "",
              "guests": "",
              "duration": "",
              "location.original": "",
              "guests.original": "",
              "location": "",
              "date.original": "",
              "time.original": "",
              "time": "",
              "duration.original": ""
            }
          },
          {
            "name": "projects/myproject/agent/sessions/123456789/contexts/room_reservation_dialog_params_location",
            "lifespanCount": 1,
            "parameters": {
              "date.original": "",
              "time.original": "",
              "time": "",
              "duration.original": "",
              "date": "",
              "guests": "",
              "duration": "",
              "location.original": "",
              "guests.original": "",
              "location": ""
            }
          },
          {
            "name": "projects/myproject/agent/sessions/123456789/contexts/room_reservation_dialog_context",
            "lifespanCount": 2,
            "parameters": {
              "time.original": "",
              "time": "",
              "duration.original": "",
              "date": "",
              "guests": "",
              "duration": "",
              "location.original": "",
              "guests.original": "",
              "location": "",
              "date.original": ""
            }
          }
        ],
        "intent": {
          "name": "projects/myproject/agent/intents/e8f6a63e-73da-4a1a-8bfc-857183f71228",
          "displayName": "room.reservation"
        },
        "intentDetectionConfidence": 1,
        "diagnosticInfo": {},
        "languageCode": "en-us"
      }
    }
    

Go

func DetectIntentAudio(projectID, sessionID, audioFile, languageCode string) (string, error) {
	ctx := context.Background()

	sessionClient, err := dialogflow.NewSessionsClient(ctx)
	if err != nil {
		return "", err
	}
	defer sessionClient.Close()

	if projectID == "" || sessionID == "" {
		return "", errors.New(fmt.Sprintf("Received empty project (%s) or session (%s)", projectID, sessionID))
	}

	sessionPath := fmt.Sprintf("projects/%s/agent/sessions/%s", projectID, sessionID)

	// In this example, we hard code the encoding and sample rate for simplicity.
	audioConfig := dialogflowpb.InputAudioConfig{AudioEncoding: dialogflowpb.AudioEncoding_AUDIO_ENCODING_LINEAR_16, SampleRateHertz: 16000, LanguageCode: languageCode}

	queryAudioInput := dialogflowpb.QueryInput_AudioConfig{AudioConfig: &audioConfig}

	audioBytes, err := ioutil.ReadFile(audioFile)
	if err != nil {
		return "", err
	}

	queryInput := dialogflowpb.QueryInput{Input: &queryAudioInput}
	request := dialogflowpb.DetectIntentRequest{Session: sessionPath, QueryInput: &queryInput, InputAudio: audioBytes}

	response, err := sessionClient.DetectIntent(ctx, &request)
	if err != nil {
		return "", err
	}

	queryResult := response.GetQueryResult()
	fulfillmentText := queryResult.GetFulfillmentText()
	return fulfillmentText, nil
}

Java

/**
 * Returns the result of detect intent with an audio file as input.
 *
 * Using the same `session_id` between requests allows continuation of the conversation.
 *
 * @param projectId     Project/Agent Id.
 * @param audioFilePath Path to the audio file.
 * @param sessionId     Identifier of the DetectIntent session.
 * @param languageCode  Language code of the query.
 * @return QueryResult for the request.
 */
public static QueryResult detectIntentAudio(
    String projectId,
    String audioFilePath,
    String sessionId,
    String languageCode)
    throws Exception {
  // Instantiates a client
  try (SessionsClient sessionsClient = SessionsClient.create()) {
    // Set the session name using the sessionId (UUID) and projectID (my-project-id)
    SessionName session = SessionName.of(projectId, sessionId);
    System.out.println("Session Path: " + session.toString());

    // Note: hard coding audioEncoding and sampleRateHertz for simplicity.
    // Audio encoding of the audio content sent in the query request.
    AudioEncoding audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16;
    int sampleRateHertz = 16000;

    // Instructs the speech recognizer how to process the audio content.
    InputAudioConfig inputAudioConfig = InputAudioConfig.newBuilder()
        .setAudioEncoding(audioEncoding) // audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16
        .setLanguageCode(languageCode) // languageCode = "en-US"
        .setSampleRateHertz(sampleRateHertz) // sampleRateHertz = 16000
        .build();

    // Build the query with the InputAudioConfig
    QueryInput queryInput = QueryInput.newBuilder().setAudioConfig(inputAudioConfig).build();

    // Read the bytes from the audio file
    byte[] inputAudio = Files.readAllBytes(Paths.get(audioFilePath));

    // Build the DetectIntentRequest
    DetectIntentRequest request = DetectIntentRequest.newBuilder()
        .setSession(session.toString())
        .setQueryInput(queryInput)
        .setInputAudio(ByteString.copyFrom(inputAudio))
        .build();

    // Performs the detect intent request
    DetectIntentResponse response = sessionsClient.detectIntent(request);

    // Display the query result
    QueryResult queryResult = response.getQueryResult();
    System.out.println("====================");
    System.out.format("Query Text: '%s'\n", queryResult.getQueryText());
    System.out.format("Detected Intent: %s (confidence: %f)\n",
        queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
    System.out.format("Fulfillment Text: '%s'\n", queryResult.getFulfillmentText());

    return queryResult;
  }
}

Node.js

// Imports the Dialogflow library
const dialogflow = require('dialogflow');

// Instantiates a session client
const sessionClient = new dialogflow.SessionsClient();

// The path to identify the agent that owns the created intent.
const sessionPath = sessionClient.sessionPath(projectId, sessionId);

// Read the content of the audio file and send it as part of the request.
const readFile = util.promisify(fs.readFile);
const inputAudio = await readFile(filename);
const request = {
  session: sessionPath,
  queryInput: {
    audioConfig: {
      audioEncoding: encoding,
      sampleRateHertz: sampleRateHertz,
      languageCode: languageCode,
    },
  },
  inputAudio: inputAudio,
};

// Recognizes the speech in the audio and detects its intent.
const [response] = await sessionClient.detectIntent(request);

console.log('Detected intent:');
logQueryResult(sessionClient, response.queryResult);

PHP

namespace Google\Cloud\Samples\Dialogflow;

use Google\Cloud\Dialogflow\V2\SessionsClient;
use Google\Cloud\Dialogflow\V2\AudioEncoding;
use Google\Cloud\Dialogflow\V2\InputAudioConfig;
use Google\Cloud\Dialogflow\V2\QueryInput;

/**
* Returns the result of detect intent with an audio file as input.
* Using the same `session_id` between requests allows continuation
* of the conversation.
*/
function detect_intent_audio($projectId, $path, $sessionId, $languageCode = 'en-US')
{
    // new session
    $sessionsClient = new SessionsClient();
    $session = $sessionsClient->sessionName($projectId, $sessionId ?: uniqid());
    printf('Session path: %s' . PHP_EOL, $session);

    // load audio file
    $inputAudio = file_get_contents($path);

    // hard coding audio_encoding and sample_rate_hertz for simplicity
    $audioConfig = new InputAudioConfig();
    $audioConfig->setAudioEncoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
    $audioConfig->setLanguageCode($languageCode);
    $audioConfig->setSampleRateHertz(16000);

    // create query input
    $queryInput = new QueryInput();
    $queryInput->setAudioConfig($audioConfig);

    // get response and relevant info
    $response = $sessionsClient->detectIntent($session, $queryInput, ['inputAudio' => $inputAudio]);
    $queryResult = $response->getQueryResult();
    $queryText = $queryResult->getQueryText();
    $intent = $queryResult->getIntent();
    $displayName = $intent->getDisplayName();
    $confidence = $queryResult->getIntentDetectionConfidence();
    $fulfilmentText = $queryResult->getFulfillmentText();

    // output relevant info
    print(str_repeat("=", 20) . PHP_EOL);
    printf('Query text: %s' . PHP_EOL, $queryText);
    printf('Detected intent: %s (confidence: %f)' . PHP_EOL, $displayName,
        $confidence);
    print(PHP_EOL);
    printf('Fulfilment text: %s' . PHP_EOL, $fulfilmentText);

    $sessionsClient->close();
}

Python

def detect_intent_audio(project_id, session_id, audio_file_path,
                        language_code):
    """Returns the result of detect intent with an audio file as input.

    Using the same `session_id` between requests allows continuation
    of the conversation."""
    import dialogflow_v2 as dialogflow

    session_client = dialogflow.SessionsClient()

    # Note: hard coding audio_encoding and sample_rate_hertz for simplicity.
    audio_encoding = dialogflow.enums.AudioEncoding.AUDIO_ENCODING_LINEAR_16
    sample_rate_hertz = 16000

    session = session_client.session_path(project_id, session_id)
    print('Session path: {}\n'.format(session))

    with open(audio_file_path, 'rb') as audio_file:
        input_audio = audio_file.read()

    audio_config = dialogflow.types.InputAudioConfig(
        audio_encoding=audio_encoding, language_code=language_code,
        sample_rate_hertz=sample_rate_hertz)
    query_input = dialogflow.types.QueryInput(audio_config=audio_config)

    response = session_client.detect_intent(
        session=session, query_input=query_input,
        input_audio=input_audio)

    print('=' * 20)
    print('Query text: {}'.format(response.query_result.query_text))
    print('Detected intent: {} (confidence: {})\n'.format(
        response.query_result.intent.display_name,
        response.query_result.intent_detection_confidence))
    print('Fulfillment text: {}\n'.format(
        response.query_result.fulfillment_text))

Ruby

# project_id = "Your Google Cloud project ID"
# session_id = "mysession"
# audio_file_path = "resources/book_a_room.wav"
# language_code = "en-US"

require "google/cloud/dialogflow"

session_client = Google::Cloud::Dialogflow::Sessions.new
session = session_client.class.session_path project_id, session_id
puts "Session path: #{session}"

begin
  audio_file = File.open(audio_file_path, "rb")
  input_audio = audio_file.read
ensure
  audio_file.close
end

audio_config = {
  audio_encoding: :AUDIO_ENCODING_LINEAR_16,
  sample_rate_hertz: 16000,
  language_code: language_code
}

query_input = { audio_config: audio_config }

response = session_client.detect_intent session, query_input, input_audio: input_audio
query_result = response.query_result

puts "Query text:        #{query_result.query_text}"
puts "Intent detected:   #{query_result.intent.display_name}"
puts "Intent confidence: #{query_result.intent_detection_confidence}"
puts "Fulfillment text:  #{query_result.fulfillment_text}"

Was this page helpful? Let us know how we did:

Send feedback about...

Dialogflow Enterprise Edition Documentation