Detecting Intent from a Stream

Here is an example of detecting intent by streaming to a Dialogflow agent.

Set up your GCP project and authentication

Create an agent

Import the example file to your agent

Importing will add intents and entities to your agent. If any existing intents or entities have the same name as those in the imported file, they will be replaced.

To import the file, follow these steps:

  1. Download the RoomReservation.zip file
  2. Go to the Dialogflow Console
  3. Select your agent
  4. Click the settings settings button next to the agent name
  5. Select the Export and Import tab
  6. Select Import From Zip and import the zip file that you downloaded

Detect intent

Go

func DetectIntentStream(projectID, sessionID, audioFile, languageCode string) (string, error) {
	ctx := context.Background()

	sessionClient, err := dialogflow.NewSessionsClient(ctx)
	if err != nil {
		return "", err
	}
	defer sessionClient.Close()

	if projectID == "" || sessionID == "" {
		return "", errors.New(fmt.Sprintf("Received empty project (%s) or session (%s)", projectID, sessionID))
	}

	sessionPath := fmt.Sprintf("projects/%s/agent/sessions/%s", projectID, sessionID)

	// In this example, we hard code the encoding and sample rate for simplicity.
	audioConfig := dialogflowpb.InputAudioConfig{AudioEncoding: dialogflowpb.AudioEncoding_AUDIO_ENCODING_LINEAR_16, SampleRateHertz: 16000, LanguageCode: languageCode}

	queryAudioInput := dialogflowpb.QueryInput_AudioConfig{AudioConfig: &audioConfig}

	queryInput := dialogflowpb.QueryInput{Input: &queryAudioInput}

	streamer, err := sessionClient.StreamingDetectIntent(ctx)
	if err != nil {
		return "", err
	}

	f, err := os.Open(audioFile)
	if err != nil {
		return "", err
	}

	defer f.Close()

	go func() {
		audioBytes := make([]byte, 1024)

		request := dialogflowpb.StreamingDetectIntentRequest{Session: sessionPath, QueryInput: &queryInput}
		err = streamer.Send(&request)
		if err != nil {
			log.Fatal(err)
		}

		for {
			_, err := f.Read(audioBytes)
			if err == io.EOF {
				streamer.CloseSend()
				break
			}
			if err != nil {
				log.Fatal(err)
			}

			request = dialogflowpb.StreamingDetectIntentRequest{InputAudio: audioBytes}
			err = streamer.Send(&request)
			if err != nil {
				log.Fatal(err)
			}
		}
	}()

	var queryResult *dialogflowpb.QueryResult

	for {
		response, err := streamer.Recv()
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatal(err)
		}

		recognitionResult := response.GetRecognitionResult()
		transcript := recognitionResult.GetTranscript()
		log.Printf("Recognition transcript: %s\n", transcript)

		queryResult = response.GetQueryResult()
	}

	fulfillmentText := queryResult.GetFulfillmentText()
	return fulfillmentText, nil
}

Java

/**
 * Returns the result of detect intent with streaming audio as input.
 *
 * Using the same `session_id` between requests allows continuation of the conversation.
 *
 * @param projectId     Project/Agent Id.
 * @param audioFilePath The audio file to be processed.
 * @param sessionId     Identifier of the DetectIntent session.
 * @param languageCode  Language code of the query.
 * @return The List of StreamingDetectIntentResponses to the input audio inputs.
 */
public static List<StreamingDetectIntentResponse> detectIntentStream(
    String projectId,
    String audioFilePath,
    String sessionId,
    String languageCode) throws Throwable {
  // Start bi-directional StreamingDetectIntent stream.
  final CountDownLatch notification = new CountDownLatch(1);
  final List<Throwable> responseThrowables = new ArrayList<>();
  final List<StreamingDetectIntentResponse> responses = new ArrayList<>();

  // Instantiates a client
  try (SessionsClient sessionsClient = SessionsClient.create()) {
    // Set the session name using the sessionId (UUID) and projectID (my-project-id)
    SessionName session = SessionName.of(projectId, sessionId);
    System.out.println("Session Path: " + session.toString());

    // Note: hard coding audioEncoding and sampleRateHertz for simplicity.
    // Audio encoding of the audio content sent in the query request.
    AudioEncoding audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16;
    int sampleRateHertz = 16000;

    // Instructs the speech recognizer how to process the audio content.
    InputAudioConfig inputAudioConfig = InputAudioConfig.newBuilder()
        .setAudioEncoding(audioEncoding) // audioEncoding = AudioEncoding.AUDIO_ENCODING_LINEAR_16
        .setLanguageCode(languageCode) // languageCode = "en-US"
        .setSampleRateHertz(sampleRateHertz) // sampleRateHertz = 16000
        .build();

    ApiStreamObserver<StreamingDetectIntentResponse> responseObserver =
        new ApiStreamObserver<StreamingDetectIntentResponse>() {
          @Override
          public void onNext(StreamingDetectIntentResponse response) {
            // Do something when receive a response
            responses.add(response);
          }

          @Override
          public void onError(Throwable t) {
            // Add error-handling
            responseThrowables.add(t);
          }

          @Override
          public void onCompleted() {
            // Do something when complete.
            notification.countDown();
          }
        };

    // Performs the streaming detect intent callable request
    ApiStreamObserver<StreamingDetectIntentRequest> requestObserver =
        sessionsClient.streamingDetectIntentCallable().bidiStreamingCall(responseObserver);

    // Build the query with the InputAudioConfig
    QueryInput queryInput = QueryInput.newBuilder().setAudioConfig(inputAudioConfig).build();

    try (FileInputStream audioStream = new FileInputStream(audioFilePath)) {
      // The first request contains the configuration
      StreamingDetectIntentRequest request = StreamingDetectIntentRequest.newBuilder()
          .setSession(session.toString())
          .setQueryInput(queryInput)
          .build();

      // Make the first request
      requestObserver.onNext(request);

      // Following messages: audio chunks. We just read the file in fixed-size chunks. In reality
      // you would split the user input by time.
      byte[] buffer = new byte[4096];
      int bytes;
      while ((bytes = audioStream.read(buffer)) != -1) {
        requestObserver.onNext(
            StreamingDetectIntentRequest.newBuilder()
                .setInputAudio(ByteString.copyFrom(buffer, 0, bytes))
                .build());
      }
    } catch (RuntimeException e) {
      // Cancel stream.
      requestObserver.onError(e);
    }
    // Half-close the stream.
    requestObserver.onCompleted();
    // Wait for the final response (without explicit timeout).
    notification.await();
    // Process errors/responses.
    if (!responseThrowables.isEmpty()) {
      throw responseThrowables.get(0);
    }
    if (responses.isEmpty()) {
      throw new RuntimeException("No response from Dialogflow.");
    }

    for (StreamingDetectIntentResponse response : responses) {
      if (response.hasRecognitionResult()) {
        System.out.format(
            "Intermediate transcript: '%s'\n", response.getRecognitionResult().getTranscript());
      }
    }

    // Display the last query result
    QueryResult queryResult = responses.get(responses.size() - 1).getQueryResult();
    System.out.println("====================");
    System.out.format("Query Text: '%s'\n", queryResult.getQueryText());
    System.out.format("Detected Intent: %s (confidence: %f)\n",
        queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
    System.out.format("Fulfillment Text: '%s'\n", queryResult.getFulfillmentText());

    return responses;
  }
}

Node.js

// Imports the Dialogflow library
const dialogflow = require('dialogflow');

// Instantiates a session client
const sessionClient = new dialogflow.SessionsClient();

// The path to the local file on which to perform speech recognition, e.g.
// /path/to/audio.raw const filename = '/path/to/audio.raw';

// The encoding of the audio file, e.g. 'AUDIO_ENCODING_LINEAR_16'
// const encoding = 'AUDIO_ENCODING_LINEAR_16';

// The sample rate of the audio file in hertz, e.g. 16000
// const sampleRateHertz = 16000;

// The BCP-47 language code to use, e.g. 'en-US'
// const languageCode = 'en-US';
const sessionPath = sessionClient.sessionPath(projectId, sessionId);

const initialStreamRequest = {
  session: sessionPath,
  queryParams: {
    session: sessionClient.sessionPath(projectId, sessionId),
  },
  queryInput: {
    audioConfig: {
      audioEncoding: encoding,
      sampleRateHertz: sampleRateHertz,
      languageCode: languageCode,
    },
    singleUtterance: true,
  },
};

// Create a stream for the streaming request.
const detectStream = sessionClient
  .streamingDetectIntent()
  .on('error', console.error)
  .on('data', data => {
    if (data.recognitionResult) {
      console.log(
        `Intermediate transcript: ${data.recognitionResult.transcript}`
      );
    } else {
      console.log(`Detected intent:`);
      logQueryResult(sessionClient, data.queryResult);
    }
  });

// Write the initial stream request to config for audio input.
detectStream.write(initialStreamRequest);

// Stream an audio file from disk to the Conversation API, e.g.
// "./resources/audio.raw"
pump(
  fs.createReadStream(filename),
  // Format the audio stream into the request format.
  through2.obj((obj, _, next) => {
    next(null, {inputAudio: obj});
  }),
  detectStream
);

PHP

namespace Google\Cloud\Samples\Dialogflow;

use Google\Cloud\Dialogflow\V2\SessionsClient;
use Google\Cloud\Dialogflow\V2\AudioEncoding;
use Google\Cloud\Dialogflow\V2\InputAudioConfig;
use Google\Cloud\Dialogflow\V2\QueryInput;
use Google\Cloud\Dialogflow\V2\StreamingDetectIntentRequest;

/**
* Returns the result of detect intent with streaming audio as input.
* Using the same `session_id` between requests allows continuation
* of the conversation.
*/
function detect_intent_stream($projectId, $path, $sessionId, $languageCode = 'en-US')
{
    // need to use gRPC
    if (!defined('Grpc\STATUS_OK')) {
        throw new \Exception('Install the grpc extension ' .
            '(pecl install grpc)');
    }

    // new session
    $sessionsClient = new SessionsClient();
    $session = $sessionsClient->sessionName($projectId, $sessionId ?: uniqid());
    printf('Session path: %s' . PHP_EOL, $session);

    // hard coding audio_encoding and sample_rate_hertz for simplicity
    $audioConfig = new InputAudioConfig();
    $audioConfig->setAudioEncoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
    $audioConfig->setLanguageCode($languageCode);
    $audioConfig->setSampleRateHertz(16000);

    // create query input
    $queryInput = new QueryInput();
    $queryInput->setAudioConfig($audioConfig);

    // first request contains the configuration
    $request = new StreamingDetectIntentRequest();
    $request->setSession($session);
    $request->setQueryInput($queryInput);
    $requests = [$request];

    // we are going to read small chunks of audio data from
    // a local audio file. in practice, these chunks should
    // come from an audio input device.
    $audioStream = fopen($path, 'rb');
    while (true) {
        $chunk = stream_get_contents($audioStream, 4096);
        if (!$chunk) {
            break;
        }
        $request = new StreamingDetectIntentRequest();
        $request->setInputAudio($chunk);
        $requests[] = $request;
    }

    // intermediate transcript info
    print(PHP_EOL . str_repeat("=", 20) . PHP_EOL);
    $stream = $sessionsClient->streamingDetectIntent();
    foreach ($requests as $request) {
        $stream->write($request);
    }
    foreach ($stream->closeWriteAndReadAll() as $response) {
        $recognitionResult = $response->getRecognitionResult();
        if ($recognitionResult) {
            $transcript = $recognitionResult->getTranscript();
            printf('Intermediate transcript: %s' . PHP_EOL, $transcript);
        }
    }

    // get final response and relevant info
    if ($response) {
        print(str_repeat("=", 20) . PHP_EOL);
        $queryResult = $response->getQueryResult();
        $queryText = $queryResult->getQueryText();
        $intent = $queryResult->getIntent();
        $displayName = $intent->getDisplayName();
        $confidence = $queryResult->getIntentDetectionConfidence();
        $fulfilmentText = $queryResult->getFulfillmentText();

        // output relevant info
        printf('Query text: %s' . PHP_EOL, $queryText);
        printf('Detected intent: %s (confidence: %f)' . PHP_EOL, $displayName,
            $confidence);
        print(PHP_EOL);
        printf('Fulfilment text: %s' . PHP_EOL, $fulfilmentText);
    }

    $sessionsClient->close();
}

Python

def detect_intent_stream(project_id, session_id, audio_file_path,
                         language_code):
    """Returns the result of detect intent with streaming audio as input.

    Using the same `session_id` between requests allows continuation
    of the conversation."""
    import dialogflow_v2 as dialogflow
    session_client = dialogflow.SessionsClient()

    # Note: hard coding audio_encoding and sample_rate_hertz for simplicity.
    audio_encoding = dialogflow.enums.AudioEncoding.AUDIO_ENCODING_LINEAR_16
    sample_rate_hertz = 16000

    session_path = session_client.session_path(project_id, session_id)
    print('Session path: {}\n'.format(session_path))

    def request_generator(audio_config, audio_file_path):
        query_input = dialogflow.types.QueryInput(audio_config=audio_config)

        # The first request contains the configuration.
        yield dialogflow.types.StreamingDetectIntentRequest(
            session=session_path, query_input=query_input)

        # Here we are reading small chunks of audio data from a local
        # audio file.  In practice these chunks should come from
        # an audio input device.
        with open(audio_file_path, 'rb') as audio_file:
            while True:
                chunk = audio_file.read(4096)
                if not chunk:
                    break
                # The later requests contains audio data.
                yield dialogflow.types.StreamingDetectIntentRequest(
                    input_audio=chunk)

    audio_config = dialogflow.types.InputAudioConfig(
        audio_encoding=audio_encoding, language_code=language_code,
        sample_rate_hertz=sample_rate_hertz)

    requests = request_generator(audio_config, audio_file_path)
    responses = session_client.streaming_detect_intent(requests)

    print('=' * 20)
    for response in responses:
        print('Intermediate transcript: "{}".'.format(
                response.recognition_result.transcript))

    # Note: The result from the last response is the final transcript along
    # with the detected content.
    query_result = response.query_result

    print('=' * 20)
    print('Query text: {}'.format(query_result.query_text))
    print('Detected intent: {} (confidence: {})\n'.format(
        query_result.intent.display_name,
        query_result.intent_detection_confidence))
    print('Fulfillment text: {}\n'.format(
        query_result.fulfillment_text))

Ruby

# project_id = "Your Google Cloud project ID"
# session_id = "mysession"
# audio_file_path = "resources/book_a_room.wav"
# language_code = "en-US"

require "google/cloud/dialogflow"
require "monitor"

session_client = Google::Cloud::Dialogflow::Sessions.new
session = session_client.class.session_path project_id, session_id
puts "Session path: #{session}"

audio_config = {
  audio_encoding: :AUDIO_ENCODING_LINEAR_16,
  sample_rate_hertz: 16000,
  language_code: language_code
}
query_input = { audio_config: audio_config }
streaming_config = { session: session, query_input: query_input }

# To signal the main thread when all responses have been processed
completed = false

# Use session_client as the sentinel to signal the end of queue
request_queue  = EnumeratorQueue.new(session_client)

# The first request needs to be the configuration.
request_queue.push(streaming_config)

# Consume the queue and process responses in a separate thread
Thread.new do
  session_client.streaming_detect_intent(request_queue.each_item).each do |response|
    if response.recognition_result
      puts "Intermediate transcript: #{response.recognition_result.transcript}\n"
    else
      # the last response has the actual query result
      query_result = response.query_result
      puts "Query text:        #{query_result.query_text}"
      puts "Intent detected:   #{query_result.intent.display_name}"
      puts "Intent confidence: #{query_result.intent_detection_confidence}"
      puts "Fulfillment text:  #{query_result.fulfillment_text}\n"
    end
  end
  completed = true
end

# While the main thread adds chunks of audio data to the queue
begin
  audio_file = File.open(audio_file_path, "rb")
    while true
      chunk = audio_file.read 4096
      break if not chunk
      request_queue.push({ input_audio: chunk})
      sleep 0.5
    end
ensure
  audio_file.close
  # pushing the sentinel session_client to end the streaming queues
  request_queue.push(session_client)
end

# Do not exit the main thread until the processing thread is completed
while not completed
  sleep 1
end

Was this page helpful? Let us know how we did:

Send feedback about...

Dialogflow Enterprise Edition Documentation