오디오 스트림에서 인텐트 감지

이 페이지에서는 API를 사용하여 인텐트 감지 요청으로 오디오 입력을 스트리밍하는 방법을 보여줍니다. Dialogflow는 오디오를 처리하고 텍스트로 변환한 후 인텐트 일치를 시도합니다. 이 변환을 음성 인식, 음성 텍스트 변환 또는 STT라고 합니다.

GCP 프로젝트 및 인증 설정

에이전트 만들기

에이전트로 예제 파일 가져오기

가져오기를 수행하면 에이전트에 인텐트와 항목이 추가됩니다. 기존 인텐트 또는 항목이 가져온 파일에 있는 인텐트 또는 항목과 이름이 같으면 대체됩니다.

파일을 가져오려면 다음 단계를 따르세요.

  1. RoomReservation.zip 파일을 다운로드합니다.
  2. Dialogflow 콘솔로 이동합니다.
  3. 에이전트를 선택합니다.
  4. 에이전트 이름 옆의 설정 버튼을 클릭합니다.
  5. 내보내기 및 가져오기 탭을 선택합니다.
  6. ZIP 파일에서 가져오기를 선택하고 다운로드한 ZIP 파일을 가져옵니다.

인텐트 감지

Go

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

func DetectIntentStream(projectID, sessionID, audioFile, languageCode string) (string, error) {
	ctx := context.Background()

	sessionClient, err := dialogflow.NewSessionsClient(ctx)
	if err != nil {
		return "", err
	}
	defer sessionClient.Close()

	if projectID == "" || sessionID == "" {
		return "", errors.New(fmt.Sprintf("Received empty project (%s) or session (%s)", projectID, sessionID))
	}

	sessionPath := fmt.Sprintf("projects/%s/agent/sessions/%s", projectID, sessionID)

	// In this example, we hard code the encoding and sample rate for simplicity.
	audioConfig := dialogflowpb.InputAudioConfig{AudioEncoding: dialogflowpb.AudioEncoding_AUDIO_ENCODING_LINEAR_16, SampleRateHertz: 16000, LanguageCode: languageCode}

	queryAudioInput := dialogflowpb.QueryInput_AudioConfig{AudioConfig: &audioConfig}

	queryInput := dialogflowpb.QueryInput{Input: &queryAudioInput}

	streamer, err := sessionClient.StreamingDetectIntent(ctx)
	if err != nil {
		return "", err
	}

	f, err := os.Open(audioFile)
	if err != nil {
		return "", err
	}

	defer f.Close()

	go func() {
		audioBytes := make([]byte, 1024)

		request := dialogflowpb.StreamingDetectIntentRequest{Session: sessionPath, QueryInput: &queryInput}
		err = streamer.Send(&request)
		if err != nil {
			log.Fatal(err)
		}

		for {
			_, err := f.Read(audioBytes)
			if err == io.EOF {
				streamer.CloseSend()
				break
			}
			if err != nil {
				log.Fatal(err)
			}

			request = dialogflowpb.StreamingDetectIntentRequest{InputAudio: audioBytes}
			err = streamer.Send(&request)
			if err != nil {
				log.Fatal(err)
			}
		}
	}()

	var queryResult *dialogflowpb.QueryResult

	for {
		response, err := streamer.Recv()
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatal(err)
		}

		recognitionResult := response.GetRecognitionResult()
		transcript := recognitionResult.GetTranscript()
		log.Printf("Recognition transcript: %s\n", transcript)

		queryResult = response.GetQueryResult()
	}

	fulfillmentText := queryResult.GetFulfillmentText()
	return fulfillmentText, nil
}

자바

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

// Imports the Google Cloud client library
import com.google.api.gax.rpc.BidiStream;
import com.google.cloud.dialogflow.v2.AudioEncoding;
import com.google.cloud.dialogflow.v2.InputAudioConfig;
import com.google.cloud.dialogflow.v2.QueryInput;
import com.google.cloud.dialogflow.v2.QueryResult;
import com.google.cloud.dialogflow.v2.SessionName;
import com.google.cloud.dialogflow.v2.SessionsClient;
import com.google.cloud.dialogflow.v2.StreamingDetectIntentRequest;
import com.google.cloud.dialogflow.v2.StreamingDetectIntentResponse;
import com.google.protobuf.ByteString;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * DialogFlow API Detect Intent sample with audio files processes as an audio stream.
 */
class DetectIntentStream {

  static void detectIntentStream(String projectId, String audioFilePath, String sessionId) {
    // String projectId = "YOUR_PROJECT_ID";
    // String audioFilePath = "path_to_your_audio_file";
    // Using the same `sessionId` between requests allows continuation of the conversation.
    // String sessionId = "Identifier of the DetectIntent session";

    // Instantiates a client
    try (SessionsClient sessionsClient = SessionsClient.create()) {
      // Set the session name using the sessionId (UUID) and projectID (my-project-id)
      SessionName session = SessionName.of(projectId, sessionId);

      // Instructs the speech recognizer how to process the audio content.
      // Note: hard coding audioEncoding and sampleRateHertz for simplicity.
      // Audio encoding of the audio content sent in the query request.
      InputAudioConfig inputAudioConfig = InputAudioConfig.newBuilder()
          .setAudioEncoding(AudioEncoding.AUDIO_ENCODING_LINEAR_16)
          .setLanguageCode("en-US") // languageCode = "en-US"
          .setSampleRateHertz(16000) // sampleRateHertz = 16000
          .build();

      // Build the query with the InputAudioConfig
      QueryInput queryInput = QueryInput.newBuilder().setAudioConfig(inputAudioConfig).build();

      // Create the Bidirectional stream
      BidiStream<StreamingDetectIntentRequest, StreamingDetectIntentResponse> bidiStream =
          sessionsClient.streamingDetectIntentCallable().call();

      // The first request must **only** contain the audio configuration:
      bidiStream.send(StreamingDetectIntentRequest.newBuilder()
          .setSession(session.toString())
          .setQueryInput(queryInput)
          .build());

      try (FileInputStream audioStream = new FileInputStream(audioFilePath)) {
        // Subsequent requests must **only** contain the audio data.
        // Following messages: audio chunks. We just read the file in fixed-size chunks. In reality
        // you would split the user input by time.
        byte[] buffer = new byte[4096];
        int bytes;
        while ((bytes = audioStream.read(buffer)) != -1) {
          bidiStream.send(
              StreamingDetectIntentRequest.newBuilder()
                  .setInputAudio(ByteString.copyFrom(buffer, 0, bytes))
                  .build());
        }
      }

      // Tell the service you are done sending data
      bidiStream.closeSend();

      for (StreamingDetectIntentResponse response : bidiStream) {
        QueryResult queryResult = response.getQueryResult();
        System.out.println("====================");
        System.out.format("Intent Display Name: %s\n", queryResult.getIntent().getDisplayName());
        System.out.format("Query Text: '%s'\n", queryResult.getQueryText());
        System.out.format("Detected Intent: %s (confidence: %f)\n",
            queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
        System.out.format("Fulfillment Text: '%s'\n", queryResult.getFulfillmentText());

      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}

Node.js

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

// Imports the Dialogflow library
const dialogflow = require('dialogflow');

// Instantiates a session client
const sessionClient = new dialogflow.SessionsClient();

// The path to the local file on which to perform speech recognition, e.g.
// /path/to/audio.raw const filename = '/path/to/audio.raw';

// The encoding of the audio file, e.g. 'AUDIO_ENCODING_LINEAR_16'
// const encoding = 'AUDIO_ENCODING_LINEAR_16';

// The sample rate of the audio file in hertz, e.g. 16000
// const sampleRateHertz = 16000;

// The BCP-47 language code to use, e.g. 'en-US'
// const languageCode = 'en-US';
const sessionPath = sessionClient.sessionPath(projectId, sessionId);

const initialStreamRequest = {
  session: sessionPath,
  queryParams: {
    session: sessionClient.sessionPath(projectId, sessionId),
  },
  queryInput: {
    audioConfig: {
      audioEncoding: encoding,
      sampleRateHertz: sampleRateHertz,
      languageCode: languageCode,
    },
    singleUtterance: true,
  },
};

// Create a stream for the streaming request.
const detectStream = sessionClient
  .streamingDetectIntent()
  .on('error', console.error)
  .on('data', data => {
    if (data.recognitionResult) {
      console.log(
        `Intermediate transcript: ${data.recognitionResult.transcript}`
      );
    } else {
      console.log(`Detected intent:`);
      logQueryResult(sessionClient, data.queryResult);
    }
  });

// Write the initial stream request to config for audio input.
detectStream.write(initialStreamRequest);

// Stream an audio file from disk to the Conversation API, e.g.
// "./resources/audio.raw"
pump(
  fs.createReadStream(filename),
  // Format the audio stream into the request format.
  through2.obj((obj, _, next) => {
    next(null, {inputAudio: obj});
  }),
  detectStream
);

PHP

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

namespace Google\Cloud\Samples\Dialogflow;

use Google\Cloud\Dialogflow\V2\SessionsClient;
use Google\Cloud\Dialogflow\V2\AudioEncoding;
use Google\Cloud\Dialogflow\V2\InputAudioConfig;
use Google\Cloud\Dialogflow\V2\QueryInput;
use Google\Cloud\Dialogflow\V2\StreamingDetectIntentRequest;

/**
* Returns the result of detect intent with streaming audio as input.
* Using the same `session_id` between requests allows continuation
* of the conversation.
*/
function detect_intent_stream($projectId, $path, $sessionId, $languageCode = 'en-US')
{
    // need to use gRPC
    if (!defined('Grpc\STATUS_OK')) {
        throw new \Exception('Install the grpc extension ' .
            '(pecl install grpc)');
    }

    // new session
    $sessionsClient = new SessionsClient();
    $session = $sessionsClient->sessionName($projectId, $sessionId ?: uniqid());
    printf('Session path: %s' . PHP_EOL, $session);

    // hard coding audio_encoding and sample_rate_hertz for simplicity
    $audioConfig = new InputAudioConfig();
    $audioConfig->setAudioEncoding(AudioEncoding::AUDIO_ENCODING_LINEAR_16);
    $audioConfig->setLanguageCode($languageCode);
    $audioConfig->setSampleRateHertz(16000);

    // create query input
    $queryInput = new QueryInput();
    $queryInput->setAudioConfig($audioConfig);

    // first request contains the configuration
    $request = new StreamingDetectIntentRequest();
    $request->setSession($session);
    $request->setQueryInput($queryInput);
    $requests = [$request];

    // we are going to read small chunks of audio data from
    // a local audio file. in practice, these chunks should
    // come from an audio input device.
    $audioStream = fopen($path, 'rb');
    while (true) {
        $chunk = stream_get_contents($audioStream, 4096);
        if (!$chunk) {
            break;
        }
        $request = new StreamingDetectIntentRequest();
        $request->setInputAudio($chunk);
        $requests[] = $request;
    }

    // intermediate transcript info
    print(PHP_EOL . str_repeat("=", 20) . PHP_EOL);
    $stream = $sessionsClient->streamingDetectIntent();
    foreach ($requests as $request) {
        $stream->write($request);
    }
    foreach ($stream->closeWriteAndReadAll() as $response) {
        $recognitionResult = $response->getRecognitionResult();
        if ($recognitionResult) {
            $transcript = $recognitionResult->getTranscript();
            printf('Intermediate transcript: %s' . PHP_EOL, $transcript);
        }
    }

    // get final response and relevant info
    if ($response) {
        print(str_repeat("=", 20) . PHP_EOL);
        $queryResult = $response->getQueryResult();
        $queryText = $queryResult->getQueryText();
        $intent = $queryResult->getIntent();
        $displayName = $intent->getDisplayName();
        $confidence = $queryResult->getIntentDetectionConfidence();
        $fulfilmentText = $queryResult->getFulfillmentText();

        // output relevant info
        printf('Query text: %s' . PHP_EOL, $queryText);
        printf('Detected intent: %s (confidence: %f)' . PHP_EOL, $displayName,
            $confidence);
        print(PHP_EOL);
        printf('Fulfilment text: %s' . PHP_EOL, $fulfilmentText);
    }

    $sessionsClient->close();
}

Python

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

def detect_intent_stream(project_id, session_id, audio_file_path,
                         language_code):
    """Returns the result of detect intent with streaming audio as input.

    Using the same `session_id` between requests allows continuation
    of the conversation."""
    import dialogflow_v2 as dialogflow
    session_client = dialogflow.SessionsClient()

    # Note: hard coding audio_encoding and sample_rate_hertz for simplicity.
    audio_encoding = dialogflow.enums.AudioEncoding.AUDIO_ENCODING_LINEAR_16
    sample_rate_hertz = 16000

    session_path = session_client.session_path(project_id, session_id)
    print('Session path: {}\n'.format(session_path))

    def request_generator(audio_config, audio_file_path):
        query_input = dialogflow.types.QueryInput(audio_config=audio_config)

        # The first request contains the configuration.
        yield dialogflow.types.StreamingDetectIntentRequest(
            session=session_path, query_input=query_input)

        # Here we are reading small chunks of audio data from a local
        # audio file.  In practice these chunks should come from
        # an audio input device.
        with open(audio_file_path, 'rb') as audio_file:
            while True:
                chunk = audio_file.read(4096)
                if not chunk:
                    break
                # The later requests contains audio data.
                yield dialogflow.types.StreamingDetectIntentRequest(
                    input_audio=chunk)

    audio_config = dialogflow.types.InputAudioConfig(
        audio_encoding=audio_encoding, language_code=language_code,
        sample_rate_hertz=sample_rate_hertz)

    requests = request_generator(audio_config, audio_file_path)
    responses = session_client.streaming_detect_intent(requests)

    print('=' * 20)
    for response in responses:
        print('Intermediate transcript: "{}".'.format(
                response.recognition_result.transcript))

    # Note: The result from the last response is the final transcript along
    # with the detected content.
    query_result = response.query_result

    print('=' * 20)
    print('Query text: {}'.format(query_result.query_text))
    print('Detected intent: {} (confidence: {})\n'.format(
        query_result.intent.display_name,
        query_result.intent_detection_confidence))
    print('Fulfillment text: {}\n'.format(
        query_result.fulfillment_text))

Ruby

이 샘플은 SessionsClient를 사용하여 인텐트를 감지합니다.

# project_id = "Your Google Cloud project ID"
# session_id = "mysession"
# audio_file_path = "resources/book_a_room.wav"
# language_code = "en-US"

require "google/cloud/dialogflow"
require "monitor"

session_client = Google::Cloud::Dialogflow::Sessions.new
session = session_client.class.session_path project_id, session_id
puts "Session path: #{session}"

audio_config = {
  audio_encoding:    :AUDIO_ENCODING_LINEAR_16,
  sample_rate_hertz: 16_000,
  language_code:     language_code
}
query_input = { audio_config: audio_config }
streaming_config = { session: session, query_input: query_input }

# To signal the main thread when all responses have been processed
completed = false

# Use session_client as the sentinel to signal the end of queue
request_queue = EnumeratorQueue.new session_client

# The first request needs to be the configuration.
request_queue.push streaming_config

# Consume the queue and process responses in a separate thread
Thread.new do
  session_client.streaming_detect_intent(request_queue.each_item).each do |response|
    if response.recognition_result
      puts "Intermediate transcript: #{response.recognition_result.transcript}\n"
    else
      # the last response has the actual query result
      query_result = response.query_result
      puts "Query text:        #{query_result.query_text}"
      puts "Intent detected:   #{query_result.intent.display_name}"
      puts "Intent confidence: #{query_result.intent_detection_confidence}"
      puts "Fulfillment text:  #{query_result.fulfillment_text}\n"
    end
  end
  completed = true
end

# While the main thread adds chunks of audio data to the queue
begin
  audio_file = File.open audio_file_path, "rb"
  loop do
    chunk = audio_file.read 4096
    break unless chunk
    request_queue.push input_audio: chunk
    sleep 0.5
  end
ensure
  audio_file.close
  # pushing the sentinel session_client to end the streaming queues
  request_queue.push session_client
end

# Do not exit the main thread until the processing thread is completed
sleep 1 until completed

이 페이지가 도움이 되었나요? 평가를 부탁드립니다.

다음에 대한 의견 보내기...

Dialogflow 문서
도움이 필요하시나요? 지원 페이지를 방문하세요.