Nesta página, você verá como fazer streaming de entrada de áudio para detectar uma solicitação de intent usando a API. O Dialogflow processa o áudio e o converte em texto antes de tentar uma correspondência de intent. Essa conversão é conhecida como entrada de áudio, reconhecimento de fala, conversão de voz em texto ou STT.
Java
import com.google.api.gax.rpc.ApiException;
import com.google.api.gax.rpc.BidiStream;
import com.google.cloud.dialogflow.cx.v3beta1.AudioEncoding;
import com.google.cloud.dialogflow.cx.v3beta1.AudioInput;
import com.google.cloud.dialogflow.cx.v3beta1.InputAudioConfig;
import com.google.cloud.dialogflow.cx.v3beta1.QueryInput;
import com.google.cloud.dialogflow.cx.v3beta1.QueryResult;
import com.google.cloud.dialogflow.cx.v3beta1.SessionName;
import com.google.cloud.dialogflow.cx.v3beta1.SessionsClient;
import com.google.cloud.dialogflow.cx.v3beta1.SessionsSettings;
import com.google.cloud.dialogflow.cx.v3beta1.StreamingDetectIntentRequest;
import com.google.cloud.dialogflow.cx.v3beta1.StreamingDetectIntentResponse;
import com.google.protobuf.ByteString;
import java.io.FileInputStream;
import java.io.IOException;
public class DetectIntentStream {
// DialogFlow API Detect Intent sample with audio files processes as an audio stream.
public static void detectIntentStream(
String projectId, String locationId, String agentId, String sessionId, String audioFilePath)
throws ApiException, IOException {
SessionsSettings.Builder sessionsSettingsBuilder = SessionsSettings.newBuilder();
if (locationId.equals("global")) {
sessionsSettingsBuilder.setEndpoint("dialogflow.googleapis.com:443");
} else {
sessionsSettingsBuilder.setEndpoint(locationId + "-dialogflow.googleapis.com:443");
}
SessionsSettings sessionsSettings = sessionsSettingsBuilder.build();
// Instantiates a client
try (SessionsClient sessionsClient = SessionsClient.create(sessionsSettings)) {
// Set the session name using the projectID (my-project-id), locationID (global), agentID
// (UUID), and sessionId (UUID).
// Using the same `sessionId` between requests allows continuation of the conversation.
SessionName session = SessionName.of(projectId, locationId, agentId, sessionId);
// Instructs the speech recognizer how to process the audio content.
// Note: hard coding audioEncoding and sampleRateHertz for simplicity.
// Audio encoding of the audio content sent in the query request.
InputAudioConfig inputAudioConfig =
InputAudioConfig.newBuilder()
.setAudioEncoding(AudioEncoding.AUDIO_ENCODING_LINEAR_16)
.setSampleRateHertz(16000) // sampleRateHertz = 16000
.build();
// Build the AudioInput with the InputAudioConfig.
AudioInput audioInput = AudioInput.newBuilder().setConfig(inputAudioConfig).build();
// Build the query with the InputAudioConfig.
QueryInput queryInput =
QueryInput.newBuilder()
.setAudio(audioInput)
.setLanguageCode("en-US") // languageCode = "en-US"
.build();
// Create the Bidirectional stream
BidiStream<StreamingDetectIntentRequest, StreamingDetectIntentResponse> bidiStream =
sessionsClient.streamingDetectIntentCallable().call();
// The first request must **only** contain the audio configuration:
bidiStream.send(
StreamingDetectIntentRequest.newBuilder()
.setSession(session.toString())
.setQueryInput(queryInput)
.build());
try (FileInputStream audioStream = new FileInputStream(audioFilePath)) {
// Subsequent requests must **only** contain the audio data.
// Following messages: audio chunks. We just read the file in fixed-size chunks. In reality
// you would split the user input by time.
byte[] buffer = new byte[4096];
int bytes;
while ((bytes = audioStream.read(buffer)) != -1) {
AudioInput subAudioInput =
AudioInput.newBuilder().setAudio(ByteString.copyFrom(buffer, 0, bytes)).build();
QueryInput subQueryInput =
QueryInput.newBuilder()
.setAudio(subAudioInput)
.setLanguageCode("en-US") // languageCode = "en-US"
.build();
bidiStream.send(
StreamingDetectIntentRequest.newBuilder().setQueryInput(subQueryInput).build());
}
}
// Tell the service you are done sending data.
bidiStream.closeSend();
for (StreamingDetectIntentResponse response : bidiStream) {
QueryResult queryResult = response.getDetectIntentResponse().getQueryResult();
System.out.println("====================");
System.out.format("Query Text: '%s'\n", queryResult.getTranscript());
System.out.format(
"Detected Intent: %s (confidence: %f)\n",
queryResult.getIntent().getDisplayName(), queryResult.getIntentDetectionConfidence());
}
}
}
}
Node.js
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'my-project';
// const location = 'global';
// const agentId = 'my-agent';
// const audioFileName = '/path/to/audio.raw';
// const encoding = 'AUDIO_ENCODING_LINEAR_16';
// const sampleRateHertz = 16000;
// const languageCode = 'en'
// Imports the Google Cloud Some API library
const {SessionsClient} = require('@google-cloud/dialogflow-cx');
/**
* Example for regional endpoint:
* const location = 'us-central1'
* const client = new SessionsClient({apiEndpoint: 'us-central1-dialogflow.googleapis.com'})
*/
const client = new SessionsClient();
const fs = require('fs');
const util = require('util');
const {Transform, pipeline} = require('stream');
const pump = util.promisify(pipeline);
async function detectIntentAudio() {
const sessionId = Math.random().toString(36).substring(7);
const sessionPath = client.projectLocationAgentSessionPath(
projectId,
location,
agentId,
sessionId
);
console.info(sessionPath);
// Create a stream for the streaming request.
const detectStream = client
.streamingDetectIntent()
.on('error', console.error)
.on('data', data => {
if (data.recognitionResult) {
console.log(
`Intermediate Transcript: ${data.recognitionResult.transcript}`
);
} else {
console.log('Detected Intent:');
const result = data.detectIntentResponse.queryResult;
console.log(`User Query: ${result.transcript}`);
for (const message of result.responseMessages) {
if (message.text) {
console.log(`Agent Response: ${message.text.text}`);
}
}
if (result.match.intent) {
console.log(`Matched Intent: ${result.match.intent.displayName}`);
}
console.log(`Current Page: ${result.currentPage.displayName}`);
}
});
// Write the initial stream request to config for audio input.
const initialStreamRequest = {
session: sessionPath,
queryInput: {
audio: {
config: {
audioEncoding: encoding,
sampleRateHertz: sampleRateHertz,
singleUtterance: true,
},
},
languageCode: languageCode,
},
};
detectStream.write(initialStreamRequest);
// Stream the audio from audio file to Dialogflow.
await pump(
fs.createReadStream(audioFileName),
// Format the audio stream into the request format.
new Transform({
objectMode: true,
transform: (obj, _, next) => {
next(null, {queryInput: {audio: {audio: obj}}});
},
}),
detectStream
);
}
detectIntentAudio();
Python
def run_sample():
# TODO(developer): Replace these values when running the function
project_id = "YOUR-PROJECT-ID"
# For more information about regionalization see https://cloud.google.com/dialogflow/cx/docs/how/region
location_id = "YOUR-LOCATION-ID"
# For more info on agents see https://cloud.google.com/dialogflow/cx/docs/concept/agent
agent_id = "YOUR-AGENT-ID"
agent = f"projects/{project_id}/locations/{location_id}/agents/{agent_id}"
# For more information on sessions see https://cloud.google.com/dialogflow/cx/docs/concept/session
session_id = uuid.uuid4()
audio_file_path = "YOUR-AUDIO-FILE-PATH"
# For more supported languages see https://cloud.google.com/dialogflow/es/docs/reference/language
language_code = "en-us"
detect_intent_stream(agent, session_id, audio_file_path, language_code)
def detect_intent_stream(agent, session_id, audio_file_path, language_code):
"""Returns the result of detect intent with streaming audio as input.
Using the same `session_id` between requests allows continuation
of the conversation."""
session_path = f"{agent}/sessions/{session_id}"
print(f"Session path: {session_path}\n")
client_options = None
agent_components = AgentsClient.parse_agent_path(agent)
location_id = agent_components["location"]
if location_id != "global":
api_endpoint = f"{location_id}-dialogflow.googleapis.com:443"
print(f"API Endpoint: {api_endpoint}\n")
client_options = {"api_endpoint": api_endpoint}
session_client = SessionsClient(client_options=client_options)
input_audio_config = audio_config.InputAudioConfig(
audio_encoding=audio_config.AudioEncoding.AUDIO_ENCODING_LINEAR_16,
sample_rate_hertz=24000,
)
def request_generator():
audio_input = session.AudioInput(config=input_audio_config)
query_input = session.QueryInput(audio=audio_input, language_code=language_code)
# The first request contains the configuration.
yield session.StreamingDetectIntentRequest(
session=session_path, query_input=query_input
)
# Here we are reading small chunks of audio data from a local
# audio file. In practice these chunks should come from
# an audio input device.
with open(audio_file_path, "rb") as audio_file:
while True:
chunk = audio_file.read(4096)
if not chunk:
break
# The later requests contains audio data.
audio_input = session.AudioInput(audio=chunk)
query_input = session.QueryInput(audio=audio_input)
yield session.StreamingDetectIntentRequest(query_input=query_input)
responses = session_client.streaming_detect_intent(requests=request_generator())
print("=" * 20)
for response in responses:
print(f'Intermediate transcript: "{response.recognition_result.transcript}".')
# Note: The result from the last response is the final transcript along
# with the detected content.
response = response.detect_intent_response
print(f"Query text: {response.query_result.transcript}")
response_messages = [
" ".join(msg.text.text) for msg in response.query_result.response_messages
]
print(f"Response text: {' '.join(response_messages)}\n")