Trascrivi un file audio locale, inclusi i metadati di riconoscimento nella risposta.
Esempio di codice
Java
/**
* Transcribe the given audio file and include recognition metadata in the request.
*
* @param fileName the path to an audio file.
*/
public static void transcribeFileWithMetadata(String fileName) throws Exception {
Path path = Paths.get(fileName);
byte[] content = Files.readAllBytes(path);
try (SpeechClient speechClient = SpeechClient.create()) {
// Get the contents of the local audio file
RecognitionAudio recognitionAudio =
RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build();
// Construct a recognition metadata object.
// Most metadata fields are specified as enums that can be found
// in speech.enums.RecognitionMetadata
RecognitionMetadata metadata =
RecognitionMetadata.newBuilder()
.setInteractionType(InteractionType.DISCUSSION)
.setMicrophoneDistance(MicrophoneDistance.NEARFIELD)
.setRecordingDeviceType(RecordingDeviceType.SMARTPHONE)
.setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings
// And some are integers, for instance the 6 digit NAICS code
// https://www.naics.com/search/
.setIndustryNaicsCodeOfAudio(519190)
.build();
// Configure request to enable enhanced models
RecognitionConfig config =
RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(8000)
.setMetadata(metadata) // Add the metadata to the config
.build();
// Perform the transcription request
RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio);
// Print out the results
for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) {
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternatives(0);
System.out.format("Transcript: %s\n\n", alternative.getTranscript());
}
}
}
Node.js
// Imports the Google Cloud client library for Beta API
/**
* TODO(developer): Update client library import to use new
* version of API when desired features become available
*/
const speech = require('@google-cloud/speech').v1p1beta1;
const fs = require('fs');
// Creates a client
const client = new speech.SpeechClient();
async function syncRecognizeWithMetaData() {
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';
const recognitionMetadata = {
interactionType: 'DISCUSSION',
microphoneDistance: 'NEARFIELD',
recordingDeviceType: 'SMARTPHONE',
recordingDeviceName: 'Pixel 2 XL',
industryNaicsCodeOfAudio: 519190,
};
const config = {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
metadata: recognitionMetadata,
};
const audio = {
content: fs.readFileSync(filename).toString('base64'),
};
const request = {
config: config,
audio: audio,
};
// Detects speech in the audio file
const [response] = await client.recognize(request);
response.results.forEach(result => {
const alternative = result.alternatives[0];
console.log(alternative.transcript);
});
Python
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with io.open(speech_file, "rb") as audio_file:
content = audio_file.read()
# Here we construct a recognition metadata object.
# Most metadata fields are specified as enums that can be found
# in speech.enums.RecognitionMetadata
metadata = speech.RecognitionMetadata()
metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
metadata.microphone_distance = (
speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD
)
metadata.recording_device_type = (
speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE
)
# Some metadata fields are free form strings
metadata.recording_device_name = "Pixel 2 XL"
# And some are integers, for instance the 6 digit NAICS code
# https://www.naics.com/search/
metadata.industry_naics_code_of_audio = 519190
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
# Add this in the request to send metadata.
metadata=metadata,
)
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
print("-" * 20)
print(u"First alternative of result {}".format(i))
print(u"Transcript: {}".format(alternative.transcript))
Passaggi successivi
Per cercare e filtrare esempi di codice per altri prodotti Google Cloud, consulta la pagina Browser di esempio Google Cloud.