const fs = require('fs');
// Imports the Google Cloud client library
const speech = require('@google-cloud/speech').v1p1beta1;
// Creates a client
const client = new speech.SpeechClient();
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const fileName = 'Local path to audio file, e.g. /path/to/audio.raw';
const config = {
encoding: 'LINEAR16',
sampleRateHertz: 8000,
languageCode: 'en-US',
enableSpeakerDiarization: true,
minSpeakerCount: 2,
maxSpeakerCount: 2,
model: 'phone_call',
};
const audio = {
content: fs.readFileSync(fileName).toString('base64'),
};
const request = {
config: config,
audio: audio,
};
const [response] = await client.recognize(request);
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log(`Transcription: ${transcription}`);
console.log('Speaker Diarization:');
const result = response.results[response.results.length - 1];
const wordsInfo = result.alternatives[0].words;
// Note: The transcript within each result is separate and sequential per result.
// However, the words list within an alternative includes all the words
// from all the results thus far. Thus, to get all the words with speaker
// tags, you only have to take the words list from the last result:
wordsInfo.forEach(a =>
console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`)
);
from google.cloud import speech_v1p1beta1 as speech
client = speech.SpeechClient()
speech_file = "resources/commercial_mono.wav"
with open(speech_file, "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
diarization_config=diarization_config,
)
print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)
# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]
words_info = result.alternatives[0].words
# Printing out the output:
for word_info in words_info:
print(
u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
)