// Imports the Google Cloud client library
const fs = require('fs');
const speech = require('@google-cloud/speech');
// Creates a client
const client = new speech.SpeechClient();
* TODO(developer): Uncomment the following lines before running the sample.
// const filename = 'Local path to audio file, e.g. /path/to/audio.raw';
// const encoding = 'Encoding of the audio file, e.g. LINEAR16';
// const sampleRateHertz = 16000;
// const languageCode = 'BCP-47 language code, e.g. en-US';
const config = {
enableWordTimeOffsets: true,
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
const audio = {
content: fs.readFileSync(filename).toString('base64'),
const request = {
config: config,
audio: audio,
// Detects speech in the audio file
const [response] = await client.recognize(request);
response.results.forEach(result => {
console.log('Transcription: ', result.alternatives[0].transcript);
result.alternatives[0].words.forEach(wordInfo => {
// NOTE: If you have a time offset exceeding 2^32 seconds, use the
// wordInfo.{x}Time.seconds.high to calculate seconds.
const startSecs =
`${wordInfo.startTime.seconds}` +
'.' +
wordInfo.startTime.nanos / 100000000;
const endSecs =
`${wordInfo.endTime.seconds}` +
'.' +
wordInfo.endTime.nanos / 100000000;
console.log(`Word: ${wordInfo.word}`);
console.log(`\t ${startSecs} secs - ${endSecs} secs`);
# audio_file_path = "Path to file on which to perform speech recognition"
require "google/cloud/speech"
speech = Google::Cloud::Speech.speech
audio_file = File.binread audio_file_path
config = { encoding: :LINEAR16,
sample_rate_hertz: 16_000,
language_code: "en-US",
enable_word_time_offsets: true }
audio = { content: audio_file }
response = speech.recognize config: config, audio: audio
results = response.results
alternatives = results.first.alternatives
alternatives.each do |alternative|
puts "Transcription: #{alternative.transcript}"
alternative.words.each do |word|
start_time = word.start_time.seconds + (word.start_time.nanos / 1_000_000_000.0)
end_time = word.end_time.seconds + (word.end_time.nanos / 1_000_000_000.0)
puts "Word: #{word.word} #{start_time} #{end_time}"
