Transcript an audio file with Gemini 1.5 Pro

This sample shows you how to use an audio file to generate a podcast transcript with timestamps. This sample works with Gemini 1.5 Pro only.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

C#

Before trying this sample, follow the C# setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI C# API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


using Google.Cloud.AIPlatform.V1;
using System;
using System.Threading.Tasks;

public class AudioInputTranscription
{
    public async Task<string> TranscribeAudio(
        string projectId = "your-project-id",
        string location = "us-central1",
        string publisher = "google",
        string model = "gemini-1.5-pro-preview-0409")
    {

        var predictionServiceClient = new PredictionServiceClientBuilder
        {
            Endpoint = $"{location}-aiplatform.googleapis.com"
        }.Build();

        string prompt = @"Can you transcribe this interview, in the format of timecode, speaker, caption.
Use speaker A, speaker B, etc. to identify speakers.";

        var generateContentRequest = new GenerateContentRequest
        {
            Model = $"projects/{projectId}/locations/{location}/publishers/{publisher}/models/{model}",
            Contents =
            {
                new Content
                {
                    Role = "USER",
                    Parts =
                    {
                        new Part { Text = prompt },
                        new Part { FileData = new() { MimeType = "audio/mp3", FileUri = "gs://cloud-samples-data/generative-ai/audio/pixel.mp3" } }
                    }
                }
            }
        };

        GenerateContentResponse response = await predictionServiceClient.GenerateContentAsync(generateContentRequest);

        string responseText = response.Candidates[0].Content.Parts[0].Text;
        Console.WriteLine(responseText);

        return responseText;
    }
}

Go

Before trying this sample, follow the Go setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Go API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import (
	"context"
	"errors"
	"fmt"
	"io"
	"mime"
	"path/filepath"

	"cloud.google.com/go/vertexai/genai"
)

// audioPrompt is a sample prompt type consisting of one audio asset, and a text question.
type audioPrompt struct {
	// audio is a Google Cloud Storage path starting with "gs://"
	audio string
	// question asked to the model
	question string
}

// transcribeAudio generates a response into w, based upon the prompt
// and audio provided.
// audio is a Google Cloud Storage path starting with "gs://"
func transcribeAudio(w io.Writer, prompt audioPrompt, projectID, location, modelName string) error {
	// prompt := audioPrompt{
	// 	audio: "gs://cloud-samples-data/generative-ai/audio/pixel.mp3",
	// 	question: `
	// 		Can you transcribe this interview, in the format of timecode, speaker, caption.
	// 		Use speaker A, speaker B, etc. to identify speakers.
	// 	`,
	// },
	// location := "us-central1"
	// modelName := "gemini-1.5-pro-preview-0409"
	ctx := context.Background()

	client, err := genai.NewClient(ctx, projectID, location)
	if err != nil {
		return fmt.Errorf("unable to create client: %w", err)
	}
	defer client.Close()

	model := client.GenerativeModel(modelName)

	// Optional: set an explicit temperature
	model.SetTemperature(0.4)

	// Given an audio file URL, prepare audio file as genai.Part
	img := genai.FileData{
		MIMEType: mime.TypeByExtension(filepath.Ext(prompt.audio)),
		FileURI:  prompt.audio,
	}

	res, err := model.GenerateContent(ctx, img, genai.Text(prompt.question))
	if err != nil {
		return fmt.Errorf("unable to generate contents: %w", err)
	}

	if len(res.Candidates) == 0 ||
		len(res.Candidates[0].Content.Parts) == 0 {
		return errors.New("empty response from model")
	}

	fmt.Fprintf(w, "generated transcript:\n%s\n", res.Candidates[0].Content.Parts[0])
	return nil
}

Java

Before trying this sample, follow the Java setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Java API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import com.google.cloud.vertexai.VertexAI;
import com.google.cloud.vertexai.api.GenerateContentResponse;
import com.google.cloud.vertexai.generativeai.ContentMaker;
import com.google.cloud.vertexai.generativeai.GenerativeModel;
import com.google.cloud.vertexai.generativeai.PartMaker;
import com.google.cloud.vertexai.generativeai.ResponseHandler;
import java.io.IOException;

public class AudioInputTranscription {

  public static void main(String[] args) throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String projectId = "your-google-cloud-project-id";
    String location = "us-central1";
    String modelName = "gemini-1.5-pro-preview-0409";

    transcribeAudio(projectId, location, modelName);
  }

  // Analyzes the given audio input.
  public static String transcribeAudio(String projectId, String location, String modelName)
      throws IOException {
    // Initialize client that will be used to send requests. This client only needs
    // to be created once, and can be reused for multiple requests.
    try (VertexAI vertexAI = new VertexAI(projectId, location)) {
      String audioUri = "gs://cloud-samples-data/generative-ai/audio/pixel.mp3";

      GenerativeModel model = new GenerativeModel(modelName, vertexAI);
      GenerateContentResponse response = model.generateContent(
          ContentMaker.fromMultiModalData(
              "Can you transcribe this interview, in the format of timecode, speaker, caption.\n"
                  + "Use speaker A, speaker B, etc. to identify speakers.",
              PartMaker.fromMimeTypeAndData("audio/mp3", audioUri)
          ));

      String output = ResponseHandler.getText(response);
      System.out.println(output);

      return output;
    }
  }
}

Node.js

Before trying this sample, follow the Node.js setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Node.js API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

const {VertexAI} = require('@google-cloud/vertexai');

/**
 * TODO(developer): Update these variables before running the sample.
 */
async function transcript_audio(projectId = 'PROJECT_ID') {
  const vertexAI = new VertexAI({project: projectId, location: 'us-central1'});

  const generativeModel = vertexAI.getGenerativeModel({
    model: 'gemini-1.5-pro-preview-0409',
  });

  const filePart = {
    file_data: {
      file_uri: 'gs://cloud-samples-data/generative-ai/audio/pixel.mp3',
      mime_type: 'audio/mpeg',
    },
  };
  const textPart = {
    text: `
    Can you transcribe this interview, in the format of timecode, speaker, caption?
    Use speaker A, speaker B, etc. to identify speakers.`,
  };

  const request = {
    contents: [{role: 'user', parts: [filePart, textPart]}],
  };

  const resp = await generativeModel.generateContent(request);
  const contentResponse = await resp.response;
  console.log(JSON.stringify(contentResponse));
}

Python

Before trying this sample, follow the Python setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Python API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


  import vertexai
  from vertexai.generative_models import GenerativeModel, Part

  # TODO(developer): Update and un-comment below lines
  # project_id = "PROJECT_ID"

  vertexai.init(project=project_id, location="us-central1")

  model = GenerativeModel(model_name="gemini-1.5-flash-preview-0514")

  prompt = """
  Can you transcribe this interview, in the format of timecode, speaker, caption.
  Use speaker A, speaker B, etc. to identify speakers.
"""

  audio_file_uri = "gs://cloud-samples-data/generative-ai/audio/pixel.mp3"
  audio_file = Part.from_uri(audio_file_uri, mime_type="audio/mpeg")

  contents = [audio_file, prompt]

  response = model.generate_content(contents)
  print(response.text)

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.