此示例演示了如何使用 Gemini 模型根据多模态提示生成文本。提示由三张图片和两个文本提示组成。模型生成描述图片和文本提示的文本回复。
深入探索
如需查看包含此代码示例的详细文档,请参阅以下内容:
代码示例
C#
在尝试此示例之前,请按照《Vertex AI 快速入门:使用客户端库》中的 C# 设置说明执行操作。如需了解详情,请参阅 Vertex AI C# API 参考文档。
如需向 Vertex AI 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
using Google.Api.Gax.Grpc;
using Google.Cloud.AIPlatform.V1;
using Google.Protobuf;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
public class MultimodalMultiImage
{
public async Task<string> GenerateContent(
string projectId = "your-project-id",
string location = "us-central1",
string publisher = "google",
string model = "gemini-1.0-pro-vision"
)
{
var predictionServiceClient = new PredictionServiceClientBuilder
{
Endpoint = $"{location}-aiplatform.googleapis.com"
}.Build();
ByteString colosseum = await ReadImageFileAsync(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png");
ByteString forbiddenCity = await ReadImageFileAsync(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png");
ByteString christRedeemer = await ReadImageFileAsync(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png");
var generateContentRequest = new GenerateContentRequest
{
Model = $"projects/{projectId}/locations/{location}/publishers/{publisher}/models/{model}",
Contents =
{
new Content
{
Role = "USER",
Parts =
{
new Part { InlineData = new() { MimeType = "image/png", Data = colosseum }},
new Part { Text = "city: Rome, Landmark: the Colosseum" },
new Part { InlineData = new() { MimeType = "image/png", Data = forbiddenCity }},
new Part { Text = "city: Beijing, Landmark: Forbidden City"},
new Part { InlineData = new() { MimeType = "image/png", Data = christRedeemer }}
}
}
}
};
using PredictionServiceClient.StreamGenerateContentStream response = predictionServiceClient.StreamGenerateContent(generateContentRequest);
StringBuilder fullText = new();
AsyncResponseStream<GenerateContentResponse> responseStream = response.GetResponseStream();
await foreach (GenerateContentResponse responseItem in responseStream)
{
fullText.Append(responseItem.Candidates[0].Content.Parts[0].Text);
}
return fullText.ToString();
}
private static async Task<ByteString> ReadImageFileAsync(string url)
{
using HttpClient client = new();
using var response = await client.GetAsync(url);
byte[] imageBytes = await response.Content.ReadAsByteArrayAsync();
return ByteString.CopyFrom(imageBytes);
}
}
Go
在尝试此示例之前,请按照《Vertex AI 快速入门:使用客户端库》中的 Go 设置说明执行操作。如需了解详情,请参阅 Vertex AI Go API 参考文档。
如需向 Vertex AI 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
import (
"context"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"strings"
"cloud.google.com/go/vertexai/genai"
)
func main() {
projectID := os.Getenv("GOOGLE_CLOUD_PROJECT")
location := "us-central1"
modelName := "gemini-1.0-pro-vision"
temperature := 0.4
if projectID == "" {
log.Fatal("require environment variable GOOGLE_CLOUD_PROJECT")
}
// construct this multimodal prompt:
// [image of colosseum] city: Rome, Landmark: the Colosseum
// [image of forbidden city] city: Beijing, Landmark: the Forbidden City
// [new image]
// create prompt image parts
// colosseum
colosseum, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png")
if err != nil {
log.Fatalf("unable to read image: %v", err)
}
// forbidden city
forbiddenCity, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png")
if err != nil {
log.Fatalf("unable to read image: %v", err)
}
// new image
newImage, err := partFromImageURL("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png")
if err != nil {
log.Fatalf("unable to read image: %v", err)
}
// create a multimodal (multipart) prompt
prompt := []genai.Part{
colosseum,
genai.Text("city: Rome, Landmark: the Colosseum "),
forbiddenCity,
genai.Text("city: Beijing, Landmark: the Forbidden City "),
newImage,
}
// generate the response
err = generateMultimodalContent(os.Stdout, prompt, projectID, location, modelName, float32(temperature))
if err != nil {
log.Fatalf("unable to generate: %v", err)
}
}
// generateMultimodalContent provide a generated response using multimodal input
func generateMultimodalContent(w io.Writer, parts []genai.Part, projectID, location, modelName string, temperature float32) error {
ctx := context.Background()
client, err := genai.NewClient(ctx, projectID, location)
if err != nil {
log.Fatal(err)
}
defer client.Close()
model := client.GenerativeModel(modelName)
model.SetTemperature(temperature)
res, err := model.GenerateContent(ctx, parts...)
if err != nil {
return fmt.Errorf("unable to generate contents: %v", err)
}
fmt.Fprintf(w, "generated response: %s\n", res.Candidates[0].Content.Parts[0])
return nil
}
// partFromImageURL create a multimodal prompt part from an image URL
func partFromImageURL(image string) (genai.Part, error) {
var img genai.Blob
imageURL, err := url.Parse(image)
if err != nil {
return img, err
}
res, err := http.Get(image)
if err != nil || res.StatusCode != 200 {
return img, err
}
defer res.Body.Close()
data, err := io.ReadAll(res.Body)
if err != nil {
return img, fmt.Errorf("unable to read from http: %v", err)
}
position := strings.LastIndex(imageURL.Path, ".")
if position == -1 {
return img, fmt.Errorf("couldn't find a period to indicate a file extension")
}
ext := imageURL.Path[position+1:]
img = genai.ImageData(ext, data)
return img, nil
}
Java
在尝试此示例之前,请按照《Vertex AI 快速入门:使用客户端库》中的 Java 设置说明执行操作。如需了解详情,请参阅 Vertex AI Java API 参考文档。
如需向 Vertex AI 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
import com.google.cloud.vertexai.VertexAI;
import com.google.cloud.vertexai.api.Content;
import com.google.cloud.vertexai.api.GenerateContentResponse;
import com.google.cloud.vertexai.generativeai.ContentMaker;
import com.google.cloud.vertexai.generativeai.GenerativeModel;
import com.google.cloud.vertexai.generativeai.PartMaker;
import com.google.cloud.vertexai.generativeai.ResponseHandler;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
public class MultimodalMultiImage {
public static void main(String[] args) throws IOException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-google-cloud-project-id";
String location = "us-central1";
String modelName = "gemini-1.0-pro-vision-001";
multimodalMultiImage(projectId, location, modelName);
}
// Generates content from multiple input images.
public static void multimodalMultiImage(String projectId, String location, String modelName)
throws IOException {
// Initialize client that will be used to send requests. This client only needs
// to be created once, and can be reused for multiple requests.
try (VertexAI vertexAI = new VertexAI(projectId, location)) {
GenerativeModel model = new GenerativeModel(modelName, vertexAI);
Content content = ContentMaker.fromMultiModalData(
PartMaker.fromMimeTypeAndData("image/png", readImageFile(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png")),
"city: Rome, Landmark: the Colosseum",
PartMaker.fromMimeTypeAndData("image/png", readImageFile(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png")),
"city: Beijing, Landmark: Forbidden City",
PartMaker.fromMimeTypeAndData("image/png", readImageFile(
"https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png"))
);
GenerateContentResponse response = model.generateContent(content);
String output = ResponseHandler.getText(response);
System.out.println(output);
}
}
// Reads the image data from the given URL.
public static byte[] readImageFile(String url) throws IOException {
URL urlObj = new URL(url);
HttpURLConnection connection = (HttpURLConnection) urlObj.openConnection();
connection.setRequestMethod("GET");
int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
InputStream inputStream = connection.getInputStream();
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
return outputStream.toByteArray();
} else {
throw new RuntimeException("Error fetching file: " + responseCode);
}
}
}
Node.js
在尝试此示例之前,请按照《Vertex AI 快速入门:使用客户端库》中的 Node.js 设置说明执行操作。如需了解详情,请参阅 Vertex AI Node.js API 参考文档。
如需向 Vertex AI 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
const {VertexAI} = require('@google-cloud/vertexai');
const axios = require('axios');
async function getBase64(url) {
const image = await axios.get(url, {responseType: 'arraybuffer'});
return Buffer.from(image.data).toString('base64');
}
/**
* TODO(developer): Update these variables before running the sample.
*/
async function sendMultiModalPromptWithImage(
projectId = 'PROJECT_ID',
location = 'us-central1',
model = 'gemini-1.0-pro-vision-001'
) {
// For images, the SDK supports base64 strings
const landmarkImage1 = await getBase64(
'https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png'
);
const landmarkImage2 = await getBase64(
'https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png'
);
const landmarkImage3 = await getBase64(
'https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png'
);
// Initialize Vertex with your Cloud project and location
const vertexAI = new VertexAI({project: projectId, location: location});
const generativeVisionModel = vertexAI.getGenerativeModel({
model: model,
});
// Pass multimodal prompt
const request = {
contents: [
{
role: 'user',
parts: [
{
inlineData: {
data: landmarkImage1,
mimeType: 'image/png',
},
},
{
text: 'city: Rome, Landmark: the Colosseum',
},
{
inlineData: {
data: landmarkImage2,
mimeType: 'image/png',
},
},
{
text: 'city: Beijing, Landmark: Forbidden City',
},
{
inlineData: {
data: landmarkImage3,
mimeType: 'image/png',
},
},
],
},
],
};
// Create the response
const response = await generativeVisionModel.generateContent(request);
// Wait for the response to complete
const aggregatedResponse = await response.response;
// Select the text from the response
const fullTextResponse =
aggregatedResponse.candidates[0].content.parts[0].text;
console.log(fullTextResponse);
}
后续步骤
如需搜索和过滤其他 Google Cloud 产品的代码示例,请参阅 Google Cloud 示例浏览器。