为图片、视频和文本生成嵌入
使用集合让一切井井有条
根据您的偏好保存内容并对其进行分类。
此代码示例展示了如何使用多模态模型为图片、文本和视频数据生成嵌入。
深入探索
如需查看包含此代码示例的详细文档,请参阅以下内容:
代码示例
如未另行说明,那么本页面中的内容已根据知识共享署名 4.0 许可获得了许可,并且代码示例已根据 Apache 2.0 许可获得了许可。有关详情,请参阅 Google 开发者网站政策。Java 是 Oracle 和/或其关联公司的注册商标。
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["很难理解","hardToUnderstand","thumb-down"],["信息或示例代码不正确","incorrectInformationOrSampleCode","thumb-down"],["没有我需要的信息/示例","missingTheInformationSamplesINeed","thumb-down"],["翻译问题","translationIssue","thumb-down"],["其他","otherDown","thumb-down"]],[],[],[],null,["# Generate embeddings for Images, Videos and Text\n\nThis code sample shows how to use the multimodal model to generate embeddings for image, text and video data.\n\nExplore further\n---------------\n\n\nFor detailed documentation that includes this code sample, see the following:\n\n- [Get multimodal embeddings](/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings)\n- [Multimodal embeddings API](/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api)\n\nCode sample\n-----------\n\n### Go\n\n\nBefore trying this sample, follow the Go setup instructions in the\n[Vertex AI quickstart using\nclient libraries](/vertex-ai/docs/start/client-libraries).\n\n\nFor more information, see the\n[Vertex AI Go API\nreference documentation](/go/docs/reference/cloud.google.com/go/aiplatform/latest/apiv1).\n\n\nTo authenticate to Vertex AI, set up Application Default Credentials.\nFor more information, see\n\n[Set up authentication for a local development environment](/docs/authentication/set-up-adc-local-dev-environment).\n\n import (\n \t\"context\"\n \t\"encoding/json\"\n \t\"fmt\"\n \t\"io\"\n \t\"time\"\n\n \taiplatform \"cloud.google.com/go/aiplatform/apiv1beta1\"\n \taiplatformpb \"cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb\"\n \t\"google.golang.org/api/option\"\n \t\"google.golang.org/protobuf/encoding/protojson\"\n \t\"google.golang.org/protobuf/types/known/structpb\"\n )\n\n // generateForImageTextAndVideo shows how to use the multimodal model to generate embeddings for\n // image, text and video data.\n func generateForImageTextAndVideo(w io.Writer, project, location string) error {\n \t// location = \"us-central1\"\n\n \t// The default context timeout may be not enough to process a video input.\n \tctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)\n \tdefer cancel()\n\n \tapiEndpoint := fmt.Sprintf(\"%s-aiplatform.googleapis.com:443\", location)\n \tclient, err := aiplatform.https://cloud.google.com/go/docs/reference/cloud.google.com/go/aiplatform/latest/apiv1beta1.html#cloud_google_com_go_aiplatform_apiv1beta1_PredictionClient_NewPredictionClient(ctx, option.WithEndpoint(apiEndpoint))\n \tif err != nil {\n \t\treturn fmt.Errorf(\"failed to construct API client: %w\", err)\n \t}\n \tdefer client.Close()\n\n \tmodel := \"multimodalembedding@001\"\n \tendpoint := fmt.Sprintf(\"projects/%s/locations/%s/publishers/google/models/%s\", project, location, model)\n\n \t// This is the input to the model's prediction call. For schema, see:\n \t// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#request_body\n \tinstance, err := structpb.NewValue(map[string]any{\n \t\t\"text\": \"Domestic cats in natural conditions\",\n \t\t\"image\": map[string]any{\n \t\t\t// Image and video inputs can be provided either as a Google Cloud Storage URI or as\n \t\t\t// base64-encoded bytes using the \"bytesBase64Encoded\" field.\n \t\t\t\"gcsUri\": \"gs://cloud-samples-data/generative-ai/image/320px-Felis_catus-cat_on_snow.jpg\",\n \t\t},\n \t\t\"video\": map[string]any{\n \t\t\t\"gcsUri\": \"gs://cloud-samples-data/video/cat.mp4\",\n \t\t},\n \t})\n \tif err != nil {\n \t\treturn fmt.Errorf(\"failed to construct request payload: %w\", err)\n \t}\n\n \treq := &aiplatformpb.PredictRequest{\n \t\tEndpoint: endpoint,\n \t\t// The model supports only 1 instance per request.\n \t\tInstances: []*structpb.Value{instance},\n \t}\n\n \tresp, err := client.Predict(ctx, req)\n \tif err != nil {\n \t\treturn fmt.Errorf(\"failed to generate embeddings: %w\", err)\n \t}\n\n \tinstanceEmbeddingsJson, err := protojson.Marshal(resp.GetPredictions()[0])\n \tif err != nil {\n \t\treturn fmt.Errorf(\"failed to convert protobuf value to JSON: %w\", err)\n \t}\n \t// For response schema, see:\n \t// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#response-body\n \tvar instanceEmbeddings struct {\n \t\tImageEmbeddings []float32 `json:\"imageEmbedding\"`\n \t\tTextEmbeddings []float32 `json:\"textEmbedding\"`\n \t\tVideoEmbeddings []struct {\n \t\t\tEmbedding []float32 `json:\"embedding\"`\n \t\t\tStartOffsetSec float64 `json:\"startOffsetSec\"`\n \t\t\tEndOffsetSec float64 `json:\"endOffsetSec\"`\n \t\t} `json:\"videoEmbeddings\"`\n \t}\n \tif err := json.Unmarshal(instanceEmbeddingsJson, &instanceEmbeddings); err != nil {\n \t\treturn fmt.Errorf(\"failed to unmarshal JSON: %w\", err)\n \t}\n\n \timageEmbedding := instanceEmbeddings.ImageEmbeddings\n \ttextEmbedding := instanceEmbeddings.TextEmbeddings\n \t// Get the embedding for our single video segment (`.videoEmbeddings` object has one entry per\n \t// each processed segment).\n \tvideoEmbedding := instanceEmbeddings.VideoEmbeddings[0].Embedding\n\n \tfmt.Fprintf(w, \"Image embedding (length=%d): %v\\n\", len(imageEmbedding), imageEmbedding)\n \tfmt.Fprintf(w, \"Text embedding (length=%d): %v\\n\", len(textEmbedding), textEmbedding)\n \tfmt.Fprintf(w, \"Video embedding (length=%d): %v\\n\", len(videoEmbedding), videoEmbedding)\n \t// Example response:\n \t// Image embedding (length=1408): [-0.01558477 0.0258355 0.016342038 ... ]\n \t// Text embedding (length=1408): [-0.005894961 0.008349559 0.015355394 ... ]\n \t// Video embedding (length=1408): [-0.018867437 0.013997682 0.0012682161 ... ]\n\n \treturn nil\n }\n\n### Python\n\n\nBefore trying this sample, follow the Python setup instructions in the\n[Vertex AI quickstart using\nclient libraries](/vertex-ai/docs/start/client-libraries).\n\n\nFor more information, see the\n[Vertex AI Python API\nreference documentation](/python/docs/reference/aiplatform/latest).\n\n\nTo authenticate to Vertex AI, set up Application Default Credentials.\nFor more information, see\n\n[Set up authentication for a local development environment](/docs/authentication/set-up-adc-local-dev-environment).\n\n import https://cloud.google.com/python/docs/reference/vertexai/latest/\n\n from vertexai.vision_models import https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.generative_models.Image.html, https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.MultiModalEmbeddingModel.html, https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.Video.html\n from vertexai.vision_models import https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.VideoSegmentConfig.html\n\n # TODO(developer): Update & uncomment line below\n # PROJECT_ID = \"your-project-id\"\n https://cloud.google.com/python/docs/reference/vertexai/latest/.init(project=PROJECT_ID, location=\"us-central1\")\n\n model = https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.MultiModalEmbeddingModel.html.https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.MultiModalEmbeddingModel.html#vertexai_preview_vision_models_MultiModalEmbeddingModel_from_pretrained(\"multimodalembedding@001\")\n\n image = https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.generative_models.Image.html.https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.generative_models.Image.html#vertexai_preview_generative_models_Image_load_from_file(\n \"gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png\"\n )\n video = https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.Video.html.https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.Video.html#vertexai_preview_vision_models_Video_load_from_file(\n \"gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4\"\n )\n\n embeddings = model.https://cloud.google.com/python/docs/reference/vertexai/latest/vertexai.preview.vision_models.MultiModalEmbeddingModel.html#vertexai_preview_vision_models_MultiModalEmbeddingModel_get_embeddings(\n image=image,\n video=video,\n video_segment_config=VideoSegmentConfig(end_offset_sec=1),\n contextual_text=\"Cars on Highway\",\n )\n\n print(f\"Image Embedding: {embeddings.image_embedding}\")\n\n # Video Embeddings are segmented based on the video_segment_config.\n print(\"Video Embeddings:\")\n for video_embedding in embeddings.video_embeddings:\n print(\n f\"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}\"\n )\n print(f\"Embedding: {video_embedding.embedding}\")\n\n print(f\"Text Embedding: {embeddings.text_embedding}\")\n # Example response:\n # Image Embedding: [-0.0123144267, 0.0727186054, 0.000201397663, ...]\n # Video Embeddings:\n # Video Segment: 0.0 - 1.0\n # Embedding: [-0.0206376351, 0.0345234685, ...]\n # Text Embedding: [-0.0207006838, -0.00251058186, ...]\n\nWhat's next\n-----------\n\n\nTo search and filter code samples for other Google Cloud products, see the\n[Google Cloud sample browser](/docs/samples?product=generativeaionvertexai)."]]