이미지, 동영상, 텍스트의 임베딩 생성

이 코드 샘플은 멀티모달 모델을 사용하여 이미지, 텍스트, 동영상 데이터의 임베딩을 생성하는 방법을 보여줍니다.

더 살펴보기

이 코드 샘플이 포함된 자세한 문서는 다음을 참조하세요.

코드 샘플

Go

이 샘플을 사용해 보기 전에 Vertex AI 빠른 시작: 클라이언트 라이브러리 사용Go 설정 안내를 따르세요. 자세한 내용은 Vertex AI Go API 참고 문서를 참조하세요.

Vertex AI에 인증하려면 애플리케이션 기본 사용자 인증 정보를 설정합니다. 자세한 내용은 로컬 개발 환경의 인증 설정을 참조하세요.

import (
	"context"
	"encoding/json"
	"fmt"
	"io"
	"time"

	aiplatform "cloud.google.com/go/aiplatform/apiv1beta1"
	aiplatformpb "cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb"
	"google.golang.org/api/option"
	"google.golang.org/protobuf/encoding/protojson"
	"google.golang.org/protobuf/types/known/structpb"
)

// generateForImageTextAndVideo shows how to use the multimodal model to generate embeddings for
// image, text and video data.
func generateForImageTextAndVideo(w io.Writer, project, location string) error {
	// location = "us-central1"

	// The default context timeout may be not enough to process a video input.
	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
	defer cancel()

	apiEndpoint := fmt.Sprintf("%s-aiplatform.googleapis.com:443", location)
	client, err := aiplatform.NewPredictionClient(ctx, option.WithEndpoint(apiEndpoint))
	if err != nil {
		return fmt.Errorf("failed to construct API client: %w", err)
	}
	defer client.Close()

	model := "multimodalembedding@001"
	endpoint := fmt.Sprintf("projects/%s/locations/%s/publishers/google/models/%s", project, location, model)

	// This is the input to the model's prediction call. For schema, see:
	// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#request_body
	instance, err := structpb.NewValue(map[string]any{
		"text": "Domestic cats in natural conditions",
		"image": map[string]any{
			// Image and video inputs can be provided either as a Google Cloud Storage URI or as
			// base64-encoded bytes using the "bytesBase64Encoded" field.
			"gcsUri": "gs://cloud-samples-data/generative-ai/image/320px-Felis_catus-cat_on_snow.jpg",
		},
		"video": map[string]any{
			"gcsUri": "gs://cloud-samples-data/video/cat.mp4",
		},
	})
	if err != nil {
		return fmt.Errorf("failed to construct request payload: %w", err)
	}

	req := &aiplatformpb.PredictRequest{
		Endpoint: endpoint,
		// The model supports only 1 instance per request.
		Instances: []*structpb.Value{instance},
	}

	resp, err := client.Predict(ctx, req)
	if err != nil {
		return fmt.Errorf("failed to generate embeddings: %w", err)
	}

	instanceEmbeddingsJson, err := protojson.Marshal(resp.GetPredictions()[0])
	if err != nil {
		return fmt.Errorf("failed to convert protobuf value to JSON: %w", err)
	}
	// For response schema, see:
	// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#response-body
	var instanceEmbeddings struct {
		ImageEmbeddings []float32 `json:"imageEmbedding"`
		TextEmbeddings  []float32 `json:"textEmbedding"`
		VideoEmbeddings []struct {
			Embedding      []float32 `json:"embedding"`
			StartOffsetSec float64   `json:"startOffsetSec"`
			EndOffsetSec   float64   `json:"endOffsetSec"`
		} `json:"videoEmbeddings"`
	}
	if err := json.Unmarshal(instanceEmbeddingsJson, &instanceEmbeddings); err != nil {
		return fmt.Errorf("failed to unmarshal JSON: %w", err)
	}

	imageEmbedding := instanceEmbeddings.ImageEmbeddings
	textEmbedding := instanceEmbeddings.TextEmbeddings
	// Get the embedding for our single video segment (`.videoEmbeddings` object has one entry per
	// each processed segment).
	videoEmbedding := instanceEmbeddings.VideoEmbeddings[0].Embedding

	fmt.Fprintf(w, "Image embedding (length=%d): %v\n", len(imageEmbedding), imageEmbedding)
	fmt.Fprintf(w, "Text embedding (length=%d): %v\n", len(textEmbedding), textEmbedding)
	fmt.Fprintf(w, "Video embedding (length=%d): %v\n", len(videoEmbedding), videoEmbedding)
	// Example response:
	// Image embedding (length=1408): [-0.01558477 0.0258355 0.016342038 ... ]
	// Text embedding (length=1408): [-0.005894961 0.008349559 0.015355394 ... ]
	// Video embedding (length=1408): [-0.018867437 0.013997682 0.0012682161 ... ]

	return nil
}

Python

이 샘플을 사용해 보기 전에 Vertex AI 빠른 시작: 클라이언트 라이브러리 사용Python 설정 안내를 따르세요. 자세한 내용은 Vertex AI Python API 참고 문서를 참조하세요.

Vertex AI에 인증하려면 애플리케이션 기본 사용자 인증 정보를 설정합니다. 자세한 내용은 로컬 개발 환경의 인증 설정을 참조하세요.

import vertexai

from vertexai.vision_models import Image, MultiModalEmbeddingModel, Video
from vertexai.vision_models import VideoSegmentConfig

# TODO(developer): Update & uncomment line below
# PROJECT_ID = "your-project-id"
vertexai.init(project=PROJECT_ID, location="us-central1")

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")

image = Image.load_from_file(
    "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"
)
video = Video.load_from_file(
    "gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4"
)

embeddings = model.get_embeddings(
    image=image,
    video=video,
    video_segment_config=VideoSegmentConfig(end_offset_sec=1),
    contextual_text="Cars on Highway",
)

print(f"Image Embedding: {embeddings.image_embedding}")

# Video Embeddings are segmented based on the video_segment_config.
print("Video Embeddings:")
for video_embedding in embeddings.video_embeddings:
    print(
        f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
    )
    print(f"Embedding: {video_embedding.embedding}")

print(f"Text Embedding: {embeddings.text_embedding}")
# Example response:
# Image Embedding: [-0.0123144267, 0.0727186054, 0.000201397663, ...]
# Video Embeddings:
# Video Segment: 0.0 - 1.0
# Embedding: [-0.0206376351, 0.0345234685, ...]
# Text Embedding: [-0.0207006838, -0.00251058186, ...]

다음 단계

다른 Google Cloud 제품의 코드 샘플을 검색하고 필터링하려면 Google Cloud 샘플 브라우저를 참조하세요.