Generate embeddings for Images, Videos and Text

This code sample shows how to use the multimodal model to generate embeddings for image, text and video data.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

Go

Before trying this sample, follow the Go setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Go API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import(
"context"
"encoding/json"
"fmt"
"io"
"time"
aiplatform"cloud.google.com/go/aiplatform/apiv1beta1"
aiplatformpb"cloud.google.com/go/aiplatform/apiv1beta1/aiplatformpb"
"google.golang.org/api/option"
"google.golang.org/protobuf/encoding/protojson"
"google.golang.org/protobuf/types/known/structpb"
)
// generateForImageTextAndVideo shows how to use the multimodal model to generate embeddings for
// image, text and video data.
funcgenerateForImageTextAndVideo(wio.Writer,project,locationstring)error{
// location = "us-central1"
// The default context timeout may be not enough to process a video input.
ctx,cancel:=context.WithTimeout(context.Background(),15*time.Second)
defercancel()
apiEndpoint:=fmt.Sprintf("%s-aiplatform.googleapis.com:443",location)
client,err:=aiplatform.NewPredictionClient (ctx,option.WithEndpoint(apiEndpoint))
iferr!=nil{
returnfmt.Errorf("failed to construct API client: %w",err)
}
deferclient.Close()
model:="multimodalembedding@001"
endpoint:=fmt.Sprintf("projects/%s/locations/%s/publishers/google/models/%s",project,location,model)
// This is the input to the model's prediction call. For schema, see:
// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#request_body
instance,err:=structpb.NewValue(map[string]any{
"text":"Domestic cats in natural conditions",
"image":map[string]any{
// Image and video inputs can be provided either as a Google Cloud Storage URI or as
// base64-encoded bytes using the "bytesBase64Encoded" field.
"gcsUri":"gs://cloud-samples-data/generative-ai/image/320px-Felis_catus-cat_on_snow.jpg",
},
"video":map[string]any{
"gcsUri":"gs://cloud-samples-data/video/cat.mp4",
},
})
iferr!=nil{
returnfmt.Errorf("failed to construct request payload: %w",err)
}
req:=&aiplatformpb.PredictRequest{
Endpoint:endpoint,
// The model supports only 1 instance per request.
Instances:[]*structpb.Value{instance},
}
resp,err:=client.Predict(ctx,req)
iferr!=nil{
returnfmt.Errorf("failed to generate embeddings: %w",err)
}
instanceEmbeddingsJson,err:=protojson.Marshal(resp.GetPredictions()[0])
iferr!=nil{
returnfmt.Errorf("failed to convert protobuf value to JSON: %w",err)
}
// For response schema, see:
// https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-embeddings-api#response-body
varinstanceEmbeddingsstruct{
ImageEmbeddings[]float32`json:"imageEmbedding"`
TextEmbeddings[]float32`json:"textEmbedding"`
VideoEmbeddings[]struct{
Embedding[]float32`json:"embedding"`
StartOffsetSecfloat64`json:"startOffsetSec"`
EndOffsetSecfloat64`json:"endOffsetSec"`
}`json:"videoEmbeddings"`
}
iferr:=json.Unmarshal(instanceEmbeddingsJson,&instanceEmbeddings);err!=nil{
returnfmt.Errorf("failed to unmarshal JSON: %w",err)
}
imageEmbedding:=instanceEmbeddings.ImageEmbeddings
textEmbedding:=instanceEmbeddings.TextEmbeddings
// Get the embedding for our single video segment (`.videoEmbeddings` object has one entry per
// each processed segment).
videoEmbedding:=instanceEmbeddings.VideoEmbeddings[0].Embedding
fmt.Fprintf(w,"Image embedding (length=%d): %v\n",len(imageEmbedding),imageEmbedding)
fmt.Fprintf(w,"Text embedding (length=%d): %v\n",len(textEmbedding),textEmbedding)
fmt.Fprintf(w,"Video embedding (length=%d): %v\n",len(videoEmbedding),videoEmbedding)
// Example response:
// Image embedding (length=1408): [-0.01558477 0.0258355 0.016342038 ... ]
// Text embedding (length=1408): [-0.005894961 0.008349559 0.015355394 ... ]
// Video embedding (length=1408): [-0.018867437 0.013997682 0.0012682161 ... ]
returnnil
}

Python

Before trying this sample, follow the Python setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Python API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

importvertexai
fromvertexai.vision_modelsimport Image , MultiModalEmbeddingModel , Video
fromvertexai.vision_modelsimport VideoSegmentConfig
# TODO(developer): Update & uncomment line below
# PROJECT_ID = "your-project-id"
vertexai .init(project=PROJECT_ID, location="us-central1")
model = MultiModalEmbeddingModel .from_pretrained ("multimodalembedding@001")
image = Image .load_from_file (
 "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"
)
video = Video .load_from_file (
 "gs://cloud-samples-data/vertex-ai-vision/highway_vehicles.mp4"
)
embeddings = model.get_embeddings (
 image=image,
 video=video,
 video_segment_config=VideoSegmentConfig(end_offset_sec=1),
 contextual_text="Cars on Highway",
)
print(f"Image Embedding: {embeddings.image_embedding}")
# Video Embeddings are segmented based on the video_segment_config.
print("Video Embeddings:")
for video_embedding in embeddings.video_embeddings:
 print(
 f"Video Segment: {video_embedding.start_offset_sec} - {video_embedding.end_offset_sec}"
 )
 print(f"Embedding: {video_embedding.embedding}")
print(f"Text Embedding: {embeddings.text_embedding}")
# Example response:
# Image Embedding: [-0.0123144267, 0.0727186054, 0.000201397663, ...]
# Video Embeddings:
# Video Segment: 0.0 - 1.0
# Embedding: [-0.0206376351, 0.0345234685, ...]
# Text Embedding: [-0.0207006838, -0.00251058186, ...]

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.

Generate embeddings for Images, Videos and Text Stay organized with collections Save and categorize content based on your preferences.

Explore further

Code sample

Go

Python

What's next

Generate embeddings for Images, Videos and Text