Bounding box detection

In this experimental launch, we are providing developers with a powerful tool for object detection and localization within images and video. By accurately identifying and delineating objects with bounding boxes, developers can unlock a wide range of applications and enhance the intelligence of their projects.

Key Benefits:

  • Simple: Integrate object detection capabilities into your applications with ease, regardless of your computer vision expertise.
  • Customizable: Produce bounding boxes based on custom instructions (e.g. "I want to see bounding boxes of all the green objects in this image"), without having to train a custom model.

Technical Details:

  • Input: Your prompt and associated images or video frames.
  • Output: Bounding boxes in the [y_min, x_min, y_max, x_max] format. The top left corner is the origin. The x and y axis go horizontally and vertically, respectively. Coordinate values are normalized to 0-1000 for every image.
  • Visualization: AI Studio users will see bounding boxes plotted within the UI. Vertex AI users should visualize their bounding boxes through custom visualization code.

Python

Install

pip install --upgrade google-genai

To learn more, see the SDK reference documentation.

Set environment variables to use the Gen AI SDK with Vertex AI:

# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values
# with appropriate values for your project.
exportGOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT
exportGOOGLE_CLOUD_LOCATION=global
exportGOOGLE_GENAI_USE_VERTEXAI=True

importrequests
fromgoogleimport genai
fromgoogle.genai.typesimport (
 GenerateContentConfig,
 HarmBlockThreshold,
 HarmCategory,
 HttpOptions,
 Part,
 SafetySetting,
)
fromPILimport Image, ImageColor, ImageDraw
frompydanticimport BaseModel
# Helper class to represent a bounding box
classBoundingBox(BaseModel):
"""
 Represents a bounding box with its 2D coordinates and associated label.
 Attributes:
 box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
 typically in the format [y_min, x_min, y_max, x_max].
 label (str): A string representing the label or class associated with the object within the bounding box.
 """
 box_2d: list[int]
 label: str
# Helper function to plot bounding boxes on an image
defplot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:
"""
 Plots bounding boxes on an image with labels, using PIL and normalized coordinates.
 Args:
 image_uri: The URI of the image file.
 bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in
 normalized [y_min, x_min, y_max, x_max] format.
 """
 with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im:
 width, height = im.size
 draw = ImageDraw.Draw(im)
 colors = list(ImageColor.colormap.keys())
 for i, bbox in enumerate(bounding_boxes):
 # Scale normalized coordinates to image dimensions
 abs_y_min = int(bbox.box_2d[0] / 1000 * height)
 abs_x_min = int(bbox.box_2d[1] / 1000 * width)
 abs_y_max = int(bbox.box_2d[2] / 1000 * height)
 abs_x_max = int(bbox.box_2d[3] / 1000 * width)
 color = colors[i % len(colors)]
 # Draw the rectangle using the correct (x, y) pairs
 draw.rectangle(
 ((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
 outline=color,
 width=4,
 )
 if bbox.label:
 # Position the text at the top-left corner of the box
 draw.text((abs_x_min + 8, abs_y_min + 6), bbox.label, fill=color)
 im.show()
client = genai.Client(http_options=HttpOptions(api_version="v1"))
config = GenerateContentConfig(
 system_instruction="""
 Return bounding boxes as an array with labels.
 Never return masks. Limit to 25 objects.
 If an object is present multiple times, give each object a unique label
 according to its distinct characteristics (colors, size, position, etc..).
 """,
 temperature=0.5,
 safety_settings=[
 SafetySetting(
 category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
 threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
 ),
 ],
 response_mime_type="application/json",
 response_schema=list[BoundingBox],
)
image_uri = "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg"
response = client.models.generate_content(
 model="gemini-2.5-flash",
 contents=[
 Part.from_uri(
 file_uri=image_uri,
 mime_type="image/jpeg",
 ),
 "Output the positions of the socks with a face. Label according to position in the image.",
 ],
 config=config,
)
print(response.text)
plot_bounding_boxes(image_uri, response.parsed)
# Example response:
# [
# {"box_2d": [6, 246, 386, 526], "label": "top-left light blue sock with cat face"},
# {"box_2d": [234, 649, 650, 863], "label": "top-right light blue sock with cat face"},
# ]

Go

Learn how to install or update the Go.

To learn more, see the SDK reference documentation.

Set environment variables to use the Gen AI SDK with Vertex AI:

# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values
# with appropriate values for your project.
exportGOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT
exportGOOGLE_CLOUD_LOCATION=global
exportGOOGLE_GENAI_USE_VERTEXAI=True

import(
	"context"
	"encoding/json"
	"fmt"
	"image"
	"image/color"
	"image/draw"
	"image/jpeg"
	"io"
	"net/http"
	"google.golang.org/genai"
)
// BoundingBox represents a bounding box with coordinates and label.
type BoundingBox struct {
	Box2D []int `json:"box_2d"`
	Label string `json:"label"`
}
// plotBoundingBoxes downloads the image and overlays bounding boxes.
func plotBoundingBoxes(imageURI string, boundingBoxes []BoundingBox) error {
	resp, err := http.Get(imageURI)
	if err != nil {
		return fmt.Errorf("failed to download image: %w", err)
	}
	defer resp.Body.Close()
	img, err := jpeg.Decode(resp.Body)
	if err != nil {
		return fmt.Errorf("failed to decode image: %w", err)
	}
	bounds := img.Bounds()
	rgba := image.NewRGBA(bounds)
	draw.Draw(rgba, bounds, img, bounds.Min, draw.Src)
	// Simple red color for bounding boxes
	red := color.RGBA{255, 0, 0, 255}
	for _, bbox := range boundingBoxes {
		// scale normalized coordinates [01000] to absolute pixels
		yMin := bbox.Box2D[0] * bounds.Dy() / 1000
		xMin := bbox.Box2D[1] * bounds.Dx() / 1000
		yMax := bbox.Box2D[2] * bounds.Dy() / 1000
		xMax := bbox.Box2D[3] * bounds.Dx() / 1000
		// draw rectangle border
		for x := xMin; x <= xMax; x++ {
			rgba.Set(x, yMin, red)
			rgba.Set(x, yMax, red)
		}
		for y := yMin; y <= yMax; y++ {
			rgba.Set(xMin, y, red)
			rgba.Set(xMax, y, red)
		}
	}
	return nil
}
func generateBoundingBoxesWithText(w io.Writer) error {
	ctx := context.Background()
	client, err := genai.NewClient(ctx, &genai.ClientConfig{
		HTTPOptions: genai.HTTPOptions{APIVersion: "v1"},
	})
	if err != nil {
		return fmt.Errorf("failed to create genai client: %w", err)
	}
	imageURI := "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg"
	// Schema definition for []BoundingBox
	schema := &genai.Schema{
		Type: genai.TypeArray,
		Items: &genai.Schema{
			Type: genai.TypeObject,
			Properties: map[string]*genai.Schema{
				"box_2d": {
					Type: genai.TypeArray,
					Items: &genai.Schema{Type: genai.TypeInteger},
				},
				"label": {Type: genai.TypeString},
			},
			Required: []string{"box_2d", "label"},
		},
	}
	config := &genai.GenerateContentConfig{
		SystemInstruction: &genai.Content{
			Parts: []*genai.Part{{
				Text: "Return bounding boxes as an array with labels. Never return masks. Limit to 25 objects.",
			}},
		},
		Temperature: float32Ptr(0.5),
		ResponseMIMEType: "application/json",
		ResponseSchema: schema,
		SafetySettings: []*genai.SafetySetting{
			{
				Category: genai.HarmCategoryDangerousContent,
				Threshold: genai.HarmBlockThresholdBlockOnlyHigh,
			},
		},
	}
	contents := []*genai.Content{
		{
			Role: "user",
			Parts: []*genai.Part{
				{
					FileData: &genai.FileData{
						FileURI: imageURI,
						MIMEType: "image/jpeg",
					},
				},
				{Text: "Output the positions of the socks with a face. Label according to position in the image."},
			},
		},
	}
	resp, err := client.Models.GenerateContent(ctx, "gemini-2.5-flash", contents, config)
	if err != nil {
		return fmt.Errorf("failed to generate content: %w", err)
	}
	fmt.Fprintln(w, resp.Text())
	// Parse into []BoundingBox
	var boxes []BoundingBox
	if err := json.Unmarshal([]byte(resp.Text()), &boxes); err != nil {
		return fmt.Errorf("failed to parse bounding boxes: %w", err)
	}
	// Example response:
	//	Box: (962,113)-(2158,1631) Label: top left sock with face
	//	Box: (2656,721)-(3953,2976) Label: top right sock with face
	//...
	return plotBoundingBoxes(imageURI, boxes)
}
func float32Ptr(v float32) *float32 { return &v }

Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License, and code samples are licensed under the Apache 2.0 License. For details, see the Google Developers Site Policies. Java is a registered trademark of Oracle and/or its affiliates.

Last updated 2025年10月15日 UTC.