Bounding box detection
Stay organized with collections
Save and categorize content based on your preferences.
In this experimental launch, we are providing developers with a powerful tool for object detection and localization within images and video. By accurately identifying and delineating objects with bounding boxes, developers can unlock a wide range of applications and enhance the intelligence of their projects.
Key Benefits:
- Simple: Integrate object detection capabilities into your applications with ease, regardless of your computer vision expertise.
- Customizable: Produce bounding boxes based on custom instructions (e.g. "I want to see bounding boxes of all the green objects in this image"), without having to train a custom model.
Technical Details:
- Input: Your prompt and associated images or video frames.
- Output: Bounding boxes in the
[y_min, x_min, y_max, x_max]
format. The top left corner is the origin. Thex
andy
axis go horizontally and vertically, respectively. Coordinate values are normalized to 0-1000 for every image. - Visualization: AI Studio users will see bounding boxes plotted within the UI. Vertex AI users should visualize their bounding boxes through custom visualization code.
Python
Install
pip install --upgrade google-genai
To learn more, see the SDK reference documentation.
Set environment variables to use the Gen AI SDK with Vertex AI:
# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values
# with appropriate values for your project.
exportGOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT
exportGOOGLE_CLOUD_LOCATION=global
exportGOOGLE_GENAI_USE_VERTEXAI=True
importrequests
fromgoogleimport genai
fromgoogle.genai.typesimport (
GenerateContentConfig,
HarmBlockThreshold,
HarmCategory,
HttpOptions,
Part,
SafetySetting,
)
fromPILimport Image, ImageColor, ImageDraw
frompydanticimport BaseModel
# Helper class to represent a bounding box
classBoundingBox(BaseModel):
"""
Represents a bounding box with its 2D coordinates and associated label.
Attributes:
box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
typically in the format [y_min, x_min, y_max, x_max].
label (str): A string representing the label or class associated with the object within the bounding box.
"""
box_2d: list[int]
label: str
# Helper function to plot bounding boxes on an image
defplot_bounding_boxes(image_uri: str, bounding_boxes: list[BoundingBox]) -> None:
"""
Plots bounding boxes on an image with labels, using PIL and normalized coordinates.
Args:
image_uri: The URI of the image file.
bounding_boxes: A list of BoundingBox objects. Each box's coordinates are in
normalized [y_min, x_min, y_max, x_max] format.
"""
with Image.open(requests.get(image_uri, stream=True, timeout=10).raw) as im:
width, height = im.size
draw = ImageDraw.Draw(im)
colors = list(ImageColor.colormap.keys())
for i, bbox in enumerate(bounding_boxes):
# Scale normalized coordinates to image dimensions
abs_y_min = int(bbox.box_2d[0] / 1000 * height)
abs_x_min = int(bbox.box_2d[1] / 1000 * width)
abs_y_max = int(bbox.box_2d[2] / 1000 * height)
abs_x_max = int(bbox.box_2d[3] / 1000 * width)
color = colors[i % len(colors)]
# Draw the rectangle using the correct (x, y) pairs
draw.rectangle(
((abs_x_min, abs_y_min), (abs_x_max, abs_y_max)),
outline=color,
width=4,
)
if bbox.label:
# Position the text at the top-left corner of the box
draw.text((abs_x_min + 8, abs_y_min + 6), bbox.label, fill=color)
im.show()
client = genai.Client(http_options=HttpOptions(api_version="v1"))
config = GenerateContentConfig(
system_instruction="""
Return bounding boxes as an array with labels.
Never return masks. Limit to 25 objects.
If an object is present multiple times, give each object a unique label
according to its distinct characteristics (colors, size, position, etc..).
""",
temperature=0.5,
safety_settings=[
SafetySetting(
category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
threshold=HarmBlockThreshold.BLOCK_ONLY_HIGH,
),
],
response_mime_type="application/json",
response_schema=list[BoundingBox],
)
image_uri = "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg"
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=[
Part.from_uri(
file_uri=image_uri,
mime_type="image/jpeg",
),
"Output the positions of the socks with a face. Label according to position in the image.",
],
config=config,
)
print(response.text)
plot_bounding_boxes(image_uri, response.parsed)
# Example response:
# [
# {"box_2d": [6, 246, 386, 526], "label": "top-left light blue sock with cat face"},
# {"box_2d": [234, 649, 650, 863], "label": "top-right light blue sock with cat face"},
# ]
Go
Learn how to install or update the Go.
To learn more, see the SDK reference documentation.
Set environment variables to use the Gen AI SDK with Vertex AI:
# Replace the `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_LOCATION` values
# with appropriate values for your project.
exportGOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT
exportGOOGLE_CLOUD_LOCATION=global
exportGOOGLE_GENAI_USE_VERTEXAI=True
import(
"context"
"encoding/json"
"fmt"
"image"
"image/color"
"image/draw"
"image/jpeg"
"io"
"net/http"
"google.golang.org/genai"
)
// BoundingBox represents a bounding box with coordinates and label.
type BoundingBox struct {
Box2D []int `json:"box_2d"`
Label string `json:"label"`
}
// plotBoundingBoxes downloads the image and overlays bounding boxes.
func plotBoundingBoxes(imageURI string, boundingBoxes []BoundingBox) error {
resp, err := http.Get(imageURI)
if err != nil {
return fmt.Errorf("failed to download image: %w", err)
}
defer resp.Body.Close()
img, err := jpeg.Decode(resp.Body)
if err != nil {
return fmt.Errorf("failed to decode image: %w", err)
}
bounds := img.Bounds()
rgba := image.NewRGBA(bounds)
draw.Draw(rgba, bounds, img, bounds.Min, draw.Src)
// Simple red color for bounding boxes
red := color.RGBA{255, 0, 0, 255}
for _, bbox := range boundingBoxes {
// scale normalized coordinates [0–1000] to absolute pixels
yMin := bbox.Box2D[0] * bounds.Dy() / 1000
xMin := bbox.Box2D[1] * bounds.Dx() / 1000
yMax := bbox.Box2D[2] * bounds.Dy() / 1000
xMax := bbox.Box2D[3] * bounds.Dx() / 1000
// draw rectangle border
for x := xMin; x <= xMax; x++ {
rgba.Set(x, yMin, red)
rgba.Set(x, yMax, red)
}
for y := yMin; y <= yMax; y++ {
rgba.Set(xMin, y, red)
rgba.Set(xMax, y, red)
}
}
return nil
}
func generateBoundingBoxesWithText(w io.Writer) error {
ctx := context.Background()
client, err := genai.NewClient(ctx, &genai.ClientConfig{
HTTPOptions: genai.HTTPOptions{APIVersion: "v1"},
})
if err != nil {
return fmt.Errorf("failed to create genai client: %w", err)
}
imageURI := "https://storage.googleapis.com/generativeai-downloads/images/socks.jpg"
// Schema definition for []BoundingBox
schema := &genai.Schema{
Type: genai.TypeArray,
Items: &genai.Schema{
Type: genai.TypeObject,
Properties: map[string]*genai.Schema{
"box_2d": {
Type: genai.TypeArray,
Items: &genai.Schema{Type: genai.TypeInteger},
},
"label": {Type: genai.TypeString},
},
Required: []string{"box_2d", "label"},
},
}
config := &genai.GenerateContentConfig{
SystemInstruction: &genai.Content{
Parts: []*genai.Part{{
Text: "Return bounding boxes as an array with labels. Never return masks. Limit to 25 objects.",
}},
},
Temperature: float32Ptr(0.5),
ResponseMIMEType: "application/json",
ResponseSchema: schema,
SafetySettings: []*genai.SafetySetting{
{
Category: genai.HarmCategoryDangerousContent,
Threshold: genai.HarmBlockThresholdBlockOnlyHigh,
},
},
}
contents := []*genai.Content{
{
Role: "user",
Parts: []*genai.Part{
{
FileData: &genai.FileData{
FileURI: imageURI,
MIMEType: "image/jpeg",
},
},
{Text: "Output the positions of the socks with a face. Label according to position in the image."},
},
},
}
resp, err := client.Models.GenerateContent(ctx, "gemini-2.5-flash", contents, config)
if err != nil {
return fmt.Errorf("failed to generate content: %w", err)
}
fmt.Fprintln(w, resp.Text())
// Parse into []BoundingBox
var boxes []BoundingBox
if err := json.Unmarshal([]byte(resp.Text()), &boxes); err != nil {
return fmt.Errorf("failed to parse bounding boxes: %w", err)
}
// Example response:
// Box: (962,113)-(2158,1631) Label: top left sock with face
// Box: (2656,721)-(3953,2976) Label: top right sock with face
//...
return plotBoundingBoxes(imageURI, boxes)
}
func float32Ptr(v float32) *float32 { return &v }