feat: add coordinate space abstraction for open weights LLM support #282

Original file line number	Diff line number	Diff line change
Expand Up		@@ -175,6 +175,72 @@ class MyVlmProvider(VlmProvider):

		---

	## Image Scaling

	Before a screenshot is sent to a model, it is preprocessed by an image scaler. The scaler resizes the image to match the model's optimal input resolution, which affects both token cost and coordinate precision.

	All scalers inherit from `ImageScaler`:

	\| Class \| Behaviour \| Used by \|
	\|-------\|-----------\|---------\|
	\| `PatchOptimizedImageScaler` \| Finds the largest aspect-preserving size within a patch-based token budget (`max_edge`, `max_tokens`, `patch_size`) \| `AskUIVlmProvider`, `AnthropicVlmProvider`, `OpenAIVlmProvider` \|
	\| `ContainedImageScaler` \| Fits within `max_width` ×ばつ `max_height` bounds \| Default in `VlmProvider` base class \|

	### Configuring the Maximum Image Edge

	All built-in providers accept an `image_edge_max` parameter that controls the maximum pixel dimension of screenshots sent to the model. You can also set it via the `ASKUI_VLM_MAX_IMAGE_EDGE` environment variable:

	```
	ASKUI_VLM_MAX_IMAGE_EDGE=1568
	```

	Or pass it directly:

	```python
	from askui import AgentSettings, ComputerAgent
	from askui.model_providers import AnthropicVlmProvider

	with ComputerAgent(settings=AgentSettings(
	vlm_provider=AnthropicVlmProvider(image_edge_max=1568),
	)) as agent:
	agent.act("Open settings")
	```

	### Using a Custom Image Scaler

	To fully replace the scaling strategy, pass an `image_scaler` instance. When provided, `image_edge_max` is ignored:

	```python
	from askui import AgentSettings, ComputerAgent
	from askui.model_providers import (
	AnthropicVlmProvider,
	ContainedImageScaler,
	)

	with ComputerAgent(settings=AgentSettings(
	vlm_provider=AnthropicVlmProvider(
	image_scaler=ContainedImageScaler(max_width=1280, max_height=720),
	),
	)) as agent:
	agent.act("Open settings")
	```

	### Implementing a Custom Image Scaler

	For fully custom scaling logic, subclass `ImageScaler`:

	```python
	from PIL import Image
	from askui.model_providers import ImageScaler

	class MyImageScaler(ImageScaler):
	def __call__(self, image: Image.Image) -> Image.Image:
	# Your custom scaling logic
	return image.resize((1024, 768), Image.Resampling.LANCZOS)
	```

	---

		## Advanced: Injecting a Custom Client

		For full control over HTTP settings (timeouts, proxies, retries), you can inject a pre-configured client:
Expand Down

12 changes: 12 additions & 0 deletions docs/07_tools.md

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -181,6 +181,18 @@ A tool’s __call__ method may return:
		- None
		- a list or tuple containing any of the above

	Image size limit: When a tool returns a `PIL.Image.Image`, it is the tool's responsibility to ensure the image does not exceed ×ばつ2000 px** (longest side ≤ 2000 px). The Claude API enforces a ×ばつ2000 px per-image limit when more than 20 images are sent in a single request, which is common in agentic loops. Use `downscale_image()` from `askui.utils.llm_image_utils` to downscale images that may be too large:

	```python
	from PIL import Image
	from askui.utils.llm_image_utils import downscale_image

	image: Image.Image = ... # your image
	image = downscale_image(image, max_dimension=2000)
	```

	This preserves the original aspect ratio and only downscales images whose longest side exceeds the limit.

		### Complete Example

		Here’s a greeting tool that demonstrates all the key concepts:
Expand Down

6 changes: 5 additions & 1 deletion src/askui/android_agent.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -87,7 +87,6 @@ def __init__(
		) -> None:
		reporter = CompositeReporter(reporters=reporters)
		self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
	self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
		super().__init__(
		reporter=reporter,
		retry=retry,
Expand All		@@ -97,6 +96,11 @@ def __init__(
		callbacks=callbacks,
		truncation_strategy=truncation_strategy,
		)
	self.act_agent_os_facade = AndroidAgentOsFacade(
	self.os,
	coordinate_space=self._vlm_provider.coordinate_space,
	image_scaler=self._vlm_provider.image_scaler,
	)
		self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
		# Override default act settings with Android-specific settings
		self.act_settings = ActSettings(
Expand Down

4 changes: 3 additions & 1 deletion src/askui/computer_agent.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -130,7 +130,9 @@ def __init__(
		truncation_strategy=truncation_strategy,
		)
		self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
	self.tools.os
	self.tools.os,
	coordinate_space=self._vlm_provider.coordinate_space,
	image_scaler=self._vlm_provider.image_scaler,
		)
		self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
		# Override default act settings with computer-specific settings
Expand Down

20 changes: 19 additions & 1 deletion src/askui/model_providers/__init__.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -35,6 +35,17 @@
		from askui.model_providers.openai_image_qa_provider import OpenAIImageQAProvider
		from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
		from askui.model_providers.vlm_provider import VlmProvider
	from askui.models.shared.coordinate_space import (
	NormalizedCoordinateSpace,
	PixelCoordinateSpace,
	ScaledCoordinateSpace,
	VlmCoordinateSpace,
	)
	from askui.models.shared.image_scaler import (
	ContainedImageScaler,
	ImageScaler,
	PatchOptimizedImageScaler,
	)
		from askui.utils.model_pricing import ModelPricing

		__all__ = [
Expand All		@@ -46,11 +57,18 @@
		"DetectionProvider",
		"GoogleImageQAProvider",
		"ImageQAProvider",
	"ContainedImageScaler",
	"ImageScaler",
		"ModelPricing",
	"PatchOptimizedImageScaler",
	"NormalizedCoordinateSpace",
		"OllamaImageQAProvider",
		"OllamaVlmProvider",
	"OpenAICompatibleVlmProvider",
		"OpenAIImageQAProvider",
		"OpenAIVlmProvider",
	"OpenAICompatibleVlmProvider",
	"PixelCoordinateSpace",
	"ScaledCoordinateSpace",
	"VlmCoordinateSpace",
		"VlmProvider",
		]

24 changes: 24 additions & 0 deletions src/askui/model_providers/anthropic_vlm_provider.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,11 +14,13 @@
		ThinkingConfigParam,
		ToolChoiceParam,
		)
	from askui.models.shared.image_scaler import ImageScaler, PatchOptimizedImageScaler
		from askui.models.shared.prompts import SystemPrompt
		from askui.models.shared.tools import ToolCollection
		from askui.utils.model_pricing import ModelPricing

		_DEFAULT_MODEL_ID = "claude-sonnet-4-6"
	_DEFAULT_MAX_IMAGE_EDGE = 1024


		class AnthropicVlmProvider(VlmProvider):
Expand Down Expand Up		@@ -46,6 +48,13 @@ class AnthropicVlmProvider(VlmProvider):
		cost in USD per 1M output tokens.
		cache_write_cost_per_million_tokens (float \| None, optional): Override
		cost in USD per 1M cache write input tokens.
	image_scaler (`ImageScaler` \| None, optional): Custom image preprocessing
	callable. If ``None``, uses Anthropic-optimized patch-based scaling
	controlled by ``image_edge_max``.
	image_edge_max (int \| None, optional): Maximum edge length (in pixels)
	for screenshots sent to the model. Only used when ``image_scaler``
	is not provided. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` from the
	environment if not provided. Defaults to 1024.
		cache_read_cost_per_million_tokens (float \| None, optional): Override
		cost in USD per 1M cache read input tokens.

Expand All		@@ -70,6 +79,8 @@ def __init__(
		auth_token: str \| None = None,
		model_id: str \| None = None,
		client: Anthropic \| None = None,
	image_scaler: ImageScaler \| None = None,
	image_edge_max: int \| None = None,
		input_cost_per_million_tokens: float \| None = None,
		output_cost_per_million_tokens: float \| None = None,
		cache_write_cost_per_million_tokens: float \| None = None,
Expand All		@@ -78,6 +89,14 @@ def __init__(
		self._model_id_value = (
		model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
		)
	resolved_edge_max = (
	image_edge_max
	or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
	or _DEFAULT_MAX_IMAGE_EDGE
	)
	self._image_scaler = image_scaler or PatchOptimizedImageScaler(
	max_edge=resolved_edge_max
	)
		if client is not None:
		self.client = client
		else:
Expand All		@@ -104,6 +123,11 @@ def model_id(self) -> str:
		def pricing(self) -> ModelPricing \| None:
		return self._pricing

	@property
	@override
	def image_scaler(self) -> ImageScaler:
	return self._image_scaler

		@cached_property
		def _messages_api(self) -> AnthropicMessagesApi:
		"""Lazily initialise the AnthropicMessagesApi on first use."""
Expand Down

42 changes: 32 additions & 10 deletions src/askui/model_providers/askui_vlm_provider.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,10 +15,12 @@
		ThinkingConfigParam,
		ToolChoiceParam,
		)
	from askui.models.shared.image_scaler import ImageScaler, PatchOptimizedImageScaler
		from askui.models.shared.prompts import SystemPrompt
		from askui.models.shared.tools import ToolCollection

		_DEFAULT_MODEL_ID = "claude-sonnet-4-6"
	_DEFAULT_MAX_IMAGE_EDGE = 1024


		class AskUIVlmProvider(VlmProvider):
Expand All		@@ -29,23 +31,28 @@ class AskUIVlmProvider(VlmProvider):
		on the first API call, not at construction time.

		Args:
	workspace_id (str \| None, optional): AskUI workspace ID. Reads
	`ASKUI_WORKSPACE_ID` from the environment if not provided.
	token (str \| None, optional): AskUI API token. Reads `ASKUI_TOKEN`
	from the environment if not provided.
	model_id (str, optional): Claude model to use. Defaults to
	`"claude-sonnet-4-6"`.
	client (Anthropic \| None, optional): Pre-configured Anthropic client.
	If provided, `workspace_id` and `token` are ignored.
	askui_settings (`AskUiInferenceApiSettings` \| None, optional):
	Connection settings (workspace ID, token, base URL). Reads
	from environment variables if not provided.
	model_id (str \| None, optional): Claude model to use. Defaults to
	``"claude-sonnet-4-6"``.
	client (`Anthropic` \| None, optional): Pre-configured Anthropic client.
	If provided, ``askui_settings`` is only used for the base URL.
	image_scaler (`ImageScaler` \| None, optional): Custom image preprocessing
	callable. If ``None``, uses Anthropic-optimized patch-based scaling
	controlled by ``image_edge_max``.
	image_edge_max (int \| None, optional): Maximum edge length (in pixels)
	for screenshots sent to the model. Only used when ``image_scaler``
	is not provided. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` from the
	environment if not provided. Defaults to 1024.

		Example:
		```python
		from askui import AgentSettings, ComputerAgent
		from askui.model_providers import AskUIVlmProvider

		agent = ComputerAgent(settings=AgentSettings(
		vlm_provider=AskUIVlmProvider(
	workspace_id="my-workspace",
	token="my-token",
		model_id="claude-opus-4-6-20260401",
		)
		))
Expand All		@@ -57,18 +64,33 @@ def __init__(
		askui_settings: AskUiInferenceApiSettings \| None = None,
		model_id: str \| None = None,
		client: Anthropic \| None = None,
	image_scaler: ImageScaler \| None = None,
	image_edge_max: int \| None = None,
		) -> None:
		self._askui_settings = askui_settings or AskUiInferenceApiSettings()
		self._model_id_value = (
		model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
		)
		self._injected_client = client
	resolved_edge_max = (
	image_edge_max
	or int(os.environ.get("ASKUI_VLM_MAX_IMAGE_EDGE", "0"))
	or _DEFAULT_MAX_IMAGE_EDGE
	)
	self._image_scaler = image_scaler or PatchOptimizedImageScaler(
	max_edge=resolved_edge_max
	)

		@property
		@override
		def model_id(self) -> str:
		return self._model_id_value

	@property
	@override
	def image_scaler(self) -> ImageScaler:
	return self._image_scaler

		@cached_property
		def _messages_api(self) -> AnthropicMessagesApi:
		"""Lazily initialise the AnthropicMessagesApi on first use."""
Expand Down

49 changes: 49 additions & 0 deletions src/askui/model_providers/ollama_vlm_provider.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,26 +1,54 @@
		"""OllamaVlmProvider — VLM access via a local Ollama instance."""

		from openai import OpenAI
	from typing_extensions import override

		from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
	from askui.models.shared.coordinate_space import (
	PixelCoordinateSpace,
	ScaledCoordinateSpace,
	VlmCoordinateSpace,
	)
	from askui.models.shared.image_scaler import ImageScaler

		_DEFAULT_BASE_URL = "http://localhost:11434/v1"
		_DEFAULT_MODEL_ID = "qwen3.5"

	_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
	_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
	_KIMI_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)


		class OllamaVlmProvider(OpenAIVlmProvider):
		"""VLM provider that routes requests to a local Ollama instance.

		Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
		defaults (``base_url``, ``api_key``, ``model_id``).

	Qwen and Holo models are automatically detected and their coordinate
	space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
	Kimi models use ``NormalizedCoordinateSpace()``.
	Pass ``coordinate_space`` explicitly to override auto-detection.

		Args:
		model_id (str, optional): Ollama model to use. Defaults to
		``"qwen3.5"``.
		base_url (str, optional): Base URL for the Ollama OpenAI-compatible
		API. Defaults to ``"http://localhost:11434/v1"``.
		client (`OpenAI` \| None, optional): Pre-configured OpenAI client.
		If provided, ``base_url`` is ignored.
	coordinate_space (VlmCoordinateSpace \| None, optional): The coordinate
	grid the model emits coordinates in. ``None`` (the default)
	enables auto-detection based on ``model_id``.
	image_scaler (`ImageScaler` \| None, optional): Custom image preprocessing
	callable. If ``None``, inherits from `OpenAIVlmProvider`.
	image_scaler (`ImageScaler` \| None, optional): Custom image preprocessing
	callable. If ``None``, inherits from `OpenAIVlmProvider`.
	image_edge_max (int \| None, optional): Maximum edge length (in pixels)
	for screenshots sent to the model. Only used when ``image_scaler``
	is not provided. Reads ``ASKUI_VLM_MAX_IMAGE_EDGE`` from the
	environment if not provided. Inherits the default from
	`OpenAIVlmProvider` (1024).

		Example:
		```python
Expand All		@@ -40,10 +68,31 @@ def __init__(
		model_id: str = _DEFAULT_MODEL_ID,
		base_url: str = _DEFAULT_BASE_URL,
		client: OpenAI \| None = None,
	coordinate_space: VlmCoordinateSpace \| None = None,
	image_scaler: ImageScaler \| None = None,
	image_edge_max: int \| None = None,
		) -> None:
	self._coordinate_space_override = coordinate_space
		super().__init__(
		model_id=model_id,
		api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value
		base_url=base_url,
		client=client,
	coordinate_space=coordinate_space or PixelCoordinateSpace(),
	image_scaler=image_scaler,
	image_edge_max=image_edge_max,
		)

	@property
	@override
	def coordinate_space(self) -> VlmCoordinateSpace:
	if self._coordinate_space_override is not None:
	return self._coordinate_space_override
	model_lower = self._model_id_value.lower()
	if "qwen" in model_lower:
	return _QWEN_COORDINATE_SPACE
	if "holo" in model_lower:
	return _HOLO_COORDINATE_SPACE
	if "kimi" in model_lower:
	return _KIMI_COORDINATE_SPACE
	return self._coordinate_space

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: add coordinate space abstraction for open weights LLM support #282

Are you sure you want to change the base?

Uh oh!

feat: add coordinate space abstraction for open weights LLM support #282

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!