Copied to Clipboard
\n {title}\n{'=' * 60}")
def _parse(result: str) -> dict:
return json.loads(result)
def main() -> None:
os.environ.setdefault("OLLAMA_VISION_MODEL", "minicpm-v4.6")
if not FIXTURES.exists() or not list(FIXTURES.glob("*.png")):
from generate_fixtures import main as gen # noqa: E402
gen()
status = vb.health_check()
_banner("MiniCPM-V 4.6 Vision MCP — Agent Demo")
print(f"Model: {vb.VISION_MODEL} · Mode: {status.get('mode', '?')}")
if not status.get("ok") and not vb.MOCK:
print("\n⚠️ Ollama offline — re-run with OLLAMA_MOCK=1 or start Ollama.\n")
# Scenario 1 — describe screenshot
_banner("Scenario 1 — describe_image")
print('[Tool: describe_image] path=fixtures/diagram_v2.png')
r1 = _parse(describe_image(str(FIXTURES / "diagram_v2.png"), "What services are shown?"))
print("\n## Architecture summary\n")
print(r1.get("result", r1.get("error", r1)))
# Scenario 2 — OCR receipt
_banner("Scenario 2 — ocr_document")
print('[Tool: ocr_document] path=fixtures/sample_receipt.png')
r2 = _parse(ocr_document(str(FIXTURES / "sample_receipt.png")))
print("\n## Receipt OCR\n")
print(r2.get("result", r2.get("error", r2)))
# Scenario 3 — compare before/after
_banner("Scenario 3 — compare_images")
print("[Tool: compare_images] v1 → v2 pipeline diagrams")
r3 = _parse(
compare_images(
str(FIXTURES / "diagram_v1.png"),
str(FIXTURES / "diagram_v2.png"),
focus="new components and labels",
)
)
print("\n## Visual diff\n")
print(r3.get("result", r3.get("error", r3)))
_banner("Done — wire examples/server.py into Cursor MCP settings")
print("See examples/cursor_mcp.json.example")
if __name__ == " __main__":
main()
Part 4 — The vision backend
examples/vision_backend.py encodes images as base64 and POSTs to OLLAMA_HOST/api/chat:
payload = {
"model": VISION_MODEL, # minicpm-v4.6
"messages": [{"role": "user", "content": prompt, "images": images_b64}],
"stream": False,
}
vision_backend.py
"""Ollama vision backend for MiniCPM-V 4.6 — shared by MCP server and demos."""
from __future__ import annotations
import base64
import os
from pathlib import Path
import httpx
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434").rstrip("/")
VISION_MODEL = os.environ.get("OLLAMA_VISION_MODEL", "minicpm-v4.6")
MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1"
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}
class VisionError(Exception):
pass
def _encode_image(path: Path) -> str:
if not path.is_file():
raise VisionError(f"Image not found: {path}")
if path.suffix.lower() not in SUPPORTED_SUFFIXES:
raise VisionError(f"Unsupported image type: {path.suffix}")
return base64.b64encode(path.read_bytes()).decode("ascii")
def _mock_response(prompt: str, image_count: int) -> str:
return (
f"[mock {VISION_MODEL}] Processed {image_count} image(s).\n"
f"Prompt preview: {prompt[:120]}...\n"
"Set OLLAMA_MOCK=0 and run `ollama pull minicpm-v4.6` for live inference."
)
def chat_vision(prompt: str, image_paths: list[Path], *, timeout: float = 120.0) -> str:
"""Send a vision chat request to Ollama."""
if MOCK:
return _mock_response(prompt, len(image_paths))
images_b64 = [_encode_image(p) for p in image_paths]
payload = {
"model": VISION_MODEL,
"messages": [{"role": "user", "content": prompt, "images": images_b64}],
"stream": False,
}
try:
with httpx.Client(timeout=timeout) as client:
resp = client.post(f"{OLLAMA_HOST}/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
except httpx.ConnectError as exc:
raise VisionError(
f"Cannot reach Ollama at {OLLAMA_HOST}. Start Ollama and run: ollama pull {VISION_MODEL}"
) from exc
except httpx.HTTPStatusError as exc:
raise VisionError(f"Ollama error {exc.response.status_code}: {exc.response.text[:300]}") from exc
message = data.get("message") or {}
content = message.get("content", "").strip()
if not content:
raise VisionError("Empty response from Ollama")
return content
def health_check() -> dict:
"""Return model + connectivity status for demos."""
if MOCK:
return {"ok": True, "mode": "mock", "model": VISION_MODEL}
try:
with httpx.Client(timeout=5.0) as client:
resp = client.get(f"{OLLAMA_HOST}/api/tags")
resp.raise_for_status()
tags = {m.get("name", "").split(":")[0] for m in resp.json().get("models", [])}
base = VISION_MODEL.split(":")[0]
return {
"ok": base in tags or VISION_MODEL in tags,
"mode": "live",
"model": VISION_MODEL,
"ollama_host": OLLAMA_HOST,
}
except Exception as exc: # noqa: BLE001 — demo helper
return {"ok": False, "mode": "offline", "model": VISION_MODEL, "error": str(exc)}
Environment variables (see .env.example):
.env.example
# Ollama host (default local)
OLLAMA_HOST=http://127.0.0.1:11434
# Vision model — 1.6 GB, text + image, 256K context
OLLAMA_VISION_MODEL=minicpm-v4.6
# Set to 1 to run agent_demo without Ollama (offline smoke test)
OLLAMA_MOCK=0
Part 5 — The three tools
examples/server.py is a FastMCP server.
server.py
#!/usr/bin/env python3
"""MiniCPM-V MCP server — vision tools for Cursor, Claude Desktop, and Hermes.
Exposes three tools over MCP:
describe_image(path, question?) → general image understanding
ocr_document(path) → structured text extraction
compare_images(path_a, path_b, focus?) → side-by-side visual diff
Powered by MiniCPM-V 4.6 via Ollama (~1.6 GB, text + image, 256K context).
Run: python examples/server.py # stdio transport (for MCP hosts)
"""
from __future__ import annotations
import json
from pathlib import Path
from mcp.server.fastmcp import FastMCP
try:
from . import vision_backend as vb
except ImportError: # pragma: no cover
import vision_backend as vb # type: ignore
mcp = FastMCP("minicpm-vision")
DESCRIBE_DEFAULT = (
"Describe this image in detail. Include objects, text visible, layout, "
"colors, and anything notable for a developer reviewing a screenshot."
)
OCR_PROMPT = (
"Extract all readable text from this document or screenshot. "
"Preserve structure with markdown headings and bullet lists where appropriate. "
"If tables are present, format them as markdown tables."
)
COMPARE_DEFAULT = (
"Compare these two images. List similarities and differences. "
"Note UI changes, text changes, and layout shifts."
)
def _resolve(path: str) -> Path:
p = Path(path).expanduser().resolve()
if not p.is_file():
raise FileNotFoundError(f"Not a file: {p}")
return p
def _tool_result(text: str, **meta) -> str:
return json.dumps({"result": text, **meta}, indent=2)
@mcp.tool()
def describe_image(path: str, question: str = "") -> str:
"""Describe or answer questions about a single image using MiniCPM-V 4.6.
Args:
path: Absolute or relative path to a PNG, JPG, WEBP, or GIF file.
question: Optional specific question about the image. Leave empty for
a general description.
Returns JSON with the model's answer and metadata.
"""
try:
img = _resolve(path)
prompt = question.strip() or DESCRIBE_DEFAULT
answer = vb.chat_vision(prompt, [img])
return _tool_result(answer, tool="describe_image", path=str(img), model=vb.VISION_MODEL)
except (FileNotFoundError, vb.VisionError) as exc:
return json.dumps({"error": str(exc)}, indent=2)
@mcp.tool()
def ocr_document(path: str) -> str:
"""OCR a document, receipt, whiteboard photo, or screenshot to markdown text.
Args:
path: Absolute or relative path to the image file.
Returns JSON with extracted text in markdown format.
"""
try:
img = _resolve(path)
answer = vb.chat_vision(OCR_PROMPT, [img])
return _tool_result(answer, tool="ocr_document", path=str(img), model=vb.VISION_MODEL)
except (FileNotFoundError, vb.VisionError) as exc:
return json.dumps({"error": str(exc)}, indent=2)
@mcp.tool()
def compare_images(path_a: str, path_b: str, focus: str = "") -> str:
"""Compare two images and report visual differences.
Args:
path_a: Path to the first image (e.g. before screenshot).
path_b: Path to the second image (e.g. after screenshot).
focus: Optional aspect to focus on (e.g. "navigation bar", "error message").
Returns JSON with a structured comparison.
"""
try:
a, b = _resolve(path_a), _resolve(path_b)
prompt = COMPARE_DEFAULT
if focus.strip():
prompt += f"\n\nFocus especially on: {focus.strip()}"
answer = vb.chat_vision(prompt, [a, b])
return _tool_result(
answer,
tool="compare_images",
path_a=str(a),
path_b=str(b),
model=vb.VISION_MODEL,
)
except (FileNotFoundError, vb.VisionError) as exc:
return json.dumps({"error": str(exc)}, indent=2)
@mcp.resource("minicpm-vision://model")
def model_info() -> str:
"""Capability hint: which vision model and host this server uses."""
status = vb.health_check()
return json.dumps(
{
"model": vb.VISION_MODEL,
"ollama_host": vb.OLLAMA_HOST,
"tools": ["describe_image", "ocr_document", "compare_images"],
"status": status,
},
indent=2,
)
if __name__ == " __main__":
mcp.run(transport="stdio")
describe_image
General-purpose image Q&A. Pass a custom question for targeted queries.
Sample input — architecture diagram the demo describes:
ocr_document
Structured OCR prompt — markdown headings, bullet lists, tables. Ideal for receipts, invoices, and whiteboard photos.
Sample input — coffee shop receipt:
compare_images
Two paths + optional focus (e.g. "navigation bar"). Returns similarities, differences, and UI change notes.
Sample inputs — before and after pipeline:
Each tool returns JSON with result, tool, paths, and model.
Part 6 — Agent demo (terminal walkthrough)
examples/agent_demo.py runs all three scenarios:
python examples/generate_fixtures.py
python examples/agent_demo.py
agent_demo.py
#!/usr/bin/env python3
"""End-to-end demo — MiniCPM-V MCP vision tools (works offline with OLLAMA_MOCK=1).
Simulates what Cursor / Claude Desktop sees when the agent calls vision tools.
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
ROOT = Path( __file__ ).resolve().parent
sys.path.insert(0, str(ROOT))
import vision_backend as vb # noqa: E402
from server import compare_images, describe_image, ocr_document # noqa: E402
FIXTURES = ROOT / "fixtures"
def _banner(title: str) -> None:
print(f"\n{'=' * 60}\n{title}\n{'=' * 60}")
def _parse(result: str) -> dict:
return json.loads(result)
def main() -> None:
os.environ.setdefault("OLLAMA_VISION_MODEL", "minicpm-v4.6")
if not FIXTURES.exists() or not list(FIXTURES.glob("*.png")):
from generate_fixtures import main as gen # noqa: E402
gen()
status = vb.health_check()
_banner("MiniCPM-V 4.6 Vision MCP — Agent Demo")
print(f"Model: {vb.VISION_MODEL} · Mode: {status.get('mode', '?')}")
if not status.get("ok") and not vb.MOCK:
print("\n⚠️ Ollama offline — re-run with OLLAMA_MOCK=1 or start Ollama.\n")
# Scenario 1 — describe screenshot
_banner("Scenario 1 — describe_image")
print('[Tool: describe_image] path=fixtures/diagram_v2.png')
r1 = _parse(describe_image(str(FIXTURES / "diagram_v2.png"), "What services are shown?"))
print("\n## Architecture summary\n")
print(r1.get("result", r1.get("error", r1)))
# Scenario 2 — OCR receipt
_banner("Scenario 2 — ocr_document")
print('[Tool: ocr_document] path=fixtures/sample_receipt.png')
r2 = _parse(ocr_document(str(FIXTURES / "sample_receipt.png")))
print("\n## Receipt OCR\n")
print(r2.get("result", r2.get("error", r2)))
# Scenario 3 — compare before/after
_banner("Scenario 3 — compare_images")
print("[Tool: compare_images] v1 → v2 pipeline diagrams")
r3 = _parse(
compare_images(
str(FIXTURES / "diagram_v1.png"),
str(FIXTURES / "diagram_v2.png"),
focus="new components and labels",
)
)
print("\n## Visual diff\n")
print(r3.get("result", r3.get("error", r3)))
_banner("Done — wire examples/server.py into Cursor MCP settings")
print("See examples/cursor_mcp.json.example")
if __name__ == " __main__":
main()
The terminal shows the same flow your MCP host runs:
[Tool: describe_image] path=fixtures/diagram_v2.png
[Tool: ocr_document] path=fixtures/sample_receipt.png
[Tool: compare_images] v1 → v2 pipeline diagrams
Offline smoke test (no Ollama):
OLLAMA_MOCK=1 python examples/agent_demo.py
Part 7 — Wire into Cursor
Copy examples/cursor_mcp.json.example into Cursor → Settings → MCP. Use absolute paths for cwd.
Restart Cursor — you should see describe_image, ocr_document, compare_images.
Try: "Use ocr_document on /path/to/receipt.png and summarize the total."
{"mcpServers":{"minicpm-vision":{"command":"python","args":["examples/server.py"],"cwd":"/absolute/path/to/guides/minicpm-v-mcp-server","env":{"OLLAMA_VISION_MODEL":"minicpm-v4.6","OLLAMA_HOST":"http://127.0.0.1:11434"}}}}
Part 8 — Wire into Claude Desktop
Add the server block from examples/claude_desktop_config.json.example to ~/Library/Application Support/Claude/claude_desktop_config.json on macOS.
Restart Claude Desktop. Vision tools appear alongside your other MCP servers.
{"mcpServers":{"minicpm-vision":{"command":"python","args":["/absolute/path/to/guides/minicpm-v-mcp-server/examples/server.py"],"env":{"OLLAMA_VISION_MODEL":"minicpm-v4.6","OLLAMA_HOST":"http://127.0.0.1:11434"}}}}
Conclusion
Give your agent eyes — without giving away your pixels.
Most coding agents are brilliant at text and terrible at images. The usual fix is a cloud vision API: API keys in config, latency on every screenshot, and your receipts, UI mocks, and whiteboard photos leaving your machine.
MiniCPM-V 4.6 flips that. At 1.3B parameters and ~1.6 GB on Ollama, it runs comfortably on a 16 GB Mac and handles text + image input with a 256K context window. Wrap it in a small MCP server, and you get three reusable tools — describe_image, ocr_document, and compare_images — that Cursor, Claude Desktop, and any other MCP host can discover at connect time.
Thank you so much for reading
Like | Follow | Subscribe to the newsletter.
Catch us on
Website: https://www.techlatest.net/
Newsletter: https://substack.com/@parvezmohammed
Twitter: https://twitter.com/TechlatestNet
LinkedIn: https://www.linkedin.com/in/techlatest-net/
YouTube:https://www.youtube.com/@techlatest_net/
Blogs: https://medium.com/@techlatest.net
Reddit Community: https://www.reddit.com/user/techlatest_net/