MiniCPM-V MCP Server — Give Your Agent Eyes

DEV Community

\n {title}\n{'=' * 60}") def _parse(result: str) -> dict: return json.loads(result) def main() -> None: os.environ.setdefault("OLLAMA_VISION_MODEL", "minicpm-v4.6") if not FIXTURES.exists() or not list(FIXTURES.glob("*.png")): from generate_fixtures import main as gen # noqa: E402 gen() status = vb.health_check() _banner("MiniCPM-V 4.6 Vision MCP — Agent Demo") print(f"Model: {vb.VISION_MODEL} · Mode: {status.get('mode', '?')}") if not status.get("ok") and not vb.MOCK: print("\n⚠️ Ollama offline — re-run with OLLAMA_MOCK=1 or start Ollama.\n") # Scenario 1 — describe screenshot _banner("Scenario 1 — describe_image") print('[Tool: describe_image] path=fixtures/diagram_v2.png') r1 = _parse(describe_image(str(FIXTURES / "diagram_v2.png"), "What services are shown?")) print("\n## Architecture summary\n") print(r1.get("result", r1.get("error", r1))) # Scenario 2 — OCR receipt _banner("Scenario 2 — ocr_document") print('[Tool: ocr_document] path=fixtures/sample_receipt.png') r2 = _parse(ocr_document(str(FIXTURES / "sample_receipt.png"))) print("\n## Receipt OCR\n") print(r2.get("result", r2.get("error", r2))) # Scenario 3 — compare before/after _banner("Scenario 3 — compare_images") print("[Tool: compare_images] v1 → v2 pipeline diagrams") r3 = _parse( compare_images( str(FIXTURES / "diagram_v1.png"), str(FIXTURES / "diagram_v2.png"), focus="new components and labels", ) ) print("\n## Visual diff\n") print(r3.get("result", r3.get("error", r3))) _banner("Done — wire examples/server.py into Cursor MCP settings") print("See examples/cursor_mcp.json.example") if __name__ == " __main__": main()

Part 4 — The vision backend

examples/vision_backend.py encodes images as base64 and POSTs to OLLAMA_HOST/api/chat:

payload = {
 "model": VISION_MODEL, # minicpm-v4.6
 "messages": [{"role": "user", "content": prompt, "images": images_b64}],
 "stream": False,
}
vision_backend.py
"""Ollama vision backend for MiniCPM-V 4.6 — shared by MCP server and demos."""
from __future__ import annotations
import base64
import os
from pathlib import Path
import httpx
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://127.0.0.1:11434").rstrip("/")
VISION_MODEL = os.environ.get("OLLAMA_VISION_MODEL", "minicpm-v4.6")
MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1"
SUPPORTED_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp"}
class VisionError(Exception):
 pass
def _encode_image(path: Path) -> str:
 if not path.is_file():
 raise VisionError(f"Image not found: {path}")
 if path.suffix.lower() not in SUPPORTED_SUFFIXES:
 raise VisionError(f"Unsupported image type: {path.suffix}")
 return base64.b64encode(path.read_bytes()).decode("ascii")
def _mock_response(prompt: str, image_count: int) -> str:
 return (
 f"[mock {VISION_MODEL}] Processed {image_count} image(s).\n"
 f"Prompt preview: {prompt[:120]}...\n"
 "Set OLLAMA_MOCK=0 and run `ollama pull minicpm-v4.6` for live inference."
 )
def chat_vision(prompt: str, image_paths: list[Path], *, timeout: float = 120.0) -> str:
 """Send a vision chat request to Ollama."""
 if MOCK:
 return _mock_response(prompt, len(image_paths))
 images_b64 = [_encode_image(p) for p in image_paths]
 payload = {
 "model": VISION_MODEL,
 "messages": [{"role": "user", "content": prompt, "images": images_b64}],
 "stream": False,
 }
 try:
 with httpx.Client(timeout=timeout) as client:
 resp = client.post(f"{OLLAMA_HOST}/api/chat", json=payload)
 resp.raise_for_status()
 data = resp.json()
 except httpx.ConnectError as exc:
 raise VisionError(
 f"Cannot reach Ollama at {OLLAMA_HOST}. Start Ollama and run: ollama pull {VISION_MODEL}"
 ) from exc
 except httpx.HTTPStatusError as exc:
 raise VisionError(f"Ollama error {exc.response.status_code}: {exc.response.text[:300]}") from exc
 message = data.get("message") or {}
 content = message.get("content", "").strip()
 if not content:
 raise VisionError("Empty response from Ollama")
 return content
def health_check() -> dict:
 """Return model + connectivity status for demos."""
 if MOCK:
 return {"ok": True, "mode": "mock", "model": VISION_MODEL}
 try:
 with httpx.Client(timeout=5.0) as client:
 resp = client.get(f"{OLLAMA_HOST}/api/tags")
 resp.raise_for_status()
 tags = {m.get("name", "").split(":")[0] for m in resp.json().get("models", [])}
 base = VISION_MODEL.split(":")[0]
 return {
 "ok": base in tags or VISION_MODEL in tags,
 "mode": "live",
 "model": VISION_MODEL,
 "ollama_host": OLLAMA_HOST,
 }
 except Exception as exc: # noqa: BLE001 — demo helper
 return {"ok": False, "mode": "offline", "model": VISION_MODEL, "error": str(exc)}

Environment variables (see .env.example):

.env.example
# Ollama host (default local)
OLLAMA_HOST=http://127.0.0.1:11434
# Vision model — 1.6 GB, text + image, 256K context
OLLAMA_VISION_MODEL=minicpm-v4.6
# Set to 1 to run agent_demo without Ollama (offline smoke test)
OLLAMA_MOCK=0

Part 5 — The three tools

examples/server.py is a FastMCP server.


server.py
#!/usr/bin/env python3
"""MiniCPM-V MCP server — vision tools for Cursor, Claude Desktop, and Hermes.
Exposes three tools over MCP:
 describe_image(path, question?) → general image understanding
 ocr_document(path) → structured text extraction
 compare_images(path_a, path_b, focus?) → side-by-side visual diff
Powered by MiniCPM-V 4.6 via Ollama (~1.6 GB, text + image, 256K context).
Run: python examples/server.py # stdio transport (for MCP hosts)
"""
from __future__ import annotations
import json
from pathlib import Path
from mcp.server.fastmcp import FastMCP
try:
 from . import vision_backend as vb
except ImportError: # pragma: no cover
 import vision_backend as vb # type: ignore

mcp = FastMCP("minicpm-vision")
DESCRIBE_DEFAULT = (
 "Describe this image in detail. Include objects, text visible, layout, "
 "colors, and anything notable for a developer reviewing a screenshot."
)
OCR_PROMPT = (
 "Extract all readable text from this document or screenshot. "
 "Preserve structure with markdown headings and bullet lists where appropriate. "
 "If tables are present, format them as markdown tables."
)
COMPARE_DEFAULT = (
 "Compare these two images. List similarities and differences. "
 "Note UI changes, text changes, and layout shifts."
)
def _resolve(path: str) -> Path:
 p = Path(path).expanduser().resolve()
 if not p.is_file():
 raise FileNotFoundError(f"Not a file: {p}")
 return p
def _tool_result(text: str, **meta) -> str:
 return json.dumps({"result": text, **meta}, indent=2)
@mcp.tool()
def describe_image(path: str, question: str = "") -> str:
 """Describe or answer questions about a single image using MiniCPM-V 4.6.
 Args:
 path: Absolute or relative path to a PNG, JPG, WEBP, or GIF file.
 question: Optional specific question about the image. Leave empty for
 a general description.
 Returns JSON with the model's answer and metadata.
 """
 try:
 img = _resolve(path)
 prompt = question.strip() or DESCRIBE_DEFAULT
 answer = vb.chat_vision(prompt, [img])
 return _tool_result(answer, tool="describe_image", path=str(img), model=vb.VISION_MODEL)
 except (FileNotFoundError, vb.VisionError) as exc:
 return json.dumps({"error": str(exc)}, indent=2)
@mcp.tool()
def ocr_document(path: str) -> str:
 """OCR a document, receipt, whiteboard photo, or screenshot to markdown text.
 Args:
 path: Absolute or relative path to the image file.
 Returns JSON with extracted text in markdown format.
 """
 try:
 img = _resolve(path)
 answer = vb.chat_vision(OCR_PROMPT, [img])
 return _tool_result(answer, tool="ocr_document", path=str(img), model=vb.VISION_MODEL)
 except (FileNotFoundError, vb.VisionError) as exc:
 return json.dumps({"error": str(exc)}, indent=2)
@mcp.tool()
def compare_images(path_a: str, path_b: str, focus: str = "") -> str:
 """Compare two images and report visual differences.
 Args:
 path_a: Path to the first image (e.g. before screenshot).
 path_b: Path to the second image (e.g. after screenshot).
 focus: Optional aspect to focus on (e.g. "navigation bar", "error message").
 Returns JSON with a structured comparison.
 """
 try:
 a, b = _resolve(path_a), _resolve(path_b)
 prompt = COMPARE_DEFAULT
 if focus.strip():
 prompt += f"\n\nFocus especially on: {focus.strip()}"
 answer = vb.chat_vision(prompt, [a, b])
 return _tool_result(
 answer,
 tool="compare_images",
 path_a=str(a),
 path_b=str(b),
 model=vb.VISION_MODEL,
 )
 except (FileNotFoundError, vb.VisionError) as exc:
 return json.dumps({"error": str(exc)}, indent=2)
@mcp.resource("minicpm-vision://model")
def model_info() -> str:
 """Capability hint: which vision model and host this server uses."""
 status = vb.health_check()
 return json.dumps(
 {
 "model": vb.VISION_MODEL,
 "ollama_host": vb.OLLAMA_HOST,
 "tools": ["describe_image", "ocr_document", "compare_images"],
 "status": status,
 },
 indent=2,
 )
if __name__ == " __main__":
 mcp.run(transport="stdio")

describe_image

General-purpose image Q&A. Pass a custom question for targeted queries.

Sample input — architecture diagram the demo describes:

ocr_document

Structured OCR prompt — markdown headings, bullet lists, tables. Ideal for receipts, invoices, and whiteboard photos.

Sample input — coffee shop receipt:

compare_images

Two paths + optional focus (e.g. "navigation bar"). Returns similarities, differences, and UI change notes.

Sample inputs — before and after pipeline:

Each tool returns JSON with result, tool, paths, and model.

Part 6 — Agent demo (terminal walkthrough)

examples/agent_demo.py runs all three scenarios:

python examples/generate_fixtures.py
python examples/agent_demo.py
agent_demo.py
#!/usr/bin/env python3
"""End-to-end demo — MiniCPM-V MCP vision tools (works offline with OLLAMA_MOCK=1).
Simulates what Cursor / Claude Desktop sees when the agent calls vision tools.
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
ROOT = Path( __file__ ).resolve().parent
sys.path.insert(0, str(ROOT))
import vision_backend as vb # noqa: E402
from server import compare_images, describe_image, ocr_document # noqa: E402

FIXTURES = ROOT / "fixtures"
def _banner(title: str) -> None:
 print(f"\n{'=' * 60}\n{title}\n{'=' * 60}")
def _parse(result: str) -> dict:
 return json.loads(result)
def main() -> None:
 os.environ.setdefault("OLLAMA_VISION_MODEL", "minicpm-v4.6")
 if not FIXTURES.exists() or not list(FIXTURES.glob("*.png")):
 from generate_fixtures import main as gen # noqa: E402

 gen()
 status = vb.health_check()
 _banner("MiniCPM-V 4.6 Vision MCP — Agent Demo")
 print(f"Model: {vb.VISION_MODEL} · Mode: {status.get('mode', '?')}")
 if not status.get("ok") and not vb.MOCK:
 print("\n⚠️ Ollama offline — re-run with OLLAMA_MOCK=1 or start Ollama.\n")
 # Scenario 1 — describe screenshot
 _banner("Scenario 1 — describe_image")
 print('[Tool: describe_image] path=fixtures/diagram_v2.png')
 r1 = _parse(describe_image(str(FIXTURES / "diagram_v2.png"), "What services are shown?"))
 print("\n## Architecture summary\n")
 print(r1.get("result", r1.get("error", r1)))
 # Scenario 2 — OCR receipt
 _banner("Scenario 2 — ocr_document")
 print('[Tool: ocr_document] path=fixtures/sample_receipt.png')
 r2 = _parse(ocr_document(str(FIXTURES / "sample_receipt.png")))
 print("\n## Receipt OCR\n")
 print(r2.get("result", r2.get("error", r2)))
 # Scenario 3 — compare before/after
 _banner("Scenario 3 — compare_images")
 print("[Tool: compare_images] v1 → v2 pipeline diagrams")
 r3 = _parse(
 compare_images(
 str(FIXTURES / "diagram_v1.png"),
 str(FIXTURES / "diagram_v2.png"),
 focus="new components and labels",
 )
 )
 print("\n## Visual diff\n")
 print(r3.get("result", r3.get("error", r3)))
 _banner("Done — wire examples/server.py into Cursor MCP settings")
 print("See examples/cursor_mcp.json.example")
if __name__ == " __main__":
 main()

The terminal shows the same flow your MCP host runs:

[Tool: describe_image] path=fixtures/diagram_v2.png
[Tool: ocr_document] path=fixtures/sample_receipt.png
[Tool: compare_images] v1 → v2 pipeline diagrams

Offline smoke test (no Ollama):

OLLAMA_MOCK=1 python examples/agent_demo.py

Part 7 — Wire into Cursor

Copy examples/cursor_mcp.json.example into Cursor → Settings → MCP. Use absolute paths for cwd.

Restart Cursor — you should see describe_image, ocr_document, compare_images.

Try: "Use ocr_document on /path/to/receipt.png and summarize the total."

{"mcpServers":{"minicpm-vision":{"command":"python","args":["examples/server.py"],"cwd":"/absolute/path/to/guides/minicpm-v-mcp-server","env":{"OLLAMA_VISION_MODEL":"minicpm-v4.6","OLLAMA_HOST":"http://127.0.0.1:11434"}}}}

Part 8 — Wire into Claude Desktop

Add the server block from examples/claude_desktop_config.json.example to ~/Library/Application Support/Claude/claude_desktop_config.json on macOS.

Restart Claude Desktop. Vision tools appear alongside your other MCP servers.

{"mcpServers":{"minicpm-vision":{"command":"python","args":["/absolute/path/to/guides/minicpm-v-mcp-server/examples/server.py"],"env":{"OLLAMA_VISION_MODEL":"minicpm-v4.6","OLLAMA_HOST":"http://127.0.0.1:11434"}}}}

Conclusion

Give your agent eyes — without giving away your pixels.

Most coding agents are brilliant at text and terrible at images. The usual fix is a cloud vision API: API keys in config, latency on every screenshot, and your receipts, UI mocks, and whiteboard photos leaving your machine.

MiniCPM-V 4.6 flips that. At 1.3B parameters and ~1.6 GB on Ollama, it runs comfortably on a 16 GB Mac and handles text + image input with a 256K context window. Wrap it in a small MCP server, and you get three reusable tools — describe_image, ocr_document, and compare_images — that Cursor, Claude Desktop, and any other MCP host can discover at connect time.

Thank you so much for reading

Like | Follow | Subscribe to the newsletter.

Catch us on

Website: https://www.techlatest.net/

Newsletter: https://substack.com/@parvezmohammed

Twitter: https://twitter.com/TechlatestNet

LinkedIn: https://www.linkedin.com/in/techlatest-net/

YouTube:https://www.youtube.com/@techlatest_net/

Blogs: https://medium.com/@techlatest.net

Reddit Community: https://www.reddit.com/user/techlatest_net/