`).join('');
+ const providerWarnings = (d.providerWarnings || []).map(w => `
+
${fmt.ago(r.timestamp)}
@@ -279,7 +291,7 @@
${card(`
Configured Providers
-
${providerCards}
+
${providerCards}${providerWarnings}
`)}
diff --git a/scripts/check-native.js b/scripts/check-native.js
new file mode 100644
index 0000000..eba34ca
--- /dev/null
+++ b/scripts/check-native.js
@@ -0,0 +1,97 @@
+#!/usr/bin/env node
+/**
+ * Native module ABI guard (postinstall).
+ *
+ * better-sqlite3 (and the other native optionalDependencies) are compiled
+ * against a specific Node ABI. When Node is upgraded, the prebuilt/compiled
+ * binary stops loading with:
+ *
+ * "was compiled against a different Node.js version using
+ * NODE_MODULE_VERSION 115. This version of Node.js requires
+ * NODE_MODULE_VERSION 141."
+ *
+ * The failure is silent at runtime — telemetry, request logs, and the memory
+ * store all sit behind try/catch and simply go empty. This probe detects the
+ * mismatch and rebuilds the native modules so it self-heals on `npm install`.
+ *
+ * It is intentionally best-effort: it NEVER exits non-zero, so it can't break
+ * `npm install` on machines without a build toolchain (the modules are
+ * optional and the app degrades gracefully without them).
+ */
+
+const { execSync } = require("child_process");
+
+// Native optionalDependencies that are ABI-sensitive. If Node changed, all of
+// them are stale, so we rebuild the set in one pass.
+const NATIVE_DEPS = [
+ "better-sqlite3",
+ "hnswlib-node",
+ "tree-sitter",
+ "tree-sitter-javascript",
+ "tree-sitter-python",
+ "tree-sitter-typescript",
+];
+
+function log(msg) {
+ console.log(`[check-native] ${msg}`);
+}
+
+/**
+ * Probe better-sqlite3 — the canary. `require()` alone is not enough: the
+ * native addon only loads when a Database is instantiated.
+ * @returns {"ok"|"absent"|"mismatch"}
+ */
+function probe() {
+ let Database;
+ try {
+ Database = require("better-sqlite3");
+ } catch (err) {
+ if (err && err.code === "MODULE_NOT_FOUND") return "absent";
+ return "mismatch";
+ }
+ try {
+ const db = new Database(":memory:");
+ db.close();
+ return "ok";
+ } catch (err) {
+ if (/NODE_MODULE_VERSION|different Node\.js version|invalid ELF|dlopen|\.node/i.test(err.message || "")) {
+ return "mismatch";
+ }
+ // Some other instantiation error — not an ABI issue we can fix by rebuild.
+ return "ok";
+ }
+}
+
+function main() {
+ const status = probe();
+
+ if (status === "absent") {
+ // Optional dependency not installed (e.g. build skipped). Nothing to do.
+ return;
+ }
+ if (status === "ok") {
+ return;
+ }
+
+ log("native module ABI mismatch detected (Node was likely upgraded). Rebuilding native modules...");
+ try {
+ execSync(`npm rebuild ${NATIVE_DEPS.join(" ")}`, { stdio: "inherit" });
+ } catch {
+ log("rebuild did not complete (a build toolchain may be missing). Continuing — native features will be disabled until you run: npm rebuild better-sqlite3");
+ return;
+ }
+
+ // Re-probe to report the outcome.
+ if (probe() === "ok") {
+ log("native modules rebuilt successfully.");
+ } else {
+ log("native modules still not loadable after rebuild. Run `npm rebuild better-sqlite3` manually.");
+ }
+}
+
+try {
+ main();
+} catch (err) {
+ // Never fail the install.
+ log(`skipped (${err.message})`);
+}
diff --git a/src/clients/databricks.js b/src/clients/databricks.js
index ef9e244..5d31c79 100644
--- a/src/clients/databricks.js
+++ b/src/clients/databricks.js
@@ -1506,10 +1506,16 @@ async function invokeMoonshot(body) {
"claude-haiku-4-5-20251001": "kimi-k2-turbo-preview",
"claude-haiku-4-5": "kimi-k2-turbo-preview",
"claude-3-haiku": "kimi-k2-turbo-preview",
+ // moonshot-v1-auto 400s with "tokenization failed" (its server-side auto
+ // context-size pass fails on large tool-bearing payloads). Remap to a
+ // fixed model that's broadly available on api.moonshot.ai.
+ "moonshot-v1-auto": "moonshot-v1-128k",
};
const requestedModel = body._tierModel || body.model || config.moonshot.model;
- const mappedModel = modelMap[requestedModel] || config.moonshot.model || "kimi-k2-turbo-preview";
+ let mappedModel = modelMap[requestedModel] || config.moonshot.model || "kimi-k2-turbo-preview";
+ // Guard against the deprecated auto model arriving via config too.
+ if (mappedModel === "moonshot-v1-auto") mappedModel = "moonshot-v1-128k";
// Convert messages using existing utility
const messages = convertAnthropicMessagesToOpenRouter(body.messages || []);
@@ -1522,12 +1528,18 @@ async function invokeMoonshot(body) {
messages.unshift({ role: "system", content: systemContent });
}
+ // kimi-k2.x (k2.5 / k2.6 ...) are thinking models that only accept
+ // temperature: 1 — any other value 400s with "invalid temperature".
+ const isKimiThinking = /^kimi-k2/i.test(mappedModel);
+
const moonshotBody = {
model: mappedModel,
messages,
max_tokens: body.max_tokens || 16384,
- temperature: body.temperature ?? 0.7,
- top_p: body.top_p ?? 1.0,
+ // kimi-k2.x thinking models pin sampling params: temperature must be 1
+ // and top_p must be 0.95 — any other value 400s.
+ temperature: isKimiThinking ? 1 : (body.temperature ?? 0.7),
+ top_p: isKimiThinking ? 0.95 : (body.top_p ?? 1.0),
stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
};
@@ -2027,6 +2039,65 @@ async function invokeCodex(body) {
};
}
+/**
+ * Compute request cost in USD from model pricing
×ばつ token usage.
+ * Registry returns per-1M-token prices ({ input, output }); returns null when
+ * pricing is unknown so we don't record misleading zeros.
+ */
+const _unknownCostWarned = new Set();
+function computeCostUsd(model, inputTokens, outputTokens) {
+ try {
+ const { getModelRegistrySync } = require("../routing/model-registry");
+ const reg = getModelRegistrySync && getModelRegistrySync();
+ const cost = reg?.getCost?.(model);
+ if (!cost) return null;
+ // Unknown model → record null (not a fabricated default), warn once so the
+ // gap is visible and can be fixed via MODEL_PRICE_OVERRIDES.
+ if (cost.unknown) {
+ if (model && !_unknownCostWarned.has(model)) {
+ _unknownCostWarned.add(model);
+ logger.warn({ model }, "[Cost] No pricing for model — recording cost_usd=null. Set MODEL_PRICE_OVERRIDES to fix.");
+ }
+ return null;
+ }
+ if (cost.input == null && cost.output == null) return null;
+ const inUsd = ((inputTokens || 0) / 1e6) * (cost.input || 0);
+ const outUsd = ((outputTokens || 0) / 1e6) * (cost.output || 0);
+ return Number((inUsd + outUsd).toFixed(6));
+ } catch {
+ return null;
+ }
+}
+
+// Telemetry prompt/response text is always captured (truncated) to build the
+// routing ML training corpus. Stored locally in .lynkr/telemetry.db only.
+const TELEMETRY_TEXT_MAXLEN = 2000;
+
+/** Flatten the latest user message to plain text (for telemetry capture). */
+function captureRequestText(body) {
+ const messages = body?.messages;
+ if (!Array.isArray(messages)) return null;
+ for (let i = messages.length - 1; i>= 0; i--) {
+ const m = messages[i];
+ if (m?.role !== "user") continue;
+ let text = "";
+ if (typeof m.content === "string") text = m.content;
+ else if (Array.isArray(m.content)) {
+ text = m.content.filter((b) => b?.type === "text").map((b) => b.text || "").join(" ");
+ }
+ if (text) return text.slice(0, TELEMETRY_TEXT_MAXLEN);
+ }
+ return null;
+}
+
+/** Flatten an Anthropic response's text blocks to plain text (for telemetry). */
+function captureResponseText(resultJson) {
+ const content = resultJson?.content;
+ if (!Array.isArray(content)) return null;
+ const text = content.filter((b) => b?.type === "text").map((b) => b.text || "").join(" ");
+ return text ? text.slice(0, TELEMETRY_TEXT_MAXLEN) : null;
+}
+
async function invokeModel(body, options = {}) {
const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
const metricsCollector = getMetricsCollector();
@@ -2233,6 +2304,9 @@ async function invokeModel(body, options = {}) {
circuit_breaker_state: breaker.state,
quality_score: qualityScore,
tokens_per_second: outputTokens && latency> 0 ? outputTokens / (latency / 1000) : null,
+ cost_usd: computeCostUsd(routingDecision.model || body._tierModel, inputTokens, outputTokens),
+ request_text: captureRequestText(body),
+ response_text: captureResponseText(result.json),
});
// Return result with provider info and routing decision for headers
@@ -2394,6 +2468,9 @@ async function invokeModel(body, options = {}) {
{ status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime }
),
tokens_per_second: fbOutputTokens && fallbackLatency> 0 ? fbOutputTokens / (fallbackLatency / 1000) : null,
+ cost_usd: computeCostUsd(routingDecision.model || body._tierModel, fbInputTokens, fbOutputTokens),
+ request_text: captureRequestText(body),
+ response_text: captureResponseText(fallbackResult.json),
});
// Return result with actual provider used (fallback provider) and routing decision
diff --git a/src/clients/openrouter-utils.js b/src/clients/openrouter-utils.js
index 1a2daba..7978f8c 100644
--- a/src/clients/openrouter-utils.js
+++ b/src/clients/openrouter-utils.js
@@ -176,6 +176,21 @@ function convertAnthropicMessagesToOpenRouter(anthropicMessages) {
}
}
+ // Kimi/Moonshot (and some OpenAI-compatible APIs) reject a message whose
+ // content is an empty string with "Invalid request: tokenization failed".
+ // This happens when a turn had only non-text blocks (thinking / image /
+ // stripped content) and flattened to "". Replace empty/whitespace-only
+ // content with a single space — but never touch an assistant message that
+ // carries tool_calls, where content: null is intentional and required.
+ for (const m of converted) {
+ if (m.role === 'tool') continue;
+ const hasToolCalls = Array.isArray(m.tool_calls) && m.tool_calls.length> 0;
+ if (hasToolCalls) continue;
+ if (typeof m.content !== 'string' || m.content.trim() === '') {
+ m.content = ' ';
+ }
+ }
+
// Log the converted messages for debugging
logger.debug({
inputCount: anthropicMessages.length,
diff --git a/src/config/index.js b/src/config/index.js
index 729f2fc..e4ac410 100644
--- a/src/config/index.js
+++ b/src/config/index.js
@@ -208,6 +208,11 @@ const tokenBudgetWarning = Number.parseInt(process.env.TOKEN_BUDGET_WARNING ?? "
const tokenBudgetMax = Number.parseInt(process.env.TOKEN_BUDGET_MAX ?? "180000", 10);
const tokenBudgetEnforcement = process.env.TOKEN_BUDGET_ENFORCEMENT !== "false"; // default true
+// Caveman terse-output injection (opt-in, off by default)
+const cavemanEnabled = process.env.CAVEMAN_ENABLED === "true";
+const cavemanLevel = (process.env.CAVEMAN_LEVEL ?? "lite").toLowerCase();
+
+
// TOON payload compression (opt-in)
const toonEnabled = process.env.TOON_ENABLED === "true"; // default false
const toonMinBytes = Number.parseInt(process.env.TOON_MIN_BYTES ?? "4096", 10);
@@ -641,6 +646,10 @@ var config = {
toolResultCompression: {
enabled: true,
},
+ caveman: {
+ enabled: cavemanEnabled,
+ level: cavemanLevel,
+ },
server: {
jsonLimit: process.env.REQUEST_JSON_LIMIT ?? "1gb",
},
diff --git a/src/context/caveman.js b/src/context/caveman.js
new file mode 100644
index 0000000..550b201
--- /dev/null
+++ b/src/context/caveman.js
@@ -0,0 +1,94 @@
+/**
+ * Caveman Terse-Output Injector
+ *
+ * Appends a brevity instruction to the system prompt so the model produces
+ * terser responses, reducing OUTPUT tokens. Opt-in and off by default — it
+ * changes model behavior, so it's only applied when explicitly enabled.
+ *
+ * Enable with CAVEMAN_ENABLED=true. Level via CAVEMAN_LEVEL=lite|full|ultra
+ * (default: lite). Adapted from 9router's caveman injector / the caveman skill
+ * (https://github.com/JuliusBrussee/caveman).
+ *
+ * @module context/caveman
+ */
+
+const config = require("../config");
+const logger = require("../logger");
+
+const LEVELS = ["lite", "full", "ultra"];
+
+// Shared guardrails so brevity never corrupts the substance that matters.
+const BOUNDARIES =
+ "Code blocks, file paths, commands, errors, URLs: keep exact. " +
+ "Security warnings, irreversible-action confirmations, and multi-step ordered " +
+ "sequences: write in full normal prose. Resume terse style afterward.";
+
+const EXAMPLES =
+ 'Not: "Sure! I\'d be happy to help. The issue is likely caused by..." ' +
+ 'Yes: "Bug in auth middleware. Token expiry uses `<` not `<=`. Fix:"'; + +const PERSISTENCE = "Apply this to every response unless a guardrail above applies."; + +const PROMPTS = { + lite: [ + "Respond tersely. Keep grammar and full sentences but drop filler, hedging, and pleasantries (just/really/basically/sure/of course/I'd be happy to).", + "Pattern: state the thing, the action, the reason. Then the next step.", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), + + full: [ + "Respond like a terse caveman. All technical substance stays exact; only fluff dies.", + "Drop articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries, and hedging. Fragments OK. Prefer short synonyms (big not extensive, fix not implement a solution for).", + "Pattern: [thing] [action] [reason]. [next step].", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), + + ultra: [ + "Respond ultra-terse. Maximum compression. Telegraphic.", + "Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, use arrows for causality (X → Y). One word when one word is enough.", + "Pattern: [thing] → [result]. [fix].", + EXAMPLES, + BOUNDARIES, + PERSISTENCE, + ].join(" "), +}; + +const MARKER = "[brevity]"; + +/** Resolve the configured level, falling back to "lite". */ +function resolveLevel(level) { + const l = String(level || config.caveman?.level || "lite").toLowerCase(); + return LEVELS.includes(l) ? l : "lite"; +} + +/** + * Append the brevity instruction to a system prompt string. + * Idempotent — won't double-inject if the marker is already present. + * + * @param {string} system - Existing system prompt (may be empty). + * @param {object} [opts] + * @param {boolean} [opts.enabled] - Override config enablement. + * @param {string} [opts.level] - Override level. + * @returns {string} system prompt, possibly with brevity instruction appended. + */ +function injectCaveman(system, opts = {}) { + const enabled = opts.enabled ?? config.caveman?.enabled === true; + if (!enabled) return system || ""; + + const base = system || ""; + if (base.includes(MARKER)) return base; + + const level = resolveLevel(opts.level); + const instruction = `\n\n${MARKER} ${PROMPTS[level]}`; + logger.debug({ level }, "[Caveman] Injected brevity instruction into system prompt"); + return base + instruction; +} + +module.exports = { + injectCaveman, + LEVELS, +}; diff --git a/src/context/tool-dedup.js b/src/context/tool-dedup.js new file mode 100644 index 0000000..65f0aba --- /dev/null +++ b/src/context/tool-dedup.js @@ -0,0 +1,95 @@ +/** + * MCP-aware Tool Dedup + * + * Strips built-in tool definitions when an equivalent MCP tool is present in + * the request. Sending both wastes tool-schema tokens and gives the model + * redundant choices. Rule-based and deterministic. + * + * Example: if the Exa or Tavily MCP search tools are present, the built-in + * WebSearch/WebFetch tools are redundant and dropped. + * + * Ported from 9router's toolDeduper. Always on — purely removes redundant + * tool definitions, never adds. + * + * @module context/tool-dedup + */ + +const logger = require("../logger"); + +// Each rule: if any `triggers` tool is present, strip any tools matching +// `strip`. Patterns may be exact strings or RegExp (matched against the name). +const DEDUP_RULES = [ + { + // Exa MCP present → drop built-in web tools (Exa is preferred). + triggers: ["mcp__exa__web_search_exa", "mcp__exa__web_fetch_exa"], + strip: ["WebSearch", "WebFetch", "web_search", "web_fetch", "mcp__workspace__web_fetch"], + }, + { + // Tavily MCP present → drop built-in web tools. + triggers: ["mcp__tavily__tavily_search", "mcp__tavily__tavily_extract"], + strip: ["WebSearch", "WebFetch", "web_search", "web_fetch", "mcp__workspace__web_fetch"], + }, + { + // Browser MCP present → drop a duplicate Chrome-connector tool family. + triggers: [/^mcp__browsermcp__/], + strip: [/^mcp__Claude_in_Chrome__/], + }, +]; + +function getToolName(t) { + return t?.name || t?.function?.name || ""; +} + +function matches(name, pattern) { + if (typeof pattern === "string") return name === pattern; + return pattern instanceof RegExp ? pattern.test(name) : false; +} + +/** + * Remove redundant built-in tools that are superseded by present MCP tools. + * + * @param {Array} tools - Tool definitions (Anthropic or OpenAI shape). + * @returns {{tools: Array, stripped: string[]}} filtered tools + names removed. + */ +function dedupeTools(tools) { + if (!Array.isArray(tools) || tools.length === 0) return { tools, stripped: [] }; + + const names = tools.map(getToolName); + const toStrip = new Set(); + + for (const rule of DEDUP_RULES) { + const hasTrigger = names.some((n) => rule.triggers.some((p) => matches(n, p)));
+ if (!hasTrigger) continue;
+ for (const n of names) {
+ // Never strip a tool that is itself a trigger.
+ if (rule.triggers.some((p) => matches(n, p))) continue;
+ if (rule.strip.some((p) => matches(n, p))) toStrip.add(n);
+ }
+ }
+
+ if (toStrip.size === 0) return { tools, stripped: [] };
+
+ const out = tools.filter((t) => !toStrip.has(getToolName(t)));
+ return { tools: out, stripped: Array.from(toStrip) };
+}
+
+/**
+ * Apply tool dedup to a payload in place. No-op when nothing is stripped.
+ *
+ * @param {object} payload - Request body with a `tools` array.
+ * @returns {string[]} names of stripped tools.
+ */
+function applyToolDedup(payload) {
+ if (!payload || !Array.isArray(payload.tools)) return [];
+ const { tools, stripped } = dedupeTools(payload.tools);
+ if (stripped.length> 0) {
+ payload.tools = tools;
+ logger.debug({ stripped }, "[ToolDedup] Stripped redundant built-in tools (MCP equivalents present)");
+ }
+ return stripped;
+}
+
+module.exports = {
+ dedupeTools,
+ applyToolDedup,
+};
diff --git a/src/context/tool-result-compressor.js b/src/context/tool-result-compressor.js
index c538d5b..9171b16 100644
--- a/src/context/tool-result-compressor.js
+++ b/src/context/tool-result-compressor.js
@@ -455,6 +455,107 @@ function compressContainerOutput(text) {
return `${header}\n${dataLines.slice(0, 10).join("\n")}\n... +${dataLines.length - 10} more (${dataLines.length} total)`;
}
+// 11. Grep / ripgrep output ("file:lineno:content"), per-file match cap.
+// Ported from 9router RTK grep filter (rtk/src/cmds/system/pipe_cmd.rs).
+const GREP_PER_FILE_MAX = 10;
+function compressGrep(text) {
+ const byFile = new Map();
+ let total = 0;
+
+ for (const line of text.split("\n")) {
+ // splitn(3, ':') — only split on the first two colons.
+ const first = line.indexOf(":");
+ if (first === -1) continue;
+ const second = line.indexOf(":", first + 1);
+ if (second === -1) continue;
+ const file = line.slice(0, first);
+ const lineNumStr = line.slice(first + 1, second);
+ const content = line.slice(second + 1);
+ if (!/^\d+$/.test(lineNumStr)) continue;
+ total++;
+ if (!byFile.has(file)) byFile.set(file, []);
+ byFile.get(file).push([lineNumStr, content]);
+ }
+
+ // Require a meaningful number of matches so we don't mangle prose that
+ // happens to contain a "word:123:..." line.
+ if (total < 5) return null; + + const files = Array.from(byFile.keys()).sort(); + let out = `${total} matches in ${files.length}F:\n\n`; + for (const file of files) { + const matches = byFile.get(file); + out += `[file] ${file} (${matches.length}):\n`; + for (const [lineNum, content] of matches.slice(0, GREP_PER_FILE_MAX)) { + out += ` ${lineNum.padStart(4)}: ${content.trim()}\n`; + } + if (matches.length> GREP_PER_FILE_MAX) {
+ out += ` +${matches.length - GREP_PER_FILE_MAX}\n`;
+ }
+ out += "\n";
+ }
+ return out;
+}
+
+// 12. Generic log de-duplication: collapse consecutive duplicate lines and
+// runs of blank lines, with a hard line cap. Ported from 9router RTK dedupLog.
+const DEDUP_LINE_MAX = 2000;
+function compressDedupLog(text) {
+ const lines = text.split("\n");
+ const out = [];
+ let prev = null;
+ let runCount = 0;
+ let blankStreak = 0;
+
+ const flushRun = () => {
+ if (prev !== null && runCount> 1) {
+ out.push(` ... (${runCount - 1} duplicate lines)`);
+ }
+ };
+
+ for (const line of lines) {
+ if (line.trim() === "") {
+ if (blankStreak < 1) out.push(line); + blankStreak += 1; + flushRun(); + prev = null; + runCount = 0; + continue; + } + blankStreak = 0; + if (line === prev) { + runCount += 1; + continue; + } + flushRun(); + out.push(line); + prev = line; + runCount = 1; + if (out.length>= DEDUP_LINE_MAX) {
+ out.push(`... (truncated at ${DEDUP_LINE_MAX} lines)`);
+ return out.join("\n");
+ }
+ }
+ flushRun();
+ return out.join("\n");
+}
+
+// 13. Last-resort generic truncation: keep head + tail lines, drop the middle.
+// Only kicks in for very long output no specific compressor matched.
+// Ported from 9router RTK smartTruncate.
+const SMART_TRUNCATE_HEAD = 120;
+const SMART_TRUNCATE_TAIL = 60;
+const SMART_TRUNCATE_MIN_LINES = 250;
+function compressSmartTruncate(text) {
+ const lines = text.split("\n");
+ if (lines.length < SMART_TRUNCATE_MIN_LINES) return null; + + const head = lines.slice(0, SMART_TRUNCATE_HEAD); + const tail = lines.slice(lines.length - SMART_TRUNCATE_TAIL); + const cut = lines.length - head.length - tail.length; + return [...head, `... +${cut} lines truncated`, ...tail].join("\n"); +} + // ── Compression Pipeline ───────────────────────────────────────────── const COMPRESSORS = [ @@ -466,8 +567,13 @@ const COMPRESSORS = [ { name: "build_output", fn: compressBuildOutput }, { name: "container_output", fn: compressContainerOutput }, { name: "json_response", fn: compressJSON }, + { name: "grep_output", fn: compressGrep }, { name: "directory_listing", fn: compressDirectoryListing }, { name: "large_file", fn: compressLargeFile }, + // Generic fallbacks last: dedup exact-duplicate spam, then hard head/tail + // truncation only if nothing more specific applied. + { name: "dedup_log", fn: compressDedupLog }, + { name: "smart_truncate", fn: compressSmartTruncate }, ]; // Compression levels tied to routing tiers diff --git a/src/dashboard/api.js b/src/dashboard/api.js index 5e0399c..58c4373 100644 --- a/src/dashboard/api.js +++ b/src/dashboard/api.js @@ -5,24 +5,74 @@ const metrics = require('../metrics'); const { getMetricsCollector } = require('../observability/metrics'); const { TIER_DEFINITIONS } = require('../routing/model-tiers'); -function getConfiguredProviders() { +// Per-provider type + whether its credentials/endpoint are actually present. +function providerMeta() { const c = config; - const providers = []; - const add = (name, type, ok) => ok && providers.push({ name, type });
-
- add('databricks', 'cloud', c.databricks?.url && c.databricks?.apiKey);
- add('azure-anthropic','cloud', c.azureAnthropic?.endpoint && c.azureAnthropic?.apiKey);
- add('bedrock', 'cloud', c.bedrock?.apiKey);
- add('openrouter', 'cloud', c.openrouter?.apiKey);
- add('openai', 'cloud', c.openai?.apiKey);
- add('azure-openai', 'cloud', c.azureOpenAI?.endpoint && c.azureOpenAI?.apiKey);
- add('vertex', 'cloud', c.vertex?.projectId);
- add('moonshot', 'cloud', c.moonshot?.apiKey);
- add('ollama', 'local', c.ollama?.endpoint);
- add('llamacpp', 'local', c.llamacpp?.endpoint);
- add('lmstudio', 'local', c.lmstudio?.endpoint);
-
- return providers;
+ return {
+ databricks: { type: 'cloud', configured: !!(c.databricks?.url && c.databricks?.apiKey) },
+ 'azure-anthropic': { type: 'cloud', configured: !!(c.azureAnthropic?.endpoint && c.azureAnthropic?.apiKey) },
+ bedrock: { type: 'cloud', configured: !!c.bedrock?.apiKey },
+ openrouter: { type: 'cloud', configured: !!c.openrouter?.apiKey },
+ openai: { type: 'cloud', configured: !!c.openai?.apiKey },
+ 'azure-openai': { type: 'cloud', configured: !!(c.azureOpenAI?.endpoint && c.azureOpenAI?.apiKey) },
+ vertex: { type: 'cloud', configured: !!c.vertex?.projectId },
+ moonshot: { type: 'cloud', configured: !!c.moonshot?.apiKey },
+ ollama: { type: 'local', configured: !!c.ollama?.endpoint },
+ llamacpp: { type: 'local', configured: !!c.llamacpp?.endpoint },
+ lmstudio: { type: 'local', configured: !!c.lmstudio?.endpoint },
+ };
+}
+
+// Providers the active routing config actually points at: the provider prefix
+// of each TIER_* value (format `provider:model[:variant]`) plus the base
+// MODEL_PROVIDER. Returns Map
.
+function getReferencedProviders() {
+ const refs = new Map();
+ const note = (provider, label) => {
+ const key = String(provider || '').trim().toLowerCase();
+ if (!key) return;
+ if (!refs.has(key)) refs.set(key, []);
+ if (label && !refs.get(key).includes(label)) refs.get(key).push(label);
+ };
+
+ const tiers = config.modelTiers || {};
+ for (const [tier, val] of Object.entries(tiers)) {
+ if (typeof val === 'string' && val.trim()) {
+ note(val.split(':')[0], tier);
+ }
+ }
+ note(config.modelProvider?.type, 'default');
+
+ return refs;
+}
+
+// Providers used by the routing config that have credentials/endpoints set.
+// Unknown providers (no metadata) are included optimistically since we can't
+// verify their credentials.
+function getConfiguredProviders() {
+ const meta = providerMeta();
+ const out = [];
+ for (const [name, tiers] of getReferencedProviders()) {
+ const m = meta[name];
+ if (!m || m.configured) {
+ out.push({ name, type: m?.type || 'cloud', tiers });
+ }
+ }
+ return out;
+}
+
+// Tiers pointing at a known provider whose credentials/endpoint are missing —
+// surfaced as a warning so a misconfigured tier is visible.
+function getProviderWarnings() {
+ const meta = providerMeta();
+ const out = [];
+ for (const [name, tiers] of getReferencedProviders()) {
+ const m = meta[name];
+ if (m && !m.configured) {
+ out.push({ name, type: m.type, tiers });
+ }
+ }
+ return out;
}
// Noise provider names injected by unit tests — filter them out of UI
@@ -92,7 +142,8 @@ function overview(req, res) {
port: config.port,
version: process.env.npm_package_version || '9.0.2',
modelProvider: config.modelProvider?.type || 'unknown',
- providers: getConfiguredProviders(),
+ providers: getConfiguredProviders(),
+ providerWarnings: getProviderWarnings(),
statsWindow: win.label,
metrics: {
requestsTotal: snap.requestsTotal,
diff --git a/src/orchestrator/bypass.js b/src/orchestrator/bypass.js
new file mode 100644
index 0000000..b47a567
--- /dev/null
+++ b/src/orchestrator/bypass.js
@@ -0,0 +1,135 @@
+/**
+ * Request Bypass
+ *
+ * Short-circuits Claude Code CLI housekeeping requests that don't need a real
+ * model call:
+ * - "Warmup" pings the CLI sends to prime a connection
+ * - Topic/title extraction (the CLI asks for {"isNewTopic":..,"title":..})
+ * - Single-word "count" / "Warmup" probes
+ *
+ * Returning a canned response here saves a full provider round-trip (latency
+ * and tokens) on every session. Inspired by 9router's bypassHandler.
+ *
+ * Always on — only ever returns a canned response for unambiguous Claude CLI
+ * housekeeping traffic, never for real work.
+ *
+ * @module orchestrator/bypass
+ */
+
+const logger = require("../logger");
+
+/** Flatten Anthropic content (string | block[]) into plain text. */
+function getText(content) {
+ if (typeof content === "string") return content;
+ if (Array.isArray(content)) {
+ return content
+ .filter((b) => b && b.type === "text" && typeof b.text === "string")
+ .map((b) => b.text)
+ .join(" ");
+ }
+ return "";
+}
+
+/** Flatten the top-level Anthropic `system` field (string | block[]). */
+function getSystemText(system) {
+ if (typeof system === "string") return system;
+ if (Array.isArray(system)) {
+ return system
+ .filter((s) => s && s.type === "text" && typeof s.text === "string")
+ .map((s) => s.text)
+ .join(" ");
+ }
+ return "";
+}
+
+/**
+ * Decide whether a request is a bypassable Claude CLI housekeeping call.
+ *
+ * @param {object} args
+ * @param {object} args.payload - The Anthropic request body.
+ * @param {object} [args.headers] - Lowercased request headers.
+ * @returns {{kind: string, text: string}|null} bypass descriptor or null.
+ */
+function detectBypass({ payload, headers = {} }) {
+ if (!payload || !Array.isArray(payload.messages) || payload.messages.length === 0) {
+ return null;
+ }
+
+ // Only bypass Claude CLI traffic — other clients use these endpoints for
+ // real work and must never receive a canned response.
+ const ua = String(headers["user-agent"] || "").toLowerCase();
+ if (!ua.includes("claude-cli")) return null;
+
+ const messages = payload.messages;
+ const lastMsg = messages[messages.length - 1];
+
+ // Pattern 1: Title prefill — the CLI seeds an assistant turn with just "{"
+ // to coax a JSON object out of the model.
+ if (lastMsg?.role === "assistant") {
+ const firstBlockText =
+ Array.isArray(lastMsg.content) && lastMsg.content[0]?.type === "text"
+ ? lastMsg.content[0].text
+ : typeof lastMsg.content === "string"
+ ? lastMsg.content
+ : "";
+ if (firstBlockText.trim() === "{") {
+ return { kind: "title_prefill", text: "{}" };
+ }
+ }
+
+ // Pattern 2: Topic/title extraction — system prompt asks for isNewTopic.
+ // Synthesize a title from the first user message instead of calling a model.
+ const systemText = getSystemText(payload.system);
+ if (systemText.includes("isNewTopic")) {
+ const userMsg = messages.find((m) => m.role === "user");
+ const userText = getText(userMsg?.content).trim();
+ const title = userText.split(/\s+/).filter(Boolean).slice(0, 3).join(" ");
+ return {
+ kind: "title_extraction",
+ text: JSON.stringify({ isNewTopic: true, title }),
+ };
+ }
+
+ // Pattern 3: Warmup / count probes — a single short user message.
+ if (messages.length === 1 && messages[0]?.role === "user") {
+ const firstText = getText(messages[0].content).trim();
+ if (firstText === "Warmup" || firstText === "count") {
+ return { kind: firstText.toLowerCase(), text: "OK" };
+ }
+ }
+
+ return null;
+}
+
+/**
+ * Build the processMessage-shaped response for a bypass descriptor.
+ * Matches the `{ status, body, terminationReason }` contract the router
+ * consumes (same shape as the prompt-cache early returns).
+ *
+ * @param {{kind: string, text: string}} bypass
+ * @param {string} model - Model id to echo back.
+ * @returns {{status: number, body: object, terminationReason: string}}
+ */
+function buildBypassResponse(bypass, model) {
+ logger.info({ kind: bypass.kind }, "[Bypass] Short-circuiting CLI housekeeping request");
+ return {
+ status: 200,
+ body: {
+ id: `msg_bypass_${Date.now()}`,
+ type: "message",
+ role: "assistant",
+ content: [{ type: "text", text: bypass.text }],
+ model: model || "claude-3-unknown",
+ stop_reason: "end_turn",
+ stop_sequence: null,
+ usage: { input_tokens: 1, output_tokens: 1 },
+ lynkr_bypass: { kind: bypass.kind },
+ },
+ terminationReason: `bypass_${bypass.kind}`,
+ };
+}
+
+module.exports = {
+ detectBypass,
+ buildBypassResponse,
+};
diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js
index f1144b6..87d2cce 100644
--- a/src/orchestrator/index.js
+++ b/src/orchestrator/index.js
@@ -18,6 +18,7 @@ const { createAuditLogger } = require("../logger/audit-logger");
const { getResolvedIp, runWithDnsContext } = require("../clients/dns-logger");
const { getShuttingDown } = require("../api/health");
const { tryPreflight, buildSatisfiedResponse: buildPreflightResponse } = require("./preflight");
+const { detectBypass, buildBypassResponse } = require("./bypass");
const crypto = require("crypto");
const { asyncClone, asyncTransform, getPoolStats } = require("../workers/helpers");
const { getSemanticCache, isSemanticCacheEnabled } = require("../cache/semantic");
@@ -1362,8 +1363,12 @@ function sanitizePayload(payload) {
delete clean.tool_choice;
}
- // Smart tool selection (universal, applies to all providers)
- if (config.smartToolSelection?.enabled && Array.isArray(clean.tools) && clean.tools.length> 0) {
+ // Smart tool selection (server mode only). In client/passthrough mode the
+ // client (e.g. Claude Code) owns tool execution, so stripping its tools would
+ // make the model emit calls for tools we removed — they then get dropped as
+ // "hallucinated" and the session makes no progress. Pass tools through intact.
+ const inClientMode = config.toolExecutionMode === "client" || config.toolExecutionMode === "passthrough";
+ if (!inClientMode && config.smartToolSelection?.enabled && Array.isArray(clean.tools) && clean.tools.length> 0) {
const classification = classifyRequestType(clean);
const selectedTools = selectToolsSmartly(clean.tools, classification, {
provider: providerType,
@@ -1977,6 +1982,12 @@ IMPORTANT TOOL USAGE RULES:
cleanPayload._tenantPolicy = options.tenantPolicy;
}
+ // Thread session id for provider affinity — keeps a tool-bearing
+ // conversation on one provider so tool_call_id linkage doesn't break.
+ if (session?.id) {
+ cleanPayload._sessionId = session.id;
+ }
+
// RTK-inspired tool result compression: compress large tool_results
// before they reach the model (saves 60-90% on test/git/lint output)
if (config.toolResultCompression?.enabled !== false) {
@@ -1985,6 +1996,18 @@ IMPORTANT TOOL USAGE RULES:
compressToolResults(cleanPayload.messages, { tier });
}
+ // MCP-aware tool dedup: drop built-in tools superseded by present MCP tools
+ // (e.g. WebSearch/WebFetch when Exa/Tavily MCP is available). Always on.
+ const { applyToolDedup } = require("../context/tool-dedup");
+ applyToolDedup(cleanPayload);
+
+ // Caveman terse-output injection (opt-in): nudge the model toward shorter
+ // responses to reduce output tokens.
+ if (config.caveman?.enabled === true) {
+ const { injectCaveman } = require("../context/caveman");
+ cleanPayload.system = injectCaveman(cleanPayload.system);
+ }
+
if (agentTimer) agentTimer.mark("preInvokeModel");
let databricksResponse;
try {
@@ -3735,6 +3758,14 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
};
}
+ // === REQUEST BYPASS ===
+ // Claude CLI housekeeping (Warmup pings, topic/title extraction) doesn't
+ // need a model call — return a canned response and skip the provider.
+ const bypass = detectBypass({ payload, headers });
+ if (bypass) {
+ return buildBypassResponse(bypass, requestedModel);
+ }
+
// === PREFLIGHT CHECK ===
// If the request supplied preflight_commands and they all pass in
// the workspace, the work is already done — short-circuit with a
diff --git a/src/routing/index.js b/src/routing/index.js
index 93c270b..b760fc3 100644
--- a/src/routing/index.js
+++ b/src/routing/index.js
@@ -138,7 +138,46 @@ function getBestLocalProvider() {
* @param {Object} options - Routing options
* @returns {Object} Routing decision with provider and metadata
*/
+const sessionAffinity = require('./session-affinity');
+
+/**
+ * Provider routing with session affinity.
+ *
+ * When a conversation already carries tool history, reuse the provider the
+ * session first routed to so tool-call IDs don't break across providers.
+ * Fresh turns route normally and refresh the session's pinned provider.
+ */
async function determineProviderSmart(payload, options = {}) {
+ const sessionId = payload?._sessionId || null;
+
+ // Enforce affinity only for in-flight tool exchanges — the turns that 400
+ // if the provider changes. Fresh turns keep full per-turn tier routing.
+ if (sessionId && !options.forceProvider && sessionAffinity.payloadHasToolHistory(payload)) {
+ const pinned = sessionAffinity.getPinned(sessionId);
+ if (pinned) {
+ logger.debug({ sessionId, provider: pinned.provider, tier: pinned.tier },
+ '[Routing] Session affinity — reusing provider for tool-bearing turn');
+ return {
+ provider: pinned.provider,
+ model: pinned.model,
+ tier: pinned.tier,
+ method: 'session_affinity',
+ reason: 'tool_history_provider_pin',
+ };
+ }
+ }
+
+ const decision = await _determineProviderSmartInner(payload, options);
+
+ // Remember the chosen provider so later tool-bearing turns stay consistent.
+ if (sessionId && decision?.provider && !options.forceProvider) {
+ sessionAffinity.setPinned(sessionId, decision);
+ }
+
+ return decision;
+}
+
+async function _determineProviderSmartInner(payload, options = {}) {
const primaryProvider = config.modelProvider?.type ?? 'databricks';
// Risk analysis runs orthogonally to complexity. We compute it once
diff --git a/src/routing/model-registry.js b/src/routing/model-registry.js
index e52258b..ac87804 100644
--- a/src/routing/model-registry.js
+++ b/src/routing/model-registry.js
@@ -54,9 +54,41 @@ const DATABRICKS_FALLBACK = {
'databricks-bge-large-en': { input: 0.02, output: 0, context: 512 },
};
-// Default cost for unknown models
+// Default cost for unknown models. Returned with `unknown: true` so callers can
+// distinguish a real price from a fabricated guess.
const DEFAULT_COST = { input: 1.0, output: 3.0, context: 128000 };
+// Curated name aliases (exact, one-directional). Maps a name a caller might use
+// to the canonical key likely present in the pricing data. Misses are harmless
+// (resolution simply continues down the ladder).
+const MODEL_ALIASES = {
+ 'claude-sonnet-4-5': 'claude-sonnet-4-5-20250929',
+ 'claude-opus-4-1': 'claude-opus-4-1-20250805',
+ 'claude-3-5-sonnet': 'claude-3-5-sonnet-20241022',
+};
+
+/**
+ * Parse MODEL_PRICE_OVERRIDES env (JSON object of
+ * { "": { "input": , "output": , "context"?: N } }).
+ * Lets operators pin correct prices for models the registry doesn't know.
+ */
+function _loadOverrides() {
+ const out = new Map();
+ const raw = process.env.MODEL_PRICE_OVERRIDES;
+ if (!raw) return out;
+ try {
+ const parsed = JSON.parse(raw);
+ for (const [name, info] of Object.entries(parsed)) {
+ if (info && typeof info.input === 'number' && typeof info.output === 'number') {
+ out.set(name.toLowerCase(), { context: 128000, ...info });
+ }
+ }
+ } catch (err) {
+ logger.warn({ err: err.message }, '[ModelRegistry] Failed to parse MODEL_PRICE_OVERRIDES');
+ }
+ return out;
+}
+
class ModelRegistry {
constructor() {
this.litellmPrices = {};
@@ -64,6 +96,7 @@ class ModelRegistry {
this.loaded = false;
this.lastFetch = 0;
this.modelIndex = new Map();
+ this.overrides = _loadOverrides();
}
/**
@@ -255,40 +288,70 @@ class ModelRegistry {
* @returns {Object} Cost info { input, output, context, ... }
*/
getCost(modelName) {
- if (!modelName) return { ...DEFAULT_COST, source: 'default' };
+ if (!modelName) return { ...DEFAULT_COST, source: 'default', unknown: true };
- const normalizedName = modelName.toLowerCase();
+ const name = String(modelName).toLowerCase().trim();
+ const hit = this._resolveCost(name);
+ if (hit) return hit;
- // Direct lookup
- if (this.modelIndex.has(normalizedName)) {
- return this.modelIndex.get(normalizedName);
- }
+ // Nothing matched — report unknown rather than silently fabricating a price.
+ logger.debug({ model: modelName }, '[ModelRegistry] Model not found — cost unknown');
+ return { ...DEFAULT_COST, source: 'default', unknown: true };
+ }
- // Try common variations
- const variations = [
- normalizedName,
- normalizedName.replace('databricks-', ''),
- normalizedName.replace('azure/', ''),
- normalizedName.replace('bedrock/', ''),
- normalizedName.replace('anthropic.', ''),
- normalizedName.split('/').pop(),
- ];
-
- for (const variant of variations) {
- if (this.modelIndex.has(variant)) {
- return this.modelIndex.get(variant);
- }
+ /**
+ * Deterministic price resolution. Each step is exact (no bidirectional
+ * substring matching), and the only loose step (longest-prefix) is
+ * one-directional and length-bounded, so unrelated names can't false-match.
+ * Returns a cost object with a `resolution` tag, or null if nothing matched.
+ * @param {string} name - already lowercased/trimmed
+ */
+ _resolveCost(name) {
+ const tag = (value, resolution, matchedAs) => ({
+ ...value,
+ resolution,
+ ...(matchedAs && matchedAs !== name ? { matchedAs } : {}),
+ });
+
+ // 1. Operator overrides (exact) — ground truth.
+ if (this.overrides.has(name)) return tag({ ...this.overrides.get(name), source: 'override' }, 'override');
+
+ // 2. Exact registry hit.
+ if (this.modelIndex.has(name)) return tag(this.modelIndex.get(name), 'exact');
+
+ // 3. Provider-prefix strip (exact).
+ const stripped = [
+ name.replace(/^databricks-/, ''),
+ name.replace(/^azure\//, ''),
+ name.replace(/^bedrock\//, ''),
+ name.replace(/^anthropic\./, ''),
+ name.replace(/^openai\//, ''),
+ name.includes('/') ? name.split('/').pop() : null,
+ ].filter((v) => v && v !== name);
+ for (const v of stripped) {
+ if (this.overrides.has(v)) return tag({ ...this.overrides.get(v), source: 'override' }, 'prefix-strip', v);
+ if (this.modelIndex.has(v)) return tag(this.modelIndex.get(v), 'prefix-strip', v);
}
- // Fuzzy match for partial names
+ // 4. Curated alias (exact).
+ const alias = MODEL_ALIASES[name];
+ if (alias && this.modelIndex.has(alias)) return tag(this.modelIndex.get(alias), 'alias', alias);
+
+ // 5. Date/version-suffix normalization (e.g. -20250929, -2025年09月29日, -v2).
+ const dateless = name.replace(/[-@](\d{8}|\d{4}-\d{2}-\d{2}|v\d+)$/, '');
+ if (dateless !== name && this.modelIndex.has(dateless)) return tag(this.modelIndex.get(dateless), 'date-normalize', dateless);
+
+ // 6. Longest registry key that is a prefix of the requested name. Bounded so
+ // short keys can't grab unrelated names (e.g. "gpt-5.2-chat-2026" → "gpt-5.2-chat").
+ let best = null;
for (const [key, value] of this.modelIndex.entries()) {
- if (key.includes(normalizedName) || normalizedName.includes(key)) {
- return value;
+ if (key.length>= 6 && name.startsWith(key) && (!best || key.length> best.key.length)) {
+ best = { key, value };
}
}
+ if (best) return tag(best.value, 'longest-prefix', best.key);
- logger.debug({ model: modelName }, '[ModelRegistry] Model not found, using default');
- return { ...DEFAULT_COST, source: 'default' };
+ return null;
}
/**
diff --git a/src/routing/risk-analyzer.js b/src/routing/risk-analyzer.js
index efd8281..78c402c 100644
--- a/src/routing/risk-analyzer.js
+++ b/src/routing/risk-analyzer.js
@@ -13,13 +13,18 @@ const { extractContent } = require('./complexity-analyzer');
// Substring keywords found in file paths or instruction text.
// Matched case-insensitively as raw substrings, so "auth" hits
// "src/auth/login.ts" and "authentication".
+// NOTE: keywords are matched as case-insensitive *substrings* against file
+// paths, so overly generic terms cause false positives. 'session' and 'token'
+// were removed because they match benign paths (src/sessions/*, tokenizer.js,
+// token-budget.js) and were force-escalating ordinary requests to COMPLEX —
+// real secrets/credentials are still covered by the keywords below.
const PROTECTED_PATH_KEYWORDS = [
- 'auth', 'oauth', 'jwt', 'session', 'security', 'permission', 'rbac',
+ 'auth', 'oauth', 'jwt', 'security', 'permission', 'rbac',
'payment', 'payments', 'billing', 'invoice', 'subscription',
'migration', 'migrations', 'schema',
'infra', 'terraform', 'kustomize', 'helm', 'kubernetes',
'.github/workflows', '.env', 'secret', 'credential',
- 'api-key', 'api_key', 'apikey', 'token',
+ 'api-key', 'api_key', 'apikey',
'webhook', 'admin',
];
diff --git a/src/routing/session-affinity.js b/src/routing/session-affinity.js
new file mode 100644
index 0000000..5f76f82
--- /dev/null
+++ b/src/routing/session-affinity.js
@@ -0,0 +1,96 @@
+/**
+ * Session → Provider Affinity
+ *
+ * A multi-turn agentic conversation builds up tool_use / tool_result history
+ * whose tool-call IDs are formatted for the provider that produced them. If a
+ * later turn re-routes to a *different* provider (because per-turn complexity
+ * or risk changed), that provider rejects the orphaned tool linkage:
+ *
+ * Azure: 400 "No tool call found for function call output with call_id ..."
+ * Moonshot: 400 "Invalid request: tool_call_id is not found"
+ *
+ * To prevent that, once a session has chosen a provider we keep subsequent
+ * turns on it *while the payload carries tool history*. Fresh turns (no tool
+ * state) still route normally, so per-turn tier routing is preserved.
+ *
+ * @module routing/session-affinity
+ */
+
+const MAX_ENTRIES = 2000;
+const TTL_MS = 60 * 60 * 1000; // 1 hour
+
+/** @type {Map} */
+const pins = new Map();
+
+function _evictIfNeeded() {
+ if (pins.size <= MAX_ENTRIES) return; + // Map preserves insertion order — drop the oldest. + const oldest = pins.keys().next().value; + if (oldest !== undefined) pins.delete(oldest); +} + +/** + * True when the payload contains an in-flight tool exchange — i.e. a prior + * assistant tool_use or a user tool_result. These are the turns whose + * tool-call IDs break if the provider changes. + * @param {object} payload + * @returns {boolean} + */ +function payloadHasToolHistory(payload) { + const messages = payload?.messages; + if (!Array.isArray(messages)) return false; + for (const msg of messages) { + const content = msg?.content; + if (!Array.isArray(content)) continue; + for (const block of content) { + const t = block?.type; + if (t === "tool_use" || t === "tool_result") return true; + } + } + return false; +} + +/** + * Return the pinned routing decision for a session, or null if none / expired. + * @param {string} sessionId + */ +function getPinned(sessionId) { + if (!sessionId) return null; + const entry = pins.get(sessionId); + if (!entry) return null; + if (Date.now() - entry.ts> TTL_MS) {
+ pins.delete(sessionId);
+ return null;
+ }
+ return entry;
+}
+
+/**
+ * Record the provider a session routed to, for reuse on later tool-bearing turns.
+ * @param {string} sessionId
+ * @param {{provider:string, model?:string|null, tier?:string|null}} decision
+ */
+function setPinned(sessionId, decision) {
+ if (!sessionId || !decision?.provider) return;
+ // Refresh insertion order so active sessions aren't evicted.
+ pins.delete(sessionId);
+ pins.set(sessionId, {
+ provider: decision.provider,
+ model: decision.model ?? null,
+ tier: decision.tier ?? null,
+ ts: Date.now(),
+ });
+ _evictIfNeeded();
+}
+
+/** Test/maintenance helper. */
+function _clear() {
+ pins.clear();
+}
+
+module.exports = {
+ payloadHasToolHistory,
+ getPinned,
+ setPinned,
+ _clear,
+};
diff --git a/src/routing/telemetry.js b/src/routing/telemetry.js
index 5d2a504..e606d35 100644
--- a/src/routing/telemetry.js
+++ b/src/routing/telemetry.js
@@ -94,7 +94,9 @@ function init() {
circuit_breaker_state TEXT,
quality_score REAL,
tokens_per_second REAL,
- cost_efficiency REAL
+ cost_efficiency REAL,
+ request_text TEXT,
+ response_text TEXT
);
CREATE INDEX IF NOT EXISTS idx_telemetry_provider
@@ -110,6 +112,15 @@ function init() {
ON routing_telemetry(session_id, timestamp);
`);
+ // Migration: add columns to pre-existing tables (CREATE TABLE IF NOT EXISTS
+ // won't add them to a DB created before these columns existed).
+ const existingCols = new Set(db.prepare("PRAGMA table_info(routing_telemetry)").all().map((c) => c.name));
+ for (const col of ["request_text", "response_text"]) {
+ if (!existingCols.has(col)) {
+ db.exec(`ALTER TABLE routing_telemetry ADD COLUMN ${col} TEXT`);
+ }
+ }
+
logger.info({ dbPath }, "Routing telemetry database initialised");
return true;
} catch (err) {
@@ -163,14 +174,14 @@ function record(data) {
provider, model, routing_method, was_fallback, output_tokens,
latency_ms, status_code, error_type, cost_usd, tool_calls_made,
retry_count, circuit_breaker_state, quality_score, tokens_per_second,
- cost_efficiency
+ cost_efficiency, request_text, response_text
) VALUES (
@request_id, @session_id, @timestamp, @complexity_score, @tier,
@agentic_type, @tool_count, @input_tokens, @message_count, @request_type,
@provider, @model, @routing_method, @was_fallback, @output_tokens,
@latency_ms, @status_code, @error_type, @cost_usd, @tool_calls_made,
@retry_count, @circuit_breaker_state, @quality_score, @tokens_per_second,
- @cost_efficiency
+ @cost_efficiency, @request_text, @response_text
)`
);
if (!insert) return;
@@ -201,6 +212,8 @@ function record(data) {
quality_score: data.quality_score ?? null,
tokens_per_second: data.tokens_per_second ?? null,
cost_efficiency: data.cost_efficiency ?? null,
+ request_text: data.request_text ?? null,
+ response_text: data.response_text ?? null,
});
} catch (err) {
logger.debug({ err: err.message }, "Telemetry record failed");
diff --git a/test/model-registry-cost.test.js b/test/model-registry-cost.test.js
new file mode 100644
index 0000000..d0836cd
--- /dev/null
+++ b/test/model-registry-cost.test.js
@@ -0,0 +1,50 @@
+const assert = require("assert");
+const { describe, it } = require("node:test");
+
+const { getModelRegistrySync } = require("../src/routing/model-registry");
+
+const reg = getModelRegistrySync();
+
+describe("model-registry cost resolution ladder", () => {
+ it("resolves a known model exactly", () => {
+ const c = reg.getCost("gpt-5.2-chat");
+ assert.strictEqual(c.unknown, undefined);
+ assert.ok(c.input> 0 && c.output> 0);
+ });
+
+ it("strips a provider prefix to resolve", () => {
+ const c = reg.getCost("databricks-claude-sonnet-4-5");
+ assert.ok(!c.unknown);
+ assert.ok(c.input> 0);
+ });
+
+ it("matches a dated/suffixed name via longest-prefix", () => {
+ const base = reg.getCost("gpt-5.2-chat");
+ const suffixed = reg.getCost("gpt-5.2-chat-2026");
+ assert.ok(!suffixed.unknown);
+ assert.strictEqual(suffixed.input, base.input);
+ assert.strictEqual(suffixed.matchedAs, "gpt-5.2-chat");
+ });
+
+ it("returns unknown (not a fabricated price) for a garbage name", () => {
+ const c = reg.getCost("totally-made-up-model-xyz");
+ assert.strictEqual(c.unknown, true);
+ assert.strictEqual(c.resolution, undefined);
+ });
+
+ it("does not false-match a too-short name", () => {
+ assert.strictEqual(reg.getCost("xx").unknown, true);
+ });
+
+ it("treats empty/missing model as unknown", () => {
+ assert.strictEqual(reg.getCost("").unknown, true);
+ assert.strictEqual(reg.getCost(null).unknown, true);
+ });
+
+ it("never does a bidirectional substring match (the old fuzzy hazard)", () => {
+ // A name that contains a real key as a *substring* but not as a prefix must
+ // NOT resolve to that key.
+ const c = reg.getCost("my-custom-gpt-5.2-chat-wrapper");
+ assert.strictEqual(c.unknown, true);
+ });
+});
diff --git a/test/session-affinity.test.js b/test/session-affinity.test.js
new file mode 100644
index 0000000..8533d99
--- /dev/null
+++ b/test/session-affinity.test.js
@@ -0,0 +1,64 @@
+const assert = require("assert");
+const { describe, it, beforeEach } = require("node:test");
+
+const affinity = require("../src/routing/session-affinity");
+
+describe("session-affinity: payloadHasToolHistory", () => {
+ it("is false for a plain text conversation", () => {
+ const payload = { messages: [{ role: "user", content: "explain this repo" }] };
+ assert.strictEqual(affinity.payloadHasToolHistory(payload), false);
+ });
+
+ it("is true when an assistant tool_use is present", () => {
+ const payload = {
+ messages: [
+ { role: "user", content: "read the file" },
+ { role: "assistant", content: [{ type: "tool_use", id: "t1", name: "Read", input: {} }] },
+ ],
+ };
+ assert.strictEqual(affinity.payloadHasToolHistory(payload), true);
+ });
+
+ it("is true when a user tool_result is present", () => {
+ const payload = {
+ messages: [
+ { role: "user", content: [{ type: "tool_result", tool_use_id: "t1", content: "ok" }] },
+ ],
+ };
+ assert.strictEqual(affinity.payloadHasToolHistory(payload), true);
+ });
+
+ it("handles missing/!array messages safely", () => {
+ assert.strictEqual(affinity.payloadHasToolHistory({}), false);
+ assert.strictEqual(affinity.payloadHasToolHistory(null), false);
+ assert.strictEqual(affinity.payloadHasToolHistory({ messages: "x" }), false);
+ });
+});
+
+describe("session-affinity: pin lifecycle", () => {
+ beforeEach(() => affinity._clear());
+
+ it("returns null when nothing is pinned", () => {
+ assert.strictEqual(affinity.getPinned("s1"), null);
+ });
+
+ it("round-trips a pinned decision", () => {
+ affinity.setPinned("s1", { provider: "moonshot", model: "moonshot-v1-auto", tier: "COMPLEX" });
+ const got = affinity.getPinned("s1");
+ assert.strictEqual(got.provider, "moonshot");
+ assert.strictEqual(got.model, "moonshot-v1-auto");
+ assert.strictEqual(got.tier, "COMPLEX");
+ });
+
+ it("ignores empty session id or provider", () => {
+ affinity.setPinned("", { provider: "ollama" });
+ affinity.setPinned("s2", { provider: undefined });
+ assert.strictEqual(affinity.getPinned("s2"), null);
+ });
+
+ it("keeps the latest provider for a session", () => {
+ affinity.setPinned("s1", { provider: "ollama" });
+ affinity.setPinned("s1", { provider: "azure-openai" });
+ assert.strictEqual(affinity.getPinned("s1").provider, "azure-openai");
+ });
+});
diff --git a/test/token-reduction.test.js b/test/token-reduction.test.js
new file mode 100644
index 0000000..01363ef
--- /dev/null
+++ b/test/token-reduction.test.js
@@ -0,0 +1,182 @@
+const assert = require("assert");
+const { describe, it } = require("node:test");
+
+const { compressToolResults, getMetrics } = require("../src/context/tool-result-compressor");
+const { detectBypass, buildBypassResponse } = require("../src/orchestrator/bypass");
+const { dedupeTools } = require("../src/context/tool-dedup");
+const { injectCaveman } = require("../src/context/caveman");
+
+// Helper: wrap a tool_result string in a message and compress it.
+function compressOne(text, tier = "SIMPLE") {
+ const messages = [
+ { role: "user", content: [{ type: "tool_result", tool_use_id: "t1", content: text }] },
+ ];
+ const res = compressToolResults(messages, { tier });
+ return { out: messages[0].content[0].content, res };
+}
+
+describe("RTK filters — grep", () => {
+ it("groups grep matches by file and caps per-file output", () => {
+ const lines = [];
+ for (let i = 1; i <= 30; i++) lines.push(`src/app.js:${i}:const x = ${i};`); + for (let i = 1; i <= 5; i++) lines.push(`src/util.js:${i}:helper(${i});`); + const { out } = compressOne(lines.join("\n")); + assert.ok(out.includes("35 matches in 2F"), `got: ${out.slice(0, 80)}`); + assert.ok(out.includes("[file] src/app.js (30)")); + assert.ok(out.includes("+20"), "should cap at 10 per file and note the rest"); + // tee recovery pointer is appended + assert.ok(/\[full: tee_/.test(out)); + }); + + it("ignores prose that is not grep output", () => {
+ const text = "This is a normal paragraph.\nNo file:line:content here.\n".repeat(40);
+ const { out } = compressOne(text);
+ // grep should not fire; dedup_log collapses the repeated lines instead — but
+ // the point is the result is still valid text, not a grep summary.
+ assert.ok(!out.includes("matches in"));
+ });
+});
+
+describe("RTK filters — dedup log", () => {
+ it("collapses consecutive duplicate lines", () => {
+ const text = "starting\n" + "retrying connection...\n".repeat(200) + "done\n";
+ const { out } = compressOne(text);
+ assert.ok(out.includes("duplicate lines"), `got: ${out.slice(0, 120)}`);
+ assert.ok(out.length < text.length * 0.7); + }); +}); + +describe("RTK filters — smart truncate", () => {
+ it("keeps head and tail of very long unmatched output", () => {
+ const lines = [];
+ for (let i = 0; i < 400; i++) lines.push(`unique log line number ${i} ${Math.random()}`); + const { out } = compressOne(lines.join("\n")); + assert.ok(out.includes("lines truncated"), `got tail: ${out.slice(-80)}`); + assert.ok(out.includes("unique log line number 0")); + assert.ok(out.includes("unique log line number 399")); + }); +}); + +describe("request bypass", () => {
+ const cliHeaders = { "user-agent": "claude-cli/1.0.0" };
+
+ it("bypasses Warmup pings from the Claude CLI", () => {
+ const b = detectBypass({
+ payload: { messages: [{ role: "user", content: "Warmup" }] },
+ headers: cliHeaders,
+ });
+ assert.ok(b, "expected bypass");
+ assert.strictEqual(b.kind, "warmup");
+ });
+
+ it("synthesizes a title for topic-extraction requests", () => {
+ const b = detectBypass({
+ payload: {
+ system: "Analyze if this is a new topic. Respond with isNewTopic and title.",
+ messages: [{ role: "user", content: "refactor the auth middleware please" }],
+ },
+ headers: cliHeaders,
+ });
+ assert.ok(b);
+ assert.strictEqual(b.kind, "title_extraction");
+ const parsed = JSON.parse(b.text);
+ assert.strictEqual(parsed.isNewTopic, true);
+ assert.strictEqual(parsed.title, "refactor the auth");
+ });
+
+ it("handles the '{' title-prefill pattern", () => {
+ const b = detectBypass({
+ payload: {
+ messages: [
+ { role: "user", content: "hi" },
+ { role: "assistant", content: [{ type: "text", text: "{" }] },
+ ],
+ },
+ headers: cliHeaders,
+ });
+ assert.ok(b);
+ assert.strictEqual(b.kind, "title_prefill");
+ });
+
+ it("does NOT bypass non-CLI clients", () => {
+ const b = detectBypass({
+ payload: { messages: [{ role: "user", content: "Warmup" }] },
+ headers: { "user-agent": "cursor/0.4" },
+ });
+ assert.strictEqual(b, null);
+ });
+
+ it("does NOT bypass a real coding question from the CLI", () => {
+ const b = detectBypass({
+ payload: { messages: [{ role: "user", content: "write a binary search in python" }] },
+ headers: cliHeaders,
+ });
+ assert.strictEqual(b, null);
+ });
+
+ it("builds a valid Anthropic message response", () => {
+ const r = buildBypassResponse({ kind: "warmup", text: "OK" }, "claude-x");
+ assert.strictEqual(r.status, 200);
+ assert.strictEqual(r.body.type, "message");
+ assert.strictEqual(r.body.content[0].text, "OK");
+ assert.strictEqual(r.body.model, "claude-x");
+ assert.strictEqual(r.terminationReason, "bypass_warmup");
+ });
+});
+
+describe("MCP-aware tool dedup", () => {
+ it("strips built-in web tools when Exa MCP is present", () => {
+ const tools = [
+ { name: "mcp__exa__web_search_exa" },
+ { name: "WebSearch" },
+ { name: "WebFetch" },
+ { name: "Read" },
+ ];
+ const { tools: out, stripped } = dedupeTools(tools);
+ assert.deepStrictEqual(stripped.sort(), ["WebFetch", "WebSearch"]);
+ assert.ok(out.some((t) => t.name === "mcp__exa__web_search_exa"));
+ assert.ok(out.some((t) => t.name === "Read"));
+ assert.ok(!out.some((t) => t.name === "WebSearch"));
+ });
+
+ it("is a no-op when no trigger MCP tool is present", () => {
+ const tools = [{ name: "WebSearch" }, { name: "Read" }];
+ const { tools: out, stripped } = dedupeTools(tools);
+ assert.deepStrictEqual(stripped, []);
+ assert.strictEqual(out.length, 2);
+ });
+
+ it("supports OpenAI-shaped tool definitions", () => {
+ const tools = [
+ { type: "function", function: { name: "mcp__tavily__tavily_search" } },
+ { type: "function", function: { name: "WebFetch" } },
+ ];
+ const { stripped } = dedupeTools(tools);
+ assert.deepStrictEqual(stripped, ["WebFetch"]);
+ });
+});
+
+describe("caveman injector", () => {
+ it("is a no-op when disabled", () => {
+ const sys = "You are a helpful assistant.";
+ assert.strictEqual(injectCaveman(sys, { enabled: false }), sys);
+ });
+
+ it("appends a brevity instruction when enabled", () => {
+ const out = injectCaveman("base prompt", { enabled: true, level: "lite" });
+ assert.ok(out.startsWith("base prompt"));
+ assert.ok(out.includes("[brevity]"));
+ assert.ok(out.includes("terse"));
+ });
+
+ it("is idempotent (no double injection)", () => {
+ const once = injectCaveman("base", { enabled: true });
+ const twice = injectCaveman(once, { enabled: true });
+ assert.strictEqual(once, twice);
+ });
+
+ it("falls back to lite for an unknown level", () => {
+ const out = injectCaveman("", { enabled: true, level: "bogus" });
+ assert.ok(out.includes("[brevity]"));
+ });
+});