From aec13579dc2f9b786ad35308d7ce30a3151bf94e Mon Sep 17 00:00:00 2001 From: leihua Date: 2026年6月12日 20:44:57 +0800 Subject: [PATCH 1/6] feat: add Erlang language support (`.erl`, `.hrl`) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the WhatsApp tree-sitter-erlang grammar into the extraction pipeline with a custom visitNode extractor that handles Erlang's fun_decl/function_clause AST structure. Extracts modules, functions, records, type/opaque declarations, macros, imports/includes, exports, and call edges. Benchmarked on poolboy, cowboy, and EMQX — codegraph reduces file reads to zero across all three. --- .claude/skills/agent-eval/corpus.json | 5 + CHANGELOG.md | 1 + README.md | 3 +- __tests__/extraction.test.ts | 136 ++++++++- src/extraction/grammars.ts | 6 +- src/extraction/languages/erlang.ts | 293 ++++++++++++++++++++ src/extraction/languages/index.ts | 2 + src/extraction/wasm/tree-sitter-erlang.wasm | Bin 0 -> 421639 bytes src/types.ts | 1 + 9 files changed, 444 insertions(+), 3 deletions(-) create mode 100644 src/extraction/languages/erlang.ts create mode 100755 src/extraction/wasm/tree-sitter-erlang.wasm diff --git a/.claude/skills/agent-eval/corpus.json b/.claude/skills/agent-eval/corpus.json index 2cfedac4f..a9c92c9be 100644 --- a/.claude/skills/agent-eval/corpus.json +++ b/.claude/skills/agent-eval/corpus.json @@ -94,5 +94,10 @@ { "name": "react-native-segmented-control", "repo": "https://github.com/react-native-segmented-control/segmented-control", "size": "Small", "files": "~25", "question": "How does JSX `` reach the native onChange handler on iOS/Android?" }, { "name": "react-native-screens", "repo": "https://github.com/software-mansion/react-native-screens", "size": "Medium", "files": "~1200", "question": "How does JSX `` reach the native RNSScreenStackView component?" }, { "name": "react-native-skia", "repo": "https://github.com/Shopify/react-native-skia", "size": "Large", "files": "~1000", "question": "How does a `` JSX usage reach the iOS / Android native renderer?" } + ], + "Erlang": [ + { "name": "poolboy", "repo": "https://github.com/devinus/poolboy", "size": "Small", "files": "~15", "question": "How does poolboy manage worker checkout, checkin, and overflow?" }, + { "name": "cowboy", "repo": "https://github.com/ninenines/cowboy", "size": "Medium", "files": "~100", "question": "How does Cowboy route an HTTP request to its handler?" }, + { "name": "emqx", "repo": "https://github.com/emqx/emqx", "size": "Large", "files": "~1500", "question": "How does EMQX handle an MQTT CONNECT packet from a client?" } ] } diff --git a/CHANGELOG.md b/CHANGELOG.md index fbf4c5d4e..874468af4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### New Features +- **CodeGraph now indexes Erlang** (`.erl`, `.hrl`) — modules, functions (including multi-clause), records, type/opaque declarations, macros, imports/includes, exports, and call edges. Benchmarked on poolboy, cowboy, and EMQX. - **Subagents and non-MCP agents can now reach CodeGraph.** Two new CLI commands — `codegraph explore ""` and `codegraph node ` — print exactly what the matching MCP tools return (relevant symbols' source + call paths; one symbol's source + callers; file reads with line numbers), so any agent with a shell can use the graph. And `codegraph install` now writes a small marker-fenced CodeGraph section into each agent's instructions file (`CLAUDE.md` / `AGENTS.md` / `GEMINI.md`) pointing at both surfaces — that file is what Task-tool subagents actually see, where the MCP server's own guidance only reaches the main agent. Measured on a delegated code-exploration task: subagents went from almost never using CodeGraph (~1 in 9 runs) to using it in every run, including runs with zero grep/file-reading fallback. The section is small, survives your own content, upgrades cleanly from the old long block, and `codegraph uninstall` removes it. Thanks @liuyao37511. (#704) - **The MCP tool list is now a focused default of four** — `codegraph_explore`, `codegraph_node`, `codegraph_search`, and `codegraph_callers`. The other four (`codegraph_callees`, `codegraph_impact`, `codegraph_files`, `codegraph_status`) remain fully functional — the CLI and library API are unchanged, and `CODEGRAPH_MCP_TOOLS` re-enables any of them — but they're no longer listed to agents by default: measured agent behavior shows they're never or rarely picked, and the information they carry already arrives inline on the tools agents do use (explore's blast-radius section, node's dependents note, a symbol's own body as its callee list). A leaner list saves context tokens every session and steers agents to the right tool by presence alone. - **CodeGraph now goes quiet instead of failing loudly in unindexed projects.** When an AI agent's session starts in a workspace that has no CodeGraph index, the MCP server now announces itself as inactive with a short note and lists no tools at all — instead of presenting the full toolset and erroring on every call, which taught agents to distrust CodeGraph even where it works. Querying another project that isn't indexed likewise returns clear guidance (use your regular tools for that codebase; the user can run `codegraph init` there to enable CodeGraph) instead of an error, and genuine internal errors now tell the agent to retry once rather than give up on CodeGraph entirely. Indexing stays your decision — agents are told not to run it themselves. (#769) diff --git a/README.md b/README.md index bb86a697b..570279608 100644 --- a/README.md +++ b/README.md @@ -225,7 +225,7 @@ CodeGraph cuts **tokens, tool calls, and wall-clock time on every repo** — acr | **Full-Text Search** | Find code by name instantly across your entire codebase, powered by FTS5 | | **Impact Analysis** | Trace callers, callees, and the full impact radius of any symbol before making changes | | **Always Fresh** | File watcher uses native OS events (FSEvents/inotify/ReadDirectoryChangesW) with debounced auto-sync — the graph stays current as you code, zero config | -| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Scala, Dart, Lua, Luau, Svelte, Vue, Astro, Liquid, Pascal/Delphi | +| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Scala, Dart, Lua, Luau, Svelte, Vue, Astro, Liquid, Pascal/Delphi, Erlang | | **Framework-aware Routes** | Recognizes web-framework routing files and links URL patterns to their handlers across 17 frameworks | | **Mixed iOS / React Native / Expo** | Closes cross-language flows that static parsing misses: Swift ↔ ObjC bridging, React Native legacy bridge + TurboModules + Fabric view components, native → JS event emitters, Expo Modules | | **100% Local** | No data leaves your machine. No API keys. No external services. SQLite database only | @@ -644,6 +644,7 @@ is written): | Pascal / Delphi | `.pas`, `.dpr`, `.dpk`, `.lpr` | Full support (classes, records, interfaces, enums, DFM/FMX form files) | | Lua | `.lua` | Full support (functions, methods with receivers, local variables, `require` imports, call edges) | | Luau | `.luau` | Full support (everything in Lua, plus `type`/`export type` aliases, typed signatures, and Roblox instance-path `require`) | +| Erlang | `.erl`, `.hrl` | Full support (modules, functions, records, types, macros, imports, call edges) | ## Measured cross-file coverage diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 7f2d13f5f..5a65912a3 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -101,6 +101,11 @@ describe('Language Detection', () => { expect(detectLanguage('stdio.h', '#ifndef STDIO_H\nvoid printf();\n#endif\n')).toBe('c'); }); + it('should detect Erlang files', () => { + expect(detectLanguage('server.erl')).toBe('erlang'); + expect(detectLanguage('records.hrl')).toBe('erlang'); + }); + it('should return unknown for unsupported extensions', () => { expect(detectLanguage('styles.css')).toBe('unknown'); expect(detectLanguage('data.json')).toBe('unknown'); @@ -2742,9 +2747,138 @@ import 'package:flutter/material.dart'; }); // ============================================================================= -// Pascal / Delphi Extraction +// Erlang Extraction // ============================================================================= +describe('Erlang Extraction', () => { + describe('Language detection', () => { + it('should detect Erlang files', () => { + expect(detectLanguage('server.erl')).toBe('erlang'); + expect(detectLanguage('records.hrl')).toBe('erlang'); + }); + }); + + it('should extract module declarations', () => { + const code = ` +-module(my_server). +-behaviour(gen_server). + +-export([start_link/0, init/1]). + +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +init([]) -> + {ok, #state{}}. +`; + const result = extractFromSource('my_server.erl', code); + + const moduleNode = result.nodes.find((n) => n.kind === 'module'); + expect(moduleNode).toBeDefined(); + expect(moduleNode?.name).toBe('my_server'); + + const functions = result.nodes.filter((n) => n.kind === 'function'); + expect(functions.length).toBeGreaterThanOrEqual(2); + expect(functions.some((f) => f.name === 'start_link')).toBe(true); + expect(functions.some((f) => f.name === 'init')).toBe(true); + }); + + it('should extract record declarations', () => { + const code = ` +-module(records). +-record(user, { + id :: integer(), + name :: string(), + email :: string() +}). + +get_user(Id) -> + #user{id = Id, name = "test"}. +`; + const result = extractFromSource('records.erl', code); + + const structNode = result.nodes.find((n) => n.kind === 'struct'); + expect(structNode).toBeDefined(); + expect(structNode?.name).toBe('user'); + + const fields = result.nodes.filter((n) => n.kind === 'field'); + expect(fields.length).toBe(3); + expect(fields.map((f) => f.name).sort()).toEqual(['email', 'id', 'name']); + }); + + it('should extract type declarations', () => { + const code = ` +-module(types). +-type user_id() :: integer(). +-type user() :: #{id => user_id(), name => string()}. +-opaque handle() :: term(). + +make_handle() -> + {}. +`; + const result = extractFromSource('types.erl', code); + + const typeNodes = result.nodes.filter((n) => n.kind === 'type_alias'); + expect(typeNodes.length).toBe(3); + expect(typeNodes.map((t) => t.name).sort()).toEqual(['handle', 'user', 'user_id']); + }); + + it('should extract imports and includes', () => { + const code = ` +-module(my_app). +-include_lib("kernel/include/logger.hrl"). +-include("my_app.hrl"). +-import(lists, [map/2, filter/2]). + +main() -> + ok. +`; + const result = extractFromSource('my_app.erl', code); + + const imports = result.nodes.filter((n) => n.kind === 'import'); + expect(imports.length).toBeGreaterThanOrEqual(2); + const importNames = imports.map((i) => i.name); + expect(importNames.some((n) => n.includes('kernel'))).toBe(true); + expect(importNames.some((n) => n.includes('lists'))).toBe(true); + }); + + it('should extract macros as constants', () => { + const code = ` +-module(config). +-define(SERVER, ?MODULE). +-define(MAX_RETRIES, 3). + +start() -> + ok. +`; + const result = extractFromSource('config.erl', code); + + const constants = result.nodes.filter((n) => n.kind === 'constant'); + expect(constants.length).toBeGreaterThanOrEqual(2); + expect(constants.some((c) => c.name === 'SERVER')).toBe(true); + expect(constants.some((c) => c.name === 'MAX_RETRIES')).toBe(true); + }); + + it('should extract multi-clause functions', () => { + const code = ` +-module(patterns). +handle_msg({ping, From}, State) -> + From ! {pong, self()}, + {noreply, State}; +handle_msg({stop, Reason}, State) -> + {stop, Reason, State}. +`; + const result = extractFromSource('patterns.erl', code); + + const functions = result.nodes.filter((n) => n.kind === 'function'); + // The grammar emits separate fun_decl nodes for each clause + expect(functions.length).toBe(2); + expect(functions.every((f) => f.name === 'handle_msg')).toBe(true); + }); +}); + + + describe('Pascal / Delphi Extraction', () => { describe('Language detection', () => { it('should detect Pascal files', () => { diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index eabdb598e..e2e802efa 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -38,6 +38,7 @@ const WASM_GRAMMAR_FILES: Record = { lua: 'tree-sitter-lua.wasm', luau: 'tree-sitter-luau.wasm', objc: 'tree-sitter-objc.wasm', + erlang: 'tree-sitter-erlang.wasm', }; /** @@ -106,6 +107,8 @@ export const EXTENSION_MAP: Record = { '.luau': 'luau', '.m': 'objc', '.mm': 'objc', + '.erl': 'erlang', + '.hrl': 'erlang', // XML: file-level tracking; the MyBatis extractor matches `