diff --git a/docs/plans/2026-06-20-v04a-impl-progress.md b/docs/plans/2026-06-20-v04a-impl-progress.md new file mode 100644 index 0000000..5db3d55 --- /dev/null +++ b/docs/plans/2026-06-20-v04a-impl-progress.md @@ -0,0 +1,71 @@ +# Locode v0.4a — Deterministic Retrieval Core — Implementation Progress + +**Started:** 2026-06-20 +**Branch:** `feat/v04a-deterministic-retrieval` +**Design:** [v0.4 — Codebase Intelligence](2026-03-10-v04-codebase-intelligence.md) +**Scope:** v0.4 phase A only — file index, symbol index, indexer, context retriever, budget manager, `symbol_lookup` tool. No embedding index, no dependency graph (those are v0.4b/v0.4c). + +--- + +## Definition of Done + +- [ ] `index` + `context_retrieval` config sections in schema with defaults synced to `locode.yaml` +- [ ] `FileIndex` scans a repo respecting `.gitignore` + config ignore patterns, records path/lang/size/hash +- [ ] `SymbolIndex` extracts functions/classes/types/interfaces/enums for TypeScript + JavaScript (+ Python) +- [ ] `CodebaseIndexer` orchestrates file + symbol indexes, supports incremental updates via file hashes +- [ ] `BudgetManager` allocates a token budget across files by priority weight +- [ ] `ContextRetriever` pipeline: mentioned files → symbol search → sibling tests → rank → truncate to budget +- [ ] `symbol_lookup` tool registered in the default tool registry +- [ ] `ContextRetriever` optionally wired into `CodingAgent.ANALYZE` (backwards-compatible — no-op when index absent) +- [ ] Index persists to disk and loads on subsequent runs +- [ ] All tests pass, `npm run build` succeeds, lint clean + +--- + +## Implementation Decisions + +### Symbol extraction: regex-based, not tree-sitter (for now) + +The v0.4 design spec calls for `web-tree-sitter` (WASM) for AST-based symbol extraction. This v0.4a slice ships a **regex-based extractor** instead, behind a `SymbolExtractor` interface so `web-tree-sitter` can be swapped in later without changing `SymbolIndex`'s API. + +**Why:** +- Keeps the bundle small (AGENTS.md: don't add deps without considering bundle size). `web-tree-sitter` + per-language `.wasm` grammars add ~2-5 MB of static assets. +- Regex extraction is fully testable without loading WASM, and covers the common cases (top-level functions, classes, interfaces, exported symbols) that the agent needs for `symbol_lookup`. +- The `SymbolExtractor` interface means the tree-sitter adapter is a drop-in replacement later — no API churn. + +**Trade-off:** regex extraction misses nested scopes, overloaded signatures, and some edge cases. Acceptable for v0.4a's "fast path first" goal; the LLM-driven ANALYZE fallback still covers complex cases. + +### `find_references` tool deferred to v0.4b + +The design spec lists `find_references` as a v0.4 tool, but it depends on the dependency graph (import tracking) which is v0.4b. Shipping `symbol_lookup` only in v0.4a. + +--- + +## Findings / Things to Improve + +Observed while implementing v0.4a. Not blocking; captured for future work. + +### 1. Config defaults: `.default({})` does not recursively apply inner defaults in Zod + +`SomeSchema.default({})` sets the default to a literal `{}`, NOT the schema-parsed result. Inner field defaults are NOT applied. The fix is `.default(SomeSchema.parse({}))` (or reuse a pre-parsed `DEFAULT_X` constant). + +**Impact:** Any future config section added with `.default({})` will silently produce empty objects instead of defaulted ones. The existing `runtime`, `performance`, and `agent` sections all pass explicit default objects, which masks this. Consider a helper or a lint rule. + +**Location:** `src/config/schema.ts` — `index` and `context_retrieval` now use `DEFAULT_INDEX_CONFIG` / `DEFAULT_CONTEXT_RETRIEVAL_CONFIG`. + +### 2. `CONFIG_TEMPLATE` in `src/cli/setup.ts` is a third source of truth (known) + +Already noted in `docs/plans/misc-todos.md` under "Config template duplication". The template is intentionally minimal (only required fields), so new defaulted sections like `index` and `context_retrieval` don't need to be added — schema defaults cover them. But the duplication risk remains for any future *required* config field. The proposed fix (single `DEFAULT_CONFIG` constant) would resolve this. + +### 3. `CodingAgent.analyze` fast-path is ripe for `ContextRetriever` integration + +The existing `analyze()` in `src/coding/coding-agent.ts:219` already does mentioned-file extraction + sibling-test discovery + LLM fallback — essentially a hand-rolled mini-retriever. v0.4a's `ContextRetriever` generalizes this. Once the retriever is wired in, the agent's `extractMentionedFiles` / `findLikelyTestFiles` / `pushBudgetedFile` logic could be delegated to it, reducing duplication. Deferred to avoid changing working behavior in this slice. + +--- + +## Verification Status + +- `npm test` passes — 398 tests (60 new for v0.4a) +- `npm run build` passes +- `npm run lint` passes + diff --git a/locode.yaml b/locode.yaml index 601964b..83b03da 100644 --- a/locode.yaml +++ b/locode.yaml @@ -64,6 +64,32 @@ performance: max_prompt_chars: 24000 lazy_semantic_search: true +index: + enabled: true + ignore: + - node_modules + - dist + - .git + - coverage + - "*.min.js" + - "*.lock" + languages: + - typescript + - javascript + - python + - go + - rust + chunk_size: 50 + storage_dir: .locode/index + auto_update: true + +context_retrieval: + max_files: 5 + max_tokens_per_file: 2000 + max_total_tokens: 8000 + strategy: deterministic-first + confidence_threshold: 0.7 + # mcp_servers: # linear: # type: remote diff --git a/src/coding/coding-agent.ts b/src/coding/coding-agent.ts index ee0765c..225da37 100644 --- a/src/coding/coding-agent.ts +++ b/src/coding/coding-agent.ts @@ -19,6 +19,7 @@ import type { Planner } from './planner' import type { AgentMemory } from './memory' import type { PerformanceConfig } from '../config/schema' import { PersistentContextCache } from '../runtime/persistent-context-cache' +import type { ContextRetriever } from '../context/context-retriever' interface LLMAgent { run(prompt: string, previousSummary?: string, repoContext?: string): Promise @@ -80,6 +81,7 @@ export class CodingAgent extends EventEmitter { private config: AgentConfig, private performance?: PerformanceConfig, private persistentCache: PersistentContextCache | null = null, + private contextRetriever: ContextRetriever | null = null, ) { super() } @@ -243,6 +245,34 @@ export class CodingAgent extends EventEmitter { } } + if (this.contextRetriever) { + const retrieved = await this.contextRetriever.retrieve(prompt) + if (retrieved.confidence >= (this.performance?.lazy_semantic_search !== false ? 0.7 : 0.5)) { + const gathered = this.applyPromptBudgetToGatheredContext( + { + files: retrieved.files, + searchResults: retrieved.searchResults, + memory: retrieved.memory, + }, + promptBudget, + ) + if (this.performance?.cache_context) { + this.contextCache.set(cacheKey, gathered) + if (this.persistentCache) { + await this.persistentCache.set(prompt, gathered) + } + } + for (const file of retrieved.files) { + this.memory.record({ type: 'file_read', detail: file.path }) + } + return { + gathered, + tokensUsed: { input: 0, output: 0 }, + toolCalls: [], + } + } + } + const files: GatheredContext['files'] = [] const searchResults: GatheredContext['searchResults'] = [] diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts index 214d0dd..8cd59c5 100644 --- a/src/config/schema.test.ts +++ b/src/config/schema.test.ts @@ -143,4 +143,88 @@ describe('ConfigSchema', () => { expect(result.performance.max_prompt_chars).toBe(24000) expect(result.performance.lazy_semantic_search).toBe(true) }) + + it('defaults index config when omitted', () => { + const result = ConfigSchema.parse(baseConfig) + expect(result.index.enabled).toBe(true) + expect(result.index.ignore).toContain('node_modules') + expect(result.index.ignore).toContain('dist') + expect(result.index.ignore).toContain('.git') + expect(result.index.languages).toContain('typescript') + expect(result.index.languages).toContain('javascript') + expect(result.index.chunk_size).toBe(50) + expect(result.index.storage_dir).toBe('.locode/index') + expect(result.index.auto_update).toBe(true) + }) + + it('accepts custom index config', () => { + const result = ConfigSchema.parse({ + ...baseConfig, + index: { + enabled: false, + ignore: ['build', 'vendor'], + languages: ['typescript', 'go'], + chunk_size: 100, + storage_dir: '.cache/index', + auto_update: false, + }, + }) + expect(result.index.enabled).toBe(false) + expect(result.index.ignore).toEqual(['build', 'vendor']) + expect(result.index.languages).toEqual(['typescript', 'go']) + expect(result.index.chunk_size).toBe(100) + expect(result.index.storage_dir).toBe('.cache/index') + expect(result.index.auto_update).toBe(false) + }) + + it('rejects non-positive chunk_size', () => { + expect(() => ConfigSchema.parse({ + ...baseConfig, + index: { chunk_size: 0 }, + })).toThrow() + }) + + it('defaults context_retrieval config when omitted', () => { + const result = ConfigSchema.parse(baseConfig) + expect(result.context_retrieval.max_files).toBe(5) + expect(result.context_retrieval.max_tokens_per_file).toBe(2000) + expect(result.context_retrieval.max_total_tokens).toBe(8000) + expect(result.context_retrieval.strategy).toBe('deterministic-first') + expect(result.context_retrieval.confidence_threshold).toBe(0.7) + }) + + it('accepts custom context_retrieval config', () => { + const result = ConfigSchema.parse({ + ...baseConfig, + context_retrieval: { + max_files: 10, + max_tokens_per_file: 4000, + max_total_tokens: 16000, + strategy: 'semantic-first', + confidence_threshold: 0.85, + }, + }) + expect(result.context_retrieval.max_files).toBe(10) + expect(result.context_retrieval.max_total_tokens).toBe(16000) + expect(result.context_retrieval.strategy).toBe('semantic-first') + expect(result.context_retrieval.confidence_threshold).toBe(0.85) + }) + + it('rejects context_retrieval confidence_threshold out of range', () => { + expect(() => ConfigSchema.parse({ + ...baseConfig, + context_retrieval: { confidence_threshold: 1.5 }, + })).toThrow() + expect(() => ConfigSchema.parse({ + ...baseConfig, + context_retrieval: { confidence_threshold: -0.1 }, + })).toThrow() + }) + + it('rejects invalid context_retrieval strategy', () => { + expect(() => ConfigSchema.parse({ + ...baseConfig, + context_retrieval: { strategy: 'random' }, + })).toThrow() + }) }) diff --git a/src/config/schema.ts b/src/config/schema.ts index 1dccc6e..d48269f 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -44,8 +44,31 @@ export const PerformanceConfigSchema = z.object({ lazy_semantic_search: z.boolean().default(true), }) +export const IndexConfigSchema = z.object({ + enabled: z.boolean().default(true), + ignore: z.array(z.string()).default([ + 'node_modules', 'dist', '.git', 'coverage', '*.min.js', '*.lock', + ]), + languages: z.array(z.string()).default([ + 'typescript', 'javascript', 'python', 'go', 'rust', + ]), + chunk_size: z.number().int().positive().default(50), + storage_dir: z.string().default('.locode/index'), + auto_update: z.boolean().default(true), +}) + +export const ContextRetrievalConfigSchema = z.object({ + max_files: z.number().int().min(1).default(5), + max_tokens_per_file: z.number().int().positive().default(2000), + max_total_tokens: z.number().int().positive().default(8000), + strategy: z.enum(['deterministic-first', 'semantic-first']).default('deterministic-first'), + confidence_threshold: z.number().min(0).max(1).default(0.7), +}) + export const DEFAULT_RUNTIME_CONFIG = RuntimeConfigSchema.parse({}) export const DEFAULT_PERFORMANCE_CONFIG = PerformanceConfigSchema.parse({}) +export const DEFAULT_INDEX_CONFIG = IndexConfigSchema.parse({}) +export const DEFAULT_CONTEXT_RETRIEVAL_CONFIG = ContextRetrievalConfigSchema.parse({}) export const ConfigSchema = z.object({ local_llm: z.object({ @@ -95,6 +118,8 @@ export const ConfigSchema = z.object({ max_prompt_chars: 24000, lazy_semantic_search: true, }), + index: IndexConfigSchema.default(DEFAULT_INDEX_CONFIG), + context_retrieval: ContextRetrievalConfigSchema.default(DEFAULT_CONTEXT_RETRIEVAL_CONFIG), mcp_servers: z.record(z.string(), McpServerSchema).default({}), safety: z.object({ always_confirm: z.array(z.string()).default([]), @@ -111,3 +136,5 @@ export const ConfigSchema = z.object({ export type Config = z.infer export type PerformanceConfig = z.infer +export type IndexConfig = z.infer +export type ContextRetrievalConfig = z.infer diff --git a/src/context/budget-manager.test.ts b/src/context/budget-manager.test.ts new file mode 100644 index 0000000..e42e60b --- /dev/null +++ b/src/context/budget-manager.test.ts @@ -0,0 +1,85 @@ +import { describe, it, expect } from 'vitest' +import { BudgetManager } from './budget-manager' +import type { BudgetPriority } from './types' + +describe('BudgetManager', () => { + it('allocates full budget to a single file', () => { + const mgr = new BudgetManager(1000) + const result = mgr.allocate([ + { path: 'a.ts', content: 'x'.repeat(500), priority: 'direct_match' as BudgetPriority }, + ]) + expect(result).toHaveLength(1) + expect(result[0].tokensUsed).toBe(500) + expect(result[0].truncated).toBe(false) + }) + + it('truncates a file that exceeds max_tokens_per_file', () => { + const mgr = new BudgetManager(10000, { maxPerFile: 200 }) + const result = mgr.allocate([ + { path: 'a.ts', content: 'x'.repeat(500), priority: 'direct_match' as BudgetPriority }, + ]) + expect(result[0].tokensUsed).toBe(200) + expect(result[0].truncated).toBe(true) + expect(result[0].content).toHaveLength(200) + }) + + it('gives higher priority files more budget', () => { + const mgr = new BudgetManager(600, { maxPerFile: 1000 }) + const result = mgr.allocate([ + { path: 'low.ts', content: 'x'.repeat(400), priority: 'dependency' as BudgetPriority }, + { path: 'high.ts', content: 'x'.repeat(400), priority: 'direct_match' as BudgetPriority }, + ]) + const highFile = result.find(r => r.path === 'high.ts')! + const lowFile = result.find(r => r.path === 'low.ts')! + expect(highFile.tokensUsed).toBeGreaterThan(lowFile.tokensUsed) + }) + + it('does not exceed total budget', () => { + const mgr = new BudgetManager(300, { maxPerFile: 1000 }) + const result = mgr.allocate([ + { path: 'a.ts', content: 'x'.repeat(200), priority: 'direct_match' as BudgetPriority }, + { path: 'b.ts', content: 'x'.repeat(200), priority: 'symbol_match' as BudgetPriority }, + ]) + const total = result.reduce((sum, r) => sum + r.tokensUsed, 0) + expect(total).toBeLessThanOrEqual(300) + }) + + it('returns empty array for empty input', () => { + const mgr = new BudgetManager(1000) + expect(mgr.allocate([])).toEqual([]) + }) + + it('marks files as truncated when total budget is exhausted', () => { + const mgr = new BudgetManager(100, { maxPerFile: 1000 }) + const result = mgr.allocate([ + { path: 'a.ts', content: 'x'.repeat(80), priority: 'direct_match' as BudgetPriority }, + { path: 'b.ts', content: 'x'.repeat(80), priority: 'direct_match' as BudgetPriority }, + ]) + const total = result.reduce((sum, r) => sum + r.tokensUsed, 0) + expect(total).toBe(100) + const truncated = result.filter(r => r.truncated) + expect(truncated.length).toBeGreaterThan(0) + }) + + it('respects max_files limit', () => { + const mgr = new BudgetManager(10000, { maxPerFile: 1000, maxFiles: 2 }) + const result = mgr.allocate([ + { path: 'a.ts', content: 'x', priority: 'direct_match' as BudgetPriority }, + { path: 'b.ts', content: 'x', priority: 'direct_match' as BudgetPriority }, + { path: 'c.ts', content: 'x', priority: 'direct_match' as BudgetPriority }, + ]) + expect(result).toHaveLength(2) + }) + + it('sorts output by priority (direct_match first)', () => { + const mgr = new BudgetManager(10000, { maxPerFile: 1000 }) + const result = mgr.allocate([ + { path: 'dep.ts', content: 'x', priority: 'dependency' as BudgetPriority }, + { path: 'direct.ts', content: 'x', priority: 'direct_match' as BudgetPriority }, + { path: 'sym.ts', content: 'x', priority: 'symbol_match' as BudgetPriority }, + ]) + expect(result[0].path).toBe('direct.ts') + expect(result[1].path).toBe('sym.ts') + expect(result[2].path).toBe('dep.ts') + }) +}) diff --git a/src/context/budget-manager.ts b/src/context/budget-manager.ts new file mode 100644 index 0000000..a102d31 --- /dev/null +++ b/src/context/budget-manager.ts @@ -0,0 +1,73 @@ +import type { BudgetPriority, BudgetedFile } from './types' + +const PRIORITY_WEIGHTS: Record = { + direct_match: 1.0, + symbol_match: 0.8, + semantic_match: 0.6, + dependency: 0.4, + git_context: 0.3, +} + +export interface BudgetManagerOptions { + maxPerFile?: number + maxFiles?: number +} + +export class BudgetManager { + private maxPerFile: number + private maxFiles: number + + constructor( + private totalTokens: number, + opts: BudgetManagerOptions = {}, + ) { + this.maxPerFile = opts.maxPerFile ?? totalTokens + this.maxFiles = opts.maxFiles ?? Infinity + } + + allocate( + files: Array<{ path: string; content: string; priority: BudgetPriority }>, + ): BudgetedFile[] { + if (files.length === 0) return [] + + const sorted = [...files].sort((a, b) => + PRIORITY_WEIGHTS[b.priority] - PRIORITY_WEIGHTS[a.priority], + ) + + const limited = sorted.slice(0, this.maxFiles) + + const totalWeight = limited.reduce( + (sum, f) => sum + PRIORITY_WEIGHTS[f.priority], 0, + ) + + const result: BudgetedFile[] = [] + let remaining = this.totalTokens + + for (const file of limited) { + if (remaining <= 0) { + result.push({ + path: file.path, content: '', tokensUsed: 0, truncated: true, + }) + continue + } + + const weight = PRIORITY_WEIGHTS[file.priority] + const weightedBudget = Math.floor((weight / totalWeight) * this.totalTokens) + const fileBudget = Math.min(weightedBudget, this.maxPerFile, remaining) + const requestedChars = file.content.length + const usedChars = Math.min(requestedChars, fileBudget) + const truncated = usedChars < requestedChars + + result.push({ + path: file.path, + content: file.content.slice(0, usedChars), + tokensUsed: usedChars, + truncated, + }) + + remaining -= usedChars + } + + return result + } +} diff --git a/src/context/context-retriever.test.ts b/src/context/context-retriever.test.ts new file mode 100644 index 0000000..d9f7536 --- /dev/null +++ b/src/context/context-retriever.test.ts @@ -0,0 +1,202 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { ContextRetriever } from './context-retriever' +import { CodebaseIndexer } from '../index/indexer' +import type { RetrievalConfig, MemorySnapshot } from './types' +import fs from 'fs' +import path from 'path' +import os from 'os' +import type { IndexConfig } from '../index/types' + +const EMPTY_MEMORY: MemorySnapshot = { + recentFiles: [], + recentEdits: [], + recentCommands: [], + recentErrors: [], + sessionStart: 0, +} + +describe('ContextRetriever', () => { + let tmpDir: string + let indexer: CodebaseIndexer + let config: RetrievalConfig + let indexConfig: IndexConfig + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'locode-retriever-')) + indexConfig = { + root: tmpDir, + ignore: ['node_modules', 'dist', '.git', 'coverage'], + languages: ['typescript', 'javascript'], + storage_dir: path.join(tmpDir, '.locode', 'index'), + auto_update: true, + } + config = { + max_files: 5, + max_tokens_per_file: 2000, + max_total_tokens: 8000, + strategy: 'deterministic-first', + confidence_threshold: 0.7, + } + indexer = new CodebaseIndexer(indexConfig) + }) + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }) + }) + + function writeFile(rel: string, content: string): void { + const full = path.join(tmpDir, rel) + fs.mkdirSync(path.dirname(full), { recursive: true }) + fs.writeFileSync(full, content, 'utf8') + } + + async function buildIndex(): Promise { + await indexer.buildAll() + } + + it('resolves files mentioned by name in the prompt', async () => { + writeFile('src/router.ts', 'export function route() { return "local" }\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('fix the bug in router.ts') + + expect(result.strategyUsed).toContain('mentioned-path') + expect(result.files.some(f => f.path === 'src/router.ts')).toBe(true) + expect(result.confidence).toBeGreaterThanOrEqual(0.7) + }) + + it('finds symbols mentioned in the prompt', async () => { + writeFile('src/orchestrator.ts', 'export function processTask() { return null }\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('how does processTask work?') + + expect(result.strategyUsed).toContain('symbol-index') + expect(result.files.some(f => f.path === 'src/orchestrator.ts')).toBe(true) + }) + + it('discovers sibling test files for mentioned source files', async () => { + writeFile('src/utils.ts', 'export function helper() { return 1 }\n') + writeFile('src/utils.test.ts', 'import { helper } from "./utils"\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('update utils.ts to handle edge cases') + + expect(result.strategyUsed).toContain('test-discovery') + expect(result.files.some(f => f.path === 'src/utils.test.ts')).toBe(true) + }) + + it('includes recent files from memory', async () => { + writeFile('src/recent.ts', 'export const x = 1\n') + await buildIndex() + + const memory: MemorySnapshot = { + ...EMPTY_MEMORY, + recentFiles: ['src/recent.ts'], + } + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory }) + const result = await retriever.retrieve('continue working on this') + + expect(result.files.some(f => f.path === 'src/recent.ts')).toBe(true) + }) + + it('respects max_files limit', async () => { + writeFile('a.ts', 'export const a = 1\n') + writeFile('b.ts', 'export const b = 2\n') + writeFile('c.ts', 'export const c = 3\n') + await buildIndex() + + config.max_files = 2 + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('look at a.ts b.ts c.ts') + + expect(result.files.length).toBeLessThanOrEqual(2) + }) + + it('respects max_total_tokens budget', async () => { + writeFile('big.ts', 'x'.repeat(5000)) + writeFile('big2.ts', 'y'.repeat(5000)) + await buildIndex() + + config.max_total_tokens = 100 + config.max_files = 5 + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('look at big.ts and big2.ts') + + const totalChars = result.files.reduce((sum, f) => sum + f.content.length, 0) + expect(totalChars).toBeLessThanOrEqual(100) + }) + + it('returns low confidence when no files are found', async () => { + writeFile('unrelated.ts', 'export const z = 0\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('something completely unrelated') + + expect(result.confidence).toBeLessThan(config.confidence_threshold) + }) + + it('deduplicates files found by multiple strategies', async () => { + writeFile('src/foo.ts', 'export function foo() {}\n') + writeFile('src/foo.test.ts', 'import { foo } from "./foo"\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('fix foo in foo.ts') + + const fooCount = result.files.filter(f => f.path === 'src/foo.ts').length + expect(fooCount).toBe(1) + }) + + it('includes search results from symbol lookup', async () => { + writeFile('src/api.ts', 'export function handleRequest() { return 200 }\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('where is handleRequest defined?') + + expect(result.searchResults.length).toBeGreaterThan(0) + expect(result.searchResults.some(s => s.match.includes('handleRequest'))).toBe(true) + }) + + it('returns empty context for empty repo', async () => { + await buildIndex() + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('fix the bug in main.ts') + + expect(result.files).toHaveLength(0) + expect(result.confidence).toBeLessThan(config.confidence_threshold) + }) + + it('returns memory snapshot in gathered context', async () => { + writeFile('a.ts', 'export const a = 1\n') + await buildIndex() + + const memory: MemorySnapshot = { + ...EMPTY_MEMORY, + recentFiles: ['a.ts'], + sessionStart: 12345, + } + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory }) + const result = await retriever.retrieve('look at a.ts') + + expect(result.memory).toEqual(memory) + }) + + it('marks relevance reason for each file', async () => { + writeFile('src/router.ts', 'export function route() {}\n') + writeFile('src/router.test.ts', 'import { route } from "./router"\n') + await buildIndex() + + const retriever = new ContextRetriever(indexer, config, { root: tmpDir, memory: EMPTY_MEMORY }) + const result = await retriever.retrieve('fix bug in router.ts') + + const router = result.files.find(f => f.path === 'src/router.ts') + expect(router).toBeDefined() + expect(router!.relevance).toBeTruthy() + }) +}) diff --git a/src/context/context-retriever.ts b/src/context/context-retriever.ts new file mode 100644 index 0000000..d8e5fd3 --- /dev/null +++ b/src/context/context-retriever.ts @@ -0,0 +1,302 @@ +import fs from 'fs' +import path from 'path' +import type { CodebaseIndexer } from '../index/indexer' +import type { RetrievalConfig, RetrievedContext, ContextSource, BudgetPriority, MemorySnapshot, BudgetedFile } from './types' +import type { GatheredContext } from '../coding/types' +import { BudgetManager } from './budget-manager' + +const FILE_EXTENSION_PATTERN = /\b([\w./-]+\.(?:ts|js|tsx|jsx|py|rs|go|java|json|yaml|yml|md|css|html|sh))\b/g + +export interface ContextRetrieverOptions { + root: string + memory: MemorySnapshot +} + +interface CandidateFile { + path: string + content: string + sources: ContextSource[] + priority: BudgetPriority +} + +export class ContextRetriever { + constructor( + private indexer: CodebaseIndexer, + private config: RetrievalConfig, + private opts: ContextRetrieverOptions, + ) {} + + async retrieve(query: string): Promise { + const strategiesUsed: ContextSource[] = [] + const candidates: Map = new Map() + + const addCandidate = (file: string, source: ContextSource, priority: BudgetPriority, content?: string) => { + const existing = candidates.get(file) + if (existing) { + if (!existing.sources.includes(source)) { + existing.sources.push(source) + } + existing.priority = this.higherPriority(existing.priority, priority) + if (!existing.content && content) { + existing.content = content + } + } else { + candidates.set(file, { + path: file, + content: content ?? '', + sources: [source], + priority, + }) + } + } + + const mentionedFiles = this.extractMentionedFiles(query) + for (const file of mentionedFiles) { + const content = this.readFileContent(file) + if (content !== null) { + addCandidate(file, 'mentioned-path', 'direct_match', content) + if (!strategiesUsed.includes('mentioned-path')) { + strategiesUsed.push('mentioned-path') + } + } + } + + for (const file of mentionedFiles) { + for (const testFile of this.findSiblingTests(file)) { + const content = this.readFileContent(testFile) + if (content !== null) { + addCandidate(testFile, 'test-discovery', 'symbol_match', content) + if (!strategiesUsed.includes('test-discovery')) { + strategiesUsed.push('test-discovery') + } + } + } + } + + const symbolResults = this.searchSymbols(query) + for (const result of symbolResults) { + const content = this.readFileContent(result.file) + if (content !== null) { + addCandidate(result.file, 'symbol-index', 'symbol_match', content) + if (!strategiesUsed.includes('symbol-index')) { + strategiesUsed.push('symbol-index') + } + } + } + + for (const file of this.opts.memory.recentFiles) { + if (candidates.has(file)) continue + const content = this.readFileContent(file) + if (content !== null) { + addCandidate(file, 'recent-files', 'dependency', content) + if (!strategiesUsed.includes('recent-files')) { + strategiesUsed.push('recent-files') + } + } + } + + const candidateList = [...candidates.values()] + if (candidateList.length === 0) { + return { + files: [], + searchResults: [], + memory: this.opts.memory, + confidence: 0, + strategyUsed: strategiesUsed, + } + } + + const budgetMgr = new BudgetManager(this.config.max_total_tokens, { + maxPerFile: this.config.max_tokens_per_file, + maxFiles: this.config.max_files, + }) + + const budgeted = budgetMgr.allocate( + candidateList.map(c => ({ path: c.path, content: c.content, priority: c.priority })), + ) + + const files: GatheredContext['files'] = budgeted + .filter((f): f is BudgetedFile => f.tokensUsed > 0) + .map(f => { + const candidate = candidates.get(f.path)! + return { + path: f.path, + content: f.content, + relevance: candidate.sources.join(', '), + } + }) + + const searchResults = symbolResults.map(s => ({ + file: s.file, + line: s.lineStart, + match: s.signature ?? s.name, + })) + + const confidence = this.computeConfidence(candidateList, mentionedFiles, symbolResults) + + return { + files, + searchResults, + memory: this.opts.memory, + confidence, + strategyUsed: strategiesUsed, + } + } + + private extractMentionedFiles(query: string): string[] { + const files: string[] = [] + let match + FILE_EXTENSION_PATTERN.lastIndex = 0 + while ((match = FILE_EXTENSION_PATTERN.exec(query)) !== null) { + const candidate = match[1] + const resolved = this.resolveFilePath(candidate) + if (resolved && !files.includes(resolved)) { + files.push(resolved) + } + } + return files + } + + private resolveFilePath(candidate: string): string | null { + if (this.fileExists(candidate)) { + return candidate + } + const exactMatches = this.indexer.files.find(candidate) + if (exactMatches.length > 0) { + return exactMatches[0].path + } + const basename = path.basename(candidate) + const matches = this.indexer.files.all().filter(e => path.basename(e.path) === basename) + if (matches.length === 1) { + return matches[0].path + } + if (matches.length > 1) { + const ext = path.extname(candidate) + const baseWithoutExt = path.basename(candidate, ext) + const dirMatch = matches.find(m => + path.dirname(m.path).endsWith(path.dirname(candidate)) || + path.basename(m.path, ext) === baseWithoutExt, + ) + if (dirMatch) return dirMatch.path + return matches[0].path + } + return null + } + + private searchSymbols(query: string): Array<{ file: string; lineStart: number; name: string; signature?: string }> { + const tokens = this.extractSymbolTokens(query) + const results: Array<{ file: string; lineStart: number; name: string; signature?: string }> = [] + const seen = new Set() + + for (const token of tokens) { + const symbols = this.indexer.symbols.search(token) + for (const sym of symbols.slice(0, 5)) { + const key = `${sym.file}:${sym.name}` + if (!seen.has(key)) { + seen.add(key) + results.push({ + file: sym.file, + lineStart: sym.lineStart, + name: sym.name, + signature: sym.signature, + }) + } + } + } + + return results + } + + private extractSymbolTokens(query: string): string[] { + const words = query.match(/\b[a-z][a-zA-Z0-9_]+\b/g) ?? [] + const stopWords = new Set([ + 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', + 'should', 'may', 'might', 'must', 'can', 'need', 'in', 'on', 'at', + 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'and', 'or', 'not', + 'but', 'if', 'then', 'else', 'when', 'where', 'how', 'what', 'why', + 'who', 'this', 'that', 'these', 'those', 'it', 'its', 'fix', 'add', + 'update', 'change', 'modify', 'remove', 'delete', 'create', 'write', + 'read', 'get', 'set', 'put', 'show', 'find', 'look', 'see', 'check', + 'make', 'run', 'test', 'bug', 'error', 'issue', 'problem', 'work', + ]) + return words + .filter(w => w.length >= 3 && !stopWords.has(w.toLowerCase())) + .slice(0, 10) + } + + private findSiblingTests(filePath: string): string[] { + const tests: string[] = [] + const ext = path.extname(filePath) + const base = path.basename(filePath, ext) + const dir = path.dirname(filePath) + + const testPatterns = [ + path.join(dir, `${base}.test${ext}`), + path.join(dir, `${base}.spec${ext}`), + path.join(dir, '__tests__', `${base}.test${ext}`), + path.join(dir, `${base}_test${ext}`), + ] + + for (const testPattern of testPatterns) { + const resolved = this.resolveFilePath(testPattern) + if (resolved) { + tests.push(resolved) + } + } + + return tests + } + + private readFileContent(relPath: string): string | null { + if (this.fileExists(relPath)) { + try { + return fs.readFileSync(relPath, 'utf8') + } catch { + return null + } + } + const fullPath = path.join(this.opts.root, relPath) + try { + return fs.readFileSync(fullPath, 'utf8') + } catch { + return null + } + } + + private fileExists(relPath: string): boolean { + try { + return fs.statSync(relPath).isFile() + } catch { + return false + } + } + + private higherPriority(a: BudgetPriority, b: BudgetPriority): BudgetPriority { + const weights: Record = { + direct_match: 5, + symbol_match: 4, + semantic_match: 3, + dependency: 2, + git_context: 1, + } + return weights[a] >= weights[b] ? a : b + } + + private computeConfidence( + candidates: CandidateFile[], + mentionedFiles: string[], + symbolResults: Array<{ file: string; name: string }>, + ): number { + if (candidates.length === 0) return 0 + + let confidence = 0 + if (mentionedFiles.length > 0) confidence = Math.max(confidence, 0.7) + if (symbolResults.length > 0) confidence = Math.max(confidence, 0.5) + if (candidates.some(c => c.sources.includes('test-discovery'))) confidence = Math.max(confidence, 0.6) + if (candidates.some(c => c.sources.includes('recent-files'))) confidence = Math.max(confidence, 0.3) + if (candidates.length >= 3) confidence = Math.min(1, confidence + 0.1) + + return confidence + } +} diff --git a/src/context/types.ts b/src/context/types.ts new file mode 100644 index 0000000..9f8d0d5 --- /dev/null +++ b/src/context/types.ts @@ -0,0 +1,39 @@ +import type { GatheredContext, MemorySnapshot } from '../coding/types' + +export type { MemorySnapshot } +export type ContextSource = + | 'mentioned-path' + | 'recent-files' + | 'symbol-index' + | 'test-discovery' + | 'sibling-file' + | 'dependency' + | 'semantic-search' + | 'git-context' + +export type BudgetPriority = + | 'direct_match' + | 'symbol_match' + | 'semantic_match' + | 'dependency' + | 'git_context' + +export interface RetrievedContext extends GatheredContext { + confidence: number + strategyUsed: ContextSource[] +} + +export interface BudgetedFile { + path: string + content: string + tokensUsed: number + truncated: boolean +} + +export interface RetrievalConfig { + max_files: number + max_tokens_per_file: number + max_total_tokens: number + strategy: 'deterministic-first' | 'semantic-first' + confidence_threshold: number +} diff --git a/src/index/file-index.test.ts b/src/index/file-index.test.ts new file mode 100644 index 0000000..15a9d99 --- /dev/null +++ b/src/index/file-index.test.ts @@ -0,0 +1,186 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { FileIndex } from './file-index' +import fs from 'fs' +import path from 'path' +import os from 'os' +import type { IndexConfig } from './types' + +describe('FileIndex', () => { + let tmpDir: string + let index: FileIndex + let config: IndexConfig + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'locode-findex-')) + config = { + root: tmpDir, + ignore: ['node_modules', 'dist', '.git', 'coverage', '*.min.js', '*.lock'], + languages: ['typescript', 'javascript', 'python', 'go', 'rust'], + storage_dir: path.join(tmpDir, '.locode', 'index'), + auto_update: true, + } + index = new FileIndex() + }) + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }) + }) + + function writeFile(rel: string, content: string): void { + const full = path.join(tmpDir, rel) + fs.mkdirSync(path.dirname(full), { recursive: true }) + fs.writeFileSync(full, content, 'utf8') + } + + it('scans files and records path, language, size, hash', async () => { + writeFile('src/a.ts', 'export const x = 1\n') + writeFile('src/b.js', 'module.exports = 2\n') + writeFile('README.md', '# hello\n') + + await index.build(config) + + const all = index.all() + expect(all.length).toBeGreaterThanOrEqual(3) + const a = index.find('src/a.ts')[0] + expect(a).toBeDefined() + expect(a!.language).toBe('typescript') + expect(a!.size).toBe(Buffer.byteLength('export const x = 1\n')) + expect(a!.hash).toHaveLength(64) // sha-256 hex + expect(a!.lastIndexed).toBeGreaterThan(0) + }) + + it('detects language from file extension', async () => { + writeFile('a.ts', '') + writeFile('b.js', '') + writeFile('c.py', '') + writeFile('d.go', '') + writeFile('e.rs', '') + writeFile('f.json', '{}') + + await index.build(config) + + expect(index.find('a.ts')[0]?.language).toBe('typescript') + expect(index.find('b.js')[0]?.language).toBe('javascript') + expect(index.find('c.py')[0]?.language).toBe('python') + expect(index.find('d.go')[0]?.language).toBe('go') + expect(index.find('e.rs')[0]?.language).toBe('rust') + expect(index.find('f.json')[0]?.language).toBe('json') + }) + + it('respects ignore patterns (directory names)', async () => { + writeFile('src/main.ts', 'export const x = 1\n') + writeFile('node_modules/lib.ts', 'export const y = 2\n') + writeFile('dist/build.js', 'var z = 3\n') + + await index.build(config) + + const paths = index.all().map(e => e.path) + expect(paths).toContain('src/main.ts') + expect(paths).not.toContain('node_modules/lib.ts') + expect(paths).not.toContain('dist/build.js') + }) + + it('respects glob ignore patterns (*.min.js, *.lock)', async () => { + writeFile('app.min.js', 'var a=1\n') + writeFile('package.lock', '{}') + writeFile('app.js', 'var b=2\n') + + await index.build(config) + + const paths = index.all().map(e => e.path) + expect(paths).toContain('app.js') + expect(paths).not.toContain('app.min.js') + expect(paths).not.toContain('package.lock') + }) + + it('respects .gitignore in the repo root', async () => { + writeFile('.gitignore', 'secrets/\n*.env\n') + writeFile('src/index.ts', 'export const x = 1\n') + writeFile('secrets/key.txt', 'SECRET\n') + writeFile('.env', 'TOKEN=abc\n') + + await index.build(config) + + const paths = index.all().map(e => e.path) + expect(paths).toContain('src/index.ts') + expect(paths).not.toContain('secrets/key.txt') + expect(paths).not.toContain('.env') + }) + + it('find by glob pattern', async () => { + writeFile('src/a.ts', '') + writeFile('src/b.ts', '') + writeFile('src/sub/c.ts', '') + writeFile('src/d.js', '') + + await index.build(config) + + const tsFiles = index.find('src/*.ts') + expect(tsFiles.map(e => e.path).sort()).toEqual(['src/a.ts', 'src/b.ts']) + }) + + it('findByLanguage returns files of a given language', async () => { + writeFile('a.ts', '') + writeFile('b.ts', '') + writeFile('c.js', '') + + await index.build(config) + + const ts = index.findByLanguage('typescript') + expect(ts.map(e => e.path).sort()).toEqual(['a.ts', 'b.ts']) + }) + + it('update detects added, removed, and changed files', async () => { + writeFile('a.ts', 'export const x = 1\n') + writeFile('b.ts', 'export const y = 2\n') + + await index.build(config) + expect(index.all()).toHaveLength(2) + + writeFile('c.ts', 'export const z = 3\n') + fs.unlinkSync(path.join(tmpDir, 'b.ts')) + writeFile('a.ts', 'export const x = 999\n') + + const result = await index.update() + + expect(result.added).toEqual(['c.ts']) + expect(result.removed).toEqual(['b.ts']) + expect(result.changed).toEqual(['a.ts']) + expect(index.all()).toHaveLength(2) + }) + + it('save and load round-trips the index', async () => { + writeFile('a.ts', 'export const x = 1\n') + writeFile('b.js', 'var y = 2\n') + + await index.build(config) + await index.save(config.storage_dir) + + const reloaded = new FileIndex() + await reloaded.load(config.storage_dir) + + expect(reloaded.all().map(e => e.path).sort()).toEqual(['a.ts', 'b.js']) + expect(reloaded.find('a.ts')[0]?.hash).toBe(index.find('a.ts')[0]?.hash) + }) + + it('isIndexed returns false before build, true after', async () => { + expect(index.isIndexed()).toBe(false) + writeFile('a.ts', '') + await index.build(config) + expect(index.isIndexed()).toBe(true) + }) + + it('indexes all known file types regardless of languages config (languages filters symbols, not files)', async () => { + config.languages = ['typescript'] + writeFile('a.ts', '') + writeFile('b.js', '') + writeFile('c.md', '') + + await index.build(config) + + const paths = index.all().map(e => e.path) + expect(paths).toContain('a.ts') + expect(paths).toContain('b.js') + expect(paths).toContain('c.md') + }) +}) diff --git a/src/index/file-index.ts b/src/index/file-index.ts new file mode 100644 index 0000000..de0c6f8 --- /dev/null +++ b/src/index/file-index.ts @@ -0,0 +1,215 @@ +import fs from 'fs' +import path from 'path' +import crypto from 'crypto' +import type { FileEntry, IndexConfig, IncrementalUpdateResult } from './types' + +const EXTENSION_TO_LANGUAGE: Record = { + '.ts': 'typescript', + '.tsx': 'typescript', + '.js': 'javascript', + '.jsx': 'javascript', + '.mjs': 'javascript', + '.cjs': 'javascript', + '.py': 'python', + '.go': 'go', + '.rs': 'rust', + '.json': 'json', + '.yaml': 'yaml', + '.yml': 'yaml', + '.md': 'markdown', + '.css': 'css', + '.html': 'html', + '.sh': 'shell', +} + +const LANGUAGE_EXTENSIONS = new Set(Object.keys(EXTENSION_TO_LANGUAGE)) + +export class FileIndex { + private files: Map = new Map() + private indexed = false + private lastConfig: IndexConfig | null = null + + async build(config: IndexConfig): Promise { + this.files.clear() + const ignorePatterns = this.loadIgnorePatterns(config) + const root = config.root + + for (const filePath of this.walk(root, ignorePatterns)) { + const rel = path.relative(root, filePath) + const language = this.detectLanguage(rel) + const stat = fs.statSync(filePath) + this.files.set(rel, { + path: rel, + language, + size: stat.size, + hash: this.hashFile(filePath), + lastIndexed: Date.now(), + }) + } + + this.indexed = true + this.lastConfig = config + } + + async update(): Promise { + if (!this.indexed || !this.lastConfig) { + throw new Error('FileIndex.update() called before build()') + } + const config = this.lastConfig + const ignorePatterns = this.loadIgnorePatterns(config) + const root = config.root + const currentPaths = new Set() + + const added: string[] = [] + const changed: string[] = [] + + for (const filePath of this.walk(root, ignorePatterns)) { + const rel = path.relative(root, filePath) + const language = this.detectLanguage(rel) + currentPaths.add(rel) + const stat = fs.statSync(filePath) + const hash = this.hashFile(filePath) + const existing = this.files.get(rel) + if (!existing) { + added.push(rel) + this.files.set(rel, { + path: rel, language, size: stat.size, hash, lastIndexed: Date.now(), + }) + } else if (existing.hash !== hash) { + changed.push(rel) + this.files.set(rel, { + path: rel, language, size: stat.size, hash, lastIndexed: Date.now(), + }) + } + } + + const removed: string[] = [] + for (const rel of this.files.keys()) { + if (!currentPaths.has(rel)) { + removed.push(rel) + this.files.delete(rel) + } + } + + return { added, removed, changed } + } + + find(glob: string): FileEntry[] { + const matcher = this.globToRegex(glob) + return this.all().filter(e => matcher.test(e.path)) + } + + findByLanguage(lang: string): FileEntry[] { + return this.all().filter(e => e.language === lang) + } + + all(): FileEntry[] { + return [...this.files.values()] + } + + isIndexed(): boolean { + return this.indexed + } + + async save(dir: string): Promise { + fs.mkdirSync(dir, { recursive: true }) + const data = JSON.stringify([...this.files.values()], null, 2) + fs.writeFileSync(path.join(dir, 'files.json'), data, 'utf8') + } + + async load(dir: string): Promise { + const file = path.join(dir, 'files.json') + const data = fs.readFileSync(file, 'utf8') + const entries: FileEntry[] = JSON.parse(data) + this.files.clear() + for (const entry of entries) { + this.files.set(entry.path, entry) + } + this.indexed = true + } + + private loadIgnorePatterns(config: IndexConfig): string[] { + const patterns = [...config.ignore] + const gitignorePath = path.join(config.root, '.gitignore') + if (fs.existsSync(gitignorePath)) { + const content = fs.readFileSync(gitignorePath, 'utf8') + for (const line of content.split('\n')) { + const trimmed = line.trim() + if (trimmed && !trimmed.startsWith('#')) { + patterns.push(trimmed) + } + } + } + return patterns + } + + private walk(root: string, ignorePatterns: string[]): string[] { + const results: string[] = [] + const matchers = ignorePatterns.map(p => this.globToRegex(p)) + const stack: string[] = [root] + + while (stack.length > 0) { + const dir = stack.pop()! + let entries: fs.Dirent[] + try { + entries = fs.readdirSync(dir, { withFileTypes: true }) + } catch { + continue + } + for (const entry of entries) { + const full = path.join(dir, entry.name) + const rel = path.relative(root, full) + if (matchers.some(m => m.test(rel) || m.test(entry.name))) { + continue + } + if (entry.isDirectory()) { + stack.push(full) + } else if (entry.isFile()) { + const ext = path.extname(entry.name).toLowerCase() + if (LANGUAGE_EXTENSIONS.has(ext)) { + results.push(full) + } + } + } + } + return results + } + + private detectLanguage(relPath: string): string { + const ext = path.extname(relPath).toLowerCase() + return EXTENSION_TO_LANGUAGE[ext] ?? 'unknown' + } + + private hashFile(filePath: string): string { + const content = fs.readFileSync(filePath) + return crypto.createHash('sha256').update(content).digest('hex') + } + + private globToRegex(glob: string): RegExp { + let pattern = glob.replace(/[.+^{}()|[\]\\]/g, '\\$&') + let result = '' + let i = 0 + while (i < pattern.length) { + if (pattern[i] === '*' && pattern[i + 1] === '*') { + result += '.*' + i += 2 + } else if (pattern[i] === '*') { + result += '[^/]*' + i += 1 + } else if (pattern[i] === '?') { + result += '[^/]' + i += 1 + } else { + result += pattern[i] + i += 1 + } + } + pattern = result + if (!pattern.includes('/')) { + pattern = `(^|/)${pattern}($|/)` + } else { + pattern = `^${pattern}$` + } + return new RegExp(pattern) + } +} diff --git a/src/index/indexer.test.ts b/src/index/indexer.test.ts new file mode 100644 index 0000000..86cb161 --- /dev/null +++ b/src/index/indexer.test.ts @@ -0,0 +1,131 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { CodebaseIndexer } from './indexer' +import fs from 'fs' +import path from 'path' +import os from 'os' +import type { IndexConfig } from './types' + +describe('CodebaseIndexer', () => { + let tmpDir: string + let config: IndexConfig + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'locode-indexer-')) + config = { + root: tmpDir, + ignore: ['node_modules', 'dist', '.git', 'coverage', '*.min.js', '*.lock'], + languages: ['typescript', 'javascript', 'python'], + storage_dir: path.join(tmpDir, '.locode', 'index'), + auto_update: true, + } + }) + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }) + }) + + function writeFile(rel: string, content: string): void { + const full = path.join(tmpDir, rel) + fs.mkdirSync(path.dirname(full), { recursive: true }) + fs.writeFileSync(full, content, 'utf8') + } + + it('buildAll indexes files and symbols', async () => { + writeFile('src/a.ts', 'export function foo() { return 1 }\n') + writeFile('src/b.ts', [ + 'export class Bar {', + ' method() {}', + '}', + ].join('\n')) + writeFile('README.md', '# hello\n') + + const indexer = new CodebaseIndexer(config) + const stats = await indexer.buildAll() + + expect(stats.files).toBeGreaterThanOrEqual(3) + expect(stats.symbols).toBeGreaterThanOrEqual(3) // foo, Bar, method + expect(stats.buildTimeMs).toBeGreaterThanOrEqual(0) + expect(indexer.isIndexed()).toBe(true) + }) + + it('isIndexed returns false before build', () => { + const indexer = new CodebaseIndexer(config) + expect(indexer.isIndexed()).toBe(false) + }) + + it('get files returns the FileIndex', async () => { + writeFile('a.ts', '') + const indexer = new CodebaseIndexer(config) + await indexer.buildAll() + expect(indexer.files.find('a.ts')).toHaveLength(1) + }) + + it('get symbols returns the SymbolIndex', async () => { + writeFile('a.ts', 'export function foo() {}') + const indexer = new CodebaseIndexer(config) + await indexer.buildAll() + expect(indexer.symbols.search('foo')).toHaveLength(1) + }) + + it('update re-indexes only changed files', async () => { + writeFile('a.ts', 'export function foo() { return 1 }\n') + writeFile('b.ts', 'export function bar() { return 2 }\n') + + const indexer = new CodebaseIndexer(config) + await indexer.buildAll() + expect(indexer.symbols.search('foo')).toHaveLength(1) + expect(indexer.symbols.search('bar')).toHaveLength(1) + + writeFile('c.ts', 'export function baz() { return 3 }\n') + fs.unlinkSync(path.join(tmpDir, 'b.ts')) + writeFile('a.ts', 'export function foo() { return 999 }\n') + + const stats = await indexer.update() + + expect(stats.files).toBe(2) // a.ts changed + c.ts added + expect(indexer.symbols.search('bar')).toHaveLength(0) // b.ts removed + expect(indexer.symbols.search('baz')).toHaveLength(1) // c.ts added + }) + + it('save persists to storage_dir and load restores', async () => { + writeFile('a.ts', 'export function foo() {}') + writeFile('b.ts', 'export class Bar {}') + + const indexer = new CodebaseIndexer(config) + await indexer.buildAll() + await indexer.save() + + const reloaded = new CodebaseIndexer(config) + await reloaded.load() + + expect(reloaded.isIndexed()).toBe(true) + expect(reloaded.files.find('a.ts')).toHaveLength(1) + expect(reloaded.symbols.search('foo')).toHaveLength(1) + expect(reloaded.symbols.search('Bar')).toHaveLength(1) + }) + + it('only extracts symbols for configured languages', async () => { + writeFile('a.ts', 'export function foo() {}') + writeFile('b.go', 'package main\nfunc bar() {}') + + const indexer = new CodebaseIndexer(config) + await indexer.buildAll() + + expect(indexer.symbols.search('foo')).toHaveLength(1) + expect(indexer.symbols.search('bar')).toHaveLength(0) // go not in languages + }) + + it('handles empty repo gracefully', async () => { + const indexer = new CodebaseIndexer(config) + const stats = await indexer.buildAll() + + expect(stats.files).toBe(0) + expect(stats.symbols).toBe(0) + expect(indexer.isIndexed()).toBe(true) + }) + + it('update throws if called before buildAll', async () => { + const indexer = new CodebaseIndexer(config) + await expect(indexer.update()).rejects.toThrow() + }) +}) diff --git a/src/index/indexer.ts b/src/index/indexer.ts new file mode 100644 index 0000000..f34758d --- /dev/null +++ b/src/index/indexer.ts @@ -0,0 +1,93 @@ +import fs from 'fs' +import path from 'path' +import { FileIndex } from './file-index' +import { SymbolIndex } from './symbol-index' +import type { IndexConfig, IndexStats, IncrementalUpdateResult } from './types' + +export class CodebaseIndexer { + private fileIndex: FileIndex + private symbolIndex: SymbolIndex + private indexed = false + + constructor(private config: IndexConfig) { + this.fileIndex = new FileIndex() + this.symbolIndex = new SymbolIndex(config.languages) + } + + async buildAll(): Promise { + const start = Date.now() + await this.fileIndex.build(this.config) + + this.symbolIndex = new SymbolIndex(this.config.languages) + for (const entry of this.fileIndex.all()) { + if (!this.config.languages.includes(entry.language)) continue + const fullPath = path.join(this.config.root, entry.path) + try { + const content = fs.readFileSync(fullPath, 'utf8') + await this.symbolIndex.indexFile(entry.path, content, entry.language) + } catch { + // file may have been deleted between scan and read + } + } + + this.indexed = true + return { + files: this.fileIndex.all().length, + symbols: this.symbolIndex.all().length, + buildTimeMs: Date.now() - start, + } + } + + async update(): Promise { + if (!this.indexed) { + throw new Error('CodebaseIndexer.update() called before buildAll()') + } + const start = Date.now() + const changes: IncrementalUpdateResult = await this.fileIndex.update() + + for (const relPath of changes.removed) { + this.symbolIndex.removeFile(relPath) + } + + for (const relPath of [...changes.added, ...changes.changed]) { + const entry = this.fileIndex.find(relPath)[0] + if (!entry || !this.config.languages.includes(entry.language)) continue + const fullPath = path.join(this.config.root, relPath) + try { + const content = fs.readFileSync(fullPath, 'utf8') + await this.symbolIndex.indexFile(relPath, content, entry.language) + } catch { + // file may have been deleted between scan and read + } + } + + return { + files: this.fileIndex.all().length, + symbols: this.symbolIndex.all().length, + buildTimeMs: Date.now() - start, + } + } + + isIndexed(): boolean { + return this.indexed + } + + get files(): FileIndex { + return this.fileIndex + } + + get symbols(): SymbolIndex { + return this.symbolIndex + } + + async save(): Promise { + await this.fileIndex.save(this.config.storage_dir) + await this.symbolIndex.save(this.config.storage_dir) + } + + async load(): Promise { + await this.fileIndex.load(this.config.storage_dir) + await this.symbolIndex.load(this.config.storage_dir) + this.indexed = true + } +} diff --git a/src/index/symbol-index.test.ts b/src/index/symbol-index.test.ts new file mode 100644 index 0000000..66e0f43 --- /dev/null +++ b/src/index/symbol-index.test.ts @@ -0,0 +1,279 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { SymbolIndex, RegexSymbolExtractor } from './symbol-index' +import fs from 'fs' +import path from 'path' +import os from 'os' + +describe('RegexSymbolExtractor', () => { + const extractor = new RegexSymbolExtractor() + + it('extracts exported TypeScript functions', () => { + const content = [ + 'export function foo(x: string): boolean {', + ' return true', + '}', + ].join('\n') + const symbols = extractor.extract('a.ts', content, 'typescript') + const foo = symbols.find(s => s.name === 'foo') + expect(foo).toBeDefined() + expect(foo!.type).toBe('function') + expect(foo!.file).toBe('a.ts') + expect(foo!.lineStart).toBe(1) + expect(foo!.exported).toBe(true) + expect(foo!.signature).toContain('foo') + }) + + it('extracts non-exported TypeScript functions', () => { + const content = 'function bar() { return 1 }' + const symbols = extractor.extract('a.ts', content, 'typescript') + const bar = symbols.find(s => s.name === 'bar') + expect(bar).toBeDefined() + expect(bar!.exported).toBe(false) + }) + + it('extracts TypeScript classes and methods', () => { + const content = [ + 'export class Foo {', + ' private bar(): void {}', + ' public baz(x: number): number { return x }', + '}', + ].join('\n') + const symbols = extractor.extract('a.ts', content, 'typescript') + const cls = symbols.find(s => s.name === 'Foo' && s.type === 'class') + expect(cls).toBeDefined() + expect(cls!.exported).toBe(true) + expect(cls!.lineStart).toBe(1) + + const bar = symbols.find(s => s.name === 'bar' && s.type === 'method') + expect(bar).toBeDefined() + expect(bar!.lineStart).toBe(2) + + const baz = symbols.find(s => s.name === 'baz' && s.type === 'method') + expect(baz).toBeDefined() + }) + + it('extracts TypeScript interfaces and types', () => { + const content = [ + 'export interface Config {', + ' name: string', + '}', + 'type Result = { ok: boolean }', + ].join('\n') + const symbols = extractor.extract('a.ts', content, 'typescript') + const iface = symbols.find(s => s.name === 'Config' && s.type === 'interface') + expect(iface).toBeDefined() + expect(iface!.exported).toBe(true) + expect(iface!.lineStart).toBe(1) + + const type = symbols.find(s => s.name === 'Result' && s.type === 'type') + expect(type).toBeDefined() + expect(type!.exported).toBe(false) + }) + + it('extracts TypeScript enums', () => { + const content = 'export enum Color { Red, Green, Blue }' + const symbols = extractor.extract('a.ts', content, 'typescript') + const e = symbols.find(s => s.name === 'Color' && s.type === 'enum') + expect(e).toBeDefined() + expect(e!.exported).toBe(true) + }) + + it('extracts arrow function const exports', () => { + const content = [ + 'export const handler = (req: Request) => {', + ' return req.json()', + '}', + ].join('\n') + const symbols = extractor.extract('a.ts', content, 'typescript') + const handler = symbols.find(s => s.name === 'handler' && s.type === 'function') + expect(handler).toBeDefined() + expect(handler!.exported).toBe(true) + }) + + it('extracts JavaScript functions and classes', () => { + const content = [ + 'function foo() { return 1 }', + 'class Bar {', + ' constructor() {}', + '}', + ].join('\n') + const symbols = extractor.extract('b.js', content, 'javascript') + expect(symbols.find(s => s.name === 'foo' && s.type === 'function')).toBeDefined() + expect(symbols.find(s => s.name === 'Bar' && s.type === 'class')).toBeDefined() + }) + + it('extracts CommonJS module.exports', () => { + const content = 'module.exports = function thing() { return 42 }' + const symbols = extractor.extract('c.js', content, 'javascript') + const thing = symbols.find(s => s.name === 'thing' && s.type === 'function') + expect(thing).toBeDefined() + expect(thing!.exported).toBe(true) + }) + + it('extracts Python functions and classes', () => { + const content = [ + 'def foo(x):', + ' return x + 1', + '', + 'class Bar:', + ' def method(self):', + ' pass', + ].join('\n') + const symbols = extractor.extract('d.py', content, 'python') + const foo = symbols.find(s => s.name === 'foo' && s.type === 'function') + expect(foo).toBeDefined() + expect(foo!.lineStart).toBe(1) + + const cls = symbols.find(s => s.name === 'Bar' && s.type === 'class') + expect(cls).toBeDefined() + expect(cls!.lineStart).toBe(4) + + const method = symbols.find(s => s.name === 'method' && s.type === 'method') + expect(method).toBeDefined() + expect(method!.lineStart).toBe(5) + }) + + it('returns empty array for unsupported languages', () => { + const symbols = extractor.extract('e.go', 'package main', 'go') + expect(symbols).toEqual([]) + }) + + it('ignores commented-out declarations', () => { + const content = [ + '// export function fake() { return 1 }', + '/* function alsoFake() {} */', + 'export function real() { return 2 }', + ].join('\n') + const symbols = extractor.extract('a.ts', content, 'typescript') + expect(symbols.find(s => s.name === 'fake')).toBeUndefined() + expect(symbols.find(s => s.name === 'alsoFake')).toBeUndefined() + expect(symbols.find(s => s.name === 'real')).toBeDefined() + }) +}) + +describe('SymbolIndex', () => { + let tmpDir: string + + beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'locode-sym-')) + }) + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }) + }) + + function writeFile(name: string, content: string): string { + const filePath = path.join(tmpDir, name) + fs.mkdirSync(path.dirname(filePath), { recursive: true }) + fs.writeFileSync(filePath, content, 'utf8') + return filePath + } + + it('indexFile stores symbols and search finds them by name', async () => { + const index = new SymbolIndex() + const content = 'export function myFunc() { return 1 }' + await index.indexFile('a.ts', content, 'typescript') + + const results = index.search('myFunc') + expect(results).toHaveLength(1) + expect(results[0].name).toBe('myFunc') + expect(results[0].type).toBe('function') + }) + + it('search is case-insensitive and supports partial match', async () => { + const index = new SymbolIndex() + const content = 'export function getUserData() { return null }' + await index.indexFile('a.ts', content, 'typescript') + + const results = index.search('userdata') + expect(results).toHaveLength(1) + expect(results[0].name).toBe('getUserData') + }) + + it('search filters by type', async () => { + const index = new SymbolIndex() + const content = [ + 'export function foo() {}', + 'export class Foo {}', + ].join('\n') + await index.indexFile('a.ts', content, 'typescript') + + const fns = index.search('foo', { type: 'function' }) + expect(fns).toHaveLength(1) + expect(fns[0].type).toBe('function') + + const classes = index.search('foo', { type: 'class' }) + expect(classes).toHaveLength(1) + expect(classes[0].type).toBe('class') + }) + + it('forFile returns all symbols in a file', async () => { + const index = new SymbolIndex() + const content = [ + 'export function foo() {}', + 'export function bar() {}', + ].join('\n') + await index.indexFile('a.ts', content, 'typescript') + + const symbols = index.forFile('a.ts') + expect(symbols).toHaveLength(2) + expect(symbols.map(s => s.name).sort()).toEqual(['bar', 'foo']) + }) + + it('getCode returns the source lines for a symbol', async () => { + const index = new SymbolIndex() + const content = [ + 'export function foo() {', + ' return 42', + '}', + 'export function bar() {}', + ].join('\n') + const filePath = writeFile('a.ts', content) + await index.indexFile(filePath, content, 'typescript') + + const foo = index.search('foo')[0] + const code = await index.getCode(foo) + expect(code).toContain('export function foo() {') + }) + + it('removeFile clears symbols for that file', async () => { + const index = new SymbolIndex() + await index.indexFile('a.ts', 'export function foo() {}', 'typescript') + await index.indexFile('b.ts', 'export function bar() {}', 'typescript') + + index.removeFile('a.ts') + expect(index.search('foo')).toHaveLength(0) + expect(index.search('bar')).toHaveLength(1) + }) + + it('save and load round-trips the symbol index', async () => { + const index = new SymbolIndex() + await index.indexFile('a.ts', 'export function foo() {}', 'typescript') + + const storageDir = path.join(tmpDir, 'index') + await index.save(storageDir) + + const reloaded = new SymbolIndex() + await reloaded.load(storageDir) + + expect(reloaded.search('foo')).toHaveLength(1) + expect(reloaded.forFile('a.ts')).toHaveLength(1) + }) + + it('all() returns every indexed symbol', async () => { + const index = new SymbolIndex() + await index.indexFile('a.ts', 'export function foo() {}\nexport class Bar {}', 'typescript') + await index.indexFile('b.ts', 'export function baz() {}', 'typescript') + + expect(index.all()).toHaveLength(3) + }) + + it('respects languages config — only extracts symbols for configured languages', async () => { + const index = new SymbolIndex(['typescript']) + await index.indexFile('a.ts', 'export function foo() {}', 'typescript') + await index.indexFile('b.py', 'def bar(): pass', 'python') + + expect(index.search('foo')).toHaveLength(1) + expect(index.search('bar')).toHaveLength(0) + }) +}) diff --git a/src/index/symbol-index.ts b/src/index/symbol-index.ts new file mode 100644 index 0000000..308aa7d --- /dev/null +++ b/src/index/symbol-index.ts @@ -0,0 +1,280 @@ +import fs from 'fs' +import path from 'path' +import type { SymbolEntry, SymbolType } from './types' + +export interface SymbolExtractor { + extract(file: string, content: string, language: string): SymbolEntry[] +} + +export class RegexSymbolExtractor implements SymbolExtractor { + extract(file: string, content: string, language: string): SymbolEntry[] { + if (language === 'typescript' || language === 'javascript') { + return this.extractTsJs(file, content) + } + if (language === 'python') { + return this.extractPython(file, content) + } + return [] + } + + private extractTsJs(file: string, content: string): SymbolEntry[] { + const symbols: SymbolEntry[] = [] + const lines = content.split('\n') + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const lineNum = i + 1 + if (this.isComment(line)) continue + + this.matchExportedFunction(file, line, lineNum, symbols) + this.matchFunction(file, line, lineNum, symbols) + this.matchArrowConst(file, line, lineNum, symbols) + this.matchClass(file, line, lineNum, symbols) + this.matchMethod(file, line, lineNum, symbols) + this.matchInterface(file, line, lineNum, symbols) + this.matchType(file, line, lineNum, symbols) + this.matchEnum(file, line, lineNum, symbols) + this.matchCommonJSExport(file, line, lineNum, symbols) + } + + return symbols + } + + private matchExportedFunction( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^export\s+(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/) + if (m) { + symbols.push({ + name: m[1], type: 'function', file, lineStart: lineNum, + lineEnd: lineNum, + signature: `function ${m[1]}(${m[2]})`, exported: true, + }) + } + } + + private matchFunction( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/) + if (m && !line.startsWith('export')) { + symbols.push({ + name: m[1], type: 'function', file, lineStart: lineNum, + lineEnd: lineNum, + signature: `function ${m[1]}(${m[2]})`, exported: false, + }) + } + } + + private matchArrowConst( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^export\s+const\s+(\w+)\s*=\s*(?:async\s+)?\(([^)]*)\)\s*=>/) + if (m) { + symbols.push({ + name: m[1], type: 'function', file, lineStart: lineNum, lineEnd: lineNum, + signature: `const ${m[1]} = (${m[2]}) =>`, exported: true, + }) + } + } + + private matchClass( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^(export\s+)?(?:abstract\s+)?class\s+(\w+)/) + if (m) { + symbols.push({ + name: m[2], type: 'class', file, lineStart: lineNum, lineEnd: lineNum, + exported: !!m[1], + }) + } + } + + private matchMethod( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^\s*(?:public|private|protected|static|async|readonly|\s)*(\w+)\s*\(([^)]*)\)\s*[:{]/) + if (m && !line.includes('function') && !line.startsWith('export')) { + const name = m[1] + if (['if', 'for', 'while', 'switch', 'catch', 'return', 'throw'].includes(name)) return + symbols.push({ + name, type: 'method', file, lineStart: lineNum, lineEnd: lineNum, + signature: `${name}(${m[2]})`, exported: false, + }) + } + } + + private matchInterface( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^(export\s+)?interface\s+(\w+)/) + if (m) { + symbols.push({ + name: m[2], type: 'interface', file, lineStart: lineNum, lineEnd: lineNum, + exported: !!m[1], + }) + } + } + + private matchType( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^(export\s+)?type\s+(\w+)\s*=/) + if (m) { + symbols.push({ + name: m[2], type: 'type', file, lineStart: lineNum, lineEnd: lineNum, + exported: !!m[1], + }) + } + } + + private matchEnum( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^(export\s+)?enum\s+(\w+)/) + if (m) { + symbols.push({ + name: m[2], type: 'enum', file, lineStart: lineNum, lineEnd: lineNum, + exported: !!m[1], + }) + } + } + + private matchCommonJSExport( + file: string, line: string, lineNum: number, symbols: SymbolEntry[], + ): void { + const m = line.match(/^module\.exports\s*=\s*function\s+(\w+)/) + if (m) { + symbols.push({ + name: m[1], type: 'function', file, lineStart: lineNum, lineEnd: lineNum, + exported: true, + }) + } + } + + private extractPython(file: string, content: string): SymbolEntry[] { + const symbols: SymbolEntry[] = [] + const lines = content.split('\n') + + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const lineNum = i + 1 + if (line.trim().startsWith('#')) continue + + const fnMatch = line.match(/^\s*(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/) + if (fnMatch) { + const isMethod = this.isInsideClass(symbols, lineNum) + symbols.push({ + name: fnMatch[1], type: isMethod ? 'method' : 'function', file, + lineStart: lineNum, lineEnd: lineNum, + signature: `def ${fnMatch[1]}(${fnMatch[2]})`, exported: !isMethod, + }) + continue + } + + const classMatch = line.match(/^class\s+(\w+)/) + if (classMatch) { + symbols.push({ + name: classMatch[1], type: 'class', file, lineStart: lineNum, lineEnd: lineNum, + exported: true, + }) + } + } + + return symbols + } + + private isInsideClass(symbols: SymbolEntry[], lineNum: number): boolean { + for (let i = symbols.length - 1; i >= 0; i--) { + if (symbols[i].type === 'class' && symbols[i].lineStart < lineNum) { + return true + } + } + return false + } + + private isComment(line: string): boolean { + const trimmed = line.trim() + return trimmed.startsWith('//') || trimmed.startsWith('/*') || trimmed.startsWith('*') + } +} + +export class SymbolIndex { + private symbols: SymbolEntry[] = [] + private byFile: Map = new Map() + private languages: Set | null = null + private extractor: SymbolExtractor + + constructor(languages?: string[], extractor?: SymbolExtractor) { + if (languages) { + this.languages = new Set(languages) + } + this.extractor = extractor ?? new RegexSymbolExtractor() + } + + async indexFile(filePath: string, content: string, language: string): Promise { + if (this.languages && !this.languages.has(language)) { + return [] + } + this.removeFile(filePath) + const extracted = this.extractor.extract(filePath, content, language) + this.symbols.push(...extracted) + this.byFile.set(filePath, extracted) + return extracted + } + + search(query: string, opts?: { type?: SymbolType; file?: string }): SymbolEntry[] { + const lower = query.toLowerCase() + return this.symbols.filter(s => { + if (opts?.type && s.type !== opts.type) return false + if (opts?.file && s.file !== opts.file) return false + return s.name.toLowerCase().includes(lower) + }) + } + + forFile(filePath: string): SymbolEntry[] { + return this.byFile.get(filePath) ?? [] + } + + async getCode(symbol: SymbolEntry): Promise { + try { + const content = fs.readFileSync(symbol.file, 'utf8') + const lines = content.split('\n') + const start = Math.max(0, symbol.lineStart - 1) + const end = Math.min(lines.length, symbol.lineEnd) + return lines.slice(start, end).join('\n') + } catch { + return '' + } + } + + removeFile(filePath: string): void { + const existing = this.byFile.get(filePath) + if (existing) { + this.symbols = this.symbols.filter(s => s.file !== filePath) + this.byFile.delete(filePath) + } + } + + all(): SymbolEntry[] { + return [...this.symbols] + } + + async save(dir: string): Promise { + fs.mkdirSync(dir, { recursive: true }) + fs.writeFileSync(path.join(dir, 'symbols.json'), JSON.stringify(this.symbols, null, 2), 'utf8') + } + + async load(dir: string): Promise { + const file = path.join(dir, 'symbols.json') + const data = fs.readFileSync(file, 'utf8') + const entries: SymbolEntry[] = JSON.parse(data) + this.symbols = entries + this.byFile.clear() + for (const entry of entries) { + const list = this.byFile.get(entry.file) ?? [] + list.push(entry) + this.byFile.set(entry.file, list) + } + } +} diff --git a/src/index/types.ts b/src/index/types.ts new file mode 100644 index 0000000..2683346 --- /dev/null +++ b/src/index/types.ts @@ -0,0 +1,39 @@ +export type SymbolType = 'function' | 'class' | 'method' | 'variable' | 'type' | 'interface' | 'enum' + +export interface FileEntry { + path: string + language: string + size: number + hash: string + lastIndexed: number +} + +export interface SymbolEntry { + name: string + type: SymbolType + file: string + lineStart: number + lineEnd: number + signature?: string + exported: boolean +} + +export interface IndexStats { + files: number + symbols: number + buildTimeMs: number +} + +export interface IndexConfig { + root: string + ignore: string[] + languages: string[] + storage_dir: string + auto_update: boolean +} + +export interface IncrementalUpdateResult { + added: string[] + removed: string[] + changed: string[] +} diff --git a/src/orchestrator/orchestrator.ts b/src/orchestrator/orchestrator.ts index e934e2e..b022682 100644 --- a/src/orchestrator/orchestrator.ts +++ b/src/orchestrator/orchestrator.ts @@ -17,6 +17,11 @@ import { TaskClassifier, type TaskIntent } from './task-classifier' import { RunArtifactStore } from '../runtime/run-artifact-store' import type { ApprovalHandler } from '../tools/executor' import { PersistentContextCache } from '../runtime/persistent-context-cache' +import { CodebaseIndexer } from '../index/indexer' +import { ContextRetriever } from '../context/context-retriever' +import { createSymbolLookupTool } from '../tools/definitions/symbol-lookup' +import type { IndexConfig as IndexerConfig } from '../index/types' +import type { RetrievalConfig } from '../context/types' function isRateLimitError(err: unknown): boolean { return err instanceof Error && 'status' in err && (err as { status: number }).status === 429 @@ -54,6 +59,8 @@ export class Orchestrator { private taskClassifier: TaskClassifier private artifactStore: RunArtifactStore private persistentContextCache: PersistentContextCache + private codebaseIndexer: CodebaseIndexer | null = null + private contextRetriever: ContextRetriever | null = null constructor(config: Config, localAgent?: LocalAgent, claudeAgent?: ClaudeAgent, options?: OrchestratorOptions) { this.config = config @@ -84,6 +91,7 @@ export class Orchestrator { this.verbose = options?.verbose ?? false this.repoContext = loadRepoContext(config.context.repo_context_files, config.context.max_file_bytes) + this.initCodebaseIndex(registry) this.rebuildCodingAgent(safetyGate) } @@ -305,9 +313,68 @@ export class Orchestrator { return this.codingAgent } + getCodebaseIndexer(): CodebaseIndexer | null { + return this.codebaseIndexer + } + + async buildCodebaseIndex(): Promise<{ files: number; symbols: number; buildTimeMs: number } | null> { + if (!this.codebaseIndexer) return null + const stats = await this.codebaseIndexer.buildAll() + await this.codebaseIndexer.save() + const registry = this.toolExecutor.registry + if (!registry.get('symbol_lookup')) { + registry.register(createSymbolLookupTool(this.codebaseIndexer)) + } + this.buildContextRetriever() + this.rebuildCodingAgent(new SafetyGate(this.config.safety)) + return stats + } + getStats() { return this.tracker.getStats() } resetStats() { this.tracker.reset() } + private initCodebaseIndex(registry: ReturnType): void { + if (!this.config.index?.enabled) return + const indexConfig: IndexerConfig = { + root: process.cwd(), + ignore: this.config.index.ignore, + languages: this.config.index.languages, + storage_dir: this.config.index.storage_dir, + auto_update: this.config.index.auto_update, + } + this.codebaseIndexer = new CodebaseIndexer(indexConfig) + try { + this.codebaseIndexer.load().then(() => { + if (this.codebaseIndexer?.isIndexed()) { + registry.register(createSymbolLookupTool(this.codebaseIndexer)) + this.buildContextRetriever() + this.rebuildCodingAgent(new SafetyGate(this.config.safety)) + } + }).catch(() => { + // No saved index — user can build one with `locode index` (future command) + }) + } catch { + // Index loading is non-fatal + } + } + + private buildContextRetriever(): void { + if (!this.codebaseIndexer || !this.codebaseIndexer.isIndexed()) return + const crConfig = this.config.context_retrieval + const retrievalConfig: RetrievalConfig = { + max_files: crConfig.max_files, + max_tokens_per_file: crConfig.max_tokens_per_file, + max_total_tokens: crConfig.max_total_tokens, + strategy: crConfig.strategy, + confidence_threshold: crConfig.confidence_threshold, + } + this.contextRetriever = new ContextRetriever( + this.codebaseIndexer, + retrievalConfig, + { root: process.cwd(), memory: new AgentMemory().getSnapshot() }, + ) + } + private rebuildCodingAgent(safetyGate: SafetyGate): void { if (!this.config.agent) { this.codingAgent = null @@ -316,6 +383,19 @@ export class Orchestrator { const codeEditor = new CodeEditor(safetyGate, process.cwd()) const planner = new Planner(this.localAgent, this.claudeAgent) const agentMemory = new AgentMemory() + if (this.contextRetriever && this.codebaseIndexer?.isIndexed()) { + this.contextRetriever = new ContextRetriever( + this.codebaseIndexer, + { + max_files: this.config.context_retrieval.max_files, + max_tokens_per_file: this.config.context_retrieval.max_tokens_per_file, + max_total_tokens: this.config.context_retrieval.max_total_tokens, + strategy: this.config.context_retrieval.strategy, + confidence_threshold: this.config.context_retrieval.confidence_threshold, + }, + { root: process.cwd(), memory: agentMemory.getSnapshot() }, + ) + } this.codingAgent = new CodingAgent( this.localAgent, this.localOnly ? null : this.claudeAgent, @@ -326,6 +406,7 @@ export class Orchestrator { this.config.agent, this.config.performance, this.persistentContextCache, + this.contextRetriever, ) } diff --git a/src/tools/definitions/symbol-lookup.test.ts b/src/tools/definitions/symbol-lookup.test.ts new file mode 100644 index 0000000..f9fc111 --- /dev/null +++ b/src/tools/definitions/symbol-lookup.test.ts @@ -0,0 +1,78 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { createSymbolLookupTool } from './symbol-lookup' +import { CodebaseIndexer } from '../../index/indexer' +import fs from 'fs' +import path from 'path' +import os from 'os' +import type { IndexConfig } from '../../index/types' + +describe('createSymbolLookupTool', () => { + let tmpDir: string + let indexer: CodebaseIndexer + + beforeEach(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'locode-symtool-')) + const config: IndexConfig = { + root: tmpDir, + ignore: ['node_modules', 'dist', '.git'], + languages: ['typescript'], + storage_dir: path.join(tmpDir, '.locode', 'index'), + auto_update: true, + } + fs.writeFileSync(path.join(tmpDir, 'a.ts'), 'export function foo() { return 1 }\nexport class Bar { method() {} }\n') + indexer = new CodebaseIndexer(config) + await indexer.buildAll() + }) + + afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }) + }) + + it('finds symbols by name', async () => { + const tool = createSymbolLookupTool(indexer) + const result = await tool.handler({ name: 'foo' }) + expect(result.success).toBe(true) + const symbols = JSON.parse(result.output) + expect(symbols).toHaveLength(1) + expect(symbols[0].name).toBe('foo') + expect(symbols[0].type).toBe('function') + }) + + it('filters by type', async () => { + const tool = createSymbolLookupTool(indexer) + const result = await tool.handler({ name: 'Bar', type: 'class' }) + expect(result.success).toBe(true) + const symbols = JSON.parse(result.output) + expect(symbols).toHaveLength(1) + expect(symbols[0].type).toBe('class') + }) + + it('returns empty array when no matches', async () => { + const tool = createSymbolLookupTool(indexer) + const result = await tool.handler({ name: 'nonexistent' }) + expect(result.success).toBe(true) + expect(JSON.parse(result.output)).toEqual([]) + }) + + it('returns error when index is not built', async () => { + const emptyIndexer = new CodebaseIndexer({ + root: tmpDir, + ignore: [], + languages: ['typescript'], + storage_dir: path.join(tmpDir, 'idx'), + auto_update: true, + }) + const tool = createSymbolLookupTool(emptyIndexer) + const result = await tool.handler({ name: 'foo' }) + expect(result.success).toBe(false) + expect(result.error).toContain('not built') + }) + + it('has correct tool metadata', () => { + const tool = createSymbolLookupTool(indexer) + expect(tool.name).toBe('symbol_lookup') + expect(tool.category).toBe('search') + expect(tool.requiresConfirmation).toBe(false) + expect(tool.inputSchema.required).toEqual(['name']) + }) +}) diff --git a/src/tools/definitions/symbol-lookup.ts b/src/tools/definitions/symbol-lookup.ts new file mode 100644 index 0000000..4d82a3b --- /dev/null +++ b/src/tools/definitions/symbol-lookup.ts @@ -0,0 +1,46 @@ +import type { ToolDefinition } from '../registry' +import type { CodebaseIndexer } from '../../index/indexer' + +export function createSymbolLookupTool(indexer: CodebaseIndexer): ToolDefinition { + return { + name: 'symbol_lookup', + description: 'Find function, class, or variable definitions by name in the codebase index. Returns symbol name, type, file, and line number.', + inputSchema: { + type: 'object', + properties: { + name: { type: 'string', description: 'Symbol name to search for (partial match, case-insensitive)' }, + type: { type: 'string', description: 'Filter by symbol type: function, class, method, variable, type, interface, enum' }, + }, + required: ['name'], + }, + category: 'search', + requiresConfirmation: false, + async handler(args) { + const name = String(args.name) + const type = args.type as string | undefined + + if (!indexer.isIndexed()) { + return { + success: false, + output: '', + error: 'Codebase index not built. Run `locode index` first.', + } + } + + const results = indexer.symbols.search(name, type ? { type: type as never } : undefined) + const limited = results.slice(0, 10) + + return { + success: true, + output: JSON.stringify(limited.map(s => ({ + name: s.name, + type: s.type, + file: s.file, + line: s.lineStart, + signature: s.signature, + exported: s.exported, + }))), + } + }, + } +}