chocks · chocks · Jun 20, 2026 · Jun 20, 2026
diff --git a/docs/plans/2026-06-20-v04a-impl-progress.md b/docs/plans/2026-06-20-v04a-impl-progress.md
@@ -0,0 +1,71 @@
+# Locode v0.4a — Deterministic Retrieval Core — Implementation Progress
+
+**Started:** 2026-06-20
+**Branch:** `feat/v04a-deterministic-retrieval`
+**Design:** [v0.4 — Codebase Intelligence](2026-03-10-v04-codebase-intelligence.md)
+**Scope:** v0.4 phase A only — file index, symbol index, indexer, context retriever, budget manager, `symbol_lookup` tool. No embedding index, no dependency graph (those are v0.4b/v0.4c).
+
+---
+
+## Definition of Done
+
+- [ ] `index` + `context_retrieval` config sections in schema with defaults synced to `locode.yaml`
+- [ ] `FileIndex` scans a repo respecting `.gitignore` + config ignore patterns, records path/lang/size/hash
+- [ ] `SymbolIndex` extracts functions/classes/types/interfaces/enums for TypeScript + JavaScript (+ Python)
+- [ ] `CodebaseIndexer` orchestrates file + symbol indexes, supports incremental updates via file hashes
+- [ ] `BudgetManager` allocates a token budget across files by priority weight
+- [ ] `ContextRetriever` pipeline: mentioned files → symbol search → sibling tests → rank → truncate to budget
+- [ ] `symbol_lookup` tool registered in the default tool registry
+- [ ] `ContextRetriever` optionally wired into `CodingAgent.ANALYZE` (backwards-compatible — no-op when index absent)
+- [ ] Index persists to disk and loads on subsequent runs
+- [ ] All tests pass, `npm run build` succeeds, lint clean
+
+---
+
+## Implementation Decisions
+
+### Symbol extraction: regex-based, not tree-sitter (for now)
+
+The v0.4 design spec calls for `web-tree-sitter` (WASM) for AST-based symbol extraction. This v0.4a slice ships a **regex-based extractor** instead, behind a `SymbolExtractor` interface so `web-tree-sitter` can be swapped in later without changing `SymbolIndex`'s API.
+
+**Why:**
+- Keeps the bundle small (AGENTS.md: don't add deps without considering bundle size). `web-tree-sitter` + per-language `.wasm` grammars add ~2-5 MB of static assets.
+- Regex extraction is fully testable without loading WASM, and covers the common cases (top-level functions, classes, interfaces, exported symbols) that the agent needs for `symbol_lookup`.
+- The `SymbolExtractor` interface means the tree-sitter adapter is a drop-in replacement later — no API churn.
+
+**Trade-off:** regex extraction misses nested scopes, overloaded signatures, and some edge cases. Acceptable for v0.4a's "fast path first" goal; the LLM-driven ANALYZE fallback still covers complex cases.
+
+### `find_references` tool deferred to v0.4b
+
+The design spec lists `find_references` as a v0.4 tool, but it depends on the dependency graph (import tracking) which is v0.4b. Shipping `symbol_lookup` only in v0.4a.
+
+---
+
+## Findings / Things to Improve
+
+Observed while implementing v0.4a. Not blocking; captured for future work.
+
+### 1. Config defaults: `.default({})` does not recursively apply inner defaults in Zod
+
+`SomeSchema.default({})` sets the default to a literal `{}`, NOT the schema-parsed result. Inner field defaults are NOT applied. The fix is `.default(SomeSchema.parse({}))` (or reuse a pre-parsed `DEFAULT_X` constant).
+
+**Impact:** Any future config section added with `.default({})` will silently produce empty objects instead of defaulted ones. The existing `runtime`, `performance`, and `agent` sections all pass explicit default objects, which masks this. Consider a helper or a lint rule.
+
+**Location:** `src/config/schema.ts` — `index` and `context_retrieval` now use `DEFAULT_INDEX_CONFIG` / `DEFAULT_CONTEXT_RETRIEVAL_CONFIG`.
+
+### 2. `CONFIG_TEMPLATE` in `src/cli/setup.ts` is a third source of truth (known)
+
+Already noted in `docs/plans/misc-todos.md` under "Config template duplication". The template is intentionally minimal (only required fields), so new defaulted sections like `index` and `context_retrieval` don't need to be added — schema defaults cover them. But the duplication risk remains for any future *required* config field. The proposed fix (single `DEFAULT_CONFIG` constant) would resolve this.
+
+### 3. `CodingAgent.analyze` fast-path is ripe for `ContextRetriever` integration
+
+The existing `analyze()` in `src/coding/coding-agent.ts:219` already does mentioned-file extraction + sibling-test discovery + LLM fallback — essentially a hand-rolled mini-retriever. v0.4a's `ContextRetriever` generalizes this. Once the retriever is wired in, the agent's `extractMentionedFiles` / `findLikelyTestFiles` / `pushBudgetedFile` logic could be delegated to it, reducing duplication. Deferred to avoid changing working behavior in this slice.
+
+---
+
+## Verification Status
+
+- `npm test` passes — 398 tests (60 new for v0.4a)
+- `npm run build` passes
+- `npm run lint` passes
+
diff --git a/locode.yaml b/locode.yaml
@@ -64,6 +64,32 @@ performance:
   max_prompt_chars: 24000
   lazy_semantic_search: true
 
+index:
+  enabled: true
+  ignore:
+    - node_modules
+    - dist
+    - .git
+    - coverage
+    - "*.min.js"
+    - "*.lock"
+  languages:
+    - typescript
+    - javascript
+    - python
+    - go
+    - rust
+  chunk_size: 50
+  storage_dir: .locode/index
+  auto_update: true
+
+context_retrieval:
+  max_files: 5
+  max_tokens_per_file: 2000
+  max_total_tokens: 8000
+  strategy: deterministic-first
+  confidence_threshold: 0.7
+
 # mcp_servers:
 #   linear:
 #     type: remote

diff --git a/src/coding/coding-agent.ts b/src/coding/coding-agent.ts
@@ -19,6 +19,7 @@ import type { Planner } from './planner'
 import type { AgentMemory } from './memory'
 import type { PerformanceConfig } from '../config/schema'
 import { PersistentContextCache } from '../runtime/persistent-context-cache'
+import type { ContextRetriever } from '../context/context-retriever'
 
 interface LLMAgent {
   run(prompt: string, previousSummary?: string, repoContext?: string): Promise<AgentResult>
@@ -80,6 +81,7 @@ export class CodingAgent extends EventEmitter {
     private config: AgentConfig,
     private performance?: PerformanceConfig,
     private persistentCache: PersistentContextCache | null = null,
+    private contextRetriever: ContextRetriever | null = null,
   ) {
     super()
   }
@@ -243,6 +245,34 @@ export class CodingAgent extends EventEmitter {
       }
     }
 
+    if (this.contextRetriever) {
+      const retrieved = await this.contextRetriever.retrieve(prompt)
+      if (retrieved.confidence >= (this.performance?.lazy_semantic_search !== false ? 0.7 : 0.5)) {
+        const gathered = this.applyPromptBudgetToGatheredContext(
+          {
+            files: retrieved.files,
+            searchResults: retrieved.searchResults,
+            memory: retrieved.memory,
+          },
+          promptBudget,
+        )
+        if (this.performance?.cache_context) {
+          this.contextCache.set(cacheKey, gathered)
+          if (this.persistentCache) {
+            await this.persistentCache.set(prompt, gathered)
+          }
+        }
+        for (const file of retrieved.files) {
+          this.memory.record({ type: 'file_read', detail: file.path })
+        }
+        return {
+          gathered,
+          tokensUsed: { input: 0, output: 0 },
+          toolCalls: [],
+        }
+      }
+    }
+
     const files: GatheredContext['files'] = []
     const searchResults: GatheredContext['searchResults'] = []
 

diff --git a/src/config/schema.test.ts b/src/config/schema.test.ts
@@ -143,4 +143,88 @@ describe('ConfigSchema', () => {
     expect(result.performance.max_prompt_chars).toBe(24000)
     expect(result.performance.lazy_semantic_search).toBe(true)
   })
+
+  it('defaults index config when omitted', () => {
+    const result = ConfigSchema.parse(baseConfig)
+    expect(result.index.enabled).toBe(true)
+    expect(result.index.ignore).toContain('node_modules')
+    expect(result.index.ignore).toContain('dist')
+    expect(result.index.ignore).toContain('.git')
+    expect(result.index.languages).toContain('typescript')
+    expect(result.index.languages).toContain('javascript')
+    expect(result.index.chunk_size).toBe(50)
+    expect(result.index.storage_dir).toBe('.locode/index')
+    expect(result.index.auto_update).toBe(true)
+  })
+
+  it('accepts custom index config', () => {
+    const result = ConfigSchema.parse({
+      ...baseConfig,
+      index: {
+        enabled: false,
+        ignore: ['build', 'vendor'],
+        languages: ['typescript', 'go'],
+        chunk_size: 100,
+        storage_dir: '.cache/index',
+        auto_update: false,
+      },
+    })
+    expect(result.index.enabled).toBe(false)
+    expect(result.index.ignore).toEqual(['build', 'vendor'])
+    expect(result.index.languages).toEqual(['typescript', 'go'])
+    expect(result.index.chunk_size).toBe(100)
+    expect(result.index.storage_dir).toBe('.cache/index')
+    expect(result.index.auto_update).toBe(false)
+  })
+
+  it('rejects non-positive chunk_size', () => {
+    expect(() => ConfigSchema.parse({
+      ...baseConfig,
+      index: { chunk_size: 0 },
+    })).toThrow()
+  })
+
+  it('defaults context_retrieval config when omitted', () => {
+    const result = ConfigSchema.parse(baseConfig)
+    expect(result.context_retrieval.max_files).toBe(5)
+    expect(result.context_retrieval.max_tokens_per_file).toBe(2000)
+    expect(result.context_retrieval.max_total_tokens).toBe(8000)
+    expect(result.context_retrieval.strategy).toBe('deterministic-first')
+    expect(result.context_retrieval.confidence_threshold).toBe(0.7)
+  })
+
+  it('accepts custom context_retrieval config', () => {
+    const result = ConfigSchema.parse({
+      ...baseConfig,
+      context_retrieval: {
+        max_files: 10,
+        max_tokens_per_file: 4000,
+        max_total_tokens: 16000,
+        strategy: 'semantic-first',
+        confidence_threshold: 0.85,
+      },
+    })
+    expect(result.context_retrieval.max_files).toBe(10)
+    expect(result.context_retrieval.max_total_tokens).toBe(16000)
+    expect(result.context_retrieval.strategy).toBe('semantic-first')
+    expect(result.context_retrieval.confidence_threshold).toBe(0.85)
+  })
+
+  it('rejects context_retrieval confidence_threshold out of range', () => {
+    expect(() => ConfigSchema.parse({
+      ...baseConfig,
+      context_retrieval: { confidence_threshold: 1.5 },
+    })).toThrow()
+    expect(() => ConfigSchema.parse({
+      ...baseConfig,
+      context_retrieval: { confidence_threshold: -0.1 },
+    })).toThrow()
+  })
+
+  it('rejects invalid context_retrieval strategy', () => {
+    expect(() => ConfigSchema.parse({
+      ...baseConfig,
+      context_retrieval: { strategy: 'random' },
+    })).toThrow()
+  })
 })
diff --git a/src/config/schema.ts b/src/config/schema.ts
@@ -44,8 +44,31 @@ export const PerformanceConfigSchema = z.object({
   lazy_semantic_search: z.boolean().default(true),
 })
 
+export const IndexConfigSchema = z.object({
+  enabled: z.boolean().default(true),
+  ignore: z.array(z.string()).default([
+    'node_modules', 'dist', '.git', 'coverage', '*.min.js', '*.lock',
+  ]),
+  languages: z.array(z.string()).default([
+    'typescript', 'javascript', 'python', 'go', 'rust',
+  ]),
+  chunk_size: z.number().int().positive().default(50),
+  storage_dir: z.string().default('.locode/index'),
+  auto_update: z.boolean().default(true),
+})
+
+export const ContextRetrievalConfigSchema = z.object({
+  max_files: z.number().int().min(1).default(5),
+  max_tokens_per_file: z.number().int().positive().default(2000),
+  max_total_tokens: z.number().int().positive().default(8000),
+  strategy: z.enum(['deterministic-first', 'semantic-first']).default('deterministic-first'),
+  confidence_threshold: z.number().min(0).max(1).default(0.7),
+})
+
 export const DEFAULT_RUNTIME_CONFIG = RuntimeConfigSchema.parse({})
 export const DEFAULT_PERFORMANCE_CONFIG = PerformanceConfigSchema.parse({})
+export const DEFAULT_INDEX_CONFIG = IndexConfigSchema.parse({})
+export const DEFAULT_CONTEXT_RETRIEVAL_CONFIG = ContextRetrievalConfigSchema.parse({})
 
 export const ConfigSchema = z.object({
   local_llm: z.object({
@@ -95,6 +118,8 @@ export const ConfigSchema = z.object({
     max_prompt_chars: 24000,
     lazy_semantic_search: true,
   }),
+  index: IndexConfigSchema.default(DEFAULT_INDEX_CONFIG),
+  context_retrieval: ContextRetrievalConfigSchema.default(DEFAULT_CONTEXT_RETRIEVAL_CONFIG),
   mcp_servers: z.record(z.string(), McpServerSchema).default({}),
   safety: z.object({
     always_confirm: z.array(z.string()).default([]),
@@ -111,3 +136,5 @@ export const ConfigSchema = z.object({
 
 export type Config = z.infer<typeof ConfigSchema>
 export type PerformanceConfig = z.infer<typeof PerformanceConfigSchema>
+export type IndexConfig = z.infer<typeof IndexConfigSchema>
+export type ContextRetrievalConfig = z.infer<typeof ContextRetrievalConfigSchema>
diff --git a/src/context/budget-manager.test.ts b/src/context/budget-manager.test.ts
@@ -0,0 +1,85 @@
+import { describe, it, expect } from 'vitest'
+import { BudgetManager } from './budget-manager'
+import type { BudgetPriority } from './types'
+
+describe('BudgetManager', () => {
+  it('allocates full budget to a single file', () => {
+    const mgr = new BudgetManager(1000)
+    const result = mgr.allocate([
+      { path: 'a.ts', content: 'x'.repeat(500), priority: 'direct_match' as BudgetPriority },
+    ])
+    expect(result).toHaveLength(1)
+    expect(result[0].tokensUsed).toBe(500)
+    expect(result[0].truncated).toBe(false)
+  })
+
+  it('truncates a file that exceeds max_tokens_per_file', () => {
+    const mgr = new BudgetManager(10000, { maxPerFile: 200 })
+    const result = mgr.allocate([
+      { path: 'a.ts', content: 'x'.repeat(500), priority: 'direct_match' as BudgetPriority },
+    ])
+    expect(result[0].tokensUsed).toBe(200)
+    expect(result[0].truncated).toBe(true)
+    expect(result[0].content).toHaveLength(200)
+  })
+
+  it('gives higher priority files more budget', () => {
+    const mgr = new BudgetManager(600, { maxPerFile: 1000 })
+    const result = mgr.allocate([
+      { path: 'low.ts', content: 'x'.repeat(400), priority: 'dependency' as BudgetPriority },
+      { path: 'high.ts', content: 'x'.repeat(400), priority: 'direct_match' as BudgetPriority },
+    ])
+    const highFile = result.find(r => r.path === 'high.ts')!
+    const lowFile = result.find(r => r.path === 'low.ts')!
+    expect(highFile.tokensUsed).toBeGreaterThan(lowFile.tokensUsed)
+  })
+
+  it('does not exceed total budget', () => {
+    const mgr = new BudgetManager(300, { maxPerFile: 1000 })
+    const result = mgr.allocate([
+      { path: 'a.ts', content: 'x'.repeat(200), priority: 'direct_match' as BudgetPriority },
+      { path: 'b.ts', content: 'x'.repeat(200), priority: 'symbol_match' as BudgetPriority },
+    ])
+    const total = result.reduce((sum, r) => sum + r.tokensUsed, 0)
+    expect(total).toBeLessThanOrEqual(300)
+  })
+
+  it('returns empty array for empty input', () => {
+    const mgr = new BudgetManager(1000)
+    expect(mgr.allocate([])).toEqual([])
+  })
+
+  it('marks files as truncated when total budget is exhausted', () => {
+    const mgr = new BudgetManager(100, { maxPerFile: 1000 })
+    const result = mgr.allocate([
+      { path: 'a.ts', content: 'x'.repeat(80), priority: 'direct_match' as BudgetPriority },
+      { path: 'b.ts', content: 'x'.repeat(80), priority: 'direct_match' as BudgetPriority },
+    ])
+    const total = result.reduce((sum, r) => sum + r.tokensUsed, 0)
+    expect(total).toBe(100)
+    const truncated = result.filter(r => r.truncated)
+    expect(truncated.length).toBeGreaterThan(0)
+  })
+
+  it('respects max_files limit', () => {
+    const mgr = new BudgetManager(10000, { maxPerFile: 1000, maxFiles: 2 })
+    const result = mgr.allocate([
+      { path: 'a.ts', content: 'x', priority: 'direct_match' as BudgetPriority },
+      { path: 'b.ts', content: 'x', priority: 'direct_match' as BudgetPriority },
+      { path: 'c.ts', content: 'x', priority: 'direct_match' as BudgetPriority },
+    ])
+    expect(result).toHaveLength(2)
+  })
+
+  it('sorts output by priority (direct_match first)', () => {
+    const mgr = new BudgetManager(10000, { maxPerFile: 1000 })
+    const result = mgr.allocate([
+      { path: 'dep.ts', content: 'x', priority: 'dependency' as BudgetPriority },
+      { path: 'direct.ts', content: 'x', priority: 'direct_match' as BudgetPriority },
+      { path: 'sym.ts', content: 'x', priority: 'symbol_match' as BudgetPriority },
+    ])
+    expect(result[0].path).toBe('direct.ts')
+    expect(result[1].path).toBe('sym.ts')
+    expect(result[2].path).toBe('dep.ts')
+  })
+})