Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
734ec7b
FE-864: Plan orchestrator brownfield enhancements
kostandinang Jun 15, 2026
1c119e8
FE-864: Add agent-extension-host contract (dual-mode pi harness)
kostandinang Jun 15, 2026
1edf1ed
FE-864: Base the Arc-1 linear stack on agent-extension-host
kostandinang Jun 15, 2026
1137502
FE-864: Reconcile brunch-detect + harness-dep-install scope with code
kostandinang Jun 15, 2026
bb36ee8
FE-864: Correct CLI surface — commands are plan/cook/serve, brigade n…
kostandinang Jun 16, 2026
4dcc365
FE-879: lazy per-slice cook worktrees + shared node_modules
kostandinang Jun 16, 2026
0d747ca
FE-879: register cook-worktree-laziness in PLAN.md
kostandinang Jun 16, 2026
bda74ce
FE-879: fail loudly when a slice id collides with a parent entry
kostandinang Jun 17, 2026
5fcaa55
FE-867: agent-extension-host mode-neutral contract (slice 1)
kostandinang Jun 15, 2026
367828b
FE-867: tighten agent-extension-host neutrality & witness proofs
kostandinang Jun 15, 2026
64831fd
FE-871: brunch toolchain detection — detectProfile (slice 1)
kostandinang Jun 15, 2026
36f0fdf
FE-871: wire brownfield toolchain detection into plan emission
kostandinang Jun 15, 2026
59fc1f3
FE-864: frame app-runtime-probe + integration-oracle as harness-owned…
kostandinang Jun 15, 2026
24df422
FE-871: fail loud on ambiguous evidence; drop per-stack detection bra…
kostandinang Jun 16, 2026
bb1cfee
FE-871: co-locate generated tests in the repo's own test dir (slice 3)
kostandinang Jun 16, 2026
c0b6fee
FE-871: monorepo-robust test-dir + workspace runner detection (slice 4)
kostandinang Jun 16, 2026
1a54d18
FE-871: detect root-level test layouts
kostandinang Jun 17, 2026
d55c7a2
FE-872: classify test-run failures as infra vs test (slice 1)
kostandinang Jun 15, 2026
8934af0
FE-872: name the toolchain cause in infra-failure halt reason (slice 2)
kostandinang Jun 15, 2026
a684367
FE-872: pin greenfield dep manifest/lockfile capture on promotion (sl…
kostandinang Jun 15, 2026
6591d29
FE-864: record dogfood-spike verdict (brownfield cook end-to-end)
kostandinang Jun 16, 2026
c6097eb
FE-872: unify test execution on one runner + verification seam (slice 4)
kostandinang Jun 16, 2026
31dd180
FE-872: classify only missing runner spawn errors as infra
kostandinang Jun 16, 2026
8f8ec2d
FE-872: avoid overclaiming infra halt details
kostandinang Jun 16, 2026
1ddf2c1
FE-875: app runtime probe — boot + HTTP probe + reachability classifi…
kostandinang Jun 16, 2026
1f01f23
FE-875: harness-owned ProbeSpec resolution — buildProbeSpec allocates…
kostandinang Jun 16, 2026
f5dd005
FE-875: bound the app probe's HTTP calls so a hung app can't hang the…
kostandinang Jun 16, 2026
e55bc2e
FE-875: keep app probe deadlines strict
kostandinang Jun 16, 2026
dbbcc08
FE-876: integration oracle Half A — fold runProbe reachability into t…
kostandinang Jun 16, 2026
3cb4bd2
FE-876: integration oracle Half B seam — reachability intent + inject…
kostandinang Jun 16, 2026
6a1f2f2
FE-877: brownfield promotion — commit the cook result onto cook/<runI…
kostandinang Jun 16, 2026
ddf6315
FE-878: brunch serve — one-shot plan-then-cook capstone (closes Arc 1)
kostandinang Jun 16, 2026
3e5ffd0
FE-878: thread launch cwd into brunch serve's cook stage
kostandinang Jun 16, 2026
0aaf8d7
FE-864: reconcile SPEC invariant drift from the Arc-1 stack
kostandinang Jun 16, 2026
343370e
FE-878: extract shared completed-spec gate for plan/serve CLI
kostandinang Jun 16, 2026
4d75ec1
FE-878: extract CLI presentation seam; migrate plan surface (slice 1a)
kostandinang Jun 16, 2026
fe2ef1b
FE-878: migrate cook surface to the presentation seam (slice 1b)
kostandinang Jun 16, 2026
3d7b8d4
FE-878: Ink TUI presenter — egg logo + brigade tracker (slice 2a)
kostandinang Jun 16, 2026
a5026de
FE-878: live waiting-state — pending panel + wait brackets (slice 2b)
kostandinang Jun 16, 2026
f2b98d6
FE-878: own the bus lifecycle so the TUI tears down (ln-review #1)
kostandinang Jun 16, 2026
613e398
FE-878: golden-test the cook banner + summary line strings (ln-review…
kostandinang Jun 16, 2026
38107be
FE-878: brunch wordmark header in brand gradient; revert brigade to m…
kostandinang Jun 16, 2026
48b28bc
FE-878: calm the pending-panel timer to whole seconds
kostandinang Jun 16, 2026
a467303
FE-878: align serve flag handling with cook
kostandinang Jun 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 183 additions & 2 deletions memory/PLAN.md

Large diffs are not rendered by default.

15 changes: 11 additions & 4 deletions memory/SPEC.md

Large diffs are not rendered by default.

539 changes: 535 additions & 4 deletions package-lock.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
"drizzle-orm": "^0.45.2",
"embla-carousel-react": "^8.6.0",
"express": "^5.2.1",
"ink": "^7.0.6",
"lucide-react": "^1.8.0",
"md-pen": "^1.2.0",
"motion": "^12.38.0",
Expand Down Expand Up @@ -118,6 +119,7 @@
"code-inspector-plugin": "^1.5.1",
"drizzle-kit": "^0.31.10",
"happy-dom": "^20.8.9",
"ink-testing-library": "^4.0.0",
"oxfmt": "^0.43.0",
"oxlint": "^1.58.0",
"oxlint-tsgolint": "^0.19.0",
Expand Down
144 changes: 144 additions & 0 deletions src/agent-extension-host.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';

import { describe, expect, it } from 'vitest';

import { type AgentExtensionConsumerWitness, flattenCapabilityIds } from './agent-extension-host.js';
import { createPiActions } from './orchestrator/src/pi-actions.js';
import type { InterviewerTools } from './server/interview.js';
import { createExplorationTools } from './server/tools/index.js';

const here = dirname(fileURLToPath(import.meta.url));

// The cook (`execute`) consumer, described as host plugins — one cook action per
// capability. Proven below against the real `createPiActions()` surface.
const cookWitness = {
consumerId: 'cook',
mode: 'execute',
plugins: [
{
id: 'execute.evaluate-done',
mode: 'execute',
capabilities: [
{
id: 'evaluate-done',
summary: 'Decide a slice is done by running its verification targets.',
handler: null,
},
],
},
{
id: 'execute.write-tests',
mode: 'execute',
capabilities: [{ id: 'write-tests', summary: 'Write failing tests for a slice.', handler: null }],
},
{
id: 'execute.write-code',
mode: 'execute',
capabilities: [{ id: 'write-code', summary: 'Write code to make a slice pass.', handler: null }],
},
{
id: 'execute.assess-semantic',
mode: 'execute',
capabilities: [
{ id: 'assess-semantic', summary: 'Assess semantic satisfaction of a slice.', handler: null },
],
},
{
id: 'execute.verify-epic',
mode: 'execute',
capabilities: [{ id: 'verify-epic', summary: 'Write + run an epic integration test.', handler: null }],
},
],
} as const satisfies AgentExtensionConsumerWitness;

// The interview (`elicit`) consumer as the neutrality WITNESS. The interview keeps
// its own runtime (Vercel AI SDK); this only proves its capability surface fits
// the same host contract. `as const` preserves the literal ids for the type-level
// coverage proof below.
const interviewWitness = {
consumerId: 'interview',
mode: 'elicit',
plugins: [
{
id: 'elicit.ask-question',
mode: 'elicit',
capabilities: [{ id: 'ask_question', summary: 'Ask the user a structured question.', handler: null }],
},
{
id: 'elicit.preface',
mode: 'elicit',
capabilities: [
{ id: 'present_preface', summary: 'Present a provisional context preface.', handler: null },
],
},
{
id: 'elicit.phase-closure',
mode: 'elicit',
capabilities: [
{ id: 'propose_phase_closure', summary: 'Propose closing the current phase.', handler: null },
],
},
{
id: 'elicit.workspace-exploration',
mode: 'elicit',
capabilities: [
{ id: 'read_file', summary: 'Read a workspace file.', handler: null },
{ id: 'grep', summary: 'Search workspace file contents.', handler: null },
{ id: 'find_files', summary: 'Find workspace files.', handler: null },
{ id: 'list_directory', summary: 'List a workspace directory.', handler: null },
],
},
],
} as const satisfies AgentExtensionConsumerWitness;

describe('agent-extension-host contract is a mode-neutral core', () => {
it('the contract module is dependency-free, which is what keeps it mode-neutral', () => {
const src = readFileSync(join(here, 'agent-extension-host.ts'), 'utf8');
// No imports is the load-bearing guarantee: a module that imports nothing
// cannot reference an `execute`-only type (Slice/Epic/Plan/Toolchain/worktree…)
// or an SDK type. That makes neutrality structural rather than a denylist of
// names we have to remember to update.
expect(src).not.toMatch(/^\s*import[\s{*]/m);
});

it('a consumer witness only loads plugins of its own mode (per-mode registration)', () => {
for (const witness of [cookWitness, interviewWitness]) {
for (const plugin of witness.plugins) {
expect(plugin.mode).toBe(witness.mode);
}
}
});
});

describe('two-consumer proof — both real surfaces fit the host contract', () => {
it('the cook execute surface matches the registered capabilities exactly', () => {
const registered = new Set(flattenCapabilityIds(cookWitness));
const actual = new Set(Object.keys(createPiActions()));
expect(registered).toEqual(actual);
});

it('the interview exploration plugin matches the real tool surface exactly', () => {
// `createExplorationTools` is DB-free, so this family is proven bidirectionally
// against live code: the witness may neither omit a real tool nor invent a
// phantom one. The three native interviewer tools (ask_question /
// present_preface / propose_phase_closure) can't be checked this way —
// constructing them needs a live DB — so their coverage is type-level only
// (the `keyof InterviewerTools` assertion below), which is superset-only: it
// proves the witness omits no real tool, not that it invents none.
const explorationPlugin = interviewWitness.plugins.find((p) => p.id === 'elicit.workspace-exploration');
const witnessed = new Set(explorationPlugin?.capabilities.map((c) => c.id));
const actual = new Set(Object.keys(createExplorationTools(here)));
expect(witnessed).toEqual(actual);
});

it('the interview witness covers every interviewer tool id (type-enforced under lint --type-check)', () => {
type ElicitCapabilityId = (typeof interviewWitness.plugins)[number]['capabilities'][number]['id'];
// If the interview adds a tool not represented in the witness, `Covered`
// becomes `false` and this assignment fails the type-aware lint gate.
type Covered = keyof InterviewerTools extends ElicitCapabilityId ? true : false;
const covered: Covered = true;
expect(covered).toBe(true);
});
});
58 changes: 58 additions & 0 deletions src/agent-extension-host.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Agent extension host — the mode-neutral contract (FE-867).
//
// The pi harness is reused across two jobs: driving specification (`elicit`)
// and driving cook (`execute`). Rather than two harnesses, treat it as one
// dual-mode *agent-extension host*: a mode-agnostic core that consumers extend
// by registering capabilities as per-mode plugins. Modes differ only by which
// plugins they load.
//
// This module is the serialization point with the parallel pi-harness work that
// owns the core *implementation*. It deliberately defines only transport-safe
// contract metadata — no session lifecycle, no stream/dispatch runtime, no SDK
// types — so it stays neutral across both consumers (cook via the pi SDK, the
// interview via the Vercel AI SDK) and across whichever runtime lands later.
//
// Invariant (checkable): this file has no imports and names no `execute`-only
// concept (slice / epic / plan / worktree / test-runner / toolchain). If it did,
// it would no longer be a mode-neutral core. See agent-extension-host.test.ts.

/** The two ways the shared agent-extension host is driven. */
export type AgentExtensionMode = 'elicit' | 'execute';

/**
* Transport-safe descriptor of one capability a consumer registers against the
* host. Mirrors `capability-registry.ts`: metadata only — the executable handler
* lives behind the host's dispatch, so this contract never owns runtime semantics.
*/
export interface AgentExtensionCapabilityContract {
id: string;
summary: string;
handler: null;
}

/**
* A plugin is the unit of per-mode registration: a named bundle of capabilities
* loaded into one mode. "Modes differ only by which plugins they load" is exactly
* this — `execute` loads the cook plugins, `elicit` loads the interview plugins.
*/
export interface AgentExtensionPluginContract {
id: string;
mode: AgentExtensionMode;
capabilities: readonly AgentExtensionCapabilityContract[];
}

/**
* A consumer (e.g. cook, the interview) described as the set of plugins it loads
* into a single mode. Used to prove a real consumer fits the host contract
* without migrating its runtime — the "witness" of mode-neutrality.
*/
export interface AgentExtensionConsumerWitness {
consumerId: string;
mode: AgentExtensionMode;
plugins: readonly AgentExtensionPluginContract[];
}

/** Enumerate the capability ids a consumer registers — the host's dispatch keys. */
export function flattenCapabilityIds(witness: AgentExtensionConsumerWitness): string[] {
return witness.plugins.flatMap((plugin) => plugin.capabilities.map((capability) => capability.id));
}
170 changes: 170 additions & 0 deletions src/orchestrator/src/app-probe.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// The probe boots a *real* app process in a tmp worktree and exercises it over
// the wire — no mocks — so these tests pin the actual boot/ready/probe/teardown
// behavior the orphan check depends on. Apps are zero-dep `node:http` scripts.

import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';

import { afterEach, describe, expect, it } from 'vitest';

import { buildProbeSpec, runProbe } from './app-probe.js';
import type { ProbeSpec } from './types.js';

const dirs: string[] = [];

afterEach(() => {
for (const dir of dirs.splice(0)) rmSync(dir, { recursive: true, force: true });
});

function sandbox(serverSource: string): string {
const dir = mkdtempSync(join(tmpdir(), 'app-probe-'));
dirs.push(dir);
writeFileSync(join(dir, 'server.js'), serverSource);
return dir;
}

/** An app that answers `routes` (path → status); everything else is 404. */
const appServing = (routes: Record<string, number>): string =>
`const http = require('node:http');\n` +
`const routes = ${JSON.stringify(routes)};\n` +
`http.createServer((req, res) => {\n` +
` const status = routes[req.url] ?? 404;\n` +
` res.writeHead(status); res.end(String(status));\n` +
`}).listen(Number(process.env.PORT), '127.0.0.1');\n`;

// Dogfoods the harness-owned spec builder: the test supplies only argv + paths,
// `buildProbeSpec` allocates the port and assembles the URLs the app boots on.
async function specFor(routes: Record<string, number>): Promise<{ spec: ProbeSpec; dir: string }> {
const spec = await buildProbeSpec({
boot: ['node', 'server.js'],
readyPath: '/health',
featurePath: '/feature',
});
return { dir: sandbox(appServing(routes)), spec };
}

describe('runProbe classifies real app reachability', () => {
it('an app whose feature endpoint answers 2xx → reachable', async () => {
const { spec, dir } = await specFor({ '/health': 200, '/feature': 200 });
const result = await runProbe(spec, dir);
expect(result.kind).toBe('reachable');
expect(result.reachable).toBe(true);
expect(result.status).toBe(200);
});

it('an app that boots but 404s the feature endpoint → not-reachable (the orphan)', async () => {
// Feature module present-but-unwired replays as: server up, route absent.
const { spec, dir } = await specFor({ '/health': 200 });
const result = await runProbe(spec, dir);
expect(result.kind).toBe('not-reachable');
expect(result.reachable).toBe(false);
expect(result.status).toBe(404);
});

it('a boot command that exits immediately → infra (distinct from not-reachable)', async () => {
const dir = sandbox('process.exit(1);\n');
const result = await runProbe(
{ boot: ['node', 'server.js'], readyUrl: 'http://127.0.0.1:1/x', featureUrl: 'http://127.0.0.1:1/x' },
dir,
);
expect(result.kind).toBe('infra');
expect(result.reachable).toBe(false);
});

it('a missing boot binary → infra, not a crash', async () => {
const dir = sandbox(appServing({ '/health': 200 }));
const started = Date.now();
const result = await runProbe(
{
boot: ['definitely-not-a-real-binary-xyz'],
readyUrl: 'http://127.0.0.1:1/x',
featureUrl: 'http://127.0.0.1:1/x',
},
dir,
);
expect(result.kind).toBe('infra');
expect(Date.now() - started).toBeLessThan(1_000);
});
});

describe('runProbe bounds its HTTP calls so a hung app cannot hang the probe', () => {
// A server that accepts connections (and the HTTP request) but never sends a
// response — the case the wall-clock deadline alone can't catch, because a
// bare `await fetch` would block forever between deadline checks.
const neverResponds = (readyRoutes: Record<string, number> = {}): string =>
`const http = require('node:http');\n` +
`const ready = ${JSON.stringify(readyRoutes)};\n` +
`http.createServer((req, res) => {\n` +
` if (ready[req.url] !== undefined) { res.writeHead(ready[req.url]); res.end('ok'); return; }\n` +
` /* otherwise: never respond */\n` +
`}).listen(Number(process.env.PORT), '127.0.0.1');\n`;

it('a ready path that accepts connections but never responds → infra within the deadline', async () => {
const spec = await buildProbeSpec({
boot: ['node', 'server.js'],
readyPath: '/health',
featurePath: '/feature',
});
const dir = sandbox(neverResponds());
const started = Date.now();
const result = await runProbe(spec, dir, { readyTimeoutMs: 600, readyAttemptMs: 2_000 });
expect(result.kind).toBe('infra');
expect(Date.now() - started).toBeLessThan(1_200);
});

it('a booted app whose feature endpoint never responds → infra, not a hang', async () => {
const spec = await buildProbeSpec({
boot: ['node', 'server.js'],
readyPath: '/health',
featurePath: '/feature',
});
const dir = sandbox(neverResponds({ '/health': 200 }));
const result = await runProbe(spec, dir, { requestTimeoutMs: 300 });
expect(result.kind).toBe('infra');
expect(result.output).toMatch(/feature probe request failed/);
});
});

describe('runProbe tears the boot process down', () => {
it('the booted app is no longer listening after the probe returns', async () => {
const { spec, dir } = await specFor({ '/health': 200, '/feature': 200 });
await runProbe(spec, dir);
// The port the app bound should be free again — nothing left listening.
await expect(fetch(spec.featureUrl)).rejects.toThrow();
});
});

describe('buildProbeSpec resolves a target into a runnable spec', () => {
it('allocates a port and assembles ready/feature URLs from the paths', async () => {
const spec = await buildProbeSpec({
boot: ['node', 'server.js'],
readyPath: '/health',
featurePath: '/feature',
});
const port = Number(spec.env?.PORT);
expect(port).toBeGreaterThan(0);
expect(spec.readyUrl).toBe(`http://127.0.0.1:${port}/health`);
expect(spec.featureUrl).toBe(`http://127.0.0.1:${port}/feature`);
expect(spec.boot).toEqual(['node', 'server.js']);
});

it('layers caller env under the allocated PORT so PORT always wins', async () => {
const spec = await buildProbeSpec({
boot: ['node', 'server.js'],
readyPath: '/',
featurePath: '/',
env: { NODE_ENV: 'test', PORT: '1' },
});
expect(spec.env?.NODE_ENV).toBe('test');
expect(Number(spec.env?.PORT)).toBeGreaterThan(1);
});

it('hands out distinct ports across concurrent allocations', async () => {
const specs = await Promise.all(
Array.from({ length: 8 }, () => buildProbeSpec({ boot: ['x'], readyPath: '/', featurePath: '/' })),
);
const ports = specs.map((s) => Number(s.env?.PORT));
expect(new Set(ports).size).toBe(ports.length);
});
});
Loading
Loading