hashintel · kostandinang · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/memory/PLAN.md b/memory/PLAN.md
diff --git a/memory/SPEC.md b/memory/SPEC.md
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -83,6 +83,7 @@
     "drizzle-orm": "^0.45.2",
     "embla-carousel-react": "^8.6.0",
     "express": "^5.2.1",
+    "ink": "^7.0.6",
     "lucide-react": "^1.8.0",
     "md-pen": "^1.2.0",
     "motion": "^12.38.0",
@@ -118,6 +119,7 @@
     "code-inspector-plugin": "^1.5.1",
     "drizzle-kit": "^0.31.10",
     "happy-dom": "^20.8.9",
+    "ink-testing-library": "^4.0.0",
     "oxfmt": "^0.43.0",
     "oxlint": "^1.58.0",
     "oxlint-tsgolint": "^0.19.0",

diff --git a/src/agent-extension-host.test.ts b/src/agent-extension-host.test.ts
@@ -0,0 +1,144 @@
+import { readFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+import { describe, expect, it } from 'vitest';
+
+import { type AgentExtensionConsumerWitness, flattenCapabilityIds } from './agent-extension-host.js';
+import { createPiActions } from './orchestrator/src/pi-actions.js';
+import type { InterviewerTools } from './server/interview.js';
+import { createExplorationTools } from './server/tools/index.js';
+
+const here = dirname(fileURLToPath(import.meta.url));
+
+// The cook (`execute`) consumer, described as host plugins — one cook action per
+// capability. Proven below against the real `createPiActions()` surface.
+const cookWitness = {
+  consumerId: 'cook',
+  mode: 'execute',
+  plugins: [
+    {
+      id: 'execute.evaluate-done',
+      mode: 'execute',
+      capabilities: [
+        {
+          id: 'evaluate-done',
+          summary: 'Decide a slice is done by running its verification targets.',
+          handler: null,
+        },
+      ],
+    },
+    {
+      id: 'execute.write-tests',
+      mode: 'execute',
+      capabilities: [{ id: 'write-tests', summary: 'Write failing tests for a slice.', handler: null }],
+    },
+    {
+      id: 'execute.write-code',
+      mode: 'execute',
+      capabilities: [{ id: 'write-code', summary: 'Write code to make a slice pass.', handler: null }],
+    },
+    {
+      id: 'execute.assess-semantic',
+      mode: 'execute',
+      capabilities: [
+        { id: 'assess-semantic', summary: 'Assess semantic satisfaction of a slice.', handler: null },
+      ],
+    },
+    {
+      id: 'execute.verify-epic',
+      mode: 'execute',
+      capabilities: [{ id: 'verify-epic', summary: 'Write + run an epic integration test.', handler: null }],
+    },
+  ],
+} as const satisfies AgentExtensionConsumerWitness;
+
+// The interview (`elicit`) consumer as the neutrality WITNESS. The interview keeps
+// its own runtime (Vercel AI SDK); this only proves its capability surface fits
+// the same host contract. `as const` preserves the literal ids for the type-level
+// coverage proof below.
+const interviewWitness = {
+  consumerId: 'interview',
+  mode: 'elicit',
+  plugins: [
+    {
+      id: 'elicit.ask-question',
+      mode: 'elicit',
+      capabilities: [{ id: 'ask_question', summary: 'Ask the user a structured question.', handler: null }],
+    },
+    {
+      id: 'elicit.preface',
+      mode: 'elicit',
+      capabilities: [
+        { id: 'present_preface', summary: 'Present a provisional context preface.', handler: null },
+      ],
+    },
+    {
+      id: 'elicit.phase-closure',
+      mode: 'elicit',
+      capabilities: [
+        { id: 'propose_phase_closure', summary: 'Propose closing the current phase.', handler: null },
+      ],
+    },
+    {
+      id: 'elicit.workspace-exploration',
+      mode: 'elicit',
+      capabilities: [
+        { id: 'read_file', summary: 'Read a workspace file.', handler: null },
+        { id: 'grep', summary: 'Search workspace file contents.', handler: null },
+        { id: 'find_files', summary: 'Find workspace files.', handler: null },
+        { id: 'list_directory', summary: 'List a workspace directory.', handler: null },
+      ],
+    },
+  ],
+} as const satisfies AgentExtensionConsumerWitness;
+
+describe('agent-extension-host contract is a mode-neutral core', () => {
+  it('the contract module is dependency-free, which is what keeps it mode-neutral', () => {
+    const src = readFileSync(join(here, 'agent-extension-host.ts'), 'utf8');
+    // No imports is the load-bearing guarantee: a module that imports nothing
+    // cannot reference an `execute`-only type (Slice/Epic/Plan/Toolchain/worktree…)
+    // or an SDK type. That makes neutrality structural rather than a denylist of
+    // names we have to remember to update.
+    expect(src).not.toMatch(/^\s*import[\s{*]/m);
+  });
+
+  it('a consumer witness only loads plugins of its own mode (per-mode registration)', () => {
+    for (const witness of [cookWitness, interviewWitness]) {
+      for (const plugin of witness.plugins) {
+        expect(plugin.mode).toBe(witness.mode);
+      }
+    }
+  });
+});
+
+describe('two-consumer proof — both real surfaces fit the host contract', () => {
+  it('the cook execute surface matches the registered capabilities exactly', () => {
+    const registered = new Set(flattenCapabilityIds(cookWitness));
+    const actual = new Set(Object.keys(createPiActions()));
+    expect(registered).toEqual(actual);
+  });
+
+  it('the interview exploration plugin matches the real tool surface exactly', () => {
+    // `createExplorationTools` is DB-free, so this family is proven bidirectionally
+    // against live code: the witness may neither omit a real tool nor invent a
+    // phantom one. The three native interviewer tools (ask_question /
+    // present_preface / propose_phase_closure) can't be checked this way —
+    // constructing them needs a live DB — so their coverage is type-level only
+    // (the `keyof InterviewerTools` assertion below), which is superset-only: it
+    // proves the witness omits no real tool, not that it invents none.
+    const explorationPlugin = interviewWitness.plugins.find((p) => p.id === 'elicit.workspace-exploration');
+    const witnessed = new Set(explorationPlugin?.capabilities.map((c) => c.id));
+    const actual = new Set(Object.keys(createExplorationTools(here)));
+    expect(witnessed).toEqual(actual);
+  });
+
+  it('the interview witness covers every interviewer tool id (type-enforced under lint --type-check)', () => {
+    type ElicitCapabilityId = (typeof interviewWitness.plugins)[number]['capabilities'][number]['id'];
+    // If the interview adds a tool not represented in the witness, `Covered`
+    // becomes `false` and this assignment fails the type-aware lint gate.
+    type Covered = keyof InterviewerTools extends ElicitCapabilityId ? true : false;
+    const covered: Covered = true;
+    expect(covered).toBe(true);
+  });
+});
diff --git a/src/agent-extension-host.ts b/src/agent-extension-host.ts
@@ -0,0 +1,58 @@
+// Agent extension host — the mode-neutral contract (FE-867).
+//
+// The pi harness is reused across two jobs: driving specification (`elicit`)
+// and driving cook (`execute`). Rather than two harnesses, treat it as one
+// dual-mode *agent-extension host*: a mode-agnostic core that consumers extend
+// by registering capabilities as per-mode plugins. Modes differ only by which
+// plugins they load.
+//
+// This module is the serialization point with the parallel pi-harness work that
+// owns the core *implementation*. It deliberately defines only transport-safe
+// contract metadata — no session lifecycle, no stream/dispatch runtime, no SDK
+// types — so it stays neutral across both consumers (cook via the pi SDK, the
+// interview via the Vercel AI SDK) and across whichever runtime lands later.
+//
+// Invariant (checkable): this file has no imports and names no `execute`-only
+// concept (slice / epic / plan / worktree / test-runner / toolchain). If it did,
+// it would no longer be a mode-neutral core. See agent-extension-host.test.ts.
+
+/** The two ways the shared agent-extension host is driven. */
+export type AgentExtensionMode = 'elicit' | 'execute';
+
+/**
+ * Transport-safe descriptor of one capability a consumer registers against the
+ * host. Mirrors `capability-registry.ts`: metadata only — the executable handler
+ * lives behind the host's dispatch, so this contract never owns runtime semantics.
+ */
+export interface AgentExtensionCapabilityContract {
+  id: string;
+  summary: string;
+  handler: null;
+}
+
+/**
+ * A plugin is the unit of per-mode registration: a named bundle of capabilities
+ * loaded into one mode. "Modes differ only by which plugins they load" is exactly
+ * this — `execute` loads the cook plugins, `elicit` loads the interview plugins.
+ */
+export interface AgentExtensionPluginContract {
+  id: string;
+  mode: AgentExtensionMode;
+  capabilities: readonly AgentExtensionCapabilityContract[];
+}
+
+/**
+ * A consumer (e.g. cook, the interview) described as the set of plugins it loads
+ * into a single mode. Used to prove a real consumer fits the host contract
+ * without migrating its runtime — the "witness" of mode-neutrality.
+ */
+export interface AgentExtensionConsumerWitness {
+  consumerId: string;
+  mode: AgentExtensionMode;
+  plugins: readonly AgentExtensionPluginContract[];
+}
+
+/** Enumerate the capability ids a consumer registers — the host's dispatch keys. */
+export function flattenCapabilityIds(witness: AgentExtensionConsumerWitness): string[] {
+  return witness.plugins.flatMap((plugin) => plugin.capabilities.map((capability) => capability.id));
+}
diff --git a/src/orchestrator/src/app-probe.test.ts b/src/orchestrator/src/app-probe.test.ts
@@ -0,0 +1,170 @@
+// The probe boots a *real* app process in a tmp worktree and exercises it over
+// the wire — no mocks — so these tests pin the actual boot/ready/probe/teardown
+// behavior the orphan check depends on. Apps are zero-dep `node:http` scripts.
+
+import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+import { afterEach, describe, expect, it } from 'vitest';
+
+import { buildProbeSpec, runProbe } from './app-probe.js';
+import type { ProbeSpec } from './types.js';
+
+const dirs: string[] = [];
+
+afterEach(() => {
+  for (const dir of dirs.splice(0)) rmSync(dir, { recursive: true, force: true });
+});
+
+function sandbox(serverSource: string): string {
+  const dir = mkdtempSync(join(tmpdir(), 'app-probe-'));
+  dirs.push(dir);
+  writeFileSync(join(dir, 'server.js'), serverSource);
+  return dir;
+}
+
+/** An app that answers `routes` (path → status); everything else is 404. */
+const appServing = (routes: Record<string, number>): string =>
+  `const http = require('node:http');\n` +
+  `const routes = ${JSON.stringify(routes)};\n` +
+  `http.createServer((req, res) => {\n` +
+  `  const status = routes[req.url] ?? 404;\n` +
+  `  res.writeHead(status); res.end(String(status));\n` +
+  `}).listen(Number(process.env.PORT), '127.0.0.1');\n`;
+
+// Dogfoods the harness-owned spec builder: the test supplies only argv + paths,
+// `buildProbeSpec` allocates the port and assembles the URLs the app boots on.
+async function specFor(routes: Record<string, number>): Promise<{ spec: ProbeSpec; dir: string }> {
+  const spec = await buildProbeSpec({
+    boot: ['node', 'server.js'],
+    readyPath: '/health',
+    featurePath: '/feature',
+  });
+  return { dir: sandbox(appServing(routes)), spec };
+}
+
+describe('runProbe classifies real app reachability', () => {
+  it('an app whose feature endpoint answers 2xx → reachable', async () => {
+    const { spec, dir } = await specFor({ '/health': 200, '/feature': 200 });
+    const result = await runProbe(spec, dir);
+    expect(result.kind).toBe('reachable');
+    expect(result.reachable).toBe(true);
+    expect(result.status).toBe(200);
+  });
+
+  it('an app that boots but 404s the feature endpoint → not-reachable (the orphan)', async () => {
+    // Feature module present-but-unwired replays as: server up, route absent.
+    const { spec, dir } = await specFor({ '/health': 200 });
+    const result = await runProbe(spec, dir);
+    expect(result.kind).toBe('not-reachable');
+    expect(result.reachable).toBe(false);
+    expect(result.status).toBe(404);
+  });
+
+  it('a boot command that exits immediately → infra (distinct from not-reachable)', async () => {
+    const dir = sandbox('process.exit(1);\n');
+    const result = await runProbe(
+      { boot: ['node', 'server.js'], readyUrl: 'http://127.0.0.1:1/x', featureUrl: 'http://127.0.0.1:1/x' },
+      dir,
+    );
+    expect(result.kind).toBe('infra');
+    expect(result.reachable).toBe(false);
+  });
+
+  it('a missing boot binary → infra, not a crash', async () => {
+    const dir = sandbox(appServing({ '/health': 200 }));
+    const started = Date.now();
+    const result = await runProbe(
+      {
+        boot: ['definitely-not-a-real-binary-xyz'],
+        readyUrl: 'http://127.0.0.1:1/x',
+        featureUrl: 'http://127.0.0.1:1/x',
+      },
+      dir,
+    );
+    expect(result.kind).toBe('infra');
+    expect(Date.now() - started).toBeLessThan(1_000);
+  });
+});
+
+describe('runProbe bounds its HTTP calls so a hung app cannot hang the probe', () => {
+  // A server that accepts connections (and the HTTP request) but never sends a
+  // response — the case the wall-clock deadline alone can't catch, because a
+  // bare `await fetch` would block forever between deadline checks.
+  const neverResponds = (readyRoutes: Record<string, number> = {}): string =>
+    `const http = require('node:http');\n` +
+    `const ready = ${JSON.stringify(readyRoutes)};\n` +
+    `http.createServer((req, res) => {\n` +
+    `  if (ready[req.url] !== undefined) { res.writeHead(ready[req.url]); res.end('ok'); return; }\n` +
+    `  /* otherwise: never respond */\n` +
+    `}).listen(Number(process.env.PORT), '127.0.0.1');\n`;
+
+  it('a ready path that accepts connections but never responds → infra within the deadline', async () => {
+    const spec = await buildProbeSpec({
+      boot: ['node', 'server.js'],
+      readyPath: '/health',
+      featurePath: '/feature',
+    });
+    const dir = sandbox(neverResponds());
+    const started = Date.now();
+    const result = await runProbe(spec, dir, { readyTimeoutMs: 600, readyAttemptMs: 2_000 });
+    expect(result.kind).toBe('infra');
+    expect(Date.now() - started).toBeLessThan(1_200);
+  });
+
+  it('a booted app whose feature endpoint never responds → infra, not a hang', async () => {
+    const spec = await buildProbeSpec({
+      boot: ['node', 'server.js'],
+      readyPath: '/health',
+      featurePath: '/feature',
+    });
+    const dir = sandbox(neverResponds({ '/health': 200 }));
+    const result = await runProbe(spec, dir, { requestTimeoutMs: 300 });
+    expect(result.kind).toBe('infra');
+    expect(result.output).toMatch(/feature probe request failed/);
+  });
+});
+
+describe('runProbe tears the boot process down', () => {
+  it('the booted app is no longer listening after the probe returns', async () => {
+    const { spec, dir } = await specFor({ '/health': 200, '/feature': 200 });
+    await runProbe(spec, dir);
+    // The port the app bound should be free again — nothing left listening.
+    await expect(fetch(spec.featureUrl)).rejects.toThrow();
+  });
+});
+
+describe('buildProbeSpec resolves a target into a runnable spec', () => {
+  it('allocates a port and assembles ready/feature URLs from the paths', async () => {
+    const spec = await buildProbeSpec({
+      boot: ['node', 'server.js'],
+      readyPath: '/health',
+      featurePath: '/feature',
+    });
+    const port = Number(spec.env?.PORT);
+    expect(port).toBeGreaterThan(0);
+    expect(spec.readyUrl).toBe(`http://127.0.0.1:${port}/health`);
+    expect(spec.featureUrl).toBe(`http://127.0.0.1:${port}/feature`);
+    expect(spec.boot).toEqual(['node', 'server.js']);
+  });
+
+  it('layers caller env under the allocated PORT so PORT always wins', async () => {
+    const spec = await buildProbeSpec({
+      boot: ['node', 'server.js'],
+      readyPath: '/',
+      featurePath: '/',
+      env: { NODE_ENV: 'test', PORT: '1' },
+    });
+    expect(spec.env?.NODE_ENV).toBe('test');
+    expect(Number(spec.env?.PORT)).toBeGreaterThan(1);
+  });
+
+  it('hands out distinct ports across concurrent allocations', async () => {
+    const specs = await Promise.all(
+      Array.from({ length: 8 }, () => buildProbeSpec({ boot: ['x'], readyPath: '/', featurePath: '/' })),
+    );
+    const ports = specs.map((s) => Number(s.env?.PORT));
+    expect(new Set(ports).size).toBe(ports.length);
+  });
+});