@forwardimpact/libeval
Advanced tools
| /** | ||
| * Test-only mock factory for AgentRunner. Yields pre-scripted responses, | ||
| * and (when an `onBatch` callback is set) fires it at the same boundaries | ||
| * the real AgentRunner would: assistant messages with at least one text | ||
| * block, and the terminal `result` message. If the callback calls | ||
| * `abort()`, the mock stops iterating that response's messages and | ||
| * reports `aborted: true`. | ||
| * | ||
| * Intentionally a regular module (not a test file) so describe/test blocks | ||
| * here would not run. Lives under test/ to make its scope explicit. | ||
| */ | ||
| import { PassThrough } from "node:stream"; | ||
| import { AgentRunner } from "@forwardimpact/libeval"; | ||
| /** | ||
| * Whether a scripted message should trigger an onBatch flush. Mirrors the | ||
| * real AgentRunner: assistant-with-text-block or terminal `result` message. | ||
| * Tool-only or string-content messages accumulate without flushing. | ||
| * @param {object} message | ||
| * @returns {boolean} | ||
| */ | ||
| export function shouldFlush(message) { | ||
| if (message.type === "result") return true; | ||
| if (message.type !== "assistant") return false; | ||
| const content = message.message?.content ?? message.content; | ||
| if (!Array.isArray(content)) return false; | ||
| for (const block of content) { | ||
| if (block.type === "text" && block.text) return true; | ||
| } | ||
| return false; | ||
| } | ||
| /** | ||
| * Create a mock AgentRunner that yields pre-scripted responses. Each call | ||
| * to `run()` or `resume()` pops the next response from the array. | ||
| * @param {object[]} responses - Array of {text, success} objects | ||
| * @param {object[]} [messages] - Messages to buffer per response | ||
| * @returns {AgentRunner} | ||
| */ | ||
| export function createMockRunner(responses, messages) { | ||
| const output = new PassThrough(); | ||
| let callIndex = 0; | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| output, | ||
| }); | ||
| const consume = async (msgs) => { | ||
| let aborted = false; | ||
| for (const m of msgs) { | ||
| const line = JSON.stringify(m); | ||
| runner.buffer.push(line); | ||
| if (runner.onLine) runner.onLine(line); | ||
| if (runner.onBatch && shouldFlush(m)) { | ||
| await runner.onBatch([line], { | ||
| abort: () => { | ||
| aborted = true; | ||
| }, | ||
| }); | ||
| if (aborted) break; | ||
| } | ||
| } | ||
| return aborted; | ||
| }; | ||
| runner.run = async (_task) => { | ||
| const resp = responses[callIndex++]; | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| const aborted = await consume(msgs); | ||
| runner.sessionId = "mock-session"; | ||
| return { | ||
| success: resp.success ?? true, | ||
| text: resp.text, | ||
| sessionId: "mock-session", | ||
| aborted, | ||
| error: null, | ||
| }; | ||
| }; | ||
| runner.resume = async (_prompt) => { | ||
| const resp = responses[callIndex++]; | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| const aborted = await consume(msgs); | ||
| return { | ||
| success: resp.success ?? true, | ||
| text: resp.text, | ||
| sessionId: runner.sessionId, | ||
| aborted, | ||
| error: null, | ||
| }; | ||
| }; | ||
| return runner; | ||
| } |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { Supervisor } from "@forwardimpact/libeval"; | ||
| import { isIntervention } from "../src/supervisor.js"; | ||
| import { createMockRunner } from "./mock-runner.js"; | ||
| describe("isIntervention", () => { | ||
| test("detects EVALUATION_INTERVENTION on its own line", () => { | ||
| assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true); | ||
| assert.strictEqual( | ||
| isIntervention("Some text\nEVALUATION_INTERVENTION\nMore text"), | ||
| true, | ||
| ); | ||
| assert.strictEqual( | ||
| isIntervention("Stop.\n\nEVALUATION_INTERVENTION"), | ||
| true, | ||
| ); | ||
| }); | ||
| test("tolerates markdown formatting around the signal", () => { | ||
| assert.strictEqual(isIntervention("**EVALUATION_INTERVENTION**"), true); | ||
| assert.strictEqual(isIntervention("*EVALUATION_INTERVENTION*"), true); | ||
| assert.strictEqual(isIntervention("__EVALUATION_INTERVENTION__"), true); | ||
| assert.strictEqual(isIntervention("_EVALUATION_INTERVENTION_"), true); | ||
| assert.strictEqual(isIntervention("`EVALUATION_INTERVENTION`"), true); | ||
| assert.strictEqual( | ||
| isIntervention( | ||
| "Wrong path.\n\n**EVALUATION_INTERVENTION**\n\nTry the documented one.", | ||
| ), | ||
| true, | ||
| ); | ||
| }); | ||
| test("matches EVALUATION_INTERVENTION inline", () => { | ||
| assert.strictEqual( | ||
| isIntervention("Stopping you with EVALUATION_INTERVENTION now."), | ||
| true, | ||
| ); | ||
| assert.strictEqual( | ||
| isIntervention("Note: EVALUATION_INTERVENTION. Switch to Y."), | ||
| true, | ||
| ); | ||
| }); | ||
| test("does not match empty or unrelated text", () => { | ||
| assert.strictEqual(isIntervention(""), false); | ||
| assert.strictEqual(isIntervention("Stop and think."), false); | ||
| assert.strictEqual(isIntervention("INTERVENTION"), false); | ||
| }); | ||
| test("does not match EVALUATION_COMPLETE alone", () => { | ||
| assert.strictEqual(isIntervention("EVALUATION_COMPLETE"), false); | ||
| assert.strictEqual( | ||
| isIntervention("Good work.\n\nEVALUATION_COMPLETE"), | ||
| false, | ||
| ); | ||
| }); | ||
| }); | ||
| describe("Supervisor - mid-turn intervention", () => { | ||
| test("observation without intervention does not interrupt the agent", async () => { | ||
| // Agent emits one structured assistant text block — fires onBatch once. | ||
| // Supervisor responds with "Keep going." — neither signal flag is set, | ||
| // so the agent's SDK session completes naturally and the end-of-turn | ||
| // review then emits EVALUATION_COMPLETE. | ||
| const agentMessages = [ | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "I'm working on it." }], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const agentRunner = createMockRunner( | ||
| [{ text: "I'm working on it." }], | ||
| agentMessages, | ||
| ); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Welcome! Please install." }, | ||
| { text: "Keep going." }, | ||
| { text: "Good work.\n\nEVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| let agentResumeCalls = 0; | ||
| const origAgentResume = agentRunner.resume; | ||
| agentRunner.resume = async (prompt) => { | ||
| agentResumeCalls++; | ||
| return origAgentResume.call(agentRunner, prompt); | ||
| }; | ||
| const result = await supervisor.run("Install"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| assert.strictEqual( | ||
| agentResumeCalls, | ||
| 0, | ||
| "Agent should not be resumed when supervisor never intervenes", | ||
| ); | ||
| // Trace must contain a mid_turn_review marker but no intervention markers. | ||
| const data = output.read()?.toString() ?? ""; | ||
| const orchestratorEvents = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0) | ||
| .map((l) => JSON.parse(l)) | ||
| .filter((e) => e.source === "orchestrator"); | ||
| assert.ok( | ||
| orchestratorEvents.some((e) => e.event?.type === "mid_turn_review"), | ||
| "Trace should contain mid_turn_review when onBatch fires", | ||
| ); | ||
| assert.ok( | ||
| !orchestratorEvents.some( | ||
| (e) => e.event?.type === "intervention_requested", | ||
| ), | ||
| "Trace should not contain intervention_requested when supervisor only observes", | ||
| ); | ||
| }); | ||
| test("EVALUATION_INTERVENTION from mid-turn batch interrupts and relays", async () => { | ||
| // Agent's first call fires onBatch on a structured assistant text block; | ||
| // supervisor responds with EVALUATION_INTERVENTION → abort + relay. | ||
| // Agent's second call (resume) finishes naturally; end-of-turn review | ||
| // then emits EVALUATION_COMPLETE. | ||
| const agentMessages = [ | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "I'll try the wrong path." }], | ||
| }, | ||
| }, | ||
| ], | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "text", text: "OK, switching to the documented path." }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const agentRunner = createMockRunner( | ||
| [ | ||
| { text: "I'll try the wrong path." }, | ||
| { text: "OK, switching to the documented path." }, | ||
| ], | ||
| agentMessages, | ||
| ); | ||
| // Supervisor responses (in order): | ||
| // 0: turn 0 introduction | ||
| // 1: mid-turn 1 batch 1 — intervene | ||
| // 2: mid-turn 1 batch 1 (post-resume) — keep going | ||
| // 3: end-of-turn 1 — EVALUATION_COMPLETE | ||
| const supervisorMessages = [ | ||
| undefined, | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "EVALUATION_INTERVENTION Stop and use the documented path.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| undefined, | ||
| undefined, | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome." }, | ||
| { text: "EVALUATION_INTERVENTION Stop and use the documented path." }, | ||
| { text: "Keep going." }, | ||
| { text: "Good.\n\nEVALUATION_COMPLETE" }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| let agentResumeCalls = 0; | ||
| let firstResumePrompt = null; | ||
| const origAgentResume = agentRunner.resume; | ||
| agentRunner.resume = async (prompt) => { | ||
| agentResumeCalls++; | ||
| if (agentResumeCalls === 1) firstResumePrompt = prompt; | ||
| return origAgentResume.call(agentRunner, prompt); | ||
| }; | ||
| const result = await supervisor.run("Install"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| assert.strictEqual( | ||
| agentResumeCalls, | ||
| 1, | ||
| "Agent should be resumed exactly once after intervention", | ||
| ); | ||
| assert.ok( | ||
| firstResumePrompt && firstResumePrompt.includes("documented path"), | ||
| "Resume prompt should carry the supervisor's intervention text", | ||
| ); | ||
| const orchestratorEvents = (output.read()?.toString() ?? "") | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0) | ||
| .map((l) => JSON.parse(l)) | ||
| .filter((e) => e.source === "orchestrator"); | ||
| assert.ok( | ||
| orchestratorEvents.some( | ||
| (e) => e.event?.type === "intervention_requested", | ||
| ), | ||
| "Trace should contain intervention_requested orchestrator event", | ||
| ); | ||
| assert.ok( | ||
| orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"), | ||
| "Trace should contain intervention_relayed orchestrator event", | ||
| ); | ||
| }); | ||
| test("EVALUATION_INTERVENTION and EVALUATION_COMPLETE in the same turn", async () => { | ||
| // Batch 1: supervisor intervenes (abort + relay). | ||
| // After resume, batch 1 of resume: supervisor writes EVALUATION_COMPLETE | ||
| // (mid-turn) — the loop must exit success without running an end-of-turn | ||
| // review. | ||
| const agentMessages = [ | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { content: [{ type: "text", text: "Trying X." }] }, | ||
| }, | ||
| ], | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { content: [{ type: "text", text: "OK trying Y." }] }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const agentRunner = createMockRunner( | ||
| [{ text: "Trying X." }, { text: "Trying Y." }], | ||
| agentMessages, | ||
| ); | ||
| const supervisorMessages = [ | ||
| undefined, | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "EVALUATION_INTERVENTION Try Y instead.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Excellent. EVALUATION_COMPLETE" }], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome." }, | ||
| { text: "EVALUATION_INTERVENTION Try Y instead." }, | ||
| { text: "Excellent. EVALUATION_COMPLETE" }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| let agentResumeCalls = 0; | ||
| const origAgentResume = agentRunner.resume; | ||
| agentRunner.resume = async (prompt) => { | ||
| agentResumeCalls++; | ||
| return origAgentResume.call(agentRunner, prompt); | ||
| }; | ||
| const result = await supervisor.run("Install"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| assert.strictEqual( | ||
| agentResumeCalls, | ||
| 1, | ||
| "Agent.resume runs once (after intervention); EVALUATION_COMPLETE then ends the turn", | ||
| ); | ||
| const orchestratorEvents = (output.read()?.toString() ?? "") | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0) | ||
| .map((l) => JSON.parse(l)) | ||
| .filter((e) => e.source === "orchestrator"); | ||
| assert.ok( | ||
| orchestratorEvents.some( | ||
| (e) => e.event?.type === "intervention_requested", | ||
| ), | ||
| "Trace should contain intervention_requested", | ||
| ); | ||
| assert.ok( | ||
| orchestratorEvents.some((e) => e.event?.type === "complete_requested"), | ||
| "Trace should contain complete_requested for mid-turn EVALUATION_COMPLETE", | ||
| ); | ||
| }); | ||
| }); |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { | ||
| Supervisor, | ||
| createSupervisor, | ||
| SUPERVISOR_SYSTEM_PROMPT, | ||
| AGENT_SYSTEM_PROMPT, | ||
| } from "@forwardimpact/libeval"; | ||
| import { createMockRunner } from "./mock-runner.js"; | ||
| describe("Supervisor - output and events", () => { | ||
| test("output contains tagged lines with correct source and turn", async () => { | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "Go ahead" }], | ||
| [{ type: "assistant", content: "EVALUATION_COMPLETE" }], | ||
| ]; | ||
| const agentMessages = [[{ type: "assistant", content: "Working" }]]; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Go ahead" }, { text: "EVALUATION_COMPLETE" }], | ||
| supervisorMessages, | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Working" }], agentMessages); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary | ||
| assert.ok(lines.length >= 4); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
| assert.ok("event" in supervisorLine); | ||
| const agentLine = JSON.parse(lines[1]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 1); | ||
| assert.ok("event" in agentLine); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.type, "summary"); | ||
| assert.strictEqual(summaryLine.success, true); | ||
| }); | ||
| test("events are nested under event key (no field collisions)", async () => { | ||
| const sourceEvent = { | ||
| type: "assistant", | ||
| source: "sdk-internal", | ||
| content: "test", | ||
| }; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Go" }, { text: "EVALUATION_COMPLETE" }], | ||
| [ | ||
| [{ type: "assistant", content: "Go" }], | ||
| [{ type: "assistant", content: "ok" }], | ||
| ], | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| // First line is supervisor turn 0, second is agent turn 1 | ||
| const tagged = JSON.parse(lines[1]); | ||
| assert.strictEqual(tagged.source, "agent"); | ||
| assert.strictEqual(tagged.event.source, "sdk-internal"); | ||
| }); | ||
| test("mid-turn intervention emits orchestrator events and shares the agent's turn id", async () => { | ||
| // Agent emits one structured assistant text block on its first call — | ||
| // supervisor intervenes mid-turn. Resume then completes naturally and | ||
| // the end-of-turn review signals EVALUATION_COMPLETE. | ||
| const agentMessages = [ | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Trying the wrong thing." }], | ||
| }, | ||
| }, | ||
| ], | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Switching to the right thing." }], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const supervisorMessages = [ | ||
| undefined, | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "EVALUATION_INTERVENTION Switch to the right path.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| undefined, | ||
| undefined, | ||
| ]; | ||
| const agentRunner = createMockRunner( | ||
| [{ text: "Trying the wrong thing." }, { text: "Switching." }], | ||
| agentMessages, | ||
| ); | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome." }, | ||
| { text: "EVALUATION_INTERVENTION Switch to the right path." }, | ||
| { text: "Keep going." }, | ||
| { text: "Done. EVALUATION_COMPLETE" }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Task"); | ||
| assert.strictEqual(result.success, true); | ||
| const lines = (output.read()?.toString() ?? "") | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0) | ||
| .map((l) => JSON.parse(l)); | ||
| // (1) Orchestrator event with intervention_requested. | ||
| const interventionRequested = lines.find( | ||
| (l) => | ||
| l.source === "orchestrator" && | ||
| l.event?.type === "intervention_requested", | ||
| ); | ||
| assert.ok( | ||
| interventionRequested, | ||
| "Trace must contain intervention_requested orchestrator event", | ||
| ); | ||
| // (2) At least one agent line and one supervisor line share a turn id — | ||
| // mid-turn supervisor activity is tagged with the agent's turn. | ||
| const agentTurns = new Set( | ||
| lines.filter((l) => l.source === "agent").map((l) => l.turn), | ||
| ); | ||
| const supervisorTurns = new Set( | ||
| lines.filter((l) => l.source === "supervisor").map((l) => l.turn), | ||
| ); | ||
| const sharedTurns = [...agentTurns].filter((t) => supervisorTurns.has(t)); | ||
| assert.ok( | ||
| sharedTurns.length > 0, | ||
| "At least one turn id must appear on both agent and supervisor lines", | ||
| ); | ||
| // (3) Final summary line still emitted. | ||
| const summary = lines[lines.length - 1]; | ||
| assert.strictEqual(summary.source, "orchestrator"); | ||
| assert.strictEqual(summary.type, "summary"); | ||
| assert.strictEqual(summary.success, true); | ||
| }); | ||
| test("emits supervisor output and summary when supervisor errors on turn 0", async () => { | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "Starting..." }], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Starting...", success: false }], | ||
| supervisorMessages, | ||
| ); | ||
| const origRun = supervisorRunner.run; | ||
| supervisorRunner.run = async (task) => { | ||
| const result = await origRun.call(supervisorRunner, task); | ||
| return { ...result, error: new Error("Process exited with code 1") }; | ||
| }; | ||
| const agentRunner = createMockRunner([]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 0); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| assert.ok(lines.length >= 2, "Expected at least supervisor line + summary"); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.success, false); | ||
| assert.strictEqual(summaryLine.turns, 0); | ||
| }); | ||
| }); | ||
| describe("Supervisor - createSupervisor factory", () => { | ||
| test("createSupervisor factory returns a Supervisor instance", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.ok(supervisor instanceof Supervisor); | ||
| }); | ||
| test("createSupervisor uses default supervisor tools when none specified", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ]); | ||
| }); | ||
| test("createSupervisor passes custom supervisor tools", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorAllowedTools: ["Read", "Glob", "Grep"], | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| ]); | ||
| }); | ||
| test("createSupervisor wires system prompts to both runners", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: AGENT_SYSTEM_PROMPT, | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: SUPERVISOR_SYSTEM_PROMPT, | ||
| }); | ||
| }); | ||
| test("createSupervisor blocks sub-agent spawn tools on supervisor by default", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [ | ||
| "Agent", | ||
| "Task", | ||
| "TaskOutput", | ||
| "TaskStop", | ||
| ]); | ||
| assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []); | ||
| }); | ||
| test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorDisallowedTools: ["WebSearch", "Task"], | ||
| }); | ||
| const disallowed = supervisor.supervisorRunner.disallowedTools; | ||
| assert.ok(disallowed.includes("Agent")); | ||
| assert.ok(disallowed.includes("Task")); | ||
| assert.ok(disallowed.includes("TaskOutput")); | ||
| assert.ok(disallowed.includes("TaskStop")); | ||
| assert.ok(disallowed.includes("WebSearch")); | ||
| assert.strictEqual(disallowed.length, new Set(disallowed).size); | ||
| }); | ||
| test("system prompt constants are non-empty strings", () => { | ||
| assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string"); | ||
| assert.ok(typeof AGENT_SYSTEM_PROMPT === "string"); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0); | ||
| assert.ok(AGENT_SYSTEM_PROMPT.length > 0); | ||
| }); | ||
| test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => { | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay")); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_COMPLETE")); | ||
| }); | ||
| }); |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { Supervisor } from "@forwardimpact/libeval"; | ||
| import { isComplete } from "../src/supervisor.js"; | ||
| import { createMockRunner } from "./mock-runner.js"; | ||
| describe("isComplete", () => { | ||
| test("detects EVALUATION_COMPLETE on its own line", () => { | ||
| assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true); | ||
| assert.strictEqual( | ||
| isComplete("Some text\nEVALUATION_COMPLETE\nMore text"), | ||
| true, | ||
| ); | ||
| assert.strictEqual(isComplete("Done.\n\nEVALUATION_COMPLETE"), true); | ||
| }); | ||
| test("tolerates markdown formatting around the signal", () => { | ||
| assert.strictEqual(isComplete("**EVALUATION_COMPLETE**"), true); | ||
| assert.strictEqual(isComplete("*EVALUATION_COMPLETE*"), true); | ||
| assert.strictEqual(isComplete("__EVALUATION_COMPLETE__"), true); | ||
| assert.strictEqual(isComplete("_EVALUATION_COMPLETE_"), true); | ||
| assert.strictEqual(isComplete("`EVALUATION_COMPLETE`"), true); | ||
| assert.strictEqual( | ||
| isComplete("Good work.\n\n**EVALUATION_COMPLETE**\n\nNow filing issues."), | ||
| true, | ||
| ); | ||
| }); | ||
| test("matches EVALUATION_COMPLETE anywhere in text", () => { | ||
| assert.strictEqual(isComplete("not EVALUATION_COMPLETE yet"), true); | ||
| assert.strictEqual( | ||
| isComplete("The agent is EVALUATION_COMPLETE done"), | ||
| true, | ||
| ); | ||
| assert.strictEqual( | ||
| isComplete("Great work! EVALUATION_COMPLETE. Now filing issues."), | ||
| true, | ||
| ); | ||
| }); | ||
| test("does not match empty or unrelated text", () => { | ||
| assert.strictEqual(isComplete(""), false); | ||
| assert.strictEqual(isComplete("All done!"), false); | ||
| assert.strictEqual(isComplete("DONE"), false); | ||
| }); | ||
| test("does not match old EVALUATION_SUCCESSFUL signal", () => { | ||
| assert.strictEqual(isComplete("EVALUATION_SUCCESSFUL"), false); | ||
| }); | ||
| }); | ||
| describe("Supervisor - run and turns", () => { | ||
| test("constructor throws on missing agentRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| supervisorRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /agentRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing supervisorRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /supervisorRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing output", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| supervisorRunner: createMockRunner([]), | ||
| }), | ||
| /output is required/, | ||
| ); | ||
| }); | ||
| test("completes on EVALUATION_COMPLETE from supervisor at turn 0", async () => { | ||
| const agentRunner = createMockRunner([]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "EVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 0); | ||
| }); | ||
| test("completes after one agent turn", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Welcome! Please install the packages." }, | ||
| { text: "Good work.\n\nEVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("detects EVALUATION_COMPLETE in streamed messages when result text differs", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| const supervisorMessages = [ | ||
| undefined, | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "Good work.\n\nEVALUATION_COMPLETE\n\nNow filing issues.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "text", text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome! Please install the packages." }, | ||
| { text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("relays only the last assistant text block to the agent", async () => { | ||
| // Supervisor emits reasoning text ("Let me research...") then a tool call, | ||
| // then a final task message. Only the final message should reach the agent. | ||
| const supervisorMessages = [ | ||
| // Turn 0: multiple assistant messages with reasoning + task | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "text", text: "Let me research the product first." }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "Hello! Here is your task: install the packages.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| // Turn 1: evaluation | ||
| undefined, | ||
| ]; | ||
| let capturedAgentPrompt = null; | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| const origRun = agentRunner.run; | ||
| agentRunner.run = async (task) => { | ||
| capturedAgentPrompt = task; | ||
| return origRun.call(agentRunner, task); | ||
| }; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| // SDK result text = last message text (but relay should use buffer) | ||
| { text: "Hello! Here is your task: install the packages." }, | ||
| { text: "EVALUATION_COMPLETE" }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| await supervisor.run("Evaluate the product"); | ||
| // Agent should receive only the final text, not the reasoning | ||
| assert.strictEqual( | ||
| capturedAgentPrompt, | ||
| "Hello! Here is your task: install the packages.", | ||
| ); | ||
| assert.ok( | ||
| !capturedAgentPrompt.includes("research"), | ||
| "Reasoning text should not leak to agent", | ||
| ); | ||
| }); | ||
| test("runs multiple turns before completion", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Started working." }, | ||
| { text: "Made progress." }, | ||
| { text: "Finished everything." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Here is your task. Do the work." }, | ||
| { text: "Keep going, you need to do more." }, | ||
| { text: "Almost there, continue." }, | ||
| { text: "EVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Do the work"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 3); | ||
| }); | ||
| test("enforces maxTurns limit", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Turn 1" }, | ||
| { text: "Turn 2" }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Start working." }, | ||
| { text: "Continue." }, | ||
| { text: "Continue." }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 2, | ||
| }); | ||
| const result = await supervisor.run("Endless task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 2); | ||
| }); | ||
| }); |
+2
-2
@@ -32,3 +32,3 @@ #!/usr/bin/env node | ||
| --model=MODEL Claude model to use (default: opus) | ||
| --max-turns=N Maximum agentic turns (default: 50) | ||
| --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited) | ||
| --output=PATH Write NDJSON trace to file (default: stdout) | ||
@@ -44,3 +44,3 @@ --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| --model=MODEL Claude model to use (default: opus) | ||
| --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20) | ||
| --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited) | ||
| --output=PATH Write NDJSON trace to file (default: stdout) | ||
@@ -47,0 +47,0 @@ --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit) |
+2
-0
@@ -8,3 +8,5 @@ export { TraceCollector, createTraceCollector } from "./src/trace-collector.js"; | ||
| AGENT_SYSTEM_PROMPT, | ||
| isComplete, | ||
| isIntervention, | ||
| } from "./src/supervisor.js"; | ||
| export { TeeWriter, createTeeWriter } from "./src/tee-writer.js"; |
+1
-1
| { | ||
| "name": "@forwardimpact/libeval", | ||
| "version": "0.1.6", | ||
| "version": "0.1.8", | ||
| "description": "Process Claude Code stream-json output into structured traces", | ||
@@ -5,0 +5,0 @@ "license": "Apache-2.0", |
+97
-39
@@ -20,2 +20,3 @@ /** | ||
| * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced | ||
| * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn. | ||
| * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md) | ||
@@ -35,2 +36,3 @@ * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI | ||
| onLine, | ||
| onBatch, | ||
| settingSources, | ||
@@ -48,3 +50,3 @@ agentProfile, | ||
| this.model = model ?? "opus"; | ||
| this.maxTurns = maxTurns ?? 50; | ||
| this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK) | ||
| this.allowedTools = allowedTools ?? [ | ||
@@ -60,2 +62,3 @@ "Bash", | ||
| this.onLine = onLine ?? null; | ||
| this.onBatch = onBatch ?? null; | ||
| this.settingSources = settingSources ?? []; | ||
@@ -67,2 +70,4 @@ this.agentProfile = agentProfile ?? null; | ||
| this.buffer = []; | ||
| /** @type {AbortController|null} */ | ||
| this.currentAbortController = null; | ||
| } | ||
@@ -73,11 +78,9 @@ | ||
| * @param {string} task - The task prompt | ||
| * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>} | ||
| * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>} | ||
| */ | ||
| async run(task) { | ||
| let text = ""; | ||
| let stopReason = null; | ||
| let error = null; | ||
| const abortController = new AbortController(); | ||
| this.currentAbortController = abortController; | ||
| try { | ||
| for await (const message of this.query({ | ||
| const iterator = this.query({ | ||
| prompt: task, | ||
@@ -87,3 +90,3 @@ options: { | ||
| allowedTools: this.allowedTools, | ||
| maxTurns: this.maxTurns, | ||
| ...(this.maxTurns > 0 && { maxTurns: this.maxTurns }), | ||
| model: this.model, | ||
@@ -93,2 +96,3 @@ permissionMode: this.permissionMode, | ||
| settingSources: this.settingSources, | ||
| abortController, | ||
| ...(this.disallowedTools.length > 0 && { | ||
@@ -100,25 +104,7 @@ disallowedTools: this.disallowedTools, | ||
| }, | ||
| })) { | ||
| const line = JSON.stringify(message); | ||
| this.output.write(line + "\n"); | ||
| this.buffer.push(line); | ||
| if (this.onLine) this.onLine(line); | ||
| if (message.type === "system" && message.subtype === "init") { | ||
| this.sessionId = message.session_id; | ||
| } | ||
| if (message.type === "result") { | ||
| text = message.result ?? ""; | ||
| stopReason = message.subtype; | ||
| } | ||
| } | ||
| } catch (err) { | ||
| error = err; | ||
| }); | ||
| return await this.#consumeQuery(iterator); | ||
| } finally { | ||
| this.currentAbortController = null; | ||
| } | ||
| // If the SDK already emitted a successful result, honour it even when the | ||
| // stream throws afterwards (e.g. "Credit balance is too low" during | ||
| // cleanup). Only treat errors as fatal when no result was received yet. | ||
| const success = stopReason === "success"; | ||
| return { success, text, sessionId: this.sessionId, error }; | ||
| } | ||
@@ -129,11 +115,9 @@ | ||
| * @param {string} prompt - The follow-up prompt | ||
| * @returns {Promise<{success: boolean, text: string}>} | ||
| * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>} | ||
| */ | ||
| async resume(prompt) { | ||
| let text = ""; | ||
| let stopReason = null; | ||
| let error = null; | ||
| const abortController = new AbortController(); | ||
| this.currentAbortController = abortController; | ||
| try { | ||
| for await (const message of this.query({ | ||
| const iterator = this.query({ | ||
| prompt, | ||
@@ -144,4 +128,42 @@ options: { | ||
| allowDangerouslySkipPermissions: true, | ||
| abortController, | ||
| }, | ||
| })) { | ||
| }); | ||
| return await this.#consumeQuery(iterator); | ||
| } finally { | ||
| this.currentAbortController = null; | ||
| } | ||
| } | ||
| /** | ||
| * Shared consumer for both `run()` and `resume()`. Iterates the SDK query | ||
| * iterator, mirroring every line to the output stream / buffer / onLine | ||
| * callback, and — when `onBatch` is set — flushes accumulated lines to it | ||
| * at natural boundaries (assistant messages with text blocks, and the | ||
| * terminal `result` message). | ||
| * | ||
| * INVARIANT: the `await this.onBatch(...)` call below is the ONLY | ||
| * suspension point in this loop. While it is pending, no further lines | ||
| * are pulled from the SDK generator. The Supervisor relies on this — its | ||
| * onBatch callback flips `currentSource` to "supervisor" for the duration | ||
| * of its mid-turn LLM call, and the invariant guarantees no agent line | ||
| * can arrive concurrently and be mis-tagged. | ||
| * | ||
| * If the supervisor calls `abort()` from inside the callback, the next | ||
| * iteration of the for-await loop will throw. We catch the throw, check | ||
| * `currentAbortController.signal.aborted` (avoiding fragility around | ||
| * AbortError vs DOMException shapes), and report `aborted: true` so the | ||
| * caller can distinguish "supervisor asked us to stop" from a real error. | ||
| * @param {AsyncIterable<object>} iterator | ||
| * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>} | ||
| */ | ||
| async #consumeQuery(iterator) { | ||
| let text = ""; | ||
| let stopReason = null; | ||
| let error = null; | ||
| let aborted = false; | ||
| const pendingBatch = []; | ||
| try { | ||
| for await (const message of iterator) { | ||
| const line = JSON.stringify(message); | ||
@@ -151,3 +173,7 @@ this.output.write(line + "\n"); | ||
| if (this.onLine) this.onLine(line); | ||
| if (this.onBatch) pendingBatch.push(line); | ||
| if (message.type === "system" && message.subtype === "init") { | ||
| this.sessionId = message.session_id; | ||
| } | ||
| if (message.type === "result") { | ||
@@ -157,9 +183,24 @@ text = message.result ?? ""; | ||
| } | ||
| const shouldFlush = | ||
| this.onBatch && | ||
| (message.type === "result" || | ||
| (message.type === "assistant" && hasTextBlock(message))); | ||
| if (shouldFlush) { | ||
| const batchLines = pendingBatch.splice(0, pendingBatch.length); | ||
| await this.onBatch(batchLines, { | ||
| abort: () => this.currentAbortController?.abort(), | ||
| }); | ||
| } | ||
| } | ||
| } catch (err) { | ||
| error = err; | ||
| if (this.currentAbortController?.signal.aborted) { | ||
| aborted = true; | ||
| } else { | ||
| error = err; | ||
| } | ||
| } | ||
| const success = stopReason === "success"; | ||
| return { success, text, error }; | ||
| return { success, text, sessionId: this.sessionId, error, aborted }; | ||
| } | ||
@@ -179,2 +220,19 @@ | ||
| /** | ||
| * Whether an SDK assistant message contains at least one text block. | ||
| * Tool-only assistant messages return false so they accumulate into the | ||
| * pending batch and flush with the next text block (or with the terminal | ||
| * `result` message), keeping supervisor LLM cost bounded. | ||
| * @param {object} message | ||
| * @returns {boolean} | ||
| */ | ||
| function hasTextBlock(message) { | ||
| const content = message.message?.content ?? message.content; | ||
| if (!Array.isArray(content)) return false; | ||
| for (const block of content) { | ||
| if (block.type === "text" && block.text) return true; | ||
| } | ||
| return false; | ||
| } | ||
| /** | ||
| * Factory function — wires real dependencies. | ||
@@ -181,0 +239,0 @@ * @param {object} deps - Same as AgentRunner constructor |
+43
-18
@@ -22,2 +22,34 @@ import { readFileSync, createWriteStream } from "node:fs"; | ||
| /** | ||
| * Parse and validate run command options from args. | ||
| * @param {string[]} args | ||
| * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }} | ||
| */ | ||
| function parseRunOptions(args) { | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
| const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50"; | ||
| const taskAmend = parseFlag(args, "task-amend") ?? undefined; | ||
| let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
| if (taskAmend) taskContent += `\n\n${taskAmend}`; | ||
| return { | ||
| taskContent, | ||
| cwd: resolve(parseFlag(args, "cwd") ?? "."), | ||
| model: parseFlag(args, "model") ?? "opus", | ||
| maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10), | ||
| outputPath: parseFlag(args, "output"), | ||
| agentProfile: parseFlag(args, "agent-profile") ?? undefined, | ||
| allowedTools: ( | ||
| parseFlag(args, "allowed-tools") ?? | ||
| "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite" | ||
| ).split(","), | ||
| }; | ||
| } | ||
| /** | ||
| * Run command — execute a single agent via the Claude Agent SDK. | ||
@@ -32,6 +64,7 @@ * | ||
| * --model=MODEL Claude model to use (default: opus) | ||
| * --max-turns=N Maximum agentic turns (default: 50) | ||
| * --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited) | ||
| * --output=PATH Write NDJSON trace to file (default: stdout) | ||
| * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
| * --task-amend=TEXT Additional text appended to the task prompt | ||
| * | ||
@@ -41,20 +74,12 @@ * @param {string[]} args - Command arguments | ||
| export async function runRunCommand(args) { | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
| const { | ||
| taskContent, | ||
| cwd, | ||
| model, | ||
| maxTurns, | ||
| outputPath, | ||
| agentProfile, | ||
| allowedTools, | ||
| } = parseRunOptions(args); | ||
| const cwd = resolve(parseFlag(args, "cwd") ?? "."); | ||
| const model = parseFlag(args, "model") ?? "opus"; | ||
| const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const agentProfile = parseFlag(args, "agent-profile") ?? undefined; | ||
| const allowedTools = ( | ||
| parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| ).split(","); | ||
| const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
| // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
@@ -61,0 +86,0 @@ // Otherwise, write NDJSON directly to stdout (backwards-compatible). |
@@ -23,2 +23,46 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs"; | ||
| /** | ||
| * Parse all supervise flags from args into an options object. | ||
| * @param {string[]} args | ||
| * @returns {object} | ||
| */ | ||
| function parseSuperviseOptions(args) { | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
| const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools"); | ||
| const taskAmend = parseFlag(args, "task-amend") ?? undefined; | ||
| let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
| if (taskAmend) taskContent += `\n\n${taskAmend}`; | ||
| return { | ||
| taskContent, | ||
| supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."), | ||
| agentCwd: resolve( | ||
| parseFlag(args, "agent-cwd") ?? | ||
| mkdtempSync(join(tmpdir(), "fit-eval-agent-")), | ||
| ), | ||
| model: parseFlag(args, "model") ?? "opus", | ||
| maxTurns: (() => { | ||
| const raw = parseFlag(args, "max-turns") ?? "20"; | ||
| return raw === "0" ? 0 : parseInt(raw, 10); | ||
| })(), | ||
| outputPath: parseFlag(args, "output"), | ||
| supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined, | ||
| agentProfile: parseFlag(args, "agent-profile") ?? undefined, | ||
| allowedTools: ( | ||
| parseFlag(args, "allowed-tools") ?? | ||
| "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite" | ||
| ).split(","), | ||
| supervisorAllowedTools: supervisorAllowedToolsRaw | ||
| ? supervisorAllowedToolsRaw.split(",") | ||
| : undefined, | ||
| }; | ||
| } | ||
| /** | ||
| * Supervise command — run two agents in a relay loop via the Claude Agent SDK. | ||
@@ -34,3 +78,3 @@ * | ||
| * --model=MODEL Claude model to use (default: opus) | ||
| * --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20) | ||
| * --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited) | ||
| * --output=PATH Write NDJSON trace to file (default: stdout) | ||
@@ -40,2 +84,3 @@ * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
| * --task-amend=TEXT Additional text appended to the task prompt | ||
| * | ||
@@ -45,32 +90,9 @@ * @param {string[]} args - Command arguments | ||
| export async function runSuperviseCommand(args) { | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
| const opts = parseSuperviseOptions(args); | ||
| const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? "."); | ||
| const agentCwd = resolve( | ||
| parseFlag(args, "agent-cwd") ?? | ||
| mkdtempSync(join(tmpdir(), "fit-eval-agent-")), | ||
| ); | ||
| const model = parseFlag(args, "model") ?? "opus"; | ||
| const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined; | ||
| const agentProfile = parseFlag(args, "agent-profile") ?? undefined; | ||
| const allowedTools = ( | ||
| parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| ).split(","); | ||
| const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools"); | ||
| const supervisorAllowedTools = supervisorAllowedToolsRaw | ||
| ? supervisorAllowedToolsRaw.split(",") | ||
| : undefined; | ||
| const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
| // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
| // Otherwise, write NDJSON directly to stdout (backwards-compatible). | ||
| const fileStream = outputPath ? createWriteStream(outputPath) : null; | ||
| const fileStream = opts.outputPath | ||
| ? createWriteStream(opts.outputPath) | ||
| : null; | ||
| const output = fileStream | ||
@@ -86,15 +108,15 @@ ? createTeeWriter({ | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd, | ||
| agentCwd, | ||
| supervisorCwd: opts.supervisorCwd, | ||
| agentCwd: opts.agentCwd, | ||
| query, | ||
| output, | ||
| model, | ||
| maxTurns, | ||
| allowedTools, | ||
| supervisorAllowedTools, | ||
| supervisorProfile, | ||
| agentProfile, | ||
| model: opts.model, | ||
| maxTurns: opts.maxTurns, | ||
| allowedTools: opts.allowedTools, | ||
| supervisorAllowedTools: opts.supervisorAllowedTools, | ||
| supervisorProfile: opts.supervisorProfile, | ||
| agentProfile: opts.agentProfile, | ||
| }); | ||
| const result = await supervisor.run(taskContent); | ||
| const result = await supervisor.run(opts.taskContent); | ||
@@ -101,0 +123,0 @@ if (fileStream) { |
+298
-59
@@ -16,4 +16,4 @@ /** | ||
| * Check if the supervisor's response signals evaluation success. | ||
| * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown | ||
| * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to | ||
| * Matches EVALUATION_COMPLETE anywhere in the text, tolerating markdown | ||
| * formatting (e.g. **EVALUATION_COMPLETE**). Uses word boundaries to | ||
| * avoid matching inside longer identifiers. | ||
@@ -23,16 +23,40 @@ * @param {string} text | ||
| */ | ||
| export function isSuccessful(text) { | ||
| return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text); | ||
| export function isComplete(text) { | ||
| return /(?:^|[\s*_~`])EVALUATION_COMPLETE(?:[\s*_~`.,!?]|$)/m.test(text); | ||
| } | ||
| /** | ||
| * Check if the supervisor's response signals a mid-turn intervention. | ||
| * Same tolerance rules as isComplete (markdown formatting, word boundaries), | ||
| * but matches the EVALUATION_INTERVENTION keyword instead. | ||
| * @param {string} text | ||
| * @returns {boolean} | ||
| */ | ||
| export function isIntervention(text) { | ||
| return /(?:^|[\s*_~`])EVALUATION_INTERVENTION(?:[\s*_~`.,!?]|$)/m.test(text); | ||
| } | ||
| /** System prompt appended for the supervisor runner in supervise mode. */ | ||
| export const SUPERVISOR_SYSTEM_PROMPT = | ||
| "You supervise another AI agent through a relay — your output becomes the agent's next input. " + | ||
| "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete."; | ||
| "You relay messages to one persistent agent session — your only output " + | ||
| "channel. Spawning sub-agents or restarting the agent is blocked. Do not " + | ||
| "do the work yourself. Reply briefly to let the agent continue, write " + | ||
| "EVALUATION_INTERVENTION + instructions to interrupt mid-turn, or " + | ||
| "EVALUATION_COMPLETE when done. Only your final message each turn is " + | ||
| "relayed."; | ||
| /** System prompt appended for the agent runner in supervise mode. */ | ||
| export const AGENT_SYSTEM_PROMPT = | ||
| "You are being supervised by another AI agent. " + | ||
| "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding."; | ||
| "A supervisor watches your work and may interrupt with new instructions " + | ||
| "mid-task. Treat any new prompt as authoritative and adjust course. " + | ||
| "When uncertain, stop and ask a clarifying question."; | ||
| /** | ||
| * Maximum number of mid-turn interventions allowed within a single agent turn. | ||
| * Bounded so a looping supervisor exhausts its quota fast (observability) but | ||
| * leaves headroom for legitimate "intervene, observe, intervene again" patterns. | ||
| * The outer maxTurns budget still bounds overall runtime. | ||
| */ | ||
| const MAX_INTERVENTIONS_PER_TURN = 5; | ||
| export class Supervisor { | ||
@@ -61,3 +85,3 @@ /** | ||
| * The SDK result text only reflects the last assistant message, so when | ||
| * the supervisor writes EVALUATION_SUCCESSFUL in an early message and | ||
| * the supervisor writes EVALUATION_COMPLETE in an early message and | ||
| * then continues with follow-up work, the result text won't contain it. | ||
@@ -67,3 +91,19 @@ * This flag captures the signal from the full message stream. | ||
| */ | ||
| this.successSignalSeen = false; | ||
| this.completeSignalSeen = false; | ||
| /** | ||
| * Set to true when any supervisor message contains EVALUATION_INTERVENTION. | ||
| * Mirrors completeSignalSeen — populated by emitLine when a supervisor | ||
| * assistant text block matches isIntervention(...). The mid-turn loop | ||
| * reads this flag after each supervisor invocation to decide whether to | ||
| * abort the agent's in-flight SDK session. | ||
| * @type {boolean} | ||
| */ | ||
| this.interventionSignalSeen = false; | ||
| /** | ||
| * The most recent supervisor SDK result captured inside the mid-turn | ||
| * onBatch callback. The outer loop reads this after the agent aborts to | ||
| * build the next relay prompt without re-running the supervisor. | ||
| * @type {{success: boolean, text: string}|null} | ||
| */ | ||
| this.lastSupervisorResult = null; | ||
| } | ||
@@ -82,3 +122,5 @@ | ||
| this.currentTurn = 0; | ||
| this.successSignalSeen = false; | ||
| this.completeSignalSeen = false; | ||
| this.interventionSignalSeen = false; | ||
| this.lastSupervisorResult = null; | ||
| let supervisorResult = await this.supervisorRunner.run(task); | ||
@@ -93,6 +135,6 @@ | ||
| // streamed message content. The SDK result text only reflects the last | ||
| // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL | ||
| // assistant message, so when the supervisor writes EVALUATION_COMPLETE | ||
| // early and then continues (e.g. filing issues), we must also check the | ||
| // flag set by emitLine during streaming. | ||
| if (this.successSignalSeen || isSuccessful(supervisorResult.text)) { | ||
| if (this.completeSignalSeen || isComplete(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: 0 }); | ||
@@ -102,49 +144,181 @@ return { success: true, turns: 0 }; | ||
| for (let turn = 1; turn <= this.maxTurns; turn++) { | ||
| // Supervisor's output becomes the agent's input | ||
| this.currentSource = "agent"; | ||
| this.currentTurn = turn; | ||
| let agentResult; | ||
| if (turn === 1) { | ||
| agentResult = await this.agentRunner.run(supervisorResult.text); | ||
| } else { | ||
| agentResult = await this.agentRunner.resume(supervisorResult.text); | ||
| } | ||
| const turnLimit = this.maxTurns === 0 ? Infinity : this.maxTurns; | ||
| for (let turn = 1; turn <= turnLimit; turn++) { | ||
| // Only the supervisor's final message is relayed to the agent. | ||
| // Extract the last assistant text block from the buffer to avoid | ||
| // leaking intermediate reasoning (research, tool calls, notes). | ||
| const relay = this.extractLastText( | ||
| this.supervisorRunner, | ||
| supervisorResult.text, | ||
| ); | ||
| if (agentResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { success: false, turns: turn }; | ||
| } | ||
| // Drive the agent through interventions until its SDK session ends | ||
| // naturally, the supervisor signals completion mid-turn, or the | ||
| // per-turn intervention budget is exhausted. | ||
| const turnOutcome = await this.#runAgentTurn(turn, relay); | ||
| if (turnOutcome.exit) return turnOutcome.exit; | ||
| // Build the full agent transcript from buffered NDJSON events so the | ||
| // supervisor sees tool calls and reasoning, not just the SDK result summary. | ||
| const agentTranscript = this.extractTranscript(this.agentRunner); | ||
| // End-of-turn review (existing behaviour). Returns either an exit | ||
| // outcome (error or completion) or the supervisor result for the | ||
| // next turn's relay. | ||
| const reviewOutcome = await this.#endOfTurnReview(turn); | ||
| if (reviewOutcome.exit) return reviewOutcome.exit; | ||
| supervisorResult = reviewOutcome.supervisorResult; | ||
| } | ||
| const supervisorPrompt = | ||
| `The agent reported:\n\n${agentTranscript}\n\n` + | ||
| `Review the agent's work and decide how to proceed.`; | ||
| this.emitSummary({ success: false, turns: this.maxTurns }); | ||
| return { success: false, turns: this.maxTurns }; | ||
| } | ||
| this.currentSource = "supervisor"; | ||
| this.currentTurn = turn; | ||
| this.successSignalSeen = false; | ||
| supervisorResult = await this.supervisorRunner.resume(supervisorPrompt); | ||
| /** | ||
| * Drive the agent through one turn, allowing the supervisor to interrupt | ||
| * mid-stream via EVALUATION_INTERVENTION. Returns either an `exit` outcome | ||
| * (the loop should return immediately) or `{exit: null}` (proceed to the | ||
| * end-of-turn review). | ||
| * @param {number} turn | ||
| * @param {string} initialRelay | ||
| * @returns {Promise<{exit: {success: boolean, turns: number}|null}>} | ||
| */ | ||
| async #runAgentTurn(turn, initialRelay) { | ||
| let relay = initialRelay; | ||
| let interventions = 0; | ||
| if (supervisorResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { success: false, turns: turn }; | ||
| } | ||
| // Wire the mid-turn observation hook on the agent runner. The bound | ||
| // callback captures `turn` so the inner loop's multiple resume(...) | ||
| // calls all see the same turn id. The supervisorRunner does NOT get | ||
| // an onBatch callback — it only fires onLine, which is enough for | ||
| // emitLine to detect EVALUATION_COMPLETE / EVALUATION_INTERVENTION. | ||
| this.agentRunner.onBatch = (batchLines, ctx) => | ||
| this.#midTurnReview(turn, batchLines, ctx); | ||
| // The supervisor's turn is fully complete — check for success signal | ||
| // in either the SDK result text or streamed messages. | ||
| if (this.successSignalSeen || isSuccessful(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { success: true, turns: turn }; | ||
| try { | ||
| while (true) { | ||
| this.currentSource = "agent"; | ||
| this.currentTurn = turn; | ||
| const isFirstAgentCall = turn === 1 && interventions === 0; | ||
| const agentResult = isFirstAgentCall | ||
| ? await this.agentRunner.run(relay) | ||
| : await this.agentRunner.resume(relay); | ||
| if (agentResult.error && !agentResult.aborted) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { exit: { success: false, turns: turn } }; | ||
| } | ||
| // Mid-turn EVALUATION_COMPLETE: end the session immediately. | ||
| if (this.completeSignalSeen) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { exit: { success: true, turns: turn } }; | ||
| } | ||
| if (agentResult.aborted && this.interventionSignalSeen) { | ||
| interventions++; | ||
| if (interventions >= MAX_INTERVENTIONS_PER_TURN) { | ||
| this.emitOrchestratorEvent({ type: "intervention_limit", turn }); | ||
| return { exit: null }; | ||
| } | ||
| relay = this.extractLastText( | ||
| this.supervisorRunner, | ||
| this.lastSupervisorResult?.text ?? "", | ||
| ); | ||
| this.emitOrchestratorEvent({ type: "intervention_relayed", turn }); | ||
| continue; | ||
| } | ||
| // Agent's SDK session finished naturally — proceed to end-of-turn. | ||
| return { exit: null }; | ||
| } | ||
| } finally { | ||
| // Detach onBatch before the end-of-turn review so the supervisor's | ||
| // own SDK session does not trigger nested onBatch fires. | ||
| this.agentRunner.onBatch = null; | ||
| } | ||
| } | ||
| this.emitSummary({ success: false, turns: this.maxTurns }); | ||
| return { success: false, turns: this.maxTurns }; | ||
| /** | ||
| * Mid-turn supervisor review fired from inside the agent's onBatch hook. | ||
| * Emits a `mid_turn_review` orchestrator marker, runs the supervisor's | ||
| * LLM against the batch, and aborts the agent if the supervisor signals | ||
| * EVALUATION_INTERVENTION or EVALUATION_COMPLETE. | ||
| * @param {number} turn | ||
| * @param {string[]} batchLines | ||
| * @param {{abort: () => void}} ctx | ||
| */ | ||
| async #midTurnReview(turn, batchLines, { abort }) { | ||
| const batchTranscript = this.renderBatch(batchLines); | ||
| // Order matters: emit the orchestrator marker BEFORE the supervisor | ||
| // LLM call so the trace reads | ||
| // agent line → orchestrator:mid_turn_review | ||
| // → supervisor lines (tagged turn:N) | ||
| // → orchestrator:intervention_requested|complete_requested | ||
| this.emitOrchestratorEvent({ type: "mid_turn_review", turn }); | ||
| // currentTurn stays = turn so mid-turn supervisor lines share the | ||
| // agent's turn id. They are distinguishable from end-of-turn reviews | ||
| // by the surrounding orchestrator events emitted around this call. | ||
| this.currentSource = "supervisor"; | ||
| this.completeSignalSeen = false; | ||
| this.interventionSignalSeen = false; | ||
| this.lastSupervisorResult = await this.supervisorRunner.resume( | ||
| `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` + | ||
| `Respond with a brief acknowledgement to let it continue, or write ` + | ||
| `EVALUATION_INTERVENTION followed by a corrective message to stop ` + | ||
| `and relay a new instruction. Write EVALUATION_COMPLETE only when ` + | ||
| `the task is fully done.`, | ||
| ); | ||
| this.currentSource = "agent"; | ||
| if (this.interventionSignalSeen) { | ||
| this.emitOrchestratorEvent({ type: "intervention_requested", turn }); | ||
| abort(); | ||
| return; | ||
| } | ||
| if (this.completeSignalSeen) { | ||
| this.emitOrchestratorEvent({ type: "complete_requested", turn }); | ||
| abort(); | ||
| } | ||
| // Non-intervention: do nothing; the agent loop pulls the next line. | ||
| } | ||
| /** | ||
| * End-of-turn supervisor review (existing behaviour). Returns either an | ||
| * exit outcome (error or completion) or the supervisor result so the | ||
| * outer loop can build the next turn's relay. | ||
| * @param {number} turn | ||
| * @returns {Promise<{exit: {success: boolean, turns: number}|null, supervisorResult?: object}>} | ||
| */ | ||
| async #endOfTurnReview(turn) { | ||
| // Build the full agent transcript from buffered NDJSON events so the | ||
| // supervisor sees tool calls and reasoning, not just the SDK result. | ||
| const agentTranscript = this.extractTranscript(this.agentRunner); | ||
| const supervisorPrompt = | ||
| `The agent reported:\n\n${agentTranscript}\n\n` + | ||
| `Review the agent's work and decide how to proceed.`; | ||
| this.currentSource = "supervisor"; | ||
| this.currentTurn = turn; | ||
| this.completeSignalSeen = false; | ||
| this.interventionSignalSeen = false; | ||
| const supervisorResult = | ||
| await this.supervisorRunner.resume(supervisorPrompt); | ||
| if (supervisorResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { exit: { success: false, turns: turn } }; | ||
| } | ||
| // The supervisor's turn is fully complete — check for success signal | ||
| // in either the SDK result text or streamed messages. | ||
| if (this.completeSignalSeen || isComplete(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { exit: { success: true, turns: turn } }; | ||
| } | ||
| return { exit: null, supervisorResult }; | ||
| } | ||
| /** | ||
| * Extract a human-readable transcript from an AgentRunner's buffered output. | ||
@@ -165,2 +339,27 @@ * Drains the buffer and replays events through a TraceCollector. | ||
| /** | ||
| * Extract only the last assistant text block from an AgentRunner's buffer. | ||
| * Scans buffered NDJSON events in reverse to find the final assistant message | ||
| * with a text content block. This prevents intermediate reasoning (tool calls, | ||
| * research notes) from leaking to the agent. | ||
| * @param {import("./agent-runner.js").AgentRunner} runner | ||
| * @param {string} fallback - Fallback text if no assistant text block is found | ||
| * @returns {string} | ||
| */ | ||
| extractLastText(runner, fallback) { | ||
| const lines = runner.buffer; | ||
| for (let i = lines.length - 1; i >= 0; i--) { | ||
| const event = JSON.parse(lines[i]); | ||
| if (event.type !== "assistant") continue; | ||
| const content = event.message?.content ?? event.content; | ||
| if (!Array.isArray(content)) continue; | ||
| for (let j = content.length - 1; j >= 0; j--) { | ||
| if (content[j].type === "text" && content[j].text) { | ||
| return content[j].text; | ||
| } | ||
| } | ||
| } | ||
| return fallback; | ||
| } | ||
| /** | ||
| * Emit a single NDJSON line tagged with the current source and turn. | ||
@@ -170,3 +369,4 @@ * Called in real-time via the AgentRunner onLine callback. | ||
| * When the current source is the supervisor, also scans assistant text | ||
| * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen. | ||
| * content for the EVALUATION_COMPLETE and EVALUATION_INTERVENTION signals, | ||
| * setting completeSignalSeen / interventionSignalSeen respectively. | ||
| * @param {string} line - Raw NDJSON line from the runner | ||
@@ -183,6 +383,6 @@ */ | ||
| // Scan supervisor assistant messages for the success signal in real time. | ||
| // Scan supervisor assistant messages for the signals in real time. | ||
| // The SDK result text only reflects the final assistant message, but the | ||
| // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and | ||
| // then continue with follow-up tool calls. | ||
| // supervisor may write EVALUATION_COMPLETE / EVALUATION_INTERVENTION in | ||
| // an earlier message and then continue with follow-up tool calls. | ||
| if (this.currentSource === "supervisor" && event.type === "assistant") { | ||
@@ -192,5 +392,5 @@ const content = event.message?.content ?? event.content ?? []; | ||
| for (const block of content) { | ||
| if (block.type === "text" && isSuccessful(block.text)) { | ||
| this.successSignalSeen = true; | ||
| } | ||
| if (block.type !== "text" || !block.text) continue; | ||
| if (isComplete(block.text)) this.completeSignalSeen = true; | ||
| if (isIntervention(block.text)) this.interventionSignalSeen = true; | ||
| } | ||
@@ -202,2 +402,37 @@ } | ||
| /** | ||
| * Render a batch of buffered NDJSON lines as human-readable text for the | ||
| * mid-turn supervisor prompt. Reuses the TraceCollector pipeline so the | ||
| * supervisor sees tool calls and reasoning, not just raw events. | ||
| * @param {string[]} batchLines | ||
| * @returns {string} | ||
| */ | ||
| renderBatch(batchLines) { | ||
| if (batchLines.length === 0) return "[empty]"; | ||
| const collector = new TraceCollector(); | ||
| for (const line of batchLines) { | ||
| collector.addLine(line); | ||
| } | ||
| return collector.toText() || "[empty]"; | ||
| } | ||
| /** | ||
| * Emit an orchestrator-source NDJSON line. Used by the mid-turn loop to | ||
| * mark mid_turn_review / intervention_requested / intervention_relayed / | ||
| * intervention_limit / complete_requested boundaries in the trace, so the | ||
| * improvement coach can distinguish mid-turn supervisor activity from | ||
| * end-of-turn reviews. Additive to existing trace shape — the parser | ||
| * already reads `source` and ignores unknown event types. | ||
| * @param {{type: string, turn?: number}} event | ||
| */ | ||
| emitOrchestratorEvent(event) { | ||
| this.output.write( | ||
| JSON.stringify({ | ||
| source: "orchestrator", | ||
| turn: this.currentTurn, | ||
| event, | ||
| }) + "\n", | ||
| ); | ||
| } | ||
| /** | ||
| * Emit a final orchestrator summary line. | ||
@@ -268,6 +503,10 @@ * @param {{success: boolean, turns: number}} result | ||
| // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents. | ||
| // The relay loop handles agent communication — letting the supervisor use | ||
| // Task would bypass the relay and produce an empty agent trace. | ||
| const defaultDisallowed = ["Task", "TaskOutput"]; | ||
| // Block every sub-agent spawning tool so the supervisor cannot bypass the | ||
| // relay loop. The current Claude Agent SDK exposes the spawn tool to the | ||
| // model as `Agent`; older versions called it `Task`. Both are blocked | ||
| // (along with TaskOutput/TaskStop) so the supervisor sees no spawn tool | ||
| // regardless of which SDK version is installed. Letting the supervisor | ||
| // spawn its own sub-agent would bypass the relay and produce an empty | ||
| // agent trace, which is the failure mode that motivated this default. | ||
| const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"]; | ||
| const disallowedTools = supervisorDisallowedTools | ||
@@ -274,0 +513,0 @@ ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])] |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { | ||
| AgentRunner, | ||
| Supervisor, | ||
| createSupervisor, | ||
| SUPERVISOR_SYSTEM_PROMPT, | ||
| AGENT_SYSTEM_PROMPT, | ||
| } from "@forwardimpact/libeval"; | ||
| import { isSuccessful } from "../src/supervisor.js"; | ||
| /** | ||
| * Create a mock AgentRunner that yields pre-scripted responses. | ||
| * Each call to run() or resume() pops the next response from the array. | ||
| * @param {object[]} responses - Array of {text, success} objects | ||
| * @param {object[]} [messages] - Messages to buffer per turn | ||
| * @returns {AgentRunner} | ||
| */ | ||
| function createMockRunner(responses, messages) { | ||
| const output = new PassThrough(); | ||
| let callIndex = 0; | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| output, | ||
| }); | ||
| // Override run and resume to return scripted responses | ||
| runner.run = async (_task) => { | ||
| const resp = responses[callIndex++]; | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| for (const m of msgs) { | ||
| const line = JSON.stringify(m); | ||
| runner.buffer.push(line); | ||
| if (runner.onLine) runner.onLine(line); | ||
| } | ||
| runner.sessionId = "mock-session"; | ||
| return { | ||
| success: resp.success ?? true, | ||
| text: resp.text, | ||
| sessionId: "mock-session", | ||
| }; | ||
| }; | ||
| runner.resume = async (_prompt) => { | ||
| const resp = responses[callIndex++]; | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| for (const m of msgs) { | ||
| const line = JSON.stringify(m); | ||
| runner.buffer.push(line); | ||
| if (runner.onLine) runner.onLine(line); | ||
| } | ||
| return { success: resp.success ?? true, text: resp.text }; | ||
| }; | ||
| return runner; | ||
| } | ||
| describe("isSuccessful", () => { | ||
| test("detects EVALUATION_SUCCESSFUL on its own line", () => { | ||
| assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true); | ||
| assert.strictEqual( | ||
| isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"), | ||
| true, | ||
| ); | ||
| assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true); | ||
| }); | ||
| test("tolerates markdown formatting around the signal", () => { | ||
| assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true); | ||
| assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true); | ||
| assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true); | ||
| assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true); | ||
| assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true); | ||
| assert.strictEqual( | ||
| isSuccessful( | ||
| "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.", | ||
| ), | ||
| true, | ||
| ); | ||
| }); | ||
| test("matches EVALUATION_SUCCESSFUL anywhere in text", () => { | ||
| assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true); | ||
| assert.strictEqual( | ||
| isSuccessful("The agent is EVALUATION_SUCCESSFUL done"), | ||
| true, | ||
| ); | ||
| assert.strictEqual( | ||
| isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."), | ||
| true, | ||
| ); | ||
| }); | ||
| test("does not match empty or unrelated text", () => { | ||
| assert.strictEqual(isSuccessful(""), false); | ||
| assert.strictEqual(isSuccessful("All done!"), false); | ||
| assert.strictEqual(isSuccessful("DONE"), false); | ||
| }); | ||
| test("does not match old EVALUATION_COMPLETE signal", () => { | ||
| assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false); | ||
| }); | ||
| }); | ||
| describe("Supervisor", () => { | ||
| test("constructor throws on missing agentRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| supervisorRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /agentRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing supervisorRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /supervisorRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing output", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| supervisorRunner: createMockRunner([]), | ||
| }), | ||
| /output is required/, | ||
| ); | ||
| }); | ||
| test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => { | ||
| const agentRunner = createMockRunner([]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "EVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 0); | ||
| }); | ||
| test("completes after one agent turn", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Welcome! Please install the packages." }, | ||
| { text: "Good work.\n\nEVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => { | ||
| // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in | ||
| // an early message, then continues with follow-up work (e.g. filing issues). | ||
| // The SDK result text reflects only the final message, which does NOT | ||
| // contain the signal. | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| // The supervisor's result text is the Summary (no signal), but messages | ||
| // include one with EVALUATION_SUCCESSFUL. | ||
| const supervisorMessages = [ | ||
| undefined, // turn 0: use default | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "text", text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome! Please install the packages." }, | ||
| // Result text is the final message — does NOT contain the signal | ||
| { text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("runs multiple turns before completion", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Started working." }, | ||
| { text: "Made progress." }, | ||
| { text: "Finished everything." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Here is your task. Do the work." }, | ||
| { text: "Keep going, you need to do more." }, | ||
| { text: "Almost there, continue." }, | ||
| { text: "EVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Do the work"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 3); | ||
| }); | ||
| test("enforces maxTurns limit", async () => { | ||
| // Supervisor starts, agent responds each turn, supervisor never says done | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Turn 1" }, | ||
| { text: "Turn 2" }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Start working." }, | ||
| { text: "Continue." }, | ||
| { text: "Continue." }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 2, | ||
| }); | ||
| const result = await supervisor.run("Endless task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 2); | ||
| }); | ||
| test("output contains tagged lines with correct source and turn", async () => { | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "Go ahead" }], | ||
| [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }], | ||
| ]; | ||
| const agentMessages = [[{ type: "assistant", content: "Working" }]]; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }], | ||
| supervisorMessages, | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Working" }], agentMessages); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary | ||
| assert.ok(lines.length >= 4); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
| assert.ok("event" in supervisorLine); | ||
| const agentLine = JSON.parse(lines[1]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 1); | ||
| assert.ok("event" in agentLine); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.type, "summary"); | ||
| assert.strictEqual(summaryLine.success, true); | ||
| }); | ||
| test("events are nested under event key (no field collisions)", async () => { | ||
| const sourceEvent = { | ||
| type: "assistant", | ||
| source: "sdk-internal", | ||
| content: "test", | ||
| }; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }], | ||
| [ | ||
| [{ type: "assistant", content: "Go" }], | ||
| [{ type: "assistant", content: "ok" }], | ||
| ], | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| // First line is supervisor turn 0, second is agent turn 1 | ||
| const tagged = JSON.parse(lines[1]); | ||
| // The original event's `source` field is preserved inside `event` | ||
| assert.strictEqual(tagged.source, "agent"); | ||
| assert.strictEqual(tagged.event.source, "sdk-internal"); | ||
| }); | ||
| test("emits supervisor output and summary when supervisor errors on turn 0", async () => { | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "Starting..." }], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Starting...", success: false }], | ||
| supervisorMessages, | ||
| ); | ||
| // Override run to simulate an error return | ||
| const origRun = supervisorRunner.run; | ||
| supervisorRunner.run = async (task) => { | ||
| const result = await origRun.call(supervisorRunner, task); | ||
| return { ...result, error: new Error("Process exited with code 1") }; | ||
| }; | ||
| const agentRunner = createMockRunner([]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 0); | ||
| // Output should still contain the supervisor's buffered lines + summary | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| assert.ok(lines.length >= 2, "Expected at least supervisor line + summary"); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.success, false); | ||
| assert.strictEqual(summaryLine.turns, 0); | ||
| }); | ||
| test("createSupervisor factory returns a Supervisor instance", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.ok(supervisor instanceof Supervisor); | ||
| }); | ||
| test("createSupervisor uses default supervisor tools when none specified", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ]); | ||
| }); | ||
| test("createSupervisor passes custom supervisor tools", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorAllowedTools: ["Read", "Glob", "Grep"], | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| ]); | ||
| }); | ||
| test("createSupervisor wires system prompts to both runners", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: AGENT_SYSTEM_PROMPT, | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: SUPERVISOR_SYSTEM_PROMPT, | ||
| }); | ||
| }); | ||
| test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [ | ||
| "Task", | ||
| "TaskOutput", | ||
| ]); | ||
| // Agent should not have disallowed tools | ||
| assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []); | ||
| }); | ||
| test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorDisallowedTools: ["WebSearch", "Task"], | ||
| }); | ||
| const disallowed = supervisor.supervisorRunner.disallowedTools; | ||
| assert.ok(disallowed.includes("Task")); | ||
| assert.ok(disallowed.includes("TaskOutput")); | ||
| assert.ok(disallowed.includes("WebSearch")); | ||
| // No duplicates | ||
| assert.strictEqual(disallowed.length, new Set(disallowed).size); | ||
| }); | ||
| test("system prompt constants are non-empty strings", () => { | ||
| assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string"); | ||
| assert.ok(typeof AGENT_SYSTEM_PROMPT === "string"); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0); | ||
| assert.ok(AGENT_SYSTEM_PROMPT.length > 0); | ||
| }); | ||
| test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => { | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay")); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL")); | ||
| }); | ||
| }); |
AI-detected potential code anomaly
Supply chain riskAI has identified unusual behaviors that may pose a security risk.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
137041
30.11%20
17.65%3435
33.29%4
33.33%