@forwardimpact/libeval - npm Package Compare versions

+101

test/mock-runner.js

		/**
		* Test-only mock factory for AgentRunner. Yields pre-scripted responses,
		* and (when an `onBatch` callback is set) fires it at the same boundaries
		* the real AgentRunner would: assistant messages with at least one text
		* block, and the terminal `result` message. If the callback calls
		* `abort()`, the mock stops iterating that response's messages and
		* reports `aborted: true`.
		*
		* Intentionally a regular module (not a test file) so describe/test blocks
		* here would not run. Lives under test/ to make its scope explicit.
		*/

		import { PassThrough } from "node:stream";
		import { AgentRunner } from "@forwardimpact/libeval";

		/**
		* Whether a scripted message should trigger an onBatch flush. Mirrors the
		* real AgentRunner: assistant-with-text-block or terminal `result` message.
		* Tool-only or string-content messages accumulate without flushing.
		* @param {object} message
		* @returns {boolean}
		*/
		export function shouldFlush(message) {
		if (message.type === "result") return true;
		if (message.type !== "assistant") return false;
		const content = message.message?.content ?? message.content;
		if (!Array.isArray(content)) return false;
		for (const block of content) {
		if (block.type === "text" && block.text) return true;
		}
		return false;
		}

		/**
		* Create a mock AgentRunner that yields pre-scripted responses. Each call
		* to `run()` or `resume()` pops the next response from the array.
		* @param {object[]} responses - Array of {text, success} objects
		* @param {object[]} [messages] - Messages to buffer per response
		* @returns {AgentRunner}
		*/
		export function createMockRunner(responses, messages) {
		const output = new PassThrough();
		let callIndex = 0;

		const runner = new AgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		output,
		});

		const consume = async (msgs) => {
		let aborted = false;
		for (const m of msgs) {
		const line = JSON.stringify(m);
		runner.buffer.push(line);
		if (runner.onLine) runner.onLine(line);
		if (runner.onBatch && shouldFlush(m)) {
		await runner.onBatch([line], {
		abort: () => {
		aborted = true;
		},
		});
		if (aborted) break;
		}
		}
		return aborted;
		};

		runner.run = async (_task) => {
		const resp = responses[callIndex++];
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		const aborted = await consume(msgs);
		runner.sessionId = "mock-session";
		return {
		success: resp.success ?? true,
		text: resp.text,
		sessionId: "mock-session",
		aborted,
		error: null,
		};
		};

		runner.resume = async (_prompt) => {
		const resp = responses[callIndex++];
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		const aborted = await consume(msgs);
		return {
		success: resp.success ?? true,
		text: resp.text,
		sessionId: runner.sessionId,
		aborted,
		error: null,
		};
		};

		return runner;
		}

+359

test/supervisor-intervention.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import { Supervisor } from "@forwardimpact/libeval";
		import { isIntervention } from "../src/supervisor.js";
		import { createMockRunner } from "./mock-runner.js";

		describe("isIntervention", () => {
		test("detects EVALUATION_INTERVENTION on its own line", () => {
		assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
		assert.strictEqual(
		isIntervention("Some text\nEVALUATION_INTERVENTION\nMore text"),
		true,
		);
		assert.strictEqual(
		isIntervention("Stop.\n\nEVALUATION_INTERVENTION"),
		true,
		);
		});

		test("tolerates markdown formatting around the signal", () => {
		assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
		assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
		assert.strictEqual(isIntervention("__EVALUATION_INTERVENTION__"), true);
		assert.strictEqual(isIntervention("_EVALUATION_INTERVENTION_"), true);
		assert.strictEqual(isIntervention("`EVALUATION_INTERVENTION`"), true);
		assert.strictEqual(
		isIntervention(
		"Wrong path.\n\nEVALUATION_INTERVENTION\n\nTry the documented one.",
		),
		true,
		);
		});

		test("matches EVALUATION_INTERVENTION inline", () => {
		assert.strictEqual(
		isIntervention("Stopping you with EVALUATION_INTERVENTION now."),
		true,
		);
		assert.strictEqual(
		isIntervention("Note: EVALUATION_INTERVENTION. Switch to Y."),
		true,
		);
		});

		test("does not match empty or unrelated text", () => {
		assert.strictEqual(isIntervention(""), false);
		assert.strictEqual(isIntervention("Stop and think."), false);
		assert.strictEqual(isIntervention("INTERVENTION"), false);
		});

		test("does not match EVALUATION_COMPLETE alone", () => {
		assert.strictEqual(isIntervention("EVALUATION_COMPLETE"), false);
		assert.strictEqual(
		isIntervention("Good work.\n\nEVALUATION_COMPLETE"),
		false,
		);
		});
		});

		describe("Supervisor - mid-turn intervention", () => {
		test("observation without intervention does not interrupt the agent", async () => {
		// Agent emits one structured assistant text block — fires onBatch once.
		// Supervisor responds with "Keep going." — neither signal flag is set,
		// so the agent's SDK session completes naturally and the end-of-turn
		// review then emits EVALUATION_COMPLETE.
		const agentMessages = [
		[
		{
		type: "assistant",
		message: {
		content: [{ type: "text", text: "I'm working on it." }],
		},
		},
		],
		];

		const agentRunner = createMockRunner(
		[{ text: "I'm working on it." }],
		agentMessages,
		);

		const supervisorRunner = createMockRunner([
		{ text: "Welcome! Please install." },
		{ text: "Keep going." },
		{ text: "Good work.\n\nEVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		let agentResumeCalls = 0;
		const origAgentResume = agentRunner.resume;
		agentRunner.resume = async (prompt) => {
		agentResumeCalls++;
		return origAgentResume.call(agentRunner, prompt);
		};

		const result = await supervisor.run("Install");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		assert.strictEqual(
		agentResumeCalls,
		0,
		"Agent should not be resumed when supervisor never intervenes",
		);

		// Trace must contain a mid_turn_review marker but no intervention markers.
		const data = output.read()?.toString() ?? "";
		const orchestratorEvents = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0)
		.map((l) => JSON.parse(l))
		.filter((e) => e.source === "orchestrator");
		assert.ok(
		orchestratorEvents.some((e) => e.event?.type === "mid_turn_review"),
		"Trace should contain mid_turn_review when onBatch fires",
		);
		assert.ok(
		!orchestratorEvents.some(
		(e) => e.event?.type === "intervention_requested",
		),
		"Trace should not contain intervention_requested when supervisor only observes",
		);
		});

		test("EVALUATION_INTERVENTION from mid-turn batch interrupts and relays", async () => {
		// Agent's first call fires onBatch on a structured assistant text block;
		// supervisor responds with EVALUATION_INTERVENTION → abort + relay.
		// Agent's second call (resume) finishes naturally; end-of-turn review
		// then emits EVALUATION_COMPLETE.
		const agentMessages = [
		[
		{
		type: "assistant",
		message: {
		content: [{ type: "text", text: "I'll try the wrong path." }],
		},
		},
		],
		[
		{
		type: "assistant",
		message: {
		content: [
		{ type: "text", text: "OK, switching to the documented path." },
		],
		},
		},
		],
		];

		const agentRunner = createMockRunner(
		[
		{ text: "I'll try the wrong path." },
		{ text: "OK, switching to the documented path." },
		],
		agentMessages,
		);

		// Supervisor responses (in order):
		// 0: turn 0 introduction
		// 1: mid-turn 1 batch 1 — intervene
		// 2: mid-turn 1 batch 1 (post-resume) — keep going
		// 3: end-of-turn 1 — EVALUATION_COMPLETE
		const supervisorMessages = [
		undefined,
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "EVALUATION_INTERVENTION Stop and use the documented path.",
		},
		],
		},
		},
		],
		undefined,
		undefined,
		];

		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome." },
		{ text: "EVALUATION_INTERVENTION Stop and use the documented path." },
		{ text: "Keep going." },
		{ text: "Good.\n\nEVALUATION_COMPLETE" },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		let agentResumeCalls = 0;
		let firstResumePrompt = null;
		const origAgentResume = agentRunner.resume;
		agentRunner.resume = async (prompt) => {
		agentResumeCalls++;
		if (agentResumeCalls === 1) firstResumePrompt = prompt;
		return origAgentResume.call(agentRunner, prompt);
		};

		const result = await supervisor.run("Install");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		assert.strictEqual(
		agentResumeCalls,
		1,
		"Agent should be resumed exactly once after intervention",
		);
		assert.ok(
		firstResumePrompt && firstResumePrompt.includes("documented path"),
		"Resume prompt should carry the supervisor's intervention text",
		);

		const orchestratorEvents = (output.read()?.toString() ?? "")
		.trim()
		.split("\n")
		.filter((l) => l.length > 0)
		.map((l) => JSON.parse(l))
		.filter((e) => e.source === "orchestrator");
		assert.ok(
		orchestratorEvents.some(
		(e) => e.event?.type === "intervention_requested",
		),
		"Trace should contain intervention_requested orchestrator event",
		);
		assert.ok(
		orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
		"Trace should contain intervention_relayed orchestrator event",
		);
		});

		test("EVALUATION_INTERVENTION and EVALUATION_COMPLETE in the same turn", async () => {
		// Batch 1: supervisor intervenes (abort + relay).
		// After resume, batch 1 of resume: supervisor writes EVALUATION_COMPLETE
		// (mid-turn) — the loop must exit success without running an end-of-turn
		// review.
		const agentMessages = [
		[
		{
		type: "assistant",
		message: { content: [{ type: "text", text: "Trying X." }] },
		},
		],
		[
		{
		type: "assistant",
		message: { content: [{ type: "text", text: "OK trying Y." }] },
		},
		],
		];

		const agentRunner = createMockRunner(
		[{ text: "Trying X." }, { text: "Trying Y." }],
		agentMessages,
		);

		const supervisorMessages = [
		undefined,
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "EVALUATION_INTERVENTION Try Y instead.",
		},
		],
		},
		},
		],
		[
		{
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Excellent. EVALUATION_COMPLETE" }],
		},
		},
		],
		];

		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome." },
		{ text: "EVALUATION_INTERVENTION Try Y instead." },
		{ text: "Excellent. EVALUATION_COMPLETE" },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		let agentResumeCalls = 0;
		const origAgentResume = agentRunner.resume;
		agentRunner.resume = async (prompt) => {
		agentResumeCalls++;
		return origAgentResume.call(agentRunner, prompt);
		};

		const result = await supervisor.run("Install");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		assert.strictEqual(
		agentResumeCalls,
		1,
		"Agent.resume runs once (after intervention); EVALUATION_COMPLETE then ends the turn",
		);

		const orchestratorEvents = (output.read()?.toString() ?? "")
		.trim()
		.split("\n")
		.filter((l) => l.length > 0)
		.map((l) => JSON.parse(l))
		.filter((e) => e.source === "orchestrator");
		assert.ok(
		orchestratorEvents.some(
		(e) => e.event?.type === "intervention_requested",
		),
		"Trace should contain intervention_requested",
		);
		assert.ok(
		orchestratorEvents.some((e) => e.event?.type === "complete_requested"),
		"Trace should contain complete_requested for mid-turn EVALUATION_COMPLETE",
		);
		});
		});

+368

test/supervisor-output.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import {
		Supervisor,
		createSupervisor,
		SUPERVISOR_SYSTEM_PROMPT,
		AGENT_SYSTEM_PROMPT,
		} from "@forwardimpact/libeval";
		import { createMockRunner } from "./mock-runner.js";

		describe("Supervisor - output and events", () => {
		test("output contains tagged lines with correct source and turn", async () => {
		const supervisorMessages = [
		[{ type: "assistant", content: "Go ahead" }],
		[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
		];
		const agentMessages = [[{ type: "assistant", content: "Working" }]];

		const supervisorRunner = createMockRunner(
		[{ text: "Go ahead" }, { text: "EVALUATION_COMPLETE" }],
		supervisorMessages,
		);
		const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
		assert.ok(lines.length >= 4);

		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);
		assert.ok("event" in supervisorLine);

		const agentLine = JSON.parse(lines[1]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 1);
		assert.ok("event" in agentLine);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.type, "summary");
		assert.strictEqual(summaryLine.success, true);
		});

		test("events are nested under event key (no field collisions)", async () => {
		const sourceEvent = {
		type: "assistant",
		source: "sdk-internal",
		content: "test",
		};
		const supervisorRunner = createMockRunner(
		[{ text: "Go" }, { text: "EVALUATION_COMPLETE" }],
		[
		[{ type: "assistant", content: "Go" }],
		[{ type: "assistant", content: "ok" }],
		],
		);
		const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		// First line is supervisor turn 0, second is agent turn 1
		const tagged = JSON.parse(lines[1]);
		assert.strictEqual(tagged.source, "agent");
		assert.strictEqual(tagged.event.source, "sdk-internal");
		});

		test("mid-turn intervention emits orchestrator events and shares the agent's turn id", async () => {
		// Agent emits one structured assistant text block on its first call —
		// supervisor intervenes mid-turn. Resume then completes naturally and
		// the end-of-turn review signals EVALUATION_COMPLETE.
		const agentMessages = [
		[
		{
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Trying the wrong thing." }],
		},
		},
		],
		[
		{
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Switching to the right thing." }],
		},
		},
		],
		];

		const supervisorMessages = [
		undefined,
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "EVALUATION_INTERVENTION Switch to the right path.",
		},
		],
		},
		},
		],
		undefined,
		undefined,
		];

		const agentRunner = createMockRunner(
		[{ text: "Trying the wrong thing." }, { text: "Switching." }],
		agentMessages,
		);
		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome." },
		{ text: "EVALUATION_INTERVENTION Switch to the right path." },
		{ text: "Keep going." },
		{ text: "Done. EVALUATION_COMPLETE" },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Task");
		assert.strictEqual(result.success, true);

		const lines = (output.read()?.toString() ?? "")
		.trim()
		.split("\n")
		.filter((l) => l.length > 0)
		.map((l) => JSON.parse(l));

		// (1) Orchestrator event with intervention_requested.
		const interventionRequested = lines.find(
		(l) =>
		l.source === "orchestrator" &&
		l.event?.type === "intervention_requested",
		);
		assert.ok(
		interventionRequested,
		"Trace must contain intervention_requested orchestrator event",
		);

		// (2) At least one agent line and one supervisor line share a turn id —
		// mid-turn supervisor activity is tagged with the agent's turn.
		const agentTurns = new Set(
		lines.filter((l) => l.source === "agent").map((l) => l.turn),
		);
		const supervisorTurns = new Set(
		lines.filter((l) => l.source === "supervisor").map((l) => l.turn),
		);
		const sharedTurns = [...agentTurns].filter((t) => supervisorTurns.has(t));
		assert.ok(
		sharedTurns.length > 0,
		"At least one turn id must appear on both agent and supervisor lines",
		);

		// (3) Final summary line still emitted.
		const summary = lines[lines.length - 1];
		assert.strictEqual(summary.source, "orchestrator");
		assert.strictEqual(summary.type, "summary");
		assert.strictEqual(summary.success, true);
		});

		test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
		const supervisorMessages = [
		[{ type: "assistant", content: "Starting..." }],
		];
		const supervisorRunner = createMockRunner(
		[{ text: "Starting...", success: false }],
		supervisorMessages,
		);

		const origRun = supervisorRunner.run;
		supervisorRunner.run = async (task) => {
		const result = await origRun.call(supervisorRunner, task);
		return { ...result, error: new Error("Process exited with code 1") };
		};

		const agentRunner = createMockRunner([]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 0);

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");

		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.success, false);
		assert.strictEqual(summaryLine.turns, 0);
		});
		});

		describe("Supervisor - createSupervisor factory", () => {
		test("createSupervisor factory returns a Supervisor instance", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.ok(supervisor instanceof Supervisor);
		});

		test("createSupervisor uses default supervisor tools when none specified", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		]);
		});

		test("createSupervisor passes custom supervisor tools", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorAllowedTools: ["Read", "Glob", "Grep"],
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Read",
		"Glob",
		"Grep",
		]);
		});

		test("createSupervisor wires system prompts to both runners", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});

		assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: AGENT_SYSTEM_PROMPT,
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: SUPERVISOR_SYSTEM_PROMPT,
		});
		});

		test("createSupervisor blocks sub-agent spawn tools on supervisor by default", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
		"Agent",
		"Task",
		"TaskOutput",
		"TaskStop",
		]);
		assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
		});

		test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorDisallowedTools: ["WebSearch", "Task"],
		});
		const disallowed = supervisor.supervisorRunner.disallowedTools;
		assert.ok(disallowed.includes("Agent"));
		assert.ok(disallowed.includes("Task"));
		assert.ok(disallowed.includes("TaskOutput"));
		assert.ok(disallowed.includes("TaskStop"));
		assert.ok(disallowed.includes("WebSearch"));
		assert.strictEqual(disallowed.length, new Set(disallowed).size);
		});

		test("system prompt constants are non-empty strings", () => {
		assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
		assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
		assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
		});

		test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_COMPLETE"));
		});
		});

+310

test/supervisor-run.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import { Supervisor } from "@forwardimpact/libeval";
		import { isComplete } from "../src/supervisor.js";
		import { createMockRunner } from "./mock-runner.js";

		describe("isComplete", () => {
		test("detects EVALUATION_COMPLETE on its own line", () => {
		assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true);
		assert.strictEqual(
		isComplete("Some text\nEVALUATION_COMPLETE\nMore text"),
		true,
		);
		assert.strictEqual(isComplete("Done.\n\nEVALUATION_COMPLETE"), true);
		});

		test("tolerates markdown formatting around the signal", () => {
		assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true);
		assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true);
		assert.strictEqual(isComplete("__EVALUATION_COMPLETE__"), true);
		assert.strictEqual(isComplete("_EVALUATION_COMPLETE_"), true);
		assert.strictEqual(isComplete("`EVALUATION_COMPLETE`"), true);
		assert.strictEqual(
		isComplete("Good work.\n\nEVALUATION_COMPLETE\n\nNow filing issues."),
		true,
		);
		});

		test("matches EVALUATION_COMPLETE anywhere in text", () => {
		assert.strictEqual(isComplete("not EVALUATION_COMPLETE yet"), true);
		assert.strictEqual(
		isComplete("The agent is EVALUATION_COMPLETE done"),
		true,
		);
		assert.strictEqual(
		isComplete("Great work! EVALUATION_COMPLETE. Now filing issues."),
		true,
		);
		});

		test("does not match empty or unrelated text", () => {
		assert.strictEqual(isComplete(""), false);
		assert.strictEqual(isComplete("All done!"), false);
		assert.strictEqual(isComplete("DONE"), false);
		});

		test("does not match old EVALUATION_SUCCESSFUL signal", () => {
		assert.strictEqual(isComplete("EVALUATION_SUCCESSFUL"), false);
		});
		});

		describe("Supervisor - run and turns", () => {
		test("constructor throws on missing agentRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		supervisorRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/agentRunner is required/,
		);
		});

		test("constructor throws on missing supervisorRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/supervisorRunner is required/,
		);
		});

		test("constructor throws on missing output", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		supervisorRunner: createMockRunner([]),
		}),
		/output is required/,
		);
		});

		test("completes on EVALUATION_COMPLETE from supervisor at turn 0", async () => {
		const agentRunner = createMockRunner([]);

		const supervisorRunner = createMockRunner([
		{ text: "EVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 0);
		});

		test("completes after one agent turn", async () => {
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Welcome! Please install the packages." },
		{ text: "Good work.\n\nEVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("detects EVALUATION_COMPLETE in streamed messages when result text differs", async () => {
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		const supervisorMessages = [
		undefined,
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "Good work.\n\nEVALUATION_COMPLETE\n\nNow filing issues.",
		},
		],
		},
		},
		{
		type: "assistant",
		message: {
		content: [
		{ type: "text", text: "## Summary\n\nAll issues filed." },
		],
		},
		},
		],
		];

		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome! Please install the packages." },
		{ text: "## Summary\n\nAll issues filed." },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("relays only the last assistant text block to the agent", async () => {
		// Supervisor emits reasoning text ("Let me research...") then a tool call,
		// then a final task message. Only the final message should reach the agent.
		const supervisorMessages = [
		// Turn 0: multiple assistant messages with reasoning + task
		[
		{
		type: "assistant",
		message: {
		content: [
		{ type: "text", text: "Let me research the product first." },
		],
		},
		},
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "Hello! Here is your task: install the packages.",
		},
		],
		},
		},
		],
		// Turn 1: evaluation
		undefined,
		];

		let capturedAgentPrompt = null;
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);
		const origRun = agentRunner.run;
		agentRunner.run = async (task) => {
		capturedAgentPrompt = task;
		return origRun.call(agentRunner, task);
		};

		const supervisorRunner = createMockRunner(
		[
		// SDK result text = last message text (but relay should use buffer)
		{ text: "Hello! Here is your task: install the packages." },
		{ text: "EVALUATION_COMPLETE" },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		await supervisor.run("Evaluate the product");

		// Agent should receive only the final text, not the reasoning
		assert.strictEqual(
		capturedAgentPrompt,
		"Hello! Here is your task: install the packages.",
		);
		assert.ok(
		!capturedAgentPrompt.includes("research"),
		"Reasoning text should not leak to agent",
		);
		});

		test("runs multiple turns before completion", async () => {
		const agentRunner = createMockRunner([
		{ text: "Started working." },
		{ text: "Made progress." },
		{ text: "Finished everything." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Here is your task. Do the work." },
		{ text: "Keep going, you need to do more." },
		{ text: "Almost there, continue." },
		{ text: "EVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Do the work");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 3);
		});

		test("enforces maxTurns limit", async () => {
		const agentRunner = createMockRunner([
		{ text: "Turn 1" },
		{ text: "Turn 2" },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Start working." },
		{ text: "Continue." },
		{ text: "Continue." },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 2,
		});

		const result = await supervisor.run("Endless task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 2);
		});
		});

+2

-2

bin/fit-eval.js

		@@ -32,3 +32,3 @@ #!/usr/bin/env node
		--model=MODEL Claude model to use (default: opus)
		--max-turns=N Maximum agentic turns (default: 50)
		--max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
		--output=PATH Write NDJSON trace to file (default: stdout)
		@@ -44,3 +44,3 @@ --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
		--model=MODEL Claude model to use (default: opus)
		--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
		--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
		--output=PATH Write NDJSON trace to file (default: stdout)
		@@ -47,0 +47,0 @@ --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)

+2

-0

index.js

		@@ -8,3 +8,5 @@ export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
		AGENT_SYSTEM_PROMPT,
		isComplete,
		isIntervention,
		} from "./src/supervisor.js";
		export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

+1

-1

package.json

		{
		"name": "@forwardimpact/libeval",
		"version": "0.1.6",
		"version": "0.1.8",
		"description": "Process Claude Code stream-json output into structured traces",
		@@ -5,0 +5,0 @@ "license": "Apache-2.0",

+97

-39

src/agent-runner.js

		@@ -20,2 +20,3 @@ /**
		* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
		* @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
		* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
		@@ -35,2 +36,3 @@ * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
		onLine,
		onBatch,
		settingSources,
		@@ -48,3 +50,3 @@ agentProfile,
		this.model = model ?? "opus";
		this.maxTurns = maxTurns ?? 50;
		this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
		this.allowedTools = allowedTools ?? [
		@@ -60,2 +62,3 @@ "Bash",
		this.onLine = onLine ?? null;
		this.onBatch = onBatch ?? null;
		this.settingSources = settingSources ?? [];
		@@ -67,2 +70,4 @@ this.agentProfile = agentProfile ?? null;
		this.buffer = [];
		/** @type {AbortController\|null} */
		this.currentAbortController = null;
		}
		@@ -73,11 +78,9 @@
		* @param {string} task - The task prompt
		* @returns {Promise<{success: boolean, text: string, sessionId: string\|null}>}
		* @returns {Promise<{success: boolean, text: string, sessionId: string\|null, error: Error\|null, aborted: boolean}>}
		*/
		async run(task) {
		let text = "";
		let stopReason = null;
		let error = null;

		const abortController = new AbortController();
		this.currentAbortController = abortController;
		try {
		for await (const message of this.query({
		const iterator = this.query({
		prompt: task,
		@@ -87,3 +90,3 @@ options: {
		allowedTools: this.allowedTools,
		maxTurns: this.maxTurns,
		...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
		model: this.model,
		@@ -93,2 +96,3 @@ permissionMode: this.permissionMode,
		settingSources: this.settingSources,
		abortController,
		...(this.disallowedTools.length > 0 && {
		@@ -100,25 +104,7 @@ disallowedTools: this.disallowedTools,
		},
		})) {
		const line = JSON.stringify(message);
		this.output.write(line + "\n");
		this.buffer.push(line);
		if (this.onLine) this.onLine(line);

		if (message.type === "system" && message.subtype === "init") {
		this.sessionId = message.session_id;
		}
		if (message.type === "result") {
		text = message.result ?? "";
		stopReason = message.subtype;
		}
		}
		} catch (err) {
		error = err;
		});
		return await this.#consumeQuery(iterator);
		} finally {
		this.currentAbortController = null;
		}

		// If the SDK already emitted a successful result, honour it even when the
		// stream throws afterwards (e.g. "Credit balance is too low" during
		// cleanup). Only treat errors as fatal when no result was received yet.
		const success = stopReason === "success";
		return { success, text, sessionId: this.sessionId, error };
		}
		@@ -129,11 +115,9 @@
		* @param {string} prompt - The follow-up prompt
		* @returns {Promise<{success: boolean, text: string}>}
		* @returns {Promise<{success: boolean, text: string, sessionId: string\|null, error: Error\|null, aborted: boolean}>}
		*/
		async resume(prompt) {
		let text = "";
		let stopReason = null;
		let error = null;

		const abortController = new AbortController();
		this.currentAbortController = abortController;
		try {
		for await (const message of this.query({
		const iterator = this.query({
		prompt,
		@@ -144,4 +128,42 @@ options: {
		allowDangerouslySkipPermissions: true,
		abortController,
		},
		})) {
		});
		return await this.#consumeQuery(iterator);
		} finally {
		this.currentAbortController = null;
		}
		}

		/**
		* Shared consumer for both `run()` and `resume()`. Iterates the SDK query
		* iterator, mirroring every line to the output stream / buffer / onLine
		* callback, and — when `onBatch` is set — flushes accumulated lines to it
		* at natural boundaries (assistant messages with text blocks, and the
		* terminal `result` message).
		*
		* INVARIANT: the `await this.onBatch(...)` call below is the ONLY
		* suspension point in this loop. While it is pending, no further lines
		* are pulled from the SDK generator. The Supervisor relies on this — its
		* onBatch callback flips `currentSource` to "supervisor" for the duration
		* of its mid-turn LLM call, and the invariant guarantees no agent line
		* can arrive concurrently and be mis-tagged.
		*
		* If the supervisor calls `abort()` from inside the callback, the next
		* iteration of the for-await loop will throw. We catch the throw, check
		* `currentAbortController.signal.aborted` (avoiding fragility around
		* AbortError vs DOMException shapes), and report `aborted: true` so the
		* caller can distinguish "supervisor asked us to stop" from a real error.
		* @param {AsyncIterable<object>} iterator
		* @returns {Promise<{success: boolean, text: string, sessionId: string\|null, error: Error\|null, aborted: boolean}>}
		*/
		async #consumeQuery(iterator) {
		let text = "";
		let stopReason = null;
		let error = null;
		let aborted = false;
		const pendingBatch = [];

		try {
		for await (const message of iterator) {
		const line = JSON.stringify(message);
		@@ -151,3 +173,7 @@ this.output.write(line + "\n");
		if (this.onLine) this.onLine(line);
		if (this.onBatch) pendingBatch.push(line);

		if (message.type === "system" && message.subtype === "init") {
		this.sessionId = message.session_id;
		}
		if (message.type === "result") {
		@@ -157,9 +183,24 @@ text = message.result ?? "";
		}

		const shouldFlush =
		this.onBatch &&
		(message.type === "result" \|\|
		(message.type === "assistant" && hasTextBlock(message)));
		if (shouldFlush) {
		const batchLines = pendingBatch.splice(0, pendingBatch.length);
		await this.onBatch(batchLines, {
		abort: () => this.currentAbortController?.abort(),
		});
		}
		}
		} catch (err) {
		error = err;
		if (this.currentAbortController?.signal.aborted) {
		aborted = true;
		} else {
		error = err;
		}
		}

		const success = stopReason === "success";
		return { success, text, error };
		return { success, text, sessionId: this.sessionId, error, aborted };
		}
		@@ -179,2 +220,19 @@
		/**
		* Whether an SDK assistant message contains at least one text block.
		* Tool-only assistant messages return false so they accumulate into the
		* pending batch and flush with the next text block (or with the terminal
		* `result` message), keeping supervisor LLM cost bounded.
		* @param {object} message
		* @returns {boolean}
		*/
		function hasTextBlock(message) {
		const content = message.message?.content ?? message.content;
		if (!Array.isArray(content)) return false;
		for (const block of content) {
		if (block.type === "text" && block.text) return true;
		}
		return false;
		}

		/**
		* Factory function — wires real dependencies.
		@@ -181,0 +239,0 @@ * @param {object} deps - Same as AgentRunner constructor

+43

-18

src/commands/run.js

		@@ -22,2 +22,34 @@ import { readFileSync, createWriteStream } from "node:fs";
		/**
		* Parse and validate run command options from args.
		* @param {string[]} args
		* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string\|undefined, agentProfile: string\|undefined, allowedTools: string[] }}
		*/
		function parseRunOptions(args) {
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");

		const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
		const taskAmend = parseFlag(args, "task-amend") ?? undefined;
		let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
		if (taskAmend) taskContent += `\n\n${taskAmend}`;

		return {
		taskContent,
		cwd: resolve(parseFlag(args, "cwd") ?? "."),
		model: parseFlag(args, "model") ?? "opus",
		maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
		outputPath: parseFlag(args, "output"),
		agentProfile: parseFlag(args, "agent-profile") ?? undefined,
		allowedTools: (
		parseFlag(args, "allowed-tools") ??
		"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
		).split(","),
		};
		}

		/**
		* Run command — execute a single agent via the Claude Agent SDK.
		@@ -32,6 +64,7 @@ *
		* --model=MODEL Claude model to use (default: opus)
		* --max-turns=N Maximum agentic turns (default: 50)
		* --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
		* --output=PATH Write NDJSON trace to file (default: stdout)
		* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
		* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
		* --task-amend=TEXT Additional text appended to the task prompt
		*
		@@ -41,20 +74,12 @@ * @param {string[]} args - Command arguments
		export async function runRunCommand(args) {
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");
		const {
		taskContent,
		cwd,
		model,
		maxTurns,
		outputPath,
		agentProfile,
		allowedTools,
		} = parseRunOptions(args);

		const cwd = resolve(parseFlag(args, "cwd") ?? ".");
		const model = parseFlag(args, "model") ?? "opus";
		const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
		const outputPath = parseFlag(args, "output");
		const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
		const allowedTools = (
		parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
		).split(",");

		const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

		// When --output is specified, stream text to stdout while writing NDJSON to file.
		@@ -61,0 +86,0 @@ // Otherwise, write NDJSON directly to stdout (backwards-compatible).

+59

-37

src/commands/supervise.js

		@@ -23,2 +23,46 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
		/**
		* Parse all supervise flags from args into an options object.
		* @param {string[]} args
		* @returns {object}
		*/
		function parseSuperviseOptions(args) {
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");

		const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");

		const taskAmend = parseFlag(args, "task-amend") ?? undefined;
		let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
		if (taskAmend) taskContent += `\n\n${taskAmend}`;

		return {
		taskContent,
		supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
		agentCwd: resolve(
		parseFlag(args, "agent-cwd") ??
		mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
		),
		model: parseFlag(args, "model") ?? "opus",
		maxTurns: (() => {
		const raw = parseFlag(args, "max-turns") ?? "20";
		return raw === "0" ? 0 : parseInt(raw, 10);
		})(),
		outputPath: parseFlag(args, "output"),
		supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
		agentProfile: parseFlag(args, "agent-profile") ?? undefined,
		allowedTools: (
		parseFlag(args, "allowed-tools") ??
		"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
		).split(","),
		supervisorAllowedTools: supervisorAllowedToolsRaw
		? supervisorAllowedToolsRaw.split(",")
		: undefined,
		};
		}

		/**
		* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
		@@ -34,3 +78,3 @@ *
		* --model=MODEL Claude model to use (default: opus)
		* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
		* --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
		* --output=PATH Write NDJSON trace to file (default: stdout)
		@@ -40,2 +84,3 @@ * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
		* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
		* --task-amend=TEXT Additional text appended to the task prompt
		*
		@@ -45,32 +90,9 @@ * @param {string[]} args - Command arguments
		export async function runSuperviseCommand(args) {
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");
		const opts = parseSuperviseOptions(args);

		const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
		const agentCwd = resolve(
		parseFlag(args, "agent-cwd") ??
		mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
		);
		const model = parseFlag(args, "model") ?? "opus";
		const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
		const outputPath = parseFlag(args, "output");
		const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
		const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
		const allowedTools = (
		parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
		).split(",");
		const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
		const supervisorAllowedTools = supervisorAllowedToolsRaw
		? supervisorAllowedToolsRaw.split(",")
		: undefined;

		const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

		// When --output is specified, stream text to stdout while writing NDJSON to file.
		// Otherwise, write NDJSON directly to stdout (backwards-compatible).
		const fileStream = outputPath ? createWriteStream(outputPath) : null;
		const fileStream = opts.outputPath
		? createWriteStream(opts.outputPath)
		: null;
		const output = fileStream
		@@ -86,15 +108,15 @@ ? createTeeWriter({
		const supervisor = createSupervisor({
		supervisorCwd,
		agentCwd,
		supervisorCwd: opts.supervisorCwd,
		agentCwd: opts.agentCwd,
		query,
		output,
		model,
		maxTurns,
		allowedTools,
		supervisorAllowedTools,
		supervisorProfile,
		agentProfile,
		model: opts.model,
		maxTurns: opts.maxTurns,
		allowedTools: opts.allowedTools,
		supervisorAllowedTools: opts.supervisorAllowedTools,
		supervisorProfile: opts.supervisorProfile,
		agentProfile: opts.agentProfile,
		});

		const result = await supervisor.run(taskContent);
		const result = await supervisor.run(opts.taskContent);

		@@ -101,0 +123,0 @@ if (fileStream) {

+298

-59

src/supervisor.js

		@@ -16,4 +16,4 @@ /**
		* Check if the supervisor's response signals evaluation success.
		* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
		* formatting (e.g. EVALUATION_SUCCESSFUL). Uses word boundaries to
		* Matches EVALUATION_COMPLETE anywhere in the text, tolerating markdown
		* formatting (e.g. EVALUATION_COMPLETE). Uses word boundaries to
		* avoid matching inside longer identifiers.
		@@ -23,16 +23,40 @@ * @param {string} text
		*/
		export function isSuccessful(text) {
		return /(?:^\|[\s_~`])EVALUATION_SUCCESSFUL(?:[\s_~`.,!?]\|$)/m.test(text);
		export function isComplete(text) {
		return /(?:^\|[\s_~`])EVALUATION_COMPLETE(?:[\s_~`.,!?]\|$)/m.test(text);
		}

		/**
		* Check if the supervisor's response signals a mid-turn intervention.
		* Same tolerance rules as isComplete (markdown formatting, word boundaries),
		* but matches the EVALUATION_INTERVENTION keyword instead.
		* @param {string} text
		* @returns {boolean}
		*/
		export function isIntervention(text) {
		return /(?:^\|[\s_~`])EVALUATION_INTERVENTION(?:[\s_~`.,!?]\|$)/m.test(text);
		}

		/** System prompt appended for the supervisor runner in supervise mode. */
		export const SUPERVISOR_SYSTEM_PROMPT =
		"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
		"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
		"You relay messages to one persistent agent session — your only output " +
		"channel. Spawning sub-agents or restarting the agent is blocked. Do not " +
		"do the work yourself. Reply briefly to let the agent continue, write " +
		"EVALUATION_INTERVENTION + instructions to interrupt mid-turn, or " +
		"EVALUATION_COMPLETE when done. Only your final message each turn is " +
		"relayed.";

		/** System prompt appended for the agent runner in supervise mode. */
		export const AGENT_SYSTEM_PROMPT =
		"You are being supervised by another AI agent. " +
		"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
		"A supervisor watches your work and may interrupt with new instructions " +
		"mid-task. Treat any new prompt as authoritative and adjust course. " +
		"When uncertain, stop and ask a clarifying question.";

		/**
		* Maximum number of mid-turn interventions allowed within a single agent turn.
		* Bounded so a looping supervisor exhausts its quota fast (observability) but
		* leaves headroom for legitimate "intervene, observe, intervene again" patterns.
		* The outer maxTurns budget still bounds overall runtime.
		*/
		const MAX_INTERVENTIONS_PER_TURN = 5;

		export class Supervisor {
		@@ -61,3 +85,3 @@ /**
		* The SDK result text only reflects the last assistant message, so when
		* the supervisor writes EVALUATION_SUCCESSFUL in an early message and
		* the supervisor writes EVALUATION_COMPLETE in an early message and
		* then continues with follow-up work, the result text won't contain it.
		@@ -67,3 +91,19 @@ * This flag captures the signal from the full message stream.
		*/
		this.successSignalSeen = false;
		this.completeSignalSeen = false;
		/**
		* Set to true when any supervisor message contains EVALUATION_INTERVENTION.
		* Mirrors completeSignalSeen — populated by emitLine when a supervisor
		* assistant text block matches isIntervention(...). The mid-turn loop
		* reads this flag after each supervisor invocation to decide whether to
		* abort the agent's in-flight SDK session.
		* @type {boolean}
		*/
		this.interventionSignalSeen = false;
		/**
		* The most recent supervisor SDK result captured inside the mid-turn
		* onBatch callback. The outer loop reads this after the agent aborts to
		* build the next relay prompt without re-running the supervisor.
		* @type {{success: boolean, text: string}\|null}
		*/
		this.lastSupervisorResult = null;
		}
		@@ -82,3 +122,5 @@
		this.currentTurn = 0;
		this.successSignalSeen = false;
		this.completeSignalSeen = false;
		this.interventionSignalSeen = false;
		this.lastSupervisorResult = null;
		let supervisorResult = await this.supervisorRunner.run(task);
		@@ -93,6 +135,6 @@
		// streamed message content. The SDK result text only reflects the last
		// assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
		// assistant message, so when the supervisor writes EVALUATION_COMPLETE
		// early and then continues (e.g. filing issues), we must also check the
		// flag set by emitLine during streaming.
		if (this.successSignalSeen \|\| isSuccessful(supervisorResult.text)) {
		if (this.completeSignalSeen \|\| isComplete(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: 0 });
		@@ -102,49 +144,181 @@ return { success: true, turns: 0 };

		for (let turn = 1; turn <= this.maxTurns; turn++) {
		// Supervisor's output becomes the agent's input
		this.currentSource = "agent";
		this.currentTurn = turn;
		let agentResult;
		if (turn === 1) {
		agentResult = await this.agentRunner.run(supervisorResult.text);
		} else {
		agentResult = await this.agentRunner.resume(supervisorResult.text);
		}
		const turnLimit = this.maxTurns === 0 ? Infinity : this.maxTurns;
		for (let turn = 1; turn <= turnLimit; turn++) {
		// Only the supervisor's final message is relayed to the agent.
		// Extract the last assistant text block from the buffer to avoid
		// leaking intermediate reasoning (research, tool calls, notes).
		const relay = this.extractLastText(
		this.supervisorRunner,
		supervisorResult.text,
		);

		if (agentResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { success: false, turns: turn };
		}
		// Drive the agent through interventions until its SDK session ends
		// naturally, the supervisor signals completion mid-turn, or the
		// per-turn intervention budget is exhausted.
		const turnOutcome = await this.#runAgentTurn(turn, relay);
		if (turnOutcome.exit) return turnOutcome.exit;

		// Build the full agent transcript from buffered NDJSON events so the
		// supervisor sees tool calls and reasoning, not just the SDK result summary.
		const agentTranscript = this.extractTranscript(this.agentRunner);
		// End-of-turn review (existing behaviour). Returns either an exit
		// outcome (error or completion) or the supervisor result for the
		// next turn's relay.
		const reviewOutcome = await this.#endOfTurnReview(turn);
		if (reviewOutcome.exit) return reviewOutcome.exit;
		supervisorResult = reviewOutcome.supervisorResult;
		}

		const supervisorPrompt =
		`The agent reported:\n\n${agentTranscript}\n\n` +
		`Review the agent's work and decide how to proceed.`;
		this.emitSummary({ success: false, turns: this.maxTurns });
		return { success: false, turns: this.maxTurns };
		}

		this.currentSource = "supervisor";
		this.currentTurn = turn;
		this.successSignalSeen = false;
		supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
		/**
		* Drive the agent through one turn, allowing the supervisor to interrupt
		* mid-stream via EVALUATION_INTERVENTION. Returns either an `exit` outcome
		* (the loop should return immediately) or `{exit: null}` (proceed to the
		* end-of-turn review).
		* @param {number} turn
		* @param {string} initialRelay
		* @returns {Promise<{exit: {success: boolean, turns: number}\|null}>}
		*/
		async #runAgentTurn(turn, initialRelay) {
		let relay = initialRelay;
		let interventions = 0;

		if (supervisorResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { success: false, turns: turn };
		}
		// Wire the mid-turn observation hook on the agent runner. The bound
		// callback captures `turn` so the inner loop's multiple resume(...)
		// calls all see the same turn id. The supervisorRunner does NOT get
		// an onBatch callback — it only fires onLine, which is enough for
		// emitLine to detect EVALUATION_COMPLETE / EVALUATION_INTERVENTION.
		this.agentRunner.onBatch = (batchLines, ctx) =>
		this.#midTurnReview(turn, batchLines, ctx);

		// The supervisor's turn is fully complete — check for success signal
		// in either the SDK result text or streamed messages.
		if (this.successSignalSeen \|\| isSuccessful(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		return { success: true, turns: turn };
		try {
		while (true) {
		this.currentSource = "agent";
		this.currentTurn = turn;
		const isFirstAgentCall = turn === 1 && interventions === 0;
		const agentResult = isFirstAgentCall
		? await this.agentRunner.run(relay)
		: await this.agentRunner.resume(relay);

		if (agentResult.error && !agentResult.aborted) {
		this.emitSummary({ success: false, turns: turn });
		return { exit: { success: false, turns: turn } };
		}

		// Mid-turn EVALUATION_COMPLETE: end the session immediately.
		if (this.completeSignalSeen) {
		this.emitSummary({ success: true, turns: turn });
		return { exit: { success: true, turns: turn } };
		}

		if (agentResult.aborted && this.interventionSignalSeen) {
		interventions++;
		if (interventions >= MAX_INTERVENTIONS_PER_TURN) {
		this.emitOrchestratorEvent({ type: "intervention_limit", turn });
		return { exit: null };
		}
		relay = this.extractLastText(
		this.supervisorRunner,
		this.lastSupervisorResult?.text ?? "",
		);
		this.emitOrchestratorEvent({ type: "intervention_relayed", turn });
		continue;
		}

		// Agent's SDK session finished naturally — proceed to end-of-turn.
		return { exit: null };
		}
		} finally {
		// Detach onBatch before the end-of-turn review so the supervisor's
		// own SDK session does not trigger nested onBatch fires.
		this.agentRunner.onBatch = null;
		}
		}

		this.emitSummary({ success: false, turns: this.maxTurns });
		return { success: false, turns: this.maxTurns };
		/**
		* Mid-turn supervisor review fired from inside the agent's onBatch hook.
		* Emits a `mid_turn_review` orchestrator marker, runs the supervisor's
		* LLM against the batch, and aborts the agent if the supervisor signals
		* EVALUATION_INTERVENTION or EVALUATION_COMPLETE.
		* @param {number} turn
		* @param {string[]} batchLines
		* @param {{abort: () => void}} ctx
		*/
		async #midTurnReview(turn, batchLines, { abort }) {
		const batchTranscript = this.renderBatch(batchLines);

		// Order matters: emit the orchestrator marker BEFORE the supervisor
		// LLM call so the trace reads
		// agent line → orchestrator:mid_turn_review
		// → supervisor lines (tagged turn:N)
		// → orchestrator:intervention_requested\|complete_requested
		this.emitOrchestratorEvent({ type: "mid_turn_review", turn });

		// currentTurn stays = turn so mid-turn supervisor lines share the
		// agent's turn id. They are distinguishable from end-of-turn reviews
		// by the surrounding orchestrator events emitted around this call.
		this.currentSource = "supervisor";
		this.completeSignalSeen = false;
		this.interventionSignalSeen = false;

		this.lastSupervisorResult = await this.supervisorRunner.resume(
		`The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
		`Respond with a brief acknowledgement to let it continue, or write ` +
		`EVALUATION_INTERVENTION followed by a corrective message to stop ` +
		`and relay a new instruction. Write EVALUATION_COMPLETE only when ` +
		`the task is fully done.`,
		);
		this.currentSource = "agent";

		if (this.interventionSignalSeen) {
		this.emitOrchestratorEvent({ type: "intervention_requested", turn });
		abort();
		return;
		}
		if (this.completeSignalSeen) {
		this.emitOrchestratorEvent({ type: "complete_requested", turn });
		abort();
		}
		// Non-intervention: do nothing; the agent loop pulls the next line.
		}

		/**
		* End-of-turn supervisor review (existing behaviour). Returns either an
		* exit outcome (error or completion) or the supervisor result so the
		* outer loop can build the next turn's relay.
		* @param {number} turn
		* @returns {Promise<{exit: {success: boolean, turns: number}\|null, supervisorResult?: object}>}
		*/
		async #endOfTurnReview(turn) {
		// Build the full agent transcript from buffered NDJSON events so the
		// supervisor sees tool calls and reasoning, not just the SDK result.
		const agentTranscript = this.extractTranscript(this.agentRunner);

		const supervisorPrompt =
		`The agent reported:\n\n${agentTranscript}\n\n` +
		`Review the agent's work and decide how to proceed.`;

		this.currentSource = "supervisor";
		this.currentTurn = turn;
		this.completeSignalSeen = false;
		this.interventionSignalSeen = false;
		const supervisorResult =
		await this.supervisorRunner.resume(supervisorPrompt);

		if (supervisorResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { exit: { success: false, turns: turn } };
		}

		// The supervisor's turn is fully complete — check for success signal
		// in either the SDK result text or streamed messages.
		if (this.completeSignalSeen \|\| isComplete(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		return { exit: { success: true, turns: turn } };
		}

		return { exit: null, supervisorResult };
		}

		/**
		* Extract a human-readable transcript from an AgentRunner's buffered output.
		@@ -165,2 +339,27 @@ * Drains the buffer and replays events through a TraceCollector.
		/**
		* Extract only the last assistant text block from an AgentRunner's buffer.
		* Scans buffered NDJSON events in reverse to find the final assistant message
		* with a text content block. This prevents intermediate reasoning (tool calls,
		* research notes) from leaking to the agent.
		* @param {import("./agent-runner.js").AgentRunner} runner
		* @param {string} fallback - Fallback text if no assistant text block is found
		* @returns {string}
		*/
		extractLastText(runner, fallback) {
		const lines = runner.buffer;
		for (let i = lines.length - 1; i >= 0; i--) {
		const event = JSON.parse(lines[i]);
		if (event.type !== "assistant") continue;
		const content = event.message?.content ?? event.content;
		if (!Array.isArray(content)) continue;
		for (let j = content.length - 1; j >= 0; j--) {
		if (content[j].type === "text" && content[j].text) {
		return content[j].text;
		}
		}
		}
		return fallback;
		}

		/**
		* Emit a single NDJSON line tagged with the current source and turn.
		@@ -170,3 +369,4 @@ * Called in real-time via the AgentRunner onLine callback.
		* When the current source is the supervisor, also scans assistant text
		* content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
		* content for the EVALUATION_COMPLETE and EVALUATION_INTERVENTION signals,
		* setting completeSignalSeen / interventionSignalSeen respectively.
		* @param {string} line - Raw NDJSON line from the runner
		@@ -183,6 +383,6 @@ */

		// Scan supervisor assistant messages for the success signal in real time.
		// Scan supervisor assistant messages for the signals in real time.
		// The SDK result text only reflects the final assistant message, but the
		// supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
		// then continue with follow-up tool calls.
		// supervisor may write EVALUATION_COMPLETE / EVALUATION_INTERVENTION in
		// an earlier message and then continue with follow-up tool calls.
		if (this.currentSource === "supervisor" && event.type === "assistant") {
		@@ -192,5 +392,5 @@ const content = event.message?.content ?? event.content ?? [];
		for (const block of content) {
		if (block.type === "text" && isSuccessful(block.text)) {
		this.successSignalSeen = true;
		}
		if (block.type !== "text" \|\| !block.text) continue;
		if (isComplete(block.text)) this.completeSignalSeen = true;
		if (isIntervention(block.text)) this.interventionSignalSeen = true;
		}
		@@ -202,2 +402,37 @@ }
		/**
		* Render a batch of buffered NDJSON lines as human-readable text for the
		* mid-turn supervisor prompt. Reuses the TraceCollector pipeline so the
		* supervisor sees tool calls and reasoning, not just raw events.
		* @param {string[]} batchLines
		* @returns {string}
		*/
		renderBatch(batchLines) {
		if (batchLines.length === 0) return "[empty]";
		const collector = new TraceCollector();
		for (const line of batchLines) {
		collector.addLine(line);
		}
		return collector.toText() \|\| "[empty]";
		}

		/**
		* Emit an orchestrator-source NDJSON line. Used by the mid-turn loop to
		* mark mid_turn_review / intervention_requested / intervention_relayed /
		* intervention_limit / complete_requested boundaries in the trace, so the
		* improvement coach can distinguish mid-turn supervisor activity from
		* end-of-turn reviews. Additive to existing trace shape — the parser
		* already reads `source` and ignores unknown event types.
		* @param {{type: string, turn?: number}} event
		*/
		emitOrchestratorEvent(event) {
		this.output.write(
		JSON.stringify({
		source: "orchestrator",
		turn: this.currentTurn,
		event,
		}) + "\n",
		);
		}

		/**
		* Emit a final orchestrator summary line.
		@@ -268,6 +503,10 @@ * @param {{success: boolean, turns: number}} result

		// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
		// The relay loop handles agent communication — letting the supervisor use
		// Task would bypass the relay and produce an empty agent trace.
		const defaultDisallowed = ["Task", "TaskOutput"];
		// Block every sub-agent spawning tool so the supervisor cannot bypass the
		// relay loop. The current Claude Agent SDK exposes the spawn tool to the
		// model as `Agent`; older versions called it `Task`. Both are blocked
		// (along with TaskOutput/TaskStop) so the supervisor sees no spawn tool
		// regardless of which SDK version is installed. Letting the supervisor
		// spawn its own sub-agent would bypass the relay and produce an empty
		// agent trace, which is the failure mode that motivated this default.
		const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
		const disallowedTools = supervisorDisallowedTools
		@@ -274,0 +513,0 @@ ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]

-554

test/supervisor.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import {
		AgentRunner,
		Supervisor,
		createSupervisor,
		SUPERVISOR_SYSTEM_PROMPT,
		AGENT_SYSTEM_PROMPT,
		} from "@forwardimpact/libeval";
		import { isSuccessful } from "../src/supervisor.js";

		/**
		* Create a mock AgentRunner that yields pre-scripted responses.
		* Each call to run() or resume() pops the next response from the array.
		* @param {object[]} responses - Array of {text, success} objects
		* @param {object[]} [messages] - Messages to buffer per turn
		* @returns {AgentRunner}
		*/
		function createMockRunner(responses, messages) {
		const output = new PassThrough();
		let callIndex = 0;

		const runner = new AgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		output,
		});

		// Override run and resume to return scripted responses
		runner.run = async (_task) => {
		const resp = responses[callIndex++];
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		for (const m of msgs) {
		const line = JSON.stringify(m);
		runner.buffer.push(line);
		if (runner.onLine) runner.onLine(line);
		}
		runner.sessionId = "mock-session";
		return {
		success: resp.success ?? true,
		text: resp.text,
		sessionId: "mock-session",
		};
		};

		runner.resume = async (_prompt) => {
		const resp = responses[callIndex++];
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		for (const m of msgs) {
		const line = JSON.stringify(m);
		runner.buffer.push(line);
		if (runner.onLine) runner.onLine(line);
		}
		return { success: resp.success ?? true, text: resp.text };
		};

		return runner;
		}

		describe("isSuccessful", () => {
		test("detects EVALUATION_SUCCESSFUL on its own line", () => {
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(
		isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
		true,
		);
		assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
		});

		test("tolerates markdown formatting around the signal", () => {
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
		assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
		assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
		assert.strictEqual(
		isSuccessful(
		"Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
		),
		true,
		);
		});

		test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
		assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
		assert.strictEqual(
		isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
		true,
		);
		assert.strictEqual(
		isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
		true,
		);
		});

		test("does not match empty or unrelated text", () => {
		assert.strictEqual(isSuccessful(""), false);
		assert.strictEqual(isSuccessful("All done!"), false);
		assert.strictEqual(isSuccessful("DONE"), false);
		});

		test("does not match old EVALUATION_COMPLETE signal", () => {
		assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
		});
		});

		describe("Supervisor", () => {
		test("constructor throws on missing agentRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		supervisorRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/agentRunner is required/,
		);
		});

		test("constructor throws on missing supervisorRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/supervisorRunner is required/,
		);
		});

		test("constructor throws on missing output", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		supervisorRunner: createMockRunner([]),
		}),
		/output is required/,
		);
		});

		test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
		const agentRunner = createMockRunner([]);

		const supervisorRunner = createMockRunner([
		{ text: "EVALUATION_SUCCESSFUL" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 0);
		});

		test("completes after one agent turn", async () => {
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Welcome! Please install the packages." },
		{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
		// Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
		// an early message, then continues with follow-up work (e.g. filing issues).
		// The SDK result text reflects only the final message, which does NOT
		// contain the signal.
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		// The supervisor's result text is the Summary (no signal), but messages
		// include one with EVALUATION_SUCCESSFUL.
		const supervisorMessages = [
		undefined, // turn 0: use default
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
		},
		],
		},
		},
		{
		type: "assistant",
		message: {
		content: [
		{ type: "text", text: "## Summary\n\nAll issues filed." },
		],
		},
		},
		],
		];

		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome! Please install the packages." },
		// Result text is the final message — does NOT contain the signal
		{ text: "## Summary\n\nAll issues filed." },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("runs multiple turns before completion", async () => {
		const agentRunner = createMockRunner([
		{ text: "Started working." },
		{ text: "Made progress." },
		{ text: "Finished everything." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Here is your task. Do the work." },
		{ text: "Keep going, you need to do more." },
		{ text: "Almost there, continue." },
		{ text: "EVALUATION_SUCCESSFUL" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Do the work");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 3);
		});

		test("enforces maxTurns limit", async () => {
		// Supervisor starts, agent responds each turn, supervisor never says done
		const agentRunner = createMockRunner([
		{ text: "Turn 1" },
		{ text: "Turn 2" },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Start working." },
		{ text: "Continue." },
		{ text: "Continue." },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 2,
		});

		const result = await supervisor.run("Endless task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 2);
		});

		test("output contains tagged lines with correct source and turn", async () => {
		const supervisorMessages = [
		[{ type: "assistant", content: "Go ahead" }],
		[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
		];
		const agentMessages = [[{ type: "assistant", content: "Working" }]];

		const supervisorRunner = createMockRunner(
		[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
		supervisorMessages,
		);
		const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
		assert.ok(lines.length >= 4);

		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);
		assert.ok("event" in supervisorLine);

		const agentLine = JSON.parse(lines[1]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 1);
		assert.ok("event" in agentLine);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.type, "summary");
		assert.strictEqual(summaryLine.success, true);
		});

		test("events are nested under event key (no field collisions)", async () => {
		const sourceEvent = {
		type: "assistant",
		source: "sdk-internal",
		content: "test",
		};
		const supervisorRunner = createMockRunner(
		[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
		[
		[{ type: "assistant", content: "Go" }],
		[{ type: "assistant", content: "ok" }],
		],
		);
		const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		// First line is supervisor turn 0, second is agent turn 1
		const tagged = JSON.parse(lines[1]);
		// The original event's `source` field is preserved inside `event`
		assert.strictEqual(tagged.source, "agent");
		assert.strictEqual(tagged.event.source, "sdk-internal");
		});

		test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
		const supervisorMessages = [
		[{ type: "assistant", content: "Starting..." }],
		];
		const supervisorRunner = createMockRunner(
		[{ text: "Starting...", success: false }],
		supervisorMessages,
		);

		// Override run to simulate an error return
		const origRun = supervisorRunner.run;
		supervisorRunner.run = async (task) => {
		const result = await origRun.call(supervisorRunner, task);
		return { ...result, error: new Error("Process exited with code 1") };
		};

		const agentRunner = createMockRunner([]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 0);

		// Output should still contain the supervisor's buffered lines + summary
		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");

		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.success, false);
		assert.strictEqual(summaryLine.turns, 0);
		});

		test("createSupervisor factory returns a Supervisor instance", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.ok(supervisor instanceof Supervisor);
		});

		test("createSupervisor uses default supervisor tools when none specified", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		]);
		});

		test("createSupervisor passes custom supervisor tools", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorAllowedTools: ["Read", "Glob", "Grep"],
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Read",
		"Glob",
		"Grep",
		]);
		});

		test("createSupervisor wires system prompts to both runners", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});

		assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: AGENT_SYSTEM_PROMPT,
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: SUPERVISOR_SYSTEM_PROMPT,
		});
		});

		test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
		"Task",
		"TaskOutput",
		]);
		// Agent should not have disallowed tools
		assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
		});

		test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorDisallowedTools: ["WebSearch", "Task"],
		});
		const disallowed = supervisor.supervisorRunner.disallowedTools;
		assert.ok(disallowed.includes("Task"));
		assert.ok(disallowed.includes("TaskOutput"));
		assert.ok(disallowed.includes("WebSearch"));
		// No duplicates
		assert.strictEqual(disallowed.length, new Set(disallowed).size);
		});

		test("system prompt constants are non-empty strings", () => {
		assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
		assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
		assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
		});

		test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
		});
		});

@forwardimpact/libeval - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics