Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement →

@forwardimpact/libeval

Advanced tools

License

Install Socket

Detect and block malicious and high-risk dependencies

Install

@forwardimpact/libeval - npm Package Compare versions

Comparing version

0.1.3

0.1.5

+12

-4

bin/fit-eval.js

		@@ -28,3 +28,4 @@ #!/usr/bin/env node
		Run options:
		--task=PATH Path to task file (required)
		--task-file=PATH Path to task file (mutually exclusive with --task-text)
		--task-text=STRING Inline task text (mutually exclusive with --task-file)
		--cwd=DIR Agent working directory (default: .)
		@@ -35,5 +36,7 @@ --model=MODEL Claude model to use (default: opus)
		--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
		--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)

		Supervise options:
		--task=PATH Path to task file (required)
		--task-file=PATH Path to task file (mutually exclusive with --task-text)
		--task-text=STRING Inline task text (mutually exclusive with --task-file)
		--supervisor-cwd=DIR Supervisor working directory (default: .)
		@@ -45,2 +48,6 @@ --agent-cwd=DIR Agent working directory (default: temp directory)
		--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
		--supervisor-allowed-tools=LIST
		Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
		--supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
		--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)

		@@ -56,4 +63,5 @@ Options:
		fit-eval tee output.ndjson < trace.ndjson
		fit-eval run --task=.github/tasks/security-audit.md --model=opus
		fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
		fit-eval run --task-text="Perform a security audit of the repository." --model=opus
		fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
		fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
		`.trim();
		@@ -60,0 +68,0 @@

+6

-1

index.js

		export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
		export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
		export { Supervisor, createSupervisor } from "./src/supervisor.js";
		export {
		Supervisor,
		createSupervisor,
		SUPERVISOR_SYSTEM_PROMPT,
		AGENT_SYSTEM_PROMPT,
		} from "./src/supervisor.js";
		export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

+4

-3

package.json

		{
		"name": "@forwardimpact/libeval",
		"version": "0.1.3",
		"version": "0.1.5",
		"description": "Process Claude Code stream-json output into structured traces",
		@@ -13,3 +13,4 @@ "license": "Apache-2.0",
		"engines": {
		"bun": ">=1.2.0"
		"bun": ">=1.2.0",
		"node": ">=18.0.0"
		},
		@@ -20,3 +21,3 @@ "scripts": {
		"dependencies": {
		"@anthropic-ai/claude-agent-sdk": "^0.1.0"
		"@anthropic-ai/claude-agent-sdk": "^0.2.91"
		},
		@@ -23,0 +24,0 @@ "publishConfig": {

+19

-1

src/agent-runner.js

		@@ -21,2 +21,5 @@ /**
		* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
		* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
		* @param {string\|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
		* @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
		*/
		@@ -33,2 +36,5 @@ constructor({
		settingSources,
		agentProfile,
		systemPrompt,
		disallowedTools,
		}) {
		@@ -54,2 +60,5 @@ if (!cwd) throw new Error("cwd is required");
		this.settingSources = settingSources ?? [];
		this.agentProfile = agentProfile ?? null;
		this.systemPrompt = systemPrompt ?? null;
		this.disallowedTools = disallowedTools ?? [];
		this.sessionId = null;
		@@ -80,2 +89,7 @@ this.buffer = [];
		settingSources: this.settingSources,
		...(this.disallowedTools.length > 0 && {
		disallowedTools: this.disallowedTools,
		}),
		...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
		...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
		},
		@@ -120,3 +134,7 @@ })) {
		prompt,
		options: { resume: this.sessionId },
		options: {
		resume: this.sessionId,
		permissionMode: this.permissionMode,
		allowDangerouslySkipPermissions: true,
		},
		})) {
		@@ -123,0 +141,0 @@ const line = JSON.stringify(message);

+12

-4

src/commands/run.js

		@@ -27,3 +27,4 @@ import { readFileSync, createWriteStream } from "node:fs";
		* Options:
		* --task=PATH Path to task file (required)
		* --task-file=PATH Path to task file (mutually exclusive with --task-text)
		* --task-text=STRING Inline task text (mutually exclusive with --task-file)
		* --cwd=DIR Agent working directory (default: .)
		@@ -34,2 +35,3 @@ * --model=MODEL Claude model to use (default: opus)
		* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
		* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
		*
		@@ -39,4 +41,8 @@ * @param {string[]} args - Command arguments
		export async function runRunCommand(args) {
		const task = parseFlag(args, "task");
		if (!task) throw new Error("--task is required");
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");

		@@ -47,2 +53,3 @@ const cwd = resolve(parseFlag(args, "cwd") ?? ".");
		const outputPath = parseFlag(args, "output");
		const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
		const allowedTools = (
		@@ -52,3 +59,3 @@ parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"

		const taskContent = readFileSync(task, "utf8");
		const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

		@@ -71,2 +78,3 @@ // When --output is specified, stream text to stdout while writing NDJSON to file.
		settingSources: ["project"],
		agentProfile,
		});
		@@ -73,0 +81,0 @@

+20

-4

src/commands/supervise.js

		@@ -28,3 +28,4 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
		* Options:
		* --task=PATH Path to task file (required)
		* --task-file=PATH Path to task file (mutually exclusive with --task-text)
		* --task-text=STRING Inline task text (mutually exclusive with --task-file)
		* --supervisor-cwd=DIR Supervisor working directory (default: .)
		@@ -36,2 +37,4 @@ * --agent-cwd=DIR Agent working directory (default: temp directory)
		* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
		* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
		* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
		*
		@@ -41,4 +44,8 @@ * @param {string[]} args - Command arguments
		export async function runSuperviseCommand(args) {
		const task = parseFlag(args, "task");
		if (!task) throw new Error("--task is required");
		const taskFile = parseFlag(args, "task-file");
		const taskText = parseFlag(args, "task-text");
		if (taskFile && taskText)
		throw new Error("--task-file and --task-text are mutually exclusive");
		if (!taskFile && !taskText)
		throw new Error("--task-file or --task-text is required");

		@@ -53,7 +60,13 @@ const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
		const outputPath = parseFlag(args, "output");
		const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
		const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
		const allowedTools = (
		parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
		).split(",");
		const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
		const supervisorAllowedTools = supervisorAllowedToolsRaw
		? supervisorAllowedToolsRaw.split(",")
		: undefined;

		const taskContent = readFileSync(task, "utf8");
		const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

		@@ -80,2 +93,5 @@ // When --output is specified, stream text to stdout while writing NDJSON to file.
		allowedTools,
		supervisorAllowedTools,
		supervisorProfile,
		agentProfile,
		});
		@@ -82,0 +98,0 @@

+108

-31

src/supervisor.js

		/**
		* Supervisor — orchestrates a relay loop between an agent and a supervisor,
		* both running as AgentRunner instances. The agent works on a task while the
		* supervisor observes and decides when the evaluation is complete.
		* both running as AgentRunner instances. The supervisor receives the task first,
		* introduces itself, and delegates work to the agent. The loop then alternates:
		* agent → supervisor → agent.
		*
		@@ -11,14 +12,26 @@ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
		import { createAgentRunner } from "./agent-runner.js";
		import { TraceCollector } from "./trace-collector.js";

		/**
		* Check if the supervisor's response signals evaluation completion.
		* Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
		* to avoid false positives from natural language.
		* Check if the supervisor's response signals evaluation success.
		* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
		* formatting (e.g. EVALUATION_SUCCESSFUL). Uses word boundaries to
		* avoid matching inside longer identifiers.
		* @param {string} text
		* @returns {boolean}
		*/
		export function isDone(text) {
		return /^EVALUATION_COMPLETE$/m.test(text);
		export function isSuccessful(text) {
		return /(?:^\|[\s_~`])EVALUATION_SUCCESSFUL(?:[\s_~`.,!?]\|$)/m.test(text);
		}

		/** System prompt appended for the supervisor runner in supervise mode. */
		export const SUPERVISOR_SYSTEM_PROMPT =
		"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
		"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";

		/** System prompt appended for the agent runner in supervise mode. */
		export const AGENT_SYSTEM_PROMPT =
		"You are being supervised by another AI agent. " +
		"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";

		export class Supervisor {
		@@ -48,12 +61,14 @@ /**
		* Run the supervisor ↔ agent relay loop.
		* @param {string} task - The initial task for the agent
		* The supervisor receives the task first, introduces itself, and delegates
		* work to the agent. The loop then alternates: agent → supervisor → agent.
		* @param {string} task - The initial task for the supervisor
		* @returns {Promise<{success: boolean, turns: number}>}
		*/
		async run(task) {
		// Turn 0: Agent receives the task and starts working
		this.currentSource = "agent";
		// Turn 0: Supervisor receives the task and introduces it to the agent
		this.currentSource = "supervisor";
		this.currentTurn = 0;
		let agentResult = await this.agentRunner.run(task);
		let supervisorResult = await this.supervisorRunner.run(task);

		if (agentResult.error) {
		if (supervisorResult.error) {
		this.emitSummary({ success: false, turns: 0 });
		@@ -63,18 +78,21 @@ return { success: false, turns: 0 };

		// The supervisor's turn is fully complete (all tool calls executed) by the
		// time we check the signal — no work is interrupted.
		if (isSuccessful(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: 0 });
		return { success: true, turns: 0 };
		}

		for (let turn = 1; turn <= this.maxTurns; turn++) {
		// Supervisor observes the agent's output
		const supervisorPrompt =
		`The agent reported:\n\n${agentResult.text}\n\n` +
		`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;

		this.currentSource = "supervisor";
		// Supervisor's output becomes the agent's input
		this.currentSource = "agent";
		this.currentTurn = turn;
		let supervisorResult;
		let agentResult;
		if (turn === 1) {
		supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
		agentResult = await this.agentRunner.run(supervisorResult.text);
		} else {
		supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
		agentResult = await this.agentRunner.resume(supervisorResult.text);
		}

		if (supervisorResult.error) {
		if (agentResult.error) {
		this.emitSummary({ success: false, turns: turn });
		@@ -84,16 +102,24 @@ return { success: false, turns: turn };

		if (isDone(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		return { success: true, turns: turn };
		}
		// Build the full agent transcript from buffered NDJSON events so the
		// supervisor sees tool calls and reasoning, not just the SDK result summary.
		const agentTranscript = this.extractTranscript(this.agentRunner);

		// Supervisor's response becomes the agent's next input
		this.currentSource = "agent";
		const supervisorPrompt =
		`The agent reported:\n\n${agentTranscript}\n\n` +
		`Review the agent's work and decide how to proceed.`;

		this.currentSource = "supervisor";
		this.currentTurn = turn;
		agentResult = await this.agentRunner.resume(supervisorResult.text);
		supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);

		if (agentResult.error) {
		if (supervisorResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { success: false, turns: turn };
		}

		// The supervisor's turn is fully complete — check for success signal.
		if (isSuccessful(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		return { success: true, turns: turn };
		}
		}
		@@ -106,2 +132,17 @@
		/**
		* Extract a human-readable transcript from an AgentRunner's buffered output.
		* Drains the buffer and replays events through a TraceCollector.
		* @param {import("./agent-runner.js").AgentRunner} runner
		* @returns {string}
		*/
		extractTranscript(runner) {
		const lines = runner.drainOutput();
		const collector = new TraceCollector();
		for (const line of lines) {
		collector.addLine(line);
		}
		return collector.toText() \|\| "[The agent produced no output.]";
		}

		/**
		* Emit a single NDJSON line tagged with the current source and turn.
		@@ -146,2 +187,6 @@ * Called in real-time via the AgentRunner onLine callback.
		* @param {string[]} [deps.allowedTools] - Tools the agent may use
		* @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
		* @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
		* @param {string} [deps.supervisorProfile] - Supervisor agent profile name
		* @param {string} [deps.agentProfile] - Agent profile name
		* @returns {Supervisor}
		@@ -157,2 +202,6 @@ */
		allowedTools,
		supervisorDisallowedTools,
		supervisorAllowedTools,
		supervisorProfile,
		agentProfile,
		}) {
		@@ -173,4 +222,18 @@ // Forward-reference: onLine captures `supervisor` before construction completes.
		settingSources: ["project"],
		agentProfile,
		systemPrompt: {
		type: "preset",
		preset: "claude_code",
		append: AGENT_SYSTEM_PROMPT,
		},
		});

		// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
		// The relay loop handles agent communication — letting the supervisor use
		// Task would bypass the relay and produce an empty agent trace.
		const defaultDisallowed = ["Task", "TaskOutput"];
		const disallowedTools = supervisorDisallowedTools
		? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
		: defaultDisallowed;

		const supervisorRunner = createAgentRunner({
		@@ -182,5 +245,19 @@ cwd: supervisorCwd,
		maxTurns: 10,
		allowedTools: ["Read", "Glob", "Grep"],
		allowedTools: supervisorAllowedTools ?? [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		],
		disallowedTools,
		onLine,
		settingSources: ["project"],
		agentProfile: supervisorProfile,
		systemPrompt: {
		type: "preset",
		preset: "claude_code",
		append: SUPERVISOR_SYSTEM_PROMPT,
		},
		});
		@@ -187,0 +264,0 @@

+6

-3

src/tee-writer.js

		@@ -110,3 +110,2 @@ /**
		this.lastSource = parsed.source;
		this.textStream.write(`\n[${parsed.source}]\n`);
		}
		@@ -123,2 +122,6 @@ this.collector.addLine(JSON.stringify(parsed.event));
		const turns = this.collector.turns;
		const prefix =
		this.mode === "supervised" && this.lastSource
		? `[${this.lastSource}] `
		: "";
		while (this.turnsEmitted < turns.length) {
		@@ -129,6 +132,6 @@ const turn = turns[this.turnsEmitted++];
		if (block.type === "text") {
		this.textStream.write(block.text + "\n");
		this.textStream.write(`${prefix}${block.text}\n`);
		} else if (block.type === "tool_use") {
		const input = summarizeInput(block.input);
		this.textStream.write(`> Tool: ${block.name} ${input}\n`);
		this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
		}
		@@ -135,0 +138,0 @@ }

+200

-49

test/supervisor.test.js

		@@ -9,4 +9,6 @@ import { describe, test } from "node:test";
		createSupervisor,
		SUPERVISOR_SYSTEM_PROMPT,
		AGENT_SYSTEM_PROMPT,
		} from "@forwardimpact/libeval";
		import { isDone } from "../src/supervisor.js";
		import { isSuccessful } from "../src/supervisor.js";

		@@ -65,23 +67,47 @@ /**

		describe("isDone", () => {
		test("detects EVALUATION_COMPLETE on its own line", () => {
		assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
		describe("isSuccessful", () => {
		test("detects EVALUATION_SUCCESSFUL on its own line", () => {
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(
		isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
		isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
		true,
		);
		assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
		assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
		});

		test("does not match EVALUATION_COMPLETE embedded in text", () => {
		assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
		assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
		assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
		test("tolerates markdown formatting around the signal", () => {
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
		assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
		assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
		assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
		assert.strictEqual(
		isSuccessful(
		"Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
		),
		true,
		);
		});

		test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
		assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
		assert.strictEqual(
		isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
		true,
		);
		assert.strictEqual(
		isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
		true,
		);
		});

		test("does not match empty or unrelated text", () => {
		assert.strictEqual(isDone(""), false);
		assert.strictEqual(isDone("All done!"), false);
		assert.strictEqual(isDone("DONE"), false);
		assert.strictEqual(isSuccessful(""), false);
		assert.strictEqual(isSuccessful("All done!"), false);
		assert.strictEqual(isSuccessful("DONE"), false);
		});

		test("does not match old EVALUATION_COMPLETE signal", () => {
		assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
		});
		});
		@@ -123,3 +149,24 @@

		test("completes on EVALUATION_COMPLETE from supervisor", async () => {
		test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
		const agentRunner = createMockRunner([]);

		const supervisorRunner = createMockRunner([
		{ text: "EVALUATION_SUCCESSFUL" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 0);
		});

		test("completes after one agent turn", async () => {
		const agentRunner = createMockRunner([
		@@ -130,3 +177,4 @@ { text: "I installed the packages." },
		const supervisorRunner = createMockRunner([
		{ text: "Good work.\n\nEVALUATION_COMPLETE" },
		{ text: "Welcome! Please install the packages." },
		{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
		]);
		@@ -156,5 +204,6 @@
		const supervisorRunner = createMockRunner([
		{ text: "Here is your task. Do the work." },
		{ text: "Keep going, you need to do more." },
		{ text: "Almost there, continue." },
		{ text: "EVALUATION_COMPLETE" },
		{ text: "EVALUATION_SUCCESSFUL" },
		]);
		@@ -177,5 +226,4 @@
		test("enforces maxTurns limit", async () => {
		// Agent responds to every turn, supervisor never says done
		// Supervisor starts, agent responds each turn, supervisor never says done
		const agentRunner = createMockRunner([
		{ text: "Turn 0" },
		{ text: "Turn 1" },
		@@ -186,2 +234,3 @@ { text: "Turn 2" },
		const supervisorRunner = createMockRunner([
		{ text: "Start working." },
		{ text: "Continue." },
		@@ -206,12 +255,13 @@ { text: "Continue." },
		test("output contains tagged lines with correct source and turn", async () => {
		const agentMessages = [[{ type: "assistant", content: "Working" }]];
		const supervisorMessages = [
		[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
		[{ type: "assistant", content: "Go ahead" }],
		[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
		];
		const agentMessages = [[{ type: "assistant", content: "Working" }]];

		const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
		const supervisorRunner = createMockRunner(
		[{ text: "EVALUATION_COMPLETE" }],
		[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
		supervisorMessages,
		);
		const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);

		@@ -236,15 +286,15 @@ const output = new PassThrough();

		// Should have: agent turn 0, supervisor turn 1, orchestrator summary
		assert.ok(lines.length >= 3);
		// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
		assert.ok(lines.length >= 4);

		const agentLine = JSON.parse(lines[0]);
		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);
		assert.ok("event" in supervisorLine);

		const agentLine = JSON.parse(lines[1]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 0);
		assert.strictEqual(agentLine.turn, 1);
		assert.ok("event" in agentLine);

		const supervisorLine = JSON.parse(lines[1]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 1);
		assert.ok("event" in supervisorLine);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		@@ -262,7 +312,10 @@ assert.strictEqual(summaryLine.source, "orchestrator");
		};
		const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
		const supervisorRunner = createMockRunner(
		[{ text: "EVALUATION_COMPLETE" }],
		[[{ type: "assistant", content: "ok" }]],
		[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
		[
		[{ type: "assistant", content: "Go" }],
		[{ type: "assistant", content: "ok" }],
		],
		);
		const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);

		@@ -287,3 +340,4 @@ const output = new PassThrough();

		const tagged = JSON.parse(lines[0]);
		// First line is supervisor turn 0, second is agent turn 1
		const tagged = JSON.parse(lines[1]);
		// The original event's `source` field is preserved inside `event`
		@@ -294,17 +348,19 @@ assert.strictEqual(tagged.source, "agent");

		test("emits agent output and summary when agent errors on turn 0", async () => {
		const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
		const agentRunner = createMockRunner(
		[{ text: "Partial work", success: false }],
		agentMessages,
		test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
		const supervisorMessages = [
		[{ type: "assistant", content: "Starting..." }],
		];
		const supervisorRunner = createMockRunner(
		[{ text: "Starting...", success: false }],
		supervisorMessages,
		);

		// Override run to simulate an error return
		const origRun = agentRunner.run;
		agentRunner.run = async (task) => {
		const result = await origRun.call(agentRunner, task);
		const origRun = supervisorRunner.run;
		supervisorRunner.run = async (task) => {
		const result = await origRun.call(supervisorRunner, task);
		return { ...result, error: new Error("Process exited with code 1") };
		};

		const supervisorRunner = createMockRunner([]);
		const agentRunner = createMockRunner([]);

		@@ -326,3 +382,3 @@ const output = new PassThrough();

		// Output should still contain the agent's buffered lines + summary
		// Output should still contain the supervisor's buffered lines + summary
		const data = output.read()?.toString() ?? "";
		@@ -334,7 +390,7 @@ const lines = data

		assert.ok(lines.length >= 2, "Expected at least agent line + summary");
		assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");

		const agentLine = JSON.parse(lines[0]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 0);
		const supervisorLine = JSON.parse(lines[0]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 0);

		@@ -356,2 +412,97 @@ const summaryLine = JSON.parse(lines[lines.length - 1]);
		});

		test("createSupervisor uses default supervisor tools when none specified", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		]);
		});

		test("createSupervisor passes custom supervisor tools", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorAllowedTools: ["Read", "Glob", "Grep"],
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
		"Read",
		"Glob",
		"Grep",
		]);
		});

		test("createSupervisor wires system prompts to both runners", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});

		assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: AGENT_SYSTEM_PROMPT,
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
		type: "preset",
		preset: "claude_code",
		append: SUPERVISOR_SYSTEM_PROMPT,
		});
		});

		test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
		"Task",
		"TaskOutput",
		]);
		// Agent should not have disallowed tools
		assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
		});

		test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		supervisorDisallowedTools: ["WebSearch", "Task"],
		});
		const disallowed = supervisor.supervisorRunner.disallowedTools;
		assert.ok(disallowed.includes("Task"));
		assert.ok(disallowed.includes("TaskOutput"));
		assert.ok(disallowed.includes("WebSearch"));
		// No duplicates
		assert.strictEqual(disallowed.length, new Set(disallowed).size);
		});

		test("system prompt constants are non-empty strings", () => {
		assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
		assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
		assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
		});

		test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
		assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
		});
		});

+6

-8

test/tee-writer.test.js

		@@ -190,7 +190,5 @@ import { describe, test } from "node:test";

		// Text should show source labels
		assert.ok(textData.includes("[agent]"));
		assert.ok(textData.includes("Working on it"));
		assert.ok(textData.includes("[supervisor]"));
		assert.ok(textData.includes("Looks good"));
		// Text should show source prefixes on content lines
		assert.ok(textData.includes("[agent] Working on it"));
		assert.ok(textData.includes("[supervisor] Looks good"));
		assert.ok(textData.includes("Evaluation completed after 1 turns"));
		@@ -258,5 +256,5 @@ });
		const textData = collect(textStream);
		// [agent] label should appear only once
		const agentLabels = textData.split("[agent]").length - 1;
		assert.strictEqual(agentLabels, 1);
		// [agent] prefix should appear on each content line
		assert.ok(textData.includes("[agent] Step 1"));
		assert.ok(textData.includes("[agent] Step 2"));
		});
		@@ -263,0 +261,0 @@

@forwardimpact/libeval - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Dependency changes