@forwardimpact/libeval - npm Package Compare versions

+142

src/agent-runner.js

		/**
		* AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
		* events to an output stream. Building block for both `fit-eval run` and
		* `fit-eval supervise`.
		*
		* Follows OO+DI: constructor injection, factory function, tests bypass factory.
		*/

		export class AgentRunner {
		/**
		* @param {object} deps
		* @param {string} deps.cwd - Agent working directory
		* @param {function} deps.query - SDK query function (injected for testing)
		* @param {import("stream").Writable} deps.output - Stream to emit NDJSON to
		* @param {string} [deps.model] - Claude model identifier
		* @param {number} [deps.maxTurns] - Maximum agentic turns
		* @param {string[]} [deps.allowedTools] - Tools the agent may use
		* @param {string} [deps.permissionMode] - SDK permission mode
		*/
		constructor({
		cwd,
		query,
		output,
		model,
		maxTurns,
		allowedTools,
		permissionMode,
		}) {
		if (!cwd) throw new Error("cwd is required");
		if (!query) throw new Error("query is required");
		if (!output) throw new Error("output is required");
		this.cwd = cwd;
		this.query = query;
		this.output = output;
		this.model = model ?? "opus";
		this.maxTurns = maxTurns ?? 50;
		this.allowedTools = allowedTools ?? [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		];
		this.permissionMode = permissionMode ?? "bypassPermissions";
		this.sessionId = null;
		this.buffer = [];
		}

		/**
		* Run a new agent session with the given task.
		* @param {string} task - The task prompt
		* @returns {Promise<{success: boolean, text: string, sessionId: string\|null}>}
		*/
		async run(task) {
		let text = "";
		let stopReason = null;
		let error = null;

		try {
		for await (const message of this.query({
		prompt: task,
		options: {
		cwd: this.cwd,
		allowedTools: this.allowedTools,
		maxTurns: this.maxTurns,
		model: this.model,
		permissionMode: this.permissionMode,
		allowDangerouslySkipPermissions: true,
		},
		})) {
		const line = JSON.stringify(message);
		this.output.write(line + "\n");
		this.buffer.push(line);

		if (message.type === "system" && message.subtype === "init") {
		this.sessionId = message.session_id;
		}
		if (message.type === "result") {
		text = message.result ?? "";
		stopReason = message.subtype;
		}
		}
		} catch (err) {
		error = err;
		}

		const success = !error && stopReason === "success";
		return { success, text, sessionId: this.sessionId, error };
		}

		/**
		* Resume an existing session with a follow-up prompt.
		* @param {string} prompt - The follow-up prompt
		* @returns {Promise<{success: boolean, text: string}>}
		*/
		async resume(prompt) {
		let text = "";
		let stopReason = null;
		let error = null;

		try {
		for await (const message of this.query({
		prompt,
		options: { resume: this.sessionId },
		})) {
		const line = JSON.stringify(message);
		this.output.write(line + "\n");
		this.buffer.push(line);

		if (message.type === "result") {
		text = message.result ?? "";
		stopReason = message.subtype;
		}
		}
		} catch (err) {
		error = err;
		}

		const success = !error && stopReason === "success";
		return { success, text, error };
		}

		/**
		* Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
		* @returns {string[]}
		*/
		drainOutput() {
		const lines = [...this.buffer];
		this.buffer = [];
		return lines;
		}
		}

		/**
		* Factory function — wires real dependencies.
		* @param {object} deps - Same as AgentRunner constructor
		* @returns {AgentRunner}
		*/
		export function createAgentRunner(deps) {
		return new AgentRunner(deps);
		}

+75

src/commands/run.js

		import { readFileSync, createWriteStream } from "node:fs";
		import { resolve } from "node:path";
		import { createAgentRunner } from "../agent-runner.js";
		import { createTeeWriter } from "../tee-writer.js";

		/**
		* Parse a --key=value or --key value flag from args.
		* @param {string[]} args
		* @param {string} name - Flag name without --
		* @returns {string\|undefined}
		*/
		function parseFlag(args, name) {
		const prefix = `--${name}=`;
		for (let i = 0; i < args.length; i++) {
		if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
		if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
		}
		return undefined;
		}

		/**
		* Run command — execute a single agent via the Claude Agent SDK.
		*
		* Usage: fit-eval run [options]
		*
		* Options:
		* --task=PATH Path to task file (required)
		* --cwd=DIR Agent working directory (default: .)
		* --model=MODEL Claude model to use (default: opus)
		* --max-turns=N Maximum agentic turns (default: 50)
		* --output=PATH Write NDJSON trace to file (default: stdout)
		* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
		*
		* @param {string[]} args - Command arguments
		*/
		export async function runRunCommand(args) {
		const task = parseFlag(args, "task");
		if (!task) throw new Error("--task is required");

		const cwd = resolve(parseFlag(args, "cwd") ?? ".");
		const model = parseFlag(args, "model") ?? "opus";
		const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
		const outputPath = parseFlag(args, "output");
		const allowedTools = (
		parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
		).split(",");

		const taskContent = readFileSync(task, "utf8");

		// When --output is specified, stream text to stdout while writing NDJSON to file.
		// Otherwise, write NDJSON directly to stdout (backwards-compatible).
		const fileStream = outputPath ? createWriteStream(outputPath) : null;
		const output = fileStream
		? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
		: process.stdout;

		const { query } = await import("@anthropic-ai/claude-agent-sdk");
		const runner = createAgentRunner({
		cwd,
		query,
		output,
		model,
		maxTurns,
		allowedTools,
		});

		const result = await runner.run(taskContent);

		if (fileStream) {
		await new Promise((r) => output.end(r));
		await new Promise((r) => fileStream.end(r));
		}

		process.exit(result.success ? 0 : 1);
		}

+86

src/commands/supervise.js

		import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
		import { resolve, join } from "node:path";
		import { tmpdir } from "node:os";
		import { createSupervisor } from "../supervisor.js";
		import { createTeeWriter } from "../tee-writer.js";

		/**
		* Parse a --key=value or --key value flag from args.
		* @param {string[]} args
		* @param {string} name - Flag name without --
		* @returns {string\|undefined}
		*/
		function parseFlag(args, name) {
		const prefix = `--${name}=`;
		for (let i = 0; i < args.length; i++) {
		if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
		if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
		}
		return undefined;
		}

		/**
		* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
		*
		* Usage: fit-eval supervise [options]
		*
		* Options:
		* --task=PATH Path to task file (required)
		* --supervisor-cwd=DIR Supervisor working directory (default: .)
		* --agent-cwd=DIR Agent working directory (default: temp directory)
		* --model=MODEL Claude model to use (default: opus)
		* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
		* --output=PATH Write NDJSON trace to file (default: stdout)
		* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
		*
		* @param {string[]} args - Command arguments
		*/
		export async function runSuperviseCommand(args) {
		const task = parseFlag(args, "task");
		if (!task) throw new Error("--task is required");

		const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
		const agentCwd = resolve(
		parseFlag(args, "agent-cwd") ??
		mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
		);
		const model = parseFlag(args, "model") ?? "opus";
		const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
		const outputPath = parseFlag(args, "output");
		const allowedTools = (
		parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
		).split(",");

		const taskContent = readFileSync(task, "utf8");

		// When --output is specified, stream text to stdout while writing NDJSON to file.
		// Otherwise, write NDJSON directly to stdout (backwards-compatible).
		const fileStream = outputPath ? createWriteStream(outputPath) : null;
		const output = fileStream
		? createTeeWriter({
		fileStream,
		textStream: process.stdout,
		mode: "supervised",
		})
		: process.stdout;

		const { query } = await import("@anthropic-ai/claude-agent-sdk");
		const supervisor = createSupervisor({
		supervisorCwd,
		agentCwd,
		query,
		output,
		model,
		maxTurns,
		allowedTools,
		});

		const result = await supervisor.run(taskContent);

		if (fileStream) {
		await new Promise((r) => output.end(r));
		await new Promise((r) => fileStream.end(r));
		}

		process.exit(result.success ? 0 : 1);
		}

+165

src/supervisor.js

		/**
		* Supervisor — orchestrates a relay loop between an agent and a supervisor,
		* both running as AgentRunner instances. The agent works on a task while the
		* supervisor observes and decides when the evaluation is complete.
		*
		* Follows OO+DI: constructor injection, factory function, tests bypass factory.
		*/

		import { PassThrough } from "node:stream";
		import { createAgentRunner } from "./agent-runner.js";

		/**
		* Check if the supervisor's response signals evaluation completion.
		* Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
		* to avoid false positives from natural language.
		* @param {string} text
		* @returns {boolean}
		*/
		export function isDone(text) {
		return /^EVALUATION_COMPLETE$/m.test(text);
		}

		export class Supervisor {
		/**
		* @param {object} deps
		* @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions
		* @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
		* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
		* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
		*/
		constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
		if (!agentRunner) throw new Error("agentRunner is required");
		if (!supervisorRunner) throw new Error("supervisorRunner is required");
		if (!output) throw new Error("output is required");
		this.agentRunner = agentRunner;
		this.supervisorRunner = supervisorRunner;
		this.output = output;
		this.maxTurns = maxTurns ?? 20;
		}

		/**
		* Run the supervisor ↔ agent relay loop.
		* @param {string} task - The initial task for the agent
		* @returns {Promise<{success: boolean, turns: number}>}
		*/
		async run(task) {
		// Turn 0: Agent receives the task and starts working
		let agentResult = await this.agentRunner.run(task);
		this.emitTagged("agent", 0);

		if (agentResult.error) {
		this.emitSummary({ success: false, turns: 0 });
		return { success: false, turns: 0 };
		}

		for (let turn = 1; turn <= this.maxTurns; turn++) {
		// Supervisor observes the agent's output
		const supervisorPrompt =
		`The agent reported:\n\n${agentResult.text}\n\n` +
		`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;

		let supervisorResult;
		if (turn === 1) {
		supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
		} else {
		supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
		}
		this.emitTagged("supervisor", turn);

		if (supervisorResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { success: false, turns: turn };
		}

		if (isDone(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		return { success: true, turns: turn };
		}

		// Supervisor's response becomes the agent's next input
		agentResult = await this.agentRunner.resume(supervisorResult.text);
		this.emitTagged("agent", turn);

		if (agentResult.error) {
		this.emitSummary({ success: false, turns: turn });
		return { success: false, turns: turn };
		}
		}

		this.emitSummary({ success: false, turns: this.maxTurns });
		return { success: false, turns: this.maxTurns };
		}

		/**
		* Drain a runner's buffered output and re-emit each line tagged with
		* source and turn metadata.
		* @param {"agent"\|"supervisor"} source
		* @param {number} turn
		*/
		emitTagged(source, turn) {
		const runner =
		source === "agent" ? this.agentRunner : this.supervisorRunner;
		for (const line of runner.drainOutput()) {
		const event = JSON.parse(line);
		const tagged = { source, turn, event };
		this.output.write(JSON.stringify(tagged) + "\n");
		}
		}

		/**
		* Emit a final orchestrator summary line.
		* @param {{success: boolean, turns: number}} result
		*/
		emitSummary(result) {
		const summary = {
		source: "orchestrator",
		type: "summary",
		success: result.success,
		turns: result.turns,
		};
		this.output.write(JSON.stringify(summary) + "\n");
		}
		}

		/**
		* Factory function — wires both AgentRunners with their respective configs.
		* @param {object} deps
		* @param {string} deps.supervisorCwd - Supervisor working directory
		* @param {string} deps.agentCwd - Agent working directory
		* @param {function} deps.query - SDK query function
		* @param {import("stream").Writable} deps.output - Final output stream
		* @param {string} [deps.model] - Claude model identifier
		* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
		* @param {string[]} [deps.allowedTools] - Tools the agent may use
		* @returns {Supervisor}
		*/
		export function createSupervisor({
		supervisorCwd,
		agentCwd,
		query,
		output,
		model,
		maxTurns,
		allowedTools,
		}) {
		const agentRunner = createAgentRunner({
		cwd: agentCwd,
		query,
		output: new PassThrough(),
		model,
		maxTurns: 50,
		allowedTools,
		});

		const supervisorRunner = createAgentRunner({
		cwd: supervisorCwd,
		query,
		output: new PassThrough(),
		model,
		maxTurns: 10,
		allowedTools: ["Read", "Glob", "Grep"],
		});

		return new Supervisor({ agentRunner, supervisorRunner, output, maxTurns });
		}

+157

src/tee-writer.js

		/**
		* TeeWriter — a Writable stream that writes raw NDJSON to a file while
		* simultaneously streaming human-readable text to a separate stream (e.g.
		* process.stdout).
		*
		* Supports two modes:
		* - "raw" (default): expects standard stream-json events from AgentRunner
		* - "supervised": expects tagged events {source, turn, event} from Supervisor
		*
		* Follows OO+DI: constructor injection, factory function, tests bypass factory.
		*/

		import { Writable } from "node:stream";
		import { TraceCollector } from "./trace-collector.js";

		export class TeeWriter extends Writable {
		/**
		* @param {object} deps
		* @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to
		* @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to
		* @param {"raw"\|"supervised"} [deps.mode] - Event format: "raw" or "supervised" (default: "raw")
		*/
		constructor({ fileStream, textStream, mode }) {
		super();
		if (!fileStream) throw new Error("fileStream is required");
		if (!textStream) throw new Error("textStream is required");
		this.fileStream = fileStream;
		this.textStream = textStream;
		this.mode = mode ?? "raw";
		this.collector = new TraceCollector();
		this.turnsEmitted = 0;
		this.lastSource = null;
		this.partial = "";
		}

		/**
		* @param {Buffer\|string} chunk
		* @param {string} encoding
		* @param {function} callback
		*/
		_write(chunk, encoding, callback) {
		const str = this.partial + chunk.toString();
		const lines = str.split("\n");
		this.partial = lines.pop() ?? "";

		for (const line of lines) {
		if (!line.trim()) continue;
		this.fileStream.write(line + "\n");
		this.processLine(line);
		}
		callback();
		}

		/**
		* @param {function} callback
		*/
		_final(callback) {
		if (this.partial.trim()) {
		this.fileStream.write(this.partial + "\n");
		this.processLine(this.partial);
		}

		if (this.mode === "raw" && this.collector.result) {
		const text = this.collector.toText();
		const idx = text.lastIndexOf("\n---");
		if (idx !== -1) {
		this.textStream.write(text.slice(idx) + "\n");
		}
		}

		callback();
		}

		/**
		* Process a single NDJSON line — feed to collector and flush text.
		* @param {string} line
		*/
		processLine(line) {
		if (this.mode === "supervised") {
		this.processSupervisedLine(line);
		} else {
		this.collector.addLine(line);
		this.flushTurns();
		}
		}

		/**
		* Handle a tagged supervisor line: unwrap event, show source labels.
		* @param {string} line
		*/
		processSupervisedLine(line) {
		let parsed;
		try {
		parsed = JSON.parse(line);
		} catch {
		return;
		}

		if (parsed.source === "orchestrator" && parsed.type === "summary") {
		const status = parsed.success ? "completed" : "incomplete";
		this.textStream.write(
		`\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`,
		);
		return;
		}

		if (parsed.event) {
		if (parsed.source && parsed.source !== this.lastSource) {
		this.lastSource = parsed.source;
		this.textStream.write(`\n[${parsed.source}]\n`);
		}
		this.collector.addLine(JSON.stringify(parsed.event));
		this.flushTurns();
		}
		}

		/**
		* Emit text for any new turns accumulated by the collector.
		*/
		flushTurns() {
		const turns = this.collector.turns;
		while (this.turnsEmitted < turns.length) {
		const turn = turns[this.turnsEmitted++];
		if (turn.role === "assistant") {
		for (const block of turn.content) {
		if (block.type === "text") {
		this.textStream.write(block.text + "\n");
		} else if (block.type === "tool_use") {
		const input = summarizeInput(block.input);
		this.textStream.write(`> Tool: ${block.name} ${input}\n`);
		}
		}
		}
		}
		}
		}

		/**
		* Summarize tool input for text display, truncated to keep logs readable.
		* @param {object} input - Tool input object
		* @returns {string} Truncated summary
		*/
		function summarizeInput(input) {
		if (!input \|\| typeof input !== "object") return "";
		const json = JSON.stringify(input);
		if (json.length <= 200) return json;
		return json.slice(0, 197) + "...";
		}

		/**
		* Factory function — wires a TeeWriter with the given streams.
		* @param {object} deps - Same as TeeWriter constructor
		* @returns {TeeWriter}
		*/
		export function createTeeWriter(deps) {
		return new TeeWriter(deps);
		}

+292

test/agent-runner.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval";

		/**
		* Create a mock query function that yields canned messages.
		* @param {object[]} messages - Messages to yield
		* @param {function} [captureOptions] - Callback to capture query options
		* @returns {function}
		*/
		function mockQuery(messages, captureOptions) {
		return async function* (params) {
		if (captureOptions) captureOptions(params);
		for (const msg of messages) {
		yield msg;
		}
		};
		}

		/**
		* Collect all NDJSON lines written to a PassThrough stream.
		* @param {PassThrough} stream
		* @returns {string[]}
		*/
		function collectLines(stream) {
		const data = stream.read();
		if (!data) return [];
		return data
		.toString()
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);
		}

		describe("AgentRunner", () => {
		test("constructor throws on missing cwd", () => {
		assert.throws(
		() =>
		new AgentRunner({
		query: async function* () {},
		output: new PassThrough(),
		}),
		/cwd is required/,
		);
		});

		test("constructor throws on missing query", () => {
		assert.throws(
		() => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }),
		/query is required/,
		);
		});

		test("constructor throws on missing output", () => {
		assert.throws(
		() =>
		new AgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		}),
		/output is required/,
		);
		});

		test("constructor uses defaults for optional params", () => {
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.strictEqual(runner.model, "opus");
		assert.strictEqual(runner.maxTurns, 50);
		assert.deepStrictEqual(runner.allowedTools, [
		"Bash",
		"Read",
		"Glob",
		"Grep",
		"Write",
		"Edit",
		]);
		assert.strictEqual(runner.permissionMode, "bypassPermissions");
		assert.strictEqual(runner.sessionId, null);
		});

		test("run() writes NDJSON lines to output stream", async () => {
		const messages = [
		{ type: "system", subtype: "init", session_id: "sess-1" },
		{ type: "assistant", content: "Working on it..." },
		{ type: "result", subtype: "success", result: "Done." },
		];

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: mockQuery(messages),
		output,
		});

		const result = await runner.run("Test task");
		const lines = collectLines(output);

		assert.strictEqual(lines.length, 3);
		assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]);
		assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]);
		assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]);
		assert.strictEqual(result.success, true);
		assert.strictEqual(result.text, "Done.");
		assert.strictEqual(result.sessionId, "sess-1");
		});

		test("run() captures sessionId from init event", async () => {
		const messages = [
		{ type: "system", subtype: "init", session_id: "my-session" },
		{ type: "result", subtype: "success", result: "OK" },
		];

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: mockQuery(messages),
		output,
		});

		await runner.run("Task");
		assert.strictEqual(runner.sessionId, "my-session");
		});

		test("run() passes options to query", async () => {
		let captured = null;
		const query = mockQuery(
		[{ type: "result", subtype: "success", result: "OK" }],
		(params) => {
		captured = params;
		},
		);

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/work",
		query,
		output,
		model: "sonnet",
		maxTurns: 10,
		allowedTools: ["Read", "Grep"],
		permissionMode: "plan",
		});

		await runner.run("My task");

		assert.strictEqual(captured.prompt, "My task");
		assert.strictEqual(captured.options.cwd, "/work");
		assert.strictEqual(captured.options.model, "sonnet");
		assert.strictEqual(captured.options.maxTurns, 10);
		assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
		assert.strictEqual(captured.options.permissionMode, "plan");
		assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
		});

		test("run() returns success=false on non-success subtype", async () => {
		const messages = [{ type: "result", subtype: "error", result: "Stopped" }];

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: mockQuery(messages),
		output,
		});

		const result = await runner.run("Task");
		assert.strictEqual(result.success, false);
		assert.strictEqual(result.text, "Stopped");
		});

		test("resume() passes sessionId via options.resume", async () => {
		let resumeCapture = null;

		const initMessages = [
		{ type: "system", subtype: "init", session_id: "sess-42" },
		{ type: "result", subtype: "success", result: "First done" },
		];

		let callCount = 0;
		const query = async function* (params) {
		callCount++;
		if (callCount === 1) {
		for (const m of initMessages) yield m;
		} else {
		resumeCapture = params;
		yield { type: "result", subtype: "success", result: "Resumed" };
		}
		};

		const output = new PassThrough();
		const runner = new AgentRunner({ cwd: "/tmp", query, output });

		await runner.run("Initial task");
		const result = await runner.resume("Follow up");

		assert.strictEqual(resumeCapture.options.resume, "sess-42");
		assert.strictEqual(resumeCapture.prompt, "Follow up");
		assert.strictEqual(result.success, true);
		assert.strictEqual(result.text, "Resumed");
		});

		test("drainOutput() returns buffered lines and clears buffer", async () => {
		const messages = [
		{ type: "assistant", content: "Line 1" },
		{ type: "result", subtype: "success", result: "Line 2" },
		];

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: mockQuery(messages),
		output,
		});

		await runner.run("Task");

		const drained = runner.drainOutput();
		assert.strictEqual(drained.length, 2);
		assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]);
		assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]);

		// Buffer should be empty after drain
		const secondDrain = runner.drainOutput();
		assert.strictEqual(secondDrain.length, 0);
		});

		test("run() captures error when query throws and returns buffered output", async () => {
		async function* failingQuery() {
		yield { type: "system", subtype: "init", session_id: "sess-err" };
		yield { type: "assistant", content: "Partial work" };
		throw new Error("Claude Code process exited with code 1");
		}

		const output = new PassThrough();
		const runner = new AgentRunner({
		cwd: "/tmp",
		query: () => failingQuery(),
		output,
		});

		const result = await runner.run("Task");
		assert.strictEqual(result.success, false);
		assert.ok(result.error);
		assert.match(result.error.message, /exited with code 1/);
		assert.strictEqual(result.sessionId, "sess-err");

		// Buffered output should contain the messages yielded before the error
		const drained = runner.drainOutput();
		assert.strictEqual(drained.length, 2);
		});

		test("resume() captures error when query throws", async () => {
		const initMessages = [
		{ type: "system", subtype: "init", session_id: "sess-r" },
		{ type: "result", subtype: "success", result: "OK" },
		];

		let callCount = 0;
		const query = async function* () {
		callCount++;
		if (callCount === 1) {
		for (const m of initMessages) yield m;
		} else {
		yield { type: "assistant", content: "Resuming..." };
		throw new Error("Process crashed");
		}
		};

		const output = new PassThrough();
		const runner = new AgentRunner({ cwd: "/tmp", query, output });

		await runner.run("Task");
		const result = await runner.resume("Continue");
		assert.strictEqual(result.success, false);
		assert.ok(result.error);
		assert.match(result.error.message, /Process crashed/);
		});

		test("createAgentRunner factory returns an AgentRunner instance", () => {
		const runner = createAgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.ok(runner instanceof AgentRunner);
		});
		});

+333

test/supervisor.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import {
		AgentRunner,
		Supervisor,
		createSupervisor,
		} from "@forwardimpact/libeval";
		import { isDone } from "../src/supervisor.js";

		/**
		* Create a mock AgentRunner that yields pre-scripted responses.
		* Each call to run() or resume() pops the next response from the array.
		* @param {object[]} responses - Array of {text, success} objects
		* @param {object[]} [messages] - Messages to buffer per turn
		* @returns {AgentRunner}
		*/
		function createMockRunner(responses, messages) {
		const output = new PassThrough();
		let callIndex = 0;

		const runner = new AgentRunner({
		cwd: "/tmp",
		query: async function* () {},
		output,
		});

		// Override run and resume to return scripted responses
		runner.run = async (_task) => {
		const resp = responses[callIndex++];
		// Buffer messages for drainOutput
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		for (const m of msgs) {
		runner.buffer.push(JSON.stringify(m));
		}
		runner.sessionId = "mock-session";
		return {
		success: resp.success ?? true,
		text: resp.text,
		sessionId: "mock-session",
		};
		};

		runner.resume = async (_prompt) => {
		const resp = responses[callIndex++];
		const msgs = messages?.[callIndex - 1] ?? [
		{ type: "assistant", content: resp.text },
		];
		for (const m of msgs) {
		runner.buffer.push(JSON.stringify(m));
		}
		return { success: resp.success ?? true, text: resp.text };
		};

		return runner;
		}

		describe("isDone", () => {
		test("detects EVALUATION_COMPLETE on its own line", () => {
		assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
		assert.strictEqual(
		isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
		true,
		);
		assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
		});

		test("does not match EVALUATION_COMPLETE embedded in text", () => {
		assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
		assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
		assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
		});

		test("does not match empty or unrelated text", () => {
		assert.strictEqual(isDone(""), false);
		assert.strictEqual(isDone("All done!"), false);
		assert.strictEqual(isDone("DONE"), false);
		});
		});

		describe("Supervisor", () => {
		test("constructor throws on missing agentRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		supervisorRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/agentRunner is required/,
		);
		});

		test("constructor throws on missing supervisorRunner", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		output: new PassThrough(),
		}),
		/supervisorRunner is required/,
		);
		});

		test("constructor throws on missing output", () => {
		assert.throws(
		() =>
		new Supervisor({
		agentRunner: createMockRunner([]),
		supervisorRunner: createMockRunner([]),
		}),
		/output is required/,
		);
		});

		test("completes on EVALUATION_COMPLETE from supervisor", async () => {
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Good work.\n\nEVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("runs multiple turns before completion", async () => {
		const agentRunner = createMockRunner([
		{ text: "Started working." },
		{ text: "Made progress." },
		{ text: "Finished everything." },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Keep going, you need to do more." },
		{ text: "Almost there, continue." },
		{ text: "EVALUATION_COMPLETE" },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Do the work");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 3);
		});

		test("enforces maxTurns limit", async () => {
		// Agent responds to every turn, supervisor never says done
		const agentRunner = createMockRunner([
		{ text: "Turn 0" },
		{ text: "Turn 1" },
		{ text: "Turn 2" },
		]);

		const supervisorRunner = createMockRunner([
		{ text: "Continue." },
		{ text: "Continue." },
		]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 2,
		});

		const result = await supervisor.run("Endless task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 2);
		});

		test("output contains tagged lines with correct source and turn", async () => {
		const agentMessages = [[{ type: "assistant", content: "Working" }]];
		const supervisorMessages = [
		[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
		];

		const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
		const supervisorRunner = createMockRunner(
		[{ text: "EVALUATION_COMPLETE" }],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		// Should have: agent turn 0, supervisor turn 1, orchestrator summary
		assert.ok(lines.length >= 3);

		const agentLine = JSON.parse(lines[0]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 0);
		assert.ok("event" in agentLine);

		const supervisorLine = JSON.parse(lines[1]);
		assert.strictEqual(supervisorLine.source, "supervisor");
		assert.strictEqual(supervisorLine.turn, 1);
		assert.ok("event" in supervisorLine);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.type, "summary");
		assert.strictEqual(summaryLine.success, true);
		});

		test("events are nested under event key (no field collisions)", async () => {
		const sourceEvent = {
		type: "assistant",
		source: "sdk-internal",
		content: "test",
		};
		const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
		const supervisorRunner = createMockRunner(
		[{ text: "EVALUATION_COMPLETE" }],
		[[{ type: "assistant", content: "ok" }]],
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		await supervisor.run("Task");

		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		const tagged = JSON.parse(lines[0]);
		// The original event's `source` field is preserved inside `event`
		assert.strictEqual(tagged.source, "agent");
		assert.strictEqual(tagged.event.source, "sdk-internal");
		});

		test("drains agent output and emits summary when agent errors on turn 0", async () => {
		const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
		const agentRunner = createMockRunner(
		[{ text: "Partial work", success: false }],
		agentMessages,
		);

		// Override run to simulate an error return
		const origRun = agentRunner.run;
		agentRunner.run = async (task) => {
		const result = await origRun.call(agentRunner, task);
		return { ...result, error: new Error("Process exited with code 1") };
		};

		const supervisorRunner = createMockRunner([]);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});

		const result = await supervisor.run("Task");

		assert.strictEqual(result.success, false);
		assert.strictEqual(result.turns, 0);

		// Output should still contain the agent's buffered lines + summary
		const data = output.read()?.toString() ?? "";
		const lines = data
		.trim()
		.split("\n")
		.filter((l) => l.length > 0);

		assert.ok(lines.length >= 2, "Expected at least agent line + summary");

		const agentLine = JSON.parse(lines[0]);
		assert.strictEqual(agentLine.source, "agent");
		assert.strictEqual(agentLine.turn, 0);

		const summaryLine = JSON.parse(lines[lines.length - 1]);
		assert.strictEqual(summaryLine.source, "orchestrator");
		assert.strictEqual(summaryLine.success, false);
		assert.strictEqual(summaryLine.turns, 0);
		});

		test("createSupervisor factory returns a Supervisor instance", () => {
		const supervisor = createSupervisor({
		supervisorCwd: "/tmp/sup",
		agentCwd: "/tmp/agent",
		query: async function* () {},
		output: new PassThrough(),
		});
		assert.ok(supervisor instanceof Supervisor);
		});
		});

+326

test/tee-writer.test.js

		import { describe, test } from "node:test";
		import assert from "node:assert";
		import { PassThrough } from "node:stream";

		import { TeeWriter, createTeeWriter } from "@forwardimpact/libeval";

		/**
		* Collect all data written to a PassThrough stream as a string.
		* @param {PassThrough} stream
		* @returns {string}
		*/
		function collect(stream) {
		const data = stream.read();
		return data ? data.toString() : "";
		}

		/**
		* Write lines to a TeeWriter and wait for it to finish.
		* @param {TeeWriter} writer
		* @param {string[]} lines - JSON lines to write
		*/
		async function writeLines(writer, lines) {
		for (const line of lines) {
		writer.write(line + "\n");
		}
		await new Promise((resolve) => writer.end(resolve));
		}

		describe("TeeWriter", () => {
		test("constructor throws on missing fileStream", () => {
		assert.throws(
		() => new TeeWriter({ textStream: new PassThrough() }),
		/fileStream is required/,
		);
		});

		test("constructor throws on missing textStream", () => {
		assert.throws(
		() => new TeeWriter({ fileStream: new PassThrough() }),
		/textStream is required/,
		);
		});

		test("writes NDJSON to fileStream and text to textStream in raw mode", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });

		const events = [
		JSON.stringify({
		type: "system",
		subtype: "init",
		session_id: "s1",
		model: "opus",
		}),
		JSON.stringify({
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Hello world" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		}),
		JSON.stringify({
		type: "assistant",
		message: {
		content: [
		{
		type: "tool_use",
		id: "t1",
		name: "Bash",
		input: { command: "ls" },
		},
		],
		usage: { input_tokens: 20, output_tokens: 10 },
		},
		}),
		JSON.stringify({
		type: "result",
		subtype: "success",
		duration_ms: 5000,
		num_turns: 2,
		total_cost_usd: 0.05,
		usage: { input_tokens: 30, output_tokens: 15 },
		}),
		];

		await writeLines(writer, events);

		const fileData = collect(fileStream);
		const textData = collect(textStream);

		// File should contain all NDJSON lines
		const fileLines = fileData.trim().split("\n");
		assert.strictEqual(fileLines.length, 4);
		assert.deepStrictEqual(JSON.parse(fileLines[0]).type, "system");
		assert.deepStrictEqual(JSON.parse(fileLines[3]).type, "result");

		// Text should contain human-readable output
		assert.ok(textData.includes("Hello world"));
		assert.ok(textData.includes("> Tool: Bash"));
		assert.ok(textData.includes("--- Result: success"));
		});

		test("streams text incrementally as events arrive", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });

		// Write first assistant message
		writer.write(
		JSON.stringify({
		type: "assistant",
		message: {
		content: [{ type: "text", text: "First message" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		}) + "\n",
		);

		// Text should be available before stream ends
		const firstText = collect(textStream);
		assert.ok(firstText.includes("First message"));

		writer.write(
		JSON.stringify({
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Second message" }],
		usage: { input_tokens: 20, output_tokens: 10 },
		},
		}) + "\n",
		);

		const secondText = collect(textStream);
		assert.ok(secondText.includes("Second message"));

		await new Promise((resolve) => writer.end(resolve));
		});

		test("supervised mode shows source labels and unwraps events", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({
		fileStream,
		textStream,
		mode: "supervised",
		});

		const events = [
		JSON.stringify({
		source: "agent",
		turn: 0,
		event: {
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Working on it" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		},
		}),
		JSON.stringify({
		source: "supervisor",
		turn: 1,
		event: {
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Looks good" }],
		usage: { input_tokens: 20, output_tokens: 10 },
		},
		},
		}),
		JSON.stringify({
		source: "orchestrator",
		type: "summary",
		success: true,
		turns: 1,
		}),
		];

		await writeLines(writer, events);

		const fileData = collect(fileStream);
		const textData = collect(textStream);

		// File should contain all raw tagged NDJSON
		const fileLines = fileData.trim().split("\n");
		assert.strictEqual(fileLines.length, 3);
		assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");

		// Text should show source labels
		assert.ok(textData.includes("[agent]"));
		assert.ok(textData.includes("Working on it"));
		assert.ok(textData.includes("[supervisor]"));
		assert.ok(textData.includes("Looks good"));
		assert.ok(textData.includes("Evaluation completed after 1 turns"));
		});

		test("supervised mode shows incomplete status on failure", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({
		fileStream,
		textStream,
		mode: "supervised",
		});

		await writeLines(writer, [
		JSON.stringify({
		source: "orchestrator",
		type: "summary",
		success: false,
		turns: 5,
		}),
		]);

		const textData = collect(textStream);
		assert.ok(textData.includes("Evaluation incomplete after 5 turns"));
		});

		test("supervised mode only shows source label on change", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({
		fileStream,
		textStream,
		mode: "supervised",
		});

		const events = [
		JSON.stringify({
		source: "agent",
		turn: 0,
		event: {
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Step 1" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		},
		}),
		JSON.stringify({
		source: "agent",
		turn: 0,
		event: {
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Step 2" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		},
		}),
		];

		await writeLines(writer, events);

		const textData = collect(textStream);
		// [agent] label should appear only once
		const agentLabels = textData.split("[agent]").length - 1;
		assert.strictEqual(agentLabels, 1);
		});

		test("handles partial lines across chunks", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });

		const fullLine = JSON.stringify({
		type: "assistant",
		message: {
		content: [{ type: "text", text: "Split message" }],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		});

		// Split the line across two chunks
		const mid = Math.floor(fullLine.length / 2);
		writer.write(fullLine.slice(0, mid));
		writer.write(fullLine.slice(mid) + "\n");
		await new Promise((resolve) => writer.end(resolve));

		const textData = collect(textStream);
		assert.ok(textData.includes("Split message"));
		});

		test("truncates long tool input", async () => {
		const fileStream = new PassThrough();
		const textStream = new PassThrough();
		const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });

		const longInput = { command: "x".repeat(300) };
		const event = JSON.stringify({
		type: "assistant",
		message: {
		content: [
		{ type: "tool_use", id: "t1", name: "Bash", input: longInput },
		],
		usage: { input_tokens: 10, output_tokens: 5 },
		},
		});

		await writeLines(writer, [event]);

		const textData = collect(textStream);
		assert.ok(textData.includes("> Tool: Bash"));
		assert.ok(textData.includes("..."));
		// Truncated to ~200 chars
		const toolLine = textData.split("\n").find((l) => l.startsWith("> Tool:"));
		assert.ok(toolLine.length < 250);
		});

		test("defaults to raw mode", () => {
		const writer = new TeeWriter({
		fileStream: new PassThrough(),
		textStream: new PassThrough(),
		});
		assert.strictEqual(writer.mode, "raw");
		});

		test("createTeeWriter factory returns a TeeWriter instance", () => {
		const writer = createTeeWriter({
		fileStream: new PassThrough(),
		textStream: new PassThrough(),
		});
		assert.ok(writer instanceof TeeWriter);
		});
		});

+26

-1

bin/fit-eval.js

		@@ -1,5 +0,7 @@
		#!/usr/bin/env node
		#!/usr/bin/env bun

		import { runOutputCommand } from "../src/commands/output.js";
		import { runTeeCommand } from "../src/commands/tee.js";
		import { runRunCommand } from "../src/commands/run.js";
		import { runSuperviseCommand } from "../src/commands/supervise.js";

		@@ -9,2 +11,4 @@ const COMMANDS = {
		tee: runTeeCommand,
		run: runRunCommand,
		supervise: runSuperviseCommand,
		};
		@@ -21,3 +25,22 @@
		tee [output.ndjson] Stream text to stdout, optionally save raw NDJSON
		run [options] Run a single agent via the Claude Agent SDK
		supervise [options] Run a supervised agent ↔ supervisor relay loop

		Run options:
		--task=PATH Path to task file (required)
		--cwd=DIR Agent working directory (default: .)
		--model=MODEL Claude model to use (default: opus)
		--max-turns=N Maximum agentic turns (default: 50)
		--output=PATH Write NDJSON trace to file (default: stdout)
		--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)

		Supervise options:
		--task=PATH Path to task file (required)
		--supervisor-cwd=DIR Supervisor working directory (default: .)
		--agent-cwd=DIR Agent working directory (default: temp directory)
		--model=MODEL Claude model to use (default: opus)
		--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
		--output=PATH Write NDJSON trace to file (default: stdout)
		--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)

		Options:
		@@ -32,2 +55,4 @@ --help Show this help message
		fit-eval tee output.ndjson < trace.ndjson
		fit-eval run --task=.github/tasks/security-audit.md --model=opus
		fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
		`.trim();
		@@ -34,0 +59,0 @@

+3

-0

index.js

		export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
		export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
		export { Supervisor, createSupervisor } from "./src/supervisor.js";
		export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

+6

-3

package.json

		{
		"name": "@forwardimpact/libeval",
		"version": "0.1.0",
		"version": "0.1.1",
		"description": "Process Claude Code stream-json output into structured traces",
		@@ -13,7 +13,10 @@ "license": "Apache-2.0",
		"engines": {
		"node": ">=22.0.0"
		"bun": ">=1.2.0"
		},
		"scripts": {
		"test": "node --test test/*.test.js"
		"test": "bun run node --test test/*.test.js"
		},
		"dependencies": {
		"@anthropic-ai/claude-agent-sdk": "^0.1.0"
		},
		"publishConfig": {
		@@ -20,0 +23,0 @@ "access": "public"

@forwardimpact/libeval - npm Package Compare versions

New alerts

Improved metrics

Worsened metrics

Dependency changes