Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

@forwardimpact/libeval

Package Overview
Dependencies
Maintainers
1
Versions
54
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@forwardimpact/libeval - npm Package Compare versions

Comparing version
0.1.0
to
0.1.1
+142
src/agent-runner.js
/**
* AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
* events to an output stream. Building block for both `fit-eval run` and
* `fit-eval supervise`.
*
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
*/
export class AgentRunner {
/**
* @param {object} deps
* @param {string} deps.cwd - Agent working directory
* @param {function} deps.query - SDK query function (injected for testing)
* @param {import("stream").Writable} deps.output - Stream to emit NDJSON to
* @param {string} [deps.model] - Claude model identifier
* @param {number} [deps.maxTurns] - Maximum agentic turns
* @param {string[]} [deps.allowedTools] - Tools the agent may use
* @param {string} [deps.permissionMode] - SDK permission mode
*/
constructor({
cwd,
query,
output,
model,
maxTurns,
allowedTools,
permissionMode,
}) {
if (!cwd) throw new Error("cwd is required");
if (!query) throw new Error("query is required");
if (!output) throw new Error("output is required");
this.cwd = cwd;
this.query = query;
this.output = output;
this.model = model ?? "opus";
this.maxTurns = maxTurns ?? 50;
this.allowedTools = allowedTools ?? [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
];
this.permissionMode = permissionMode ?? "bypassPermissions";
this.sessionId = null;
this.buffer = [];
}
/**
* Run a new agent session with the given task.
* @param {string} task - The task prompt
* @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
*/
async run(task) {
let text = "";
let stopReason = null;
let error = null;
try {
for await (const message of this.query({
prompt: task,
options: {
cwd: this.cwd,
allowedTools: this.allowedTools,
maxTurns: this.maxTurns,
model: this.model,
permissionMode: this.permissionMode,
allowDangerouslySkipPermissions: true,
},
})) {
const line = JSON.stringify(message);
this.output.write(line + "\n");
this.buffer.push(line);
if (message.type === "system" && message.subtype === "init") {
this.sessionId = message.session_id;
}
if (message.type === "result") {
text = message.result ?? "";
stopReason = message.subtype;
}
}
} catch (err) {
error = err;
}
const success = !error && stopReason === "success";
return { success, text, sessionId: this.sessionId, error };
}
/**
* Resume an existing session with a follow-up prompt.
* @param {string} prompt - The follow-up prompt
* @returns {Promise<{success: boolean, text: string}>}
*/
async resume(prompt) {
let text = "";
let stopReason = null;
let error = null;
try {
for await (const message of this.query({
prompt,
options: { resume: this.sessionId },
})) {
const line = JSON.stringify(message);
this.output.write(line + "\n");
this.buffer.push(line);
if (message.type === "result") {
text = message.result ?? "";
stopReason = message.subtype;
}
}
} catch (err) {
error = err;
}
const success = !error && stopReason === "success";
return { success, text, error };
}
/**
* Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
* @returns {string[]}
*/
drainOutput() {
const lines = [...this.buffer];
this.buffer = [];
return lines;
}
}
/**
* Factory function — wires real dependencies.
* @param {object} deps - Same as AgentRunner constructor
* @returns {AgentRunner}
*/
export function createAgentRunner(deps) {
return new AgentRunner(deps);
}
import { readFileSync, createWriteStream } from "node:fs";
import { resolve } from "node:path";
import { createAgentRunner } from "../agent-runner.js";
import { createTeeWriter } from "../tee-writer.js";
/**
* Parse a --key=value or --key value flag from args.
* @param {string[]} args
* @param {string} name - Flag name without --
* @returns {string|undefined}
*/
function parseFlag(args, name) {
const prefix = `--${name}=`;
for (let i = 0; i < args.length; i++) {
if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
}
return undefined;
}
/**
* Run command — execute a single agent via the Claude Agent SDK.
*
* Usage: fit-eval run [options]
*
* Options:
* --task=PATH Path to task file (required)
* --cwd=DIR Agent working directory (default: .)
* --model=MODEL Claude model to use (default: opus)
* --max-turns=N Maximum agentic turns (default: 50)
* --output=PATH Write NDJSON trace to file (default: stdout)
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
*
* @param {string[]} args - Command arguments
*/
export async function runRunCommand(args) {
const task = parseFlag(args, "task");
if (!task) throw new Error("--task is required");
const cwd = resolve(parseFlag(args, "cwd") ?? ".");
const model = parseFlag(args, "model") ?? "opus";
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
const outputPath = parseFlag(args, "output");
const allowedTools = (
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
).split(",");
const taskContent = readFileSync(task, "utf8");
// When --output is specified, stream text to stdout while writing NDJSON to file.
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
const fileStream = outputPath ? createWriteStream(outputPath) : null;
const output = fileStream
? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
: process.stdout;
const { query } = await import("@anthropic-ai/claude-agent-sdk");
const runner = createAgentRunner({
cwd,
query,
output,
model,
maxTurns,
allowedTools,
});
const result = await runner.run(taskContent);
if (fileStream) {
await new Promise((r) => output.end(r));
await new Promise((r) => fileStream.end(r));
}
process.exit(result.success ? 0 : 1);
}
import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
import { resolve, join } from "node:path";
import { tmpdir } from "node:os";
import { createSupervisor } from "../supervisor.js";
import { createTeeWriter } from "../tee-writer.js";
/**
* Parse a --key=value or --key value flag from args.
* @param {string[]} args
* @param {string} name - Flag name without --
* @returns {string|undefined}
*/
function parseFlag(args, name) {
const prefix = `--${name}=`;
for (let i = 0; i < args.length; i++) {
if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
}
return undefined;
}
/**
* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
*
* Usage: fit-eval supervise [options]
*
* Options:
* --task=PATH Path to task file (required)
* --supervisor-cwd=DIR Supervisor working directory (default: .)
* --agent-cwd=DIR Agent working directory (default: temp directory)
* --model=MODEL Claude model to use (default: opus)
* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
* --output=PATH Write NDJSON trace to file (default: stdout)
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
*
* @param {string[]} args - Command arguments
*/
export async function runSuperviseCommand(args) {
const task = parseFlag(args, "task");
if (!task) throw new Error("--task is required");
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
const agentCwd = resolve(
parseFlag(args, "agent-cwd") ??
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
);
const model = parseFlag(args, "model") ?? "opus";
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
const outputPath = parseFlag(args, "output");
const allowedTools = (
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
).split(",");
const taskContent = readFileSync(task, "utf8");
// When --output is specified, stream text to stdout while writing NDJSON to file.
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
const fileStream = outputPath ? createWriteStream(outputPath) : null;
const output = fileStream
? createTeeWriter({
fileStream,
textStream: process.stdout,
mode: "supervised",
})
: process.stdout;
const { query } = await import("@anthropic-ai/claude-agent-sdk");
const supervisor = createSupervisor({
supervisorCwd,
agentCwd,
query,
output,
model,
maxTurns,
allowedTools,
});
const result = await supervisor.run(taskContent);
if (fileStream) {
await new Promise((r) => output.end(r));
await new Promise((r) => fileStream.end(r));
}
process.exit(result.success ? 0 : 1);
}
/**
* Supervisor — orchestrates a relay loop between an agent and a supervisor,
* both running as AgentRunner instances. The agent works on a task while the
* supervisor observes and decides when the evaluation is complete.
*
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
*/
import { PassThrough } from "node:stream";
import { createAgentRunner } from "./agent-runner.js";
/**
* Check if the supervisor's response signals evaluation completion.
* Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
* to avoid false positives from natural language.
* @param {string} text
* @returns {boolean}
*/
export function isDone(text) {
return /^EVALUATION_COMPLETE$/m.test(text);
}
export class Supervisor {
/**
* @param {object} deps
* @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions
* @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
*/
constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
if (!agentRunner) throw new Error("agentRunner is required");
if (!supervisorRunner) throw new Error("supervisorRunner is required");
if (!output) throw new Error("output is required");
this.agentRunner = agentRunner;
this.supervisorRunner = supervisorRunner;
this.output = output;
this.maxTurns = maxTurns ?? 20;
}
/**
* Run the supervisor ↔ agent relay loop.
* @param {string} task - The initial task for the agent
* @returns {Promise<{success: boolean, turns: number}>}
*/
async run(task) {
// Turn 0: Agent receives the task and starts working
let agentResult = await this.agentRunner.run(task);
this.emitTagged("agent", 0);
if (agentResult.error) {
this.emitSummary({ success: false, turns: 0 });
return { success: false, turns: 0 };
}
for (let turn = 1; turn <= this.maxTurns; turn++) {
// Supervisor observes the agent's output
const supervisorPrompt =
`The agent reported:\n\n${agentResult.text}\n\n` +
`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
let supervisorResult;
if (turn === 1) {
supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
} else {
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
}
this.emitTagged("supervisor", turn);
if (supervisorResult.error) {
this.emitSummary({ success: false, turns: turn });
return { success: false, turns: turn };
}
if (isDone(supervisorResult.text)) {
this.emitSummary({ success: true, turns: turn });
return { success: true, turns: turn };
}
// Supervisor's response becomes the agent's next input
agentResult = await this.agentRunner.resume(supervisorResult.text);
this.emitTagged("agent", turn);
if (agentResult.error) {
this.emitSummary({ success: false, turns: turn });
return { success: false, turns: turn };
}
}
this.emitSummary({ success: false, turns: this.maxTurns });
return { success: false, turns: this.maxTurns };
}
/**
* Drain a runner's buffered output and re-emit each line tagged with
* source and turn metadata.
* @param {"agent"|"supervisor"} source
* @param {number} turn
*/
emitTagged(source, turn) {
const runner =
source === "agent" ? this.agentRunner : this.supervisorRunner;
for (const line of runner.drainOutput()) {
const event = JSON.parse(line);
const tagged = { source, turn, event };
this.output.write(JSON.stringify(tagged) + "\n");
}
}
/**
* Emit a final orchestrator summary line.
* @param {{success: boolean, turns: number}} result
*/
emitSummary(result) {
const summary = {
source: "orchestrator",
type: "summary",
success: result.success,
turns: result.turns,
};
this.output.write(JSON.stringify(summary) + "\n");
}
}
/**
* Factory function — wires both AgentRunners with their respective configs.
* @param {object} deps
* @param {string} deps.supervisorCwd - Supervisor working directory
* @param {string} deps.agentCwd - Agent working directory
* @param {function} deps.query - SDK query function
* @param {import("stream").Writable} deps.output - Final output stream
* @param {string} [deps.model] - Claude model identifier
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
* @param {string[]} [deps.allowedTools] - Tools the agent may use
* @returns {Supervisor}
*/
export function createSupervisor({
supervisorCwd,
agentCwd,
query,
output,
model,
maxTurns,
allowedTools,
}) {
const agentRunner = createAgentRunner({
cwd: agentCwd,
query,
output: new PassThrough(),
model,
maxTurns: 50,
allowedTools,
});
const supervisorRunner = createAgentRunner({
cwd: supervisorCwd,
query,
output: new PassThrough(),
model,
maxTurns: 10,
allowedTools: ["Read", "Glob", "Grep"],
});
return new Supervisor({ agentRunner, supervisorRunner, output, maxTurns });
}
/**
* TeeWriter — a Writable stream that writes raw NDJSON to a file while
* simultaneously streaming human-readable text to a separate stream (e.g.
* process.stdout).
*
* Supports two modes:
* - "raw" (default): expects standard stream-json events from AgentRunner
* - "supervised": expects tagged events {source, turn, event} from Supervisor
*
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
*/
import { Writable } from "node:stream";
import { TraceCollector } from "./trace-collector.js";
export class TeeWriter extends Writable {
/**
* @param {object} deps
* @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to
* @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to
* @param {"raw"|"supervised"} [deps.mode] - Event format: "raw" or "supervised" (default: "raw")
*/
constructor({ fileStream, textStream, mode }) {
super();
if (!fileStream) throw new Error("fileStream is required");
if (!textStream) throw new Error("textStream is required");
this.fileStream = fileStream;
this.textStream = textStream;
this.mode = mode ?? "raw";
this.collector = new TraceCollector();
this.turnsEmitted = 0;
this.lastSource = null;
this.partial = "";
}
/**
* @param {Buffer|string} chunk
* @param {string} encoding
* @param {function} callback
*/
_write(chunk, encoding, callback) {
const str = this.partial + chunk.toString();
const lines = str.split("\n");
this.partial = lines.pop() ?? "";
for (const line of lines) {
if (!line.trim()) continue;
this.fileStream.write(line + "\n");
this.processLine(line);
}
callback();
}
/**
* @param {function} callback
*/
_final(callback) {
if (this.partial.trim()) {
this.fileStream.write(this.partial + "\n");
this.processLine(this.partial);
}
if (this.mode === "raw" && this.collector.result) {
const text = this.collector.toText();
const idx = text.lastIndexOf("\n---");
if (idx !== -1) {
this.textStream.write(text.slice(idx) + "\n");
}
}
callback();
}
/**
* Process a single NDJSON line — feed to collector and flush text.
* @param {string} line
*/
processLine(line) {
if (this.mode === "supervised") {
this.processSupervisedLine(line);
} else {
this.collector.addLine(line);
this.flushTurns();
}
}
/**
* Handle a tagged supervisor line: unwrap event, show source labels.
* @param {string} line
*/
processSupervisedLine(line) {
let parsed;
try {
parsed = JSON.parse(line);
} catch {
return;
}
if (parsed.source === "orchestrator" && parsed.type === "summary") {
const status = parsed.success ? "completed" : "incomplete";
this.textStream.write(
`\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`,
);
return;
}
if (parsed.event) {
if (parsed.source && parsed.source !== this.lastSource) {
this.lastSource = parsed.source;
this.textStream.write(`\n[${parsed.source}]\n`);
}
this.collector.addLine(JSON.stringify(parsed.event));
this.flushTurns();
}
}
/**
* Emit text for any new turns accumulated by the collector.
*/
flushTurns() {
const turns = this.collector.turns;
while (this.turnsEmitted < turns.length) {
const turn = turns[this.turnsEmitted++];
if (turn.role === "assistant") {
for (const block of turn.content) {
if (block.type === "text") {
this.textStream.write(block.text + "\n");
} else if (block.type === "tool_use") {
const input = summarizeInput(block.input);
this.textStream.write(`> Tool: ${block.name} ${input}\n`);
}
}
}
}
}
}
/**
* Summarize tool input for text display, truncated to keep logs readable.
* @param {object} input - Tool input object
* @returns {string} Truncated summary
*/
function summarizeInput(input) {
if (!input || typeof input !== "object") return "";
const json = JSON.stringify(input);
if (json.length <= 200) return json;
return json.slice(0, 197) + "...";
}
/**
* Factory function — wires a TeeWriter with the given streams.
* @param {object} deps - Same as TeeWriter constructor
* @returns {TeeWriter}
*/
export function createTeeWriter(deps) {
return new TeeWriter(deps);
}
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval";
/**
* Create a mock query function that yields canned messages.
* @param {object[]} messages - Messages to yield
* @param {function} [captureOptions] - Callback to capture query options
* @returns {function}
*/
function mockQuery(messages, captureOptions) {
return async function* (params) {
if (captureOptions) captureOptions(params);
for (const msg of messages) {
yield msg;
}
};
}
/**
* Collect all NDJSON lines written to a PassThrough stream.
* @param {PassThrough} stream
* @returns {string[]}
*/
function collectLines(stream) {
const data = stream.read();
if (!data) return [];
return data
.toString()
.trim()
.split("\n")
.filter((l) => l.length > 0);
}
describe("AgentRunner", () => {
test("constructor throws on missing cwd", () => {
assert.throws(
() =>
new AgentRunner({
query: async function* () {},
output: new PassThrough(),
}),
/cwd is required/,
);
});
test("constructor throws on missing query", () => {
assert.throws(
() => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }),
/query is required/,
);
});
test("constructor throws on missing output", () => {
assert.throws(
() =>
new AgentRunner({
cwd: "/tmp",
query: async function* () {},
}),
/output is required/,
);
});
test("constructor uses defaults for optional params", () => {
const runner = new AgentRunner({
cwd: "/tmp",
query: async function* () {},
output: new PassThrough(),
});
assert.strictEqual(runner.model, "opus");
assert.strictEqual(runner.maxTurns, 50);
assert.deepStrictEqual(runner.allowedTools, [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
]);
assert.strictEqual(runner.permissionMode, "bypassPermissions");
assert.strictEqual(runner.sessionId, null);
});
test("run() writes NDJSON lines to output stream", async () => {
const messages = [
{ type: "system", subtype: "init", session_id: "sess-1" },
{ type: "assistant", content: "Working on it..." },
{ type: "result", subtype: "success", result: "Done." },
];
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/tmp",
query: mockQuery(messages),
output,
});
const result = await runner.run("Test task");
const lines = collectLines(output);
assert.strictEqual(lines.length, 3);
assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]);
assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]);
assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]);
assert.strictEqual(result.success, true);
assert.strictEqual(result.text, "Done.");
assert.strictEqual(result.sessionId, "sess-1");
});
test("run() captures sessionId from init event", async () => {
const messages = [
{ type: "system", subtype: "init", session_id: "my-session" },
{ type: "result", subtype: "success", result: "OK" },
];
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/tmp",
query: mockQuery(messages),
output,
});
await runner.run("Task");
assert.strictEqual(runner.sessionId, "my-session");
});
test("run() passes options to query", async () => {
let captured = null;
const query = mockQuery(
[{ type: "result", subtype: "success", result: "OK" }],
(params) => {
captured = params;
},
);
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/work",
query,
output,
model: "sonnet",
maxTurns: 10,
allowedTools: ["Read", "Grep"],
permissionMode: "plan",
});
await runner.run("My task");
assert.strictEqual(captured.prompt, "My task");
assert.strictEqual(captured.options.cwd, "/work");
assert.strictEqual(captured.options.model, "sonnet");
assert.strictEqual(captured.options.maxTurns, 10);
assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
assert.strictEqual(captured.options.permissionMode, "plan");
assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
});
test("run() returns success=false on non-success subtype", async () => {
const messages = [{ type: "result", subtype: "error", result: "Stopped" }];
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/tmp",
query: mockQuery(messages),
output,
});
const result = await runner.run("Task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.text, "Stopped");
});
test("resume() passes sessionId via options.resume", async () => {
let resumeCapture = null;
const initMessages = [
{ type: "system", subtype: "init", session_id: "sess-42" },
{ type: "result", subtype: "success", result: "First done" },
];
let callCount = 0;
const query = async function* (params) {
callCount++;
if (callCount === 1) {
for (const m of initMessages) yield m;
} else {
resumeCapture = params;
yield { type: "result", subtype: "success", result: "Resumed" };
}
};
const output = new PassThrough();
const runner = new AgentRunner({ cwd: "/tmp", query, output });
await runner.run("Initial task");
const result = await runner.resume("Follow up");
assert.strictEqual(resumeCapture.options.resume, "sess-42");
assert.strictEqual(resumeCapture.prompt, "Follow up");
assert.strictEqual(result.success, true);
assert.strictEqual(result.text, "Resumed");
});
test("drainOutput() returns buffered lines and clears buffer", async () => {
const messages = [
{ type: "assistant", content: "Line 1" },
{ type: "result", subtype: "success", result: "Line 2" },
];
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/tmp",
query: mockQuery(messages),
output,
});
await runner.run("Task");
const drained = runner.drainOutput();
assert.strictEqual(drained.length, 2);
assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]);
assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]);
// Buffer should be empty after drain
const secondDrain = runner.drainOutput();
assert.strictEqual(secondDrain.length, 0);
});
test("run() captures error when query throws and returns buffered output", async () => {
async function* failingQuery() {
yield { type: "system", subtype: "init", session_id: "sess-err" };
yield { type: "assistant", content: "Partial work" };
throw new Error("Claude Code process exited with code 1");
}
const output = new PassThrough();
const runner = new AgentRunner({
cwd: "/tmp",
query: () => failingQuery(),
output,
});
const result = await runner.run("Task");
assert.strictEqual(result.success, false);
assert.ok(result.error);
assert.match(result.error.message, /exited with code 1/);
assert.strictEqual(result.sessionId, "sess-err");
// Buffered output should contain the messages yielded before the error
const drained = runner.drainOutput();
assert.strictEqual(drained.length, 2);
});
test("resume() captures error when query throws", async () => {
const initMessages = [
{ type: "system", subtype: "init", session_id: "sess-r" },
{ type: "result", subtype: "success", result: "OK" },
];
let callCount = 0;
const query = async function* () {
callCount++;
if (callCount === 1) {
for (const m of initMessages) yield m;
} else {
yield { type: "assistant", content: "Resuming..." };
throw new Error("Process crashed");
}
};
const output = new PassThrough();
const runner = new AgentRunner({ cwd: "/tmp", query, output });
await runner.run("Task");
const result = await runner.resume("Continue");
assert.strictEqual(result.success, false);
assert.ok(result.error);
assert.match(result.error.message, /Process crashed/);
});
test("createAgentRunner factory returns an AgentRunner instance", () => {
const runner = createAgentRunner({
cwd: "/tmp",
query: async function* () {},
output: new PassThrough(),
});
assert.ok(runner instanceof AgentRunner);
});
});
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import {
AgentRunner,
Supervisor,
createSupervisor,
} from "@forwardimpact/libeval";
import { isDone } from "../src/supervisor.js";
/**
* Create a mock AgentRunner that yields pre-scripted responses.
* Each call to run() or resume() pops the next response from the array.
* @param {object[]} responses - Array of {text, success} objects
* @param {object[]} [messages] - Messages to buffer per turn
* @returns {AgentRunner}
*/
function createMockRunner(responses, messages) {
const output = new PassThrough();
let callIndex = 0;
const runner = new AgentRunner({
cwd: "/tmp",
query: async function* () {},
output,
});
// Override run and resume to return scripted responses
runner.run = async (_task) => {
const resp = responses[callIndex++];
// Buffer messages for drainOutput
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
for (const m of msgs) {
runner.buffer.push(JSON.stringify(m));
}
runner.sessionId = "mock-session";
return {
success: resp.success ?? true,
text: resp.text,
sessionId: "mock-session",
};
};
runner.resume = async (_prompt) => {
const resp = responses[callIndex++];
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
for (const m of msgs) {
runner.buffer.push(JSON.stringify(m));
}
return { success: resp.success ?? true, text: resp.text };
};
return runner;
}
describe("isDone", () => {
test("detects EVALUATION_COMPLETE on its own line", () => {
assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
assert.strictEqual(
isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
true,
);
assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
});
test("does not match EVALUATION_COMPLETE embedded in text", () => {
assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
});
test("does not match empty or unrelated text", () => {
assert.strictEqual(isDone(""), false);
assert.strictEqual(isDone("All done!"), false);
assert.strictEqual(isDone("DONE"), false);
});
});
describe("Supervisor", () => {
test("constructor throws on missing agentRunner", () => {
assert.throws(
() =>
new Supervisor({
supervisorRunner: createMockRunner([]),
output: new PassThrough(),
}),
/agentRunner is required/,
);
});
test("constructor throws on missing supervisorRunner", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
output: new PassThrough(),
}),
/supervisorRunner is required/,
);
});
test("constructor throws on missing output", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
supervisorRunner: createMockRunner([]),
}),
/output is required/,
);
});
test("completes on EVALUATION_COMPLETE from supervisor", async () => {
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
const supervisorRunner = createMockRunner([
{ text: "Good work.\n\nEVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
});
test("runs multiple turns before completion", async () => {
const agentRunner = createMockRunner([
{ text: "Started working." },
{ text: "Made progress." },
{ text: "Finished everything." },
]);
const supervisorRunner = createMockRunner([
{ text: "Keep going, you need to do more." },
{ text: "Almost there, continue." },
{ text: "EVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Do the work");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 3);
});
test("enforces maxTurns limit", async () => {
// Agent responds to every turn, supervisor never says done
const agentRunner = createMockRunner([
{ text: "Turn 0" },
{ text: "Turn 1" },
{ text: "Turn 2" },
]);
const supervisorRunner = createMockRunner([
{ text: "Continue." },
{ text: "Continue." },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 2,
});
const result = await supervisor.run("Endless task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 2);
});
test("output contains tagged lines with correct source and turn", async () => {
const agentMessages = [[{ type: "assistant", content: "Working" }]];
const supervisorMessages = [
[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
];
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
const supervisorRunner = createMockRunner(
[{ text: "EVALUATION_COMPLETE" }],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
// Should have: agent turn 0, supervisor turn 1, orchestrator summary
assert.ok(lines.length >= 3);
const agentLine = JSON.parse(lines[0]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 0);
assert.ok("event" in agentLine);
const supervisorLine = JSON.parse(lines[1]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 1);
assert.ok("event" in supervisorLine);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.type, "summary");
assert.strictEqual(summaryLine.success, true);
});
test("events are nested under event key (no field collisions)", async () => {
const sourceEvent = {
type: "assistant",
source: "sdk-internal",
content: "test",
};
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
const supervisorRunner = createMockRunner(
[{ text: "EVALUATION_COMPLETE" }],
[[{ type: "assistant", content: "ok" }]],
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
const tagged = JSON.parse(lines[0]);
// The original event's `source` field is preserved inside `event`
assert.strictEqual(tagged.source, "agent");
assert.strictEqual(tagged.event.source, "sdk-internal");
});
test("drains agent output and emits summary when agent errors on turn 0", async () => {
const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
const agentRunner = createMockRunner(
[{ text: "Partial work", success: false }],
agentMessages,
);
// Override run to simulate an error return
const origRun = agentRunner.run;
agentRunner.run = async (task) => {
const result = await origRun.call(agentRunner, task);
return { ...result, error: new Error("Process exited with code 1") };
};
const supervisorRunner = createMockRunner([]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 0);
// Output should still contain the agent's buffered lines + summary
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
assert.ok(lines.length >= 2, "Expected at least agent line + summary");
const agentLine = JSON.parse(lines[0]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 0);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.success, false);
assert.strictEqual(summaryLine.turns, 0);
});
test("createSupervisor factory returns a Supervisor instance", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.ok(supervisor instanceof Supervisor);
});
});
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import { TeeWriter, createTeeWriter } from "@forwardimpact/libeval";
/**
* Collect all data written to a PassThrough stream as a string.
* @param {PassThrough} stream
* @returns {string}
*/
function collect(stream) {
const data = stream.read();
return data ? data.toString() : "";
}
/**
* Write lines to a TeeWriter and wait for it to finish.
* @param {TeeWriter} writer
* @param {string[]} lines - JSON lines to write
*/
async function writeLines(writer, lines) {
for (const line of lines) {
writer.write(line + "\n");
}
await new Promise((resolve) => writer.end(resolve));
}
describe("TeeWriter", () => {
test("constructor throws on missing fileStream", () => {
assert.throws(
() => new TeeWriter({ textStream: new PassThrough() }),
/fileStream is required/,
);
});
test("constructor throws on missing textStream", () => {
assert.throws(
() => new TeeWriter({ fileStream: new PassThrough() }),
/textStream is required/,
);
});
test("writes NDJSON to fileStream and text to textStream in raw mode", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
const events = [
JSON.stringify({
type: "system",
subtype: "init",
session_id: "s1",
model: "opus",
}),
JSON.stringify({
type: "assistant",
message: {
content: [{ type: "text", text: "Hello world" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
}),
JSON.stringify({
type: "assistant",
message: {
content: [
{
type: "tool_use",
id: "t1",
name: "Bash",
input: { command: "ls" },
},
],
usage: { input_tokens: 20, output_tokens: 10 },
},
}),
JSON.stringify({
type: "result",
subtype: "success",
duration_ms: 5000,
num_turns: 2,
total_cost_usd: 0.05,
usage: { input_tokens: 30, output_tokens: 15 },
}),
];
await writeLines(writer, events);
const fileData = collect(fileStream);
const textData = collect(textStream);
// File should contain all NDJSON lines
const fileLines = fileData.trim().split("\n");
assert.strictEqual(fileLines.length, 4);
assert.deepStrictEqual(JSON.parse(fileLines[0]).type, "system");
assert.deepStrictEqual(JSON.parse(fileLines[3]).type, "result");
// Text should contain human-readable output
assert.ok(textData.includes("Hello world"));
assert.ok(textData.includes("> Tool: Bash"));
assert.ok(textData.includes("--- Result: success"));
});
test("streams text incrementally as events arrive", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
// Write first assistant message
writer.write(
JSON.stringify({
type: "assistant",
message: {
content: [{ type: "text", text: "First message" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
}) + "\n",
);
// Text should be available before stream ends
const firstText = collect(textStream);
assert.ok(firstText.includes("First message"));
writer.write(
JSON.stringify({
type: "assistant",
message: {
content: [{ type: "text", text: "Second message" }],
usage: { input_tokens: 20, output_tokens: 10 },
},
}) + "\n",
);
const secondText = collect(textStream);
assert.ok(secondText.includes("Second message"));
await new Promise((resolve) => writer.end(resolve));
});
test("supervised mode shows source labels and unwraps events", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({
fileStream,
textStream,
mode: "supervised",
});
const events = [
JSON.stringify({
source: "agent",
turn: 0,
event: {
type: "assistant",
message: {
content: [{ type: "text", text: "Working on it" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
},
}),
JSON.stringify({
source: "supervisor",
turn: 1,
event: {
type: "assistant",
message: {
content: [{ type: "text", text: "Looks good" }],
usage: { input_tokens: 20, output_tokens: 10 },
},
},
}),
JSON.stringify({
source: "orchestrator",
type: "summary",
success: true,
turns: 1,
}),
];
await writeLines(writer, events);
const fileData = collect(fileStream);
const textData = collect(textStream);
// File should contain all raw tagged NDJSON
const fileLines = fileData.trim().split("\n");
assert.strictEqual(fileLines.length, 3);
assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
// Text should show source labels
assert.ok(textData.includes("[agent]"));
assert.ok(textData.includes("Working on it"));
assert.ok(textData.includes("[supervisor]"));
assert.ok(textData.includes("Looks good"));
assert.ok(textData.includes("Evaluation completed after 1 turns"));
});
test("supervised mode shows incomplete status on failure", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({
fileStream,
textStream,
mode: "supervised",
});
await writeLines(writer, [
JSON.stringify({
source: "orchestrator",
type: "summary",
success: false,
turns: 5,
}),
]);
const textData = collect(textStream);
assert.ok(textData.includes("Evaluation incomplete after 5 turns"));
});
test("supervised mode only shows source label on change", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({
fileStream,
textStream,
mode: "supervised",
});
const events = [
JSON.stringify({
source: "agent",
turn: 0,
event: {
type: "assistant",
message: {
content: [{ type: "text", text: "Step 1" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
},
}),
JSON.stringify({
source: "agent",
turn: 0,
event: {
type: "assistant",
message: {
content: [{ type: "text", text: "Step 2" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
},
}),
];
await writeLines(writer, events);
const textData = collect(textStream);
// [agent] label should appear only once
const agentLabels = textData.split("[agent]").length - 1;
assert.strictEqual(agentLabels, 1);
});
test("handles partial lines across chunks", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
const fullLine = JSON.stringify({
type: "assistant",
message: {
content: [{ type: "text", text: "Split message" }],
usage: { input_tokens: 10, output_tokens: 5 },
},
});
// Split the line across two chunks
const mid = Math.floor(fullLine.length / 2);
writer.write(fullLine.slice(0, mid));
writer.write(fullLine.slice(mid) + "\n");
await new Promise((resolve) => writer.end(resolve));
const textData = collect(textStream);
assert.ok(textData.includes("Split message"));
});
test("truncates long tool input", async () => {
const fileStream = new PassThrough();
const textStream = new PassThrough();
const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
const longInput = { command: "x".repeat(300) };
const event = JSON.stringify({
type: "assistant",
message: {
content: [
{ type: "tool_use", id: "t1", name: "Bash", input: longInput },
],
usage: { input_tokens: 10, output_tokens: 5 },
},
});
await writeLines(writer, [event]);
const textData = collect(textStream);
assert.ok(textData.includes("> Tool: Bash"));
assert.ok(textData.includes("..."));
// Truncated to ~200 chars
const toolLine = textData.split("\n").find((l) => l.startsWith("> Tool:"));
assert.ok(toolLine.length < 250);
});
test("defaults to raw mode", () => {
const writer = new TeeWriter({
fileStream: new PassThrough(),
textStream: new PassThrough(),
});
assert.strictEqual(writer.mode, "raw");
});
test("createTeeWriter factory returns a TeeWriter instance", () => {
const writer = createTeeWriter({
fileStream: new PassThrough(),
textStream: new PassThrough(),
});
assert.ok(writer instanceof TeeWriter);
});
});
+26
-1

@@ -1,5 +0,7 @@

#!/usr/bin/env node
#!/usr/bin/env bun
import { runOutputCommand } from "../src/commands/output.js";
import { runTeeCommand } from "../src/commands/tee.js";
import { runRunCommand } from "../src/commands/run.js";
import { runSuperviseCommand } from "../src/commands/supervise.js";

@@ -9,2 +11,4 @@ const COMMANDS = {

tee: runTeeCommand,
run: runRunCommand,
supervise: runSuperviseCommand,
};

@@ -21,3 +25,22 @@

tee [output.ndjson] Stream text to stdout, optionally save raw NDJSON
run [options] Run a single agent via the Claude Agent SDK
supervise [options] Run a supervised agent ↔ supervisor relay loop
Run options:
--task=PATH Path to task file (required)
--cwd=DIR Agent working directory (default: .)
--model=MODEL Claude model to use (default: opus)
--max-turns=N Maximum agentic turns (default: 50)
--output=PATH Write NDJSON trace to file (default: stdout)
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
Supervise options:
--task=PATH Path to task file (required)
--supervisor-cwd=DIR Supervisor working directory (default: .)
--agent-cwd=DIR Agent working directory (default: temp directory)
--model=MODEL Claude model to use (default: opus)
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
--output=PATH Write NDJSON trace to file (default: stdout)
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
Options:

@@ -32,2 +55,4 @@ --help Show this help message

fit-eval tee output.ndjson < trace.ndjson
fit-eval run --task=.github/tasks/security-audit.md --model=opus
fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
`.trim();

@@ -34,0 +59,0 @@

export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
export { Supervisor, createSupervisor } from "./src/supervisor.js";
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
+6
-3
{
"name": "@forwardimpact/libeval",
"version": "0.1.0",
"version": "0.1.1",
"description": "Process Claude Code stream-json output into structured traces",

@@ -13,7 +13,10 @@ "license": "Apache-2.0",

"engines": {
"node": ">=22.0.0"
"bun": ">=1.2.0"
},
"scripts": {
"test": "node --test test/*.test.js"
"test": "bun run node --test test/*.test.js"
},
"dependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.1.0"
},
"publishConfig": {

@@ -20,0 +23,0 @@ "access": "public"