@forwardimpact/libeval
Advanced tools
| /** | ||
| * AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON | ||
| * events to an output stream. Building block for both `fit-eval run` and | ||
| * `fit-eval supervise`. | ||
| * | ||
| * Follows OO+DI: constructor injection, factory function, tests bypass factory. | ||
| */ | ||
| export class AgentRunner { | ||
| /** | ||
| * @param {object} deps | ||
| * @param {string} deps.cwd - Agent working directory | ||
| * @param {function} deps.query - SDK query function (injected for testing) | ||
| * @param {import("stream").Writable} deps.output - Stream to emit NDJSON to | ||
| * @param {string} [deps.model] - Claude model identifier | ||
| * @param {number} [deps.maxTurns] - Maximum agentic turns | ||
| * @param {string[]} [deps.allowedTools] - Tools the agent may use | ||
| * @param {string} [deps.permissionMode] - SDK permission mode | ||
| */ | ||
| constructor({ | ||
| cwd, | ||
| query, | ||
| output, | ||
| model, | ||
| maxTurns, | ||
| allowedTools, | ||
| permissionMode, | ||
| }) { | ||
| if (!cwd) throw new Error("cwd is required"); | ||
| if (!query) throw new Error("query is required"); | ||
| if (!output) throw new Error("output is required"); | ||
| this.cwd = cwd; | ||
| this.query = query; | ||
| this.output = output; | ||
| this.model = model ?? "opus"; | ||
| this.maxTurns = maxTurns ?? 50; | ||
| this.allowedTools = allowedTools ?? [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ]; | ||
| this.permissionMode = permissionMode ?? "bypassPermissions"; | ||
| this.sessionId = null; | ||
| this.buffer = []; | ||
| } | ||
| /** | ||
| * Run a new agent session with the given task. | ||
| * @param {string} task - The task prompt | ||
| * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>} | ||
| */ | ||
| async run(task) { | ||
| let text = ""; | ||
| let stopReason = null; | ||
| let error = null; | ||
| try { | ||
| for await (const message of this.query({ | ||
| prompt: task, | ||
| options: { | ||
| cwd: this.cwd, | ||
| allowedTools: this.allowedTools, | ||
| maxTurns: this.maxTurns, | ||
| model: this.model, | ||
| permissionMode: this.permissionMode, | ||
| allowDangerouslySkipPermissions: true, | ||
| }, | ||
| })) { | ||
| const line = JSON.stringify(message); | ||
| this.output.write(line + "\n"); | ||
| this.buffer.push(line); | ||
| if (message.type === "system" && message.subtype === "init") { | ||
| this.sessionId = message.session_id; | ||
| } | ||
| if (message.type === "result") { | ||
| text = message.result ?? ""; | ||
| stopReason = message.subtype; | ||
| } | ||
| } | ||
| } catch (err) { | ||
| error = err; | ||
| } | ||
| const success = !error && stopReason === "success"; | ||
| return { success, text, sessionId: this.sessionId, error }; | ||
| } | ||
| /** | ||
| * Resume an existing session with a follow-up prompt. | ||
| * @param {string} prompt - The follow-up prompt | ||
| * @returns {Promise<{success: boolean, text: string}>} | ||
| */ | ||
| async resume(prompt) { | ||
| let text = ""; | ||
| let stopReason = null; | ||
| let error = null; | ||
| try { | ||
| for await (const message of this.query({ | ||
| prompt, | ||
| options: { resume: this.sessionId }, | ||
| })) { | ||
| const line = JSON.stringify(message); | ||
| this.output.write(line + "\n"); | ||
| this.buffer.push(line); | ||
| if (message.type === "result") { | ||
| text = message.result ?? ""; | ||
| stopReason = message.subtype; | ||
| } | ||
| } | ||
| } catch (err) { | ||
| error = err; | ||
| } | ||
| const success = !error && stopReason === "success"; | ||
| return { success, text, error }; | ||
| } | ||
| /** | ||
| * Drain buffered output lines. Used by Supervisor to tag and re-emit lines. | ||
| * @returns {string[]} | ||
| */ | ||
| drainOutput() { | ||
| const lines = [...this.buffer]; | ||
| this.buffer = []; | ||
| return lines; | ||
| } | ||
| } | ||
| /** | ||
| * Factory function — wires real dependencies. | ||
| * @param {object} deps - Same as AgentRunner constructor | ||
| * @returns {AgentRunner} | ||
| */ | ||
| export function createAgentRunner(deps) { | ||
| return new AgentRunner(deps); | ||
| } |
| import { readFileSync, createWriteStream } from "node:fs"; | ||
| import { resolve } from "node:path"; | ||
| import { createAgentRunner } from "../agent-runner.js"; | ||
| import { createTeeWriter } from "../tee-writer.js"; | ||
| /** | ||
| * Parse a --key=value or --key value flag from args. | ||
| * @param {string[]} args | ||
| * @param {string} name - Flag name without -- | ||
| * @returns {string|undefined} | ||
| */ | ||
| function parseFlag(args, name) { | ||
| const prefix = `--${name}=`; | ||
| for (let i = 0; i < args.length; i++) { | ||
| if (args[i].startsWith(prefix)) return args[i].slice(prefix.length); | ||
| if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1]; | ||
| } | ||
| return undefined; | ||
| } | ||
| /** | ||
| * Run command — execute a single agent via the Claude Agent SDK. | ||
| * | ||
| * Usage: fit-eval run [options] | ||
| * | ||
| * Options: | ||
| * --task=PATH Path to task file (required) | ||
| * --cwd=DIR Agent working directory (default: .) | ||
| * --model=MODEL Claude model to use (default: opus) | ||
| * --max-turns=N Maximum agentic turns (default: 50) | ||
| * --output=PATH Write NDJSON trace to file (default: stdout) | ||
| * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * | ||
| * @param {string[]} args - Command arguments | ||
| */ | ||
| export async function runRunCommand(args) { | ||
| const task = parseFlag(args, "task"); | ||
| if (!task) throw new Error("--task is required"); | ||
| const cwd = resolve(parseFlag(args, "cwd") ?? "."); | ||
| const model = parseFlag(args, "model") ?? "opus"; | ||
| const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const allowedTools = ( | ||
| parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| ).split(","); | ||
| const taskContent = readFileSync(task, "utf8"); | ||
| // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
| // Otherwise, write NDJSON directly to stdout (backwards-compatible). | ||
| const fileStream = outputPath ? createWriteStream(outputPath) : null; | ||
| const output = fileStream | ||
| ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" }) | ||
| : process.stdout; | ||
| const { query } = await import("@anthropic-ai/claude-agent-sdk"); | ||
| const runner = createAgentRunner({ | ||
| cwd, | ||
| query, | ||
| output, | ||
| model, | ||
| maxTurns, | ||
| allowedTools, | ||
| }); | ||
| const result = await runner.run(taskContent); | ||
| if (fileStream) { | ||
| await new Promise((r) => output.end(r)); | ||
| await new Promise((r) => fileStream.end(r)); | ||
| } | ||
| process.exit(result.success ? 0 : 1); | ||
| } |
| import { readFileSync, createWriteStream, mkdtempSync } from "node:fs"; | ||
| import { resolve, join } from "node:path"; | ||
| import { tmpdir } from "node:os"; | ||
| import { createSupervisor } from "../supervisor.js"; | ||
| import { createTeeWriter } from "../tee-writer.js"; | ||
| /** | ||
| * Parse a --key=value or --key value flag from args. | ||
| * @param {string[]} args | ||
| * @param {string} name - Flag name without -- | ||
| * @returns {string|undefined} | ||
| */ | ||
| function parseFlag(args, name) { | ||
| const prefix = `--${name}=`; | ||
| for (let i = 0; i < args.length; i++) { | ||
| if (args[i].startsWith(prefix)) return args[i].slice(prefix.length); | ||
| if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1]; | ||
| } | ||
| return undefined; | ||
| } | ||
| /** | ||
| * Supervise command — run two agents in a relay loop via the Claude Agent SDK. | ||
| * | ||
| * Usage: fit-eval supervise [options] | ||
| * | ||
| * Options: | ||
| * --task=PATH Path to task file (required) | ||
| * --supervisor-cwd=DIR Supervisor working directory (default: .) | ||
| * --agent-cwd=DIR Agent working directory (default: temp directory) | ||
| * --model=MODEL Claude model to use (default: opus) | ||
| * --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20) | ||
| * --output=PATH Write NDJSON trace to file (default: stdout) | ||
| * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * | ||
| * @param {string[]} args - Command arguments | ||
| */ | ||
| export async function runSuperviseCommand(args) { | ||
| const task = parseFlag(args, "task"); | ||
| if (!task) throw new Error("--task is required"); | ||
| const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? "."); | ||
| const agentCwd = resolve( | ||
| parseFlag(args, "agent-cwd") ?? | ||
| mkdtempSync(join(tmpdir(), "fit-eval-agent-")), | ||
| ); | ||
| const model = parseFlag(args, "model") ?? "opus"; | ||
| const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const allowedTools = ( | ||
| parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| ).split(","); | ||
| const taskContent = readFileSync(task, "utf8"); | ||
| // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
| // Otherwise, write NDJSON directly to stdout (backwards-compatible). | ||
| const fileStream = outputPath ? createWriteStream(outputPath) : null; | ||
| const output = fileStream | ||
| ? createTeeWriter({ | ||
| fileStream, | ||
| textStream: process.stdout, | ||
| mode: "supervised", | ||
| }) | ||
| : process.stdout; | ||
| const { query } = await import("@anthropic-ai/claude-agent-sdk"); | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd, | ||
| agentCwd, | ||
| query, | ||
| output, | ||
| model, | ||
| maxTurns, | ||
| allowedTools, | ||
| }); | ||
| const result = await supervisor.run(taskContent); | ||
| if (fileStream) { | ||
| await new Promise((r) => output.end(r)); | ||
| await new Promise((r) => fileStream.end(r)); | ||
| } | ||
| process.exit(result.success ? 0 : 1); | ||
| } |
| /** | ||
| * Supervisor — orchestrates a relay loop between an agent and a supervisor, | ||
| * both running as AgentRunner instances. The agent works on a task while the | ||
| * supervisor observes and decides when the evaluation is complete. | ||
| * | ||
| * Follows OO+DI: constructor injection, factory function, tests bypass factory. | ||
| */ | ||
| import { PassThrough } from "node:stream"; | ||
| import { createAgentRunner } from "./agent-runner.js"; | ||
| /** | ||
| * Check if the supervisor's response signals evaluation completion. | ||
| * Uses a structured signal — `EVALUATION_COMPLETE` on its own line — | ||
| * to avoid false positives from natural language. | ||
| * @param {string} text | ||
| * @returns {boolean} | ||
| */ | ||
| export function isDone(text) { | ||
| return /^EVALUATION_COMPLETE$/m.test(text); | ||
| } | ||
| export class Supervisor { | ||
| /** | ||
| * @param {object} deps | ||
| * @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions | ||
| * @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions | ||
| * @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to | ||
| * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges | ||
| */ | ||
| constructor({ agentRunner, supervisorRunner, output, maxTurns }) { | ||
| if (!agentRunner) throw new Error("agentRunner is required"); | ||
| if (!supervisorRunner) throw new Error("supervisorRunner is required"); | ||
| if (!output) throw new Error("output is required"); | ||
| this.agentRunner = agentRunner; | ||
| this.supervisorRunner = supervisorRunner; | ||
| this.output = output; | ||
| this.maxTurns = maxTurns ?? 20; | ||
| } | ||
| /** | ||
| * Run the supervisor ↔ agent relay loop. | ||
| * @param {string} task - The initial task for the agent | ||
| * @returns {Promise<{success: boolean, turns: number}>} | ||
| */ | ||
| async run(task) { | ||
| // Turn 0: Agent receives the task and starts working | ||
| let agentResult = await this.agentRunner.run(task); | ||
| this.emitTagged("agent", 0); | ||
| if (agentResult.error) { | ||
| this.emitSummary({ success: false, turns: 0 }); | ||
| return { success: false, turns: 0 }; | ||
| } | ||
| for (let turn = 1; turn <= this.maxTurns; turn++) { | ||
| // Supervisor observes the agent's output | ||
| const supervisorPrompt = | ||
| `The agent reported:\n\n${agentResult.text}\n\n` + | ||
| `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`; | ||
| let supervisorResult; | ||
| if (turn === 1) { | ||
| supervisorResult = await this.supervisorRunner.run(supervisorPrompt); | ||
| } else { | ||
| supervisorResult = await this.supervisorRunner.resume(supervisorPrompt); | ||
| } | ||
| this.emitTagged("supervisor", turn); | ||
| if (supervisorResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { success: false, turns: turn }; | ||
| } | ||
| if (isDone(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { success: true, turns: turn }; | ||
| } | ||
| // Supervisor's response becomes the agent's next input | ||
| agentResult = await this.agentRunner.resume(supervisorResult.text); | ||
| this.emitTagged("agent", turn); | ||
| if (agentResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { success: false, turns: turn }; | ||
| } | ||
| } | ||
| this.emitSummary({ success: false, turns: this.maxTurns }); | ||
| return { success: false, turns: this.maxTurns }; | ||
| } | ||
| /** | ||
| * Drain a runner's buffered output and re-emit each line tagged with | ||
| * source and turn metadata. | ||
| * @param {"agent"|"supervisor"} source | ||
| * @param {number} turn | ||
| */ | ||
| emitTagged(source, turn) { | ||
| const runner = | ||
| source === "agent" ? this.agentRunner : this.supervisorRunner; | ||
| for (const line of runner.drainOutput()) { | ||
| const event = JSON.parse(line); | ||
| const tagged = { source, turn, event }; | ||
| this.output.write(JSON.stringify(tagged) + "\n"); | ||
| } | ||
| } | ||
| /** | ||
| * Emit a final orchestrator summary line. | ||
| * @param {{success: boolean, turns: number}} result | ||
| */ | ||
| emitSummary(result) { | ||
| const summary = { | ||
| source: "orchestrator", | ||
| type: "summary", | ||
| success: result.success, | ||
| turns: result.turns, | ||
| }; | ||
| this.output.write(JSON.stringify(summary) + "\n"); | ||
| } | ||
| } | ||
| /** | ||
| * Factory function — wires both AgentRunners with their respective configs. | ||
| * @param {object} deps | ||
| * @param {string} deps.supervisorCwd - Supervisor working directory | ||
| * @param {string} deps.agentCwd - Agent working directory | ||
| * @param {function} deps.query - SDK query function | ||
| * @param {import("stream").Writable} deps.output - Final output stream | ||
| * @param {string} [deps.model] - Claude model identifier | ||
| * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges | ||
| * @param {string[]} [deps.allowedTools] - Tools the agent may use | ||
| * @returns {Supervisor} | ||
| */ | ||
| export function createSupervisor({ | ||
| supervisorCwd, | ||
| agentCwd, | ||
| query, | ||
| output, | ||
| model, | ||
| maxTurns, | ||
| allowedTools, | ||
| }) { | ||
| const agentRunner = createAgentRunner({ | ||
| cwd: agentCwd, | ||
| query, | ||
| output: new PassThrough(), | ||
| model, | ||
| maxTurns: 50, | ||
| allowedTools, | ||
| }); | ||
| const supervisorRunner = createAgentRunner({ | ||
| cwd: supervisorCwd, | ||
| query, | ||
| output: new PassThrough(), | ||
| model, | ||
| maxTurns: 10, | ||
| allowedTools: ["Read", "Glob", "Grep"], | ||
| }); | ||
| return new Supervisor({ agentRunner, supervisorRunner, output, maxTurns }); | ||
| } |
| /** | ||
| * TeeWriter — a Writable stream that writes raw NDJSON to a file while | ||
| * simultaneously streaming human-readable text to a separate stream (e.g. | ||
| * process.stdout). | ||
| * | ||
| * Supports two modes: | ||
| * - "raw" (default): expects standard stream-json events from AgentRunner | ||
| * - "supervised": expects tagged events {source, turn, event} from Supervisor | ||
| * | ||
| * Follows OO+DI: constructor injection, factory function, tests bypass factory. | ||
| */ | ||
| import { Writable } from "node:stream"; | ||
| import { TraceCollector } from "./trace-collector.js"; | ||
| export class TeeWriter extends Writable { | ||
| /** | ||
| * @param {object} deps | ||
| * @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to | ||
| * @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to | ||
| * @param {"raw"|"supervised"} [deps.mode] - Event format: "raw" or "supervised" (default: "raw") | ||
| */ | ||
| constructor({ fileStream, textStream, mode }) { | ||
| super(); | ||
| if (!fileStream) throw new Error("fileStream is required"); | ||
| if (!textStream) throw new Error("textStream is required"); | ||
| this.fileStream = fileStream; | ||
| this.textStream = textStream; | ||
| this.mode = mode ?? "raw"; | ||
| this.collector = new TraceCollector(); | ||
| this.turnsEmitted = 0; | ||
| this.lastSource = null; | ||
| this.partial = ""; | ||
| } | ||
| /** | ||
| * @param {Buffer|string} chunk | ||
| * @param {string} encoding | ||
| * @param {function} callback | ||
| */ | ||
| _write(chunk, encoding, callback) { | ||
| const str = this.partial + chunk.toString(); | ||
| const lines = str.split("\n"); | ||
| this.partial = lines.pop() ?? ""; | ||
| for (const line of lines) { | ||
| if (!line.trim()) continue; | ||
| this.fileStream.write(line + "\n"); | ||
| this.processLine(line); | ||
| } | ||
| callback(); | ||
| } | ||
| /** | ||
| * @param {function} callback | ||
| */ | ||
| _final(callback) { | ||
| if (this.partial.trim()) { | ||
| this.fileStream.write(this.partial + "\n"); | ||
| this.processLine(this.partial); | ||
| } | ||
| if (this.mode === "raw" && this.collector.result) { | ||
| const text = this.collector.toText(); | ||
| const idx = text.lastIndexOf("\n---"); | ||
| if (idx !== -1) { | ||
| this.textStream.write(text.slice(idx) + "\n"); | ||
| } | ||
| } | ||
| callback(); | ||
| } | ||
| /** | ||
| * Process a single NDJSON line — feed to collector and flush text. | ||
| * @param {string} line | ||
| */ | ||
| processLine(line) { | ||
| if (this.mode === "supervised") { | ||
| this.processSupervisedLine(line); | ||
| } else { | ||
| this.collector.addLine(line); | ||
| this.flushTurns(); | ||
| } | ||
| } | ||
| /** | ||
| * Handle a tagged supervisor line: unwrap event, show source labels. | ||
| * @param {string} line | ||
| */ | ||
| processSupervisedLine(line) { | ||
| let parsed; | ||
| try { | ||
| parsed = JSON.parse(line); | ||
| } catch { | ||
| return; | ||
| } | ||
| if (parsed.source === "orchestrator" && parsed.type === "summary") { | ||
| const status = parsed.success ? "completed" : "incomplete"; | ||
| this.textStream.write( | ||
| `\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`, | ||
| ); | ||
| return; | ||
| } | ||
| if (parsed.event) { | ||
| if (parsed.source && parsed.source !== this.lastSource) { | ||
| this.lastSource = parsed.source; | ||
| this.textStream.write(`\n[${parsed.source}]\n`); | ||
| } | ||
| this.collector.addLine(JSON.stringify(parsed.event)); | ||
| this.flushTurns(); | ||
| } | ||
| } | ||
| /** | ||
| * Emit text for any new turns accumulated by the collector. | ||
| */ | ||
| flushTurns() { | ||
| const turns = this.collector.turns; | ||
| while (this.turnsEmitted < turns.length) { | ||
| const turn = turns[this.turnsEmitted++]; | ||
| if (turn.role === "assistant") { | ||
| for (const block of turn.content) { | ||
| if (block.type === "text") { | ||
| this.textStream.write(block.text + "\n"); | ||
| } else if (block.type === "tool_use") { | ||
| const input = summarizeInput(block.input); | ||
| this.textStream.write(`> Tool: ${block.name} ${input}\n`); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| /** | ||
| * Summarize tool input for text display, truncated to keep logs readable. | ||
| * @param {object} input - Tool input object | ||
| * @returns {string} Truncated summary | ||
| */ | ||
| function summarizeInput(input) { | ||
| if (!input || typeof input !== "object") return ""; | ||
| const json = JSON.stringify(input); | ||
| if (json.length <= 200) return json; | ||
| return json.slice(0, 197) + "..."; | ||
| } | ||
| /** | ||
| * Factory function — wires a TeeWriter with the given streams. | ||
| * @param {object} deps - Same as TeeWriter constructor | ||
| * @returns {TeeWriter} | ||
| */ | ||
| export function createTeeWriter(deps) { | ||
| return new TeeWriter(deps); | ||
| } |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval"; | ||
| /** | ||
| * Create a mock query function that yields canned messages. | ||
| * @param {object[]} messages - Messages to yield | ||
| * @param {function} [captureOptions] - Callback to capture query options | ||
| * @returns {function} | ||
| */ | ||
| function mockQuery(messages, captureOptions) { | ||
| return async function* (params) { | ||
| if (captureOptions) captureOptions(params); | ||
| for (const msg of messages) { | ||
| yield msg; | ||
| } | ||
| }; | ||
| } | ||
| /** | ||
| * Collect all NDJSON lines written to a PassThrough stream. | ||
| * @param {PassThrough} stream | ||
| * @returns {string[]} | ||
| */ | ||
| function collectLines(stream) { | ||
| const data = stream.read(); | ||
| if (!data) return []; | ||
| return data | ||
| .toString() | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| } | ||
| describe("AgentRunner", () => { | ||
| test("constructor throws on missing cwd", () => { | ||
| assert.throws( | ||
| () => | ||
| new AgentRunner({ | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }), | ||
| /cwd is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing query", () => { | ||
| assert.throws( | ||
| () => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }), | ||
| /query is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing output", () => { | ||
| assert.throws( | ||
| () => | ||
| new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| }), | ||
| /output is required/, | ||
| ); | ||
| }); | ||
| test("constructor uses defaults for optional params", () => { | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.strictEqual(runner.model, "opus"); | ||
| assert.strictEqual(runner.maxTurns, 50); | ||
| assert.deepStrictEqual(runner.allowedTools, [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ]); | ||
| assert.strictEqual(runner.permissionMode, "bypassPermissions"); | ||
| assert.strictEqual(runner.sessionId, null); | ||
| }); | ||
| test("run() writes NDJSON lines to output stream", async () => { | ||
| const messages = [ | ||
| { type: "system", subtype: "init", session_id: "sess-1" }, | ||
| { type: "assistant", content: "Working on it..." }, | ||
| { type: "result", subtype: "success", result: "Done." }, | ||
| ]; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: mockQuery(messages), | ||
| output, | ||
| }); | ||
| const result = await runner.run("Test task"); | ||
| const lines = collectLines(output); | ||
| assert.strictEqual(lines.length, 3); | ||
| assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]); | ||
| assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]); | ||
| assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.text, "Done."); | ||
| assert.strictEqual(result.sessionId, "sess-1"); | ||
| }); | ||
| test("run() captures sessionId from init event", async () => { | ||
| const messages = [ | ||
| { type: "system", subtype: "init", session_id: "my-session" }, | ||
| { type: "result", subtype: "success", result: "OK" }, | ||
| ]; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: mockQuery(messages), | ||
| output, | ||
| }); | ||
| await runner.run("Task"); | ||
| assert.strictEqual(runner.sessionId, "my-session"); | ||
| }); | ||
| test("run() passes options to query", async () => { | ||
| let captured = null; | ||
| const query = mockQuery( | ||
| [{ type: "result", subtype: "success", result: "OK" }], | ||
| (params) => { | ||
| captured = params; | ||
| }, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/work", | ||
| query, | ||
| output, | ||
| model: "sonnet", | ||
| maxTurns: 10, | ||
| allowedTools: ["Read", "Grep"], | ||
| permissionMode: "plan", | ||
| }); | ||
| await runner.run("My task"); | ||
| assert.strictEqual(captured.prompt, "My task"); | ||
| assert.strictEqual(captured.options.cwd, "/work"); | ||
| assert.strictEqual(captured.options.model, "sonnet"); | ||
| assert.strictEqual(captured.options.maxTurns, 10); | ||
| assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]); | ||
| assert.strictEqual(captured.options.permissionMode, "plan"); | ||
| assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true); | ||
| }); | ||
| test("run() returns success=false on non-success subtype", async () => { | ||
| const messages = [{ type: "result", subtype: "error", result: "Stopped" }]; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: mockQuery(messages), | ||
| output, | ||
| }); | ||
| const result = await runner.run("Task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.text, "Stopped"); | ||
| }); | ||
| test("resume() passes sessionId via options.resume", async () => { | ||
| let resumeCapture = null; | ||
| const initMessages = [ | ||
| { type: "system", subtype: "init", session_id: "sess-42" }, | ||
| { type: "result", subtype: "success", result: "First done" }, | ||
| ]; | ||
| let callCount = 0; | ||
| const query = async function* (params) { | ||
| callCount++; | ||
| if (callCount === 1) { | ||
| for (const m of initMessages) yield m; | ||
| } else { | ||
| resumeCapture = params; | ||
| yield { type: "result", subtype: "success", result: "Resumed" }; | ||
| } | ||
| }; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ cwd: "/tmp", query, output }); | ||
| await runner.run("Initial task"); | ||
| const result = await runner.resume("Follow up"); | ||
| assert.strictEqual(resumeCapture.options.resume, "sess-42"); | ||
| assert.strictEqual(resumeCapture.prompt, "Follow up"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.text, "Resumed"); | ||
| }); | ||
| test("drainOutput() returns buffered lines and clears buffer", async () => { | ||
| const messages = [ | ||
| { type: "assistant", content: "Line 1" }, | ||
| { type: "result", subtype: "success", result: "Line 2" }, | ||
| ]; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: mockQuery(messages), | ||
| output, | ||
| }); | ||
| await runner.run("Task"); | ||
| const drained = runner.drainOutput(); | ||
| assert.strictEqual(drained.length, 2); | ||
| assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]); | ||
| assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]); | ||
| // Buffer should be empty after drain | ||
| const secondDrain = runner.drainOutput(); | ||
| assert.strictEqual(secondDrain.length, 0); | ||
| }); | ||
| test("run() captures error when query throws and returns buffered output", async () => { | ||
| async function* failingQuery() { | ||
| yield { type: "system", subtype: "init", session_id: "sess-err" }; | ||
| yield { type: "assistant", content: "Partial work" }; | ||
| throw new Error("Claude Code process exited with code 1"); | ||
| } | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: () => failingQuery(), | ||
| output, | ||
| }); | ||
| const result = await runner.run("Task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.ok(result.error); | ||
| assert.match(result.error.message, /exited with code 1/); | ||
| assert.strictEqual(result.sessionId, "sess-err"); | ||
| // Buffered output should contain the messages yielded before the error | ||
| const drained = runner.drainOutput(); | ||
| assert.strictEqual(drained.length, 2); | ||
| }); | ||
| test("resume() captures error when query throws", async () => { | ||
| const initMessages = [ | ||
| { type: "system", subtype: "init", session_id: "sess-r" }, | ||
| { type: "result", subtype: "success", result: "OK" }, | ||
| ]; | ||
| let callCount = 0; | ||
| const query = async function* () { | ||
| callCount++; | ||
| if (callCount === 1) { | ||
| for (const m of initMessages) yield m; | ||
| } else { | ||
| yield { type: "assistant", content: "Resuming..." }; | ||
| throw new Error("Process crashed"); | ||
| } | ||
| }; | ||
| const output = new PassThrough(); | ||
| const runner = new AgentRunner({ cwd: "/tmp", query, output }); | ||
| await runner.run("Task"); | ||
| const result = await runner.resume("Continue"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.ok(result.error); | ||
| assert.match(result.error.message, /Process crashed/); | ||
| }); | ||
| test("createAgentRunner factory returns an AgentRunner instance", () => { | ||
| const runner = createAgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.ok(runner instanceof AgentRunner); | ||
| }); | ||
| }); |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { | ||
| AgentRunner, | ||
| Supervisor, | ||
| createSupervisor, | ||
| } from "@forwardimpact/libeval"; | ||
| import { isDone } from "../src/supervisor.js"; | ||
| /** | ||
| * Create a mock AgentRunner that yields pre-scripted responses. | ||
| * Each call to run() or resume() pops the next response from the array. | ||
| * @param {object[]} responses - Array of {text, success} objects | ||
| * @param {object[]} [messages] - Messages to buffer per turn | ||
| * @returns {AgentRunner} | ||
| */ | ||
| function createMockRunner(responses, messages) { | ||
| const output = new PassThrough(); | ||
| let callIndex = 0; | ||
| const runner = new AgentRunner({ | ||
| cwd: "/tmp", | ||
| query: async function* () {}, | ||
| output, | ||
| }); | ||
| // Override run and resume to return scripted responses | ||
| runner.run = async (_task) => { | ||
| const resp = responses[callIndex++]; | ||
| // Buffer messages for drainOutput | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| for (const m of msgs) { | ||
| runner.buffer.push(JSON.stringify(m)); | ||
| } | ||
| runner.sessionId = "mock-session"; | ||
| return { | ||
| success: resp.success ?? true, | ||
| text: resp.text, | ||
| sessionId: "mock-session", | ||
| }; | ||
| }; | ||
| runner.resume = async (_prompt) => { | ||
| const resp = responses[callIndex++]; | ||
| const msgs = messages?.[callIndex - 1] ?? [ | ||
| { type: "assistant", content: resp.text }, | ||
| ]; | ||
| for (const m of msgs) { | ||
| runner.buffer.push(JSON.stringify(m)); | ||
| } | ||
| return { success: resp.success ?? true, text: resp.text }; | ||
| }; | ||
| return runner; | ||
| } | ||
| describe("isDone", () => { | ||
| test("detects EVALUATION_COMPLETE on its own line", () => { | ||
| assert.strictEqual(isDone("EVALUATION_COMPLETE"), true); | ||
| assert.strictEqual( | ||
| isDone("Some text\nEVALUATION_COMPLETE\nMore text"), | ||
| true, | ||
| ); | ||
| assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true); | ||
| }); | ||
| test("does not match EVALUATION_COMPLETE embedded in text", () => { | ||
| assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false); | ||
| assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false); | ||
| assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false); | ||
| }); | ||
| test("does not match empty or unrelated text", () => { | ||
| assert.strictEqual(isDone(""), false); | ||
| assert.strictEqual(isDone("All done!"), false); | ||
| assert.strictEqual(isDone("DONE"), false); | ||
| }); | ||
| }); | ||
| describe("Supervisor", () => { | ||
| test("constructor throws on missing agentRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| supervisorRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /agentRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing supervisorRunner", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| output: new PassThrough(), | ||
| }), | ||
| /supervisorRunner is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing output", () => { | ||
| assert.throws( | ||
| () => | ||
| new Supervisor({ | ||
| agentRunner: createMockRunner([]), | ||
| supervisorRunner: createMockRunner([]), | ||
| }), | ||
| /output is required/, | ||
| ); | ||
| }); | ||
| test("completes on EVALUATION_COMPLETE from supervisor", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Good work.\n\nEVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("runs multiple turns before completion", async () => { | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Started working." }, | ||
| { text: "Made progress." }, | ||
| { text: "Finished everything." }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Keep going, you need to do more." }, | ||
| { text: "Almost there, continue." }, | ||
| { text: "EVALUATION_COMPLETE" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Do the work"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 3); | ||
| }); | ||
| test("enforces maxTurns limit", async () => { | ||
| // Agent responds to every turn, supervisor never says done | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Turn 0" }, | ||
| { text: "Turn 1" }, | ||
| { text: "Turn 2" }, | ||
| ]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Continue." }, | ||
| { text: "Continue." }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 2, | ||
| }); | ||
| const result = await supervisor.run("Endless task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 2); | ||
| }); | ||
| test("output contains tagged lines with correct source and turn", async () => { | ||
| const agentMessages = [[{ type: "assistant", content: "Working" }]]; | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "EVALUATION_COMPLETE" }], | ||
| ]; | ||
| const agentRunner = createMockRunner([{ text: "Working" }], agentMessages); | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "EVALUATION_COMPLETE" }], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| // Should have: agent turn 0, supervisor turn 1, orchestrator summary | ||
| assert.ok(lines.length >= 3); | ||
| const agentLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 0); | ||
| assert.ok("event" in agentLine); | ||
| const supervisorLine = JSON.parse(lines[1]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 1); | ||
| assert.ok("event" in supervisorLine); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.type, "summary"); | ||
| assert.strictEqual(summaryLine.success, true); | ||
| }); | ||
| test("events are nested under event key (no field collisions)", async () => { | ||
| const sourceEvent = { | ||
| type: "assistant", | ||
| source: "sdk-internal", | ||
| content: "test", | ||
| }; | ||
| const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]); | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "EVALUATION_COMPLETE" }], | ||
| [[{ type: "assistant", content: "ok" }]], | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| await supervisor.run("Task"); | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| const tagged = JSON.parse(lines[0]); | ||
| // The original event's `source` field is preserved inside `event` | ||
| assert.strictEqual(tagged.source, "agent"); | ||
| assert.strictEqual(tagged.event.source, "sdk-internal"); | ||
| }); | ||
| test("drains agent output and emits summary when agent errors on turn 0", async () => { | ||
| const agentMessages = [[{ type: "assistant", content: "Partial work" }]]; | ||
| const agentRunner = createMockRunner( | ||
| [{ text: "Partial work", success: false }], | ||
| agentMessages, | ||
| ); | ||
| // Override run to simulate an error return | ||
| const origRun = agentRunner.run; | ||
| agentRunner.run = async (task) => { | ||
| const result = await origRun.call(agentRunner, task); | ||
| return { ...result, error: new Error("Process exited with code 1") }; | ||
| }; | ||
| const supervisorRunner = createMockRunner([]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Task"); | ||
| assert.strictEqual(result.success, false); | ||
| assert.strictEqual(result.turns, 0); | ||
| // Output should still contain the agent's buffered lines + summary | ||
| const data = output.read()?.toString() ?? ""; | ||
| const lines = data | ||
| .trim() | ||
| .split("\n") | ||
| .filter((l) => l.length > 0); | ||
| assert.ok(lines.length >= 2, "Expected at least agent line + summary"); | ||
| const agentLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 0); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| assert.strictEqual(summaryLine.success, false); | ||
| assert.strictEqual(summaryLine.turns, 0); | ||
| }); | ||
| test("createSupervisor factory returns a Supervisor instance", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.ok(supervisor instanceof Supervisor); | ||
| }); | ||
| }); |
| import { describe, test } from "node:test"; | ||
| import assert from "node:assert"; | ||
| import { PassThrough } from "node:stream"; | ||
| import { TeeWriter, createTeeWriter } from "@forwardimpact/libeval"; | ||
| /** | ||
| * Collect all data written to a PassThrough stream as a string. | ||
| * @param {PassThrough} stream | ||
| * @returns {string} | ||
| */ | ||
| function collect(stream) { | ||
| const data = stream.read(); | ||
| return data ? data.toString() : ""; | ||
| } | ||
| /** | ||
| * Write lines to a TeeWriter and wait for it to finish. | ||
| * @param {TeeWriter} writer | ||
| * @param {string[]} lines - JSON lines to write | ||
| */ | ||
| async function writeLines(writer, lines) { | ||
| for (const line of lines) { | ||
| writer.write(line + "\n"); | ||
| } | ||
| await new Promise((resolve) => writer.end(resolve)); | ||
| } | ||
| describe("TeeWriter", () => { | ||
| test("constructor throws on missing fileStream", () => { | ||
| assert.throws( | ||
| () => new TeeWriter({ textStream: new PassThrough() }), | ||
| /fileStream is required/, | ||
| ); | ||
| }); | ||
| test("constructor throws on missing textStream", () => { | ||
| assert.throws( | ||
| () => new TeeWriter({ fileStream: new PassThrough() }), | ||
| /textStream is required/, | ||
| ); | ||
| }); | ||
| test("writes NDJSON to fileStream and text to textStream in raw mode", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ fileStream, textStream, mode: "raw" }); | ||
| const events = [ | ||
| JSON.stringify({ | ||
| type: "system", | ||
| subtype: "init", | ||
| session_id: "s1", | ||
| model: "opus", | ||
| }), | ||
| JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Hello world" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }), | ||
| JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "tool_use", | ||
| id: "t1", | ||
| name: "Bash", | ||
| input: { command: "ls" }, | ||
| }, | ||
| ], | ||
| usage: { input_tokens: 20, output_tokens: 10 }, | ||
| }, | ||
| }), | ||
| JSON.stringify({ | ||
| type: "result", | ||
| subtype: "success", | ||
| duration_ms: 5000, | ||
| num_turns: 2, | ||
| total_cost_usd: 0.05, | ||
| usage: { input_tokens: 30, output_tokens: 15 }, | ||
| }), | ||
| ]; | ||
| await writeLines(writer, events); | ||
| const fileData = collect(fileStream); | ||
| const textData = collect(textStream); | ||
| // File should contain all NDJSON lines | ||
| const fileLines = fileData.trim().split("\n"); | ||
| assert.strictEqual(fileLines.length, 4); | ||
| assert.deepStrictEqual(JSON.parse(fileLines[0]).type, "system"); | ||
| assert.deepStrictEqual(JSON.parse(fileLines[3]).type, "result"); | ||
| // Text should contain human-readable output | ||
| assert.ok(textData.includes("Hello world")); | ||
| assert.ok(textData.includes("> Tool: Bash")); | ||
| assert.ok(textData.includes("--- Result: success")); | ||
| }); | ||
| test("streams text incrementally as events arrive", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ fileStream, textStream, mode: "raw" }); | ||
| // Write first assistant message | ||
| writer.write( | ||
| JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "First message" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }) + "\n", | ||
| ); | ||
| // Text should be available before stream ends | ||
| const firstText = collect(textStream); | ||
| assert.ok(firstText.includes("First message")); | ||
| writer.write( | ||
| JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Second message" }], | ||
| usage: { input_tokens: 20, output_tokens: 10 }, | ||
| }, | ||
| }) + "\n", | ||
| ); | ||
| const secondText = collect(textStream); | ||
| assert.ok(secondText.includes("Second message")); | ||
| await new Promise((resolve) => writer.end(resolve)); | ||
| }); | ||
| test("supervised mode shows source labels and unwraps events", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ | ||
| fileStream, | ||
| textStream, | ||
| mode: "supervised", | ||
| }); | ||
| const events = [ | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 0, | ||
| event: { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Working on it" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }, | ||
| }), | ||
| JSON.stringify({ | ||
| source: "supervisor", | ||
| turn: 1, | ||
| event: { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Looks good" }], | ||
| usage: { input_tokens: 20, output_tokens: 10 }, | ||
| }, | ||
| }, | ||
| }), | ||
| JSON.stringify({ | ||
| source: "orchestrator", | ||
| type: "summary", | ||
| success: true, | ||
| turns: 1, | ||
| }), | ||
| ]; | ||
| await writeLines(writer, events); | ||
| const fileData = collect(fileStream); | ||
| const textData = collect(textStream); | ||
| // File should contain all raw tagged NDJSON | ||
| const fileLines = fileData.trim().split("\n"); | ||
| assert.strictEqual(fileLines.length, 3); | ||
| assert.strictEqual(JSON.parse(fileLines[0]).source, "agent"); | ||
| // Text should show source labels | ||
| assert.ok(textData.includes("[agent]")); | ||
| assert.ok(textData.includes("Working on it")); | ||
| assert.ok(textData.includes("[supervisor]")); | ||
| assert.ok(textData.includes("Looks good")); | ||
| assert.ok(textData.includes("Evaluation completed after 1 turns")); | ||
| }); | ||
| test("supervised mode shows incomplete status on failure", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ | ||
| fileStream, | ||
| textStream, | ||
| mode: "supervised", | ||
| }); | ||
| await writeLines(writer, [ | ||
| JSON.stringify({ | ||
| source: "orchestrator", | ||
| type: "summary", | ||
| success: false, | ||
| turns: 5, | ||
| }), | ||
| ]); | ||
| const textData = collect(textStream); | ||
| assert.ok(textData.includes("Evaluation incomplete after 5 turns")); | ||
| }); | ||
| test("supervised mode only shows source label on change", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ | ||
| fileStream, | ||
| textStream, | ||
| mode: "supervised", | ||
| }); | ||
| const events = [ | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 0, | ||
| event: { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Step 1" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }, | ||
| }), | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 0, | ||
| event: { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Step 2" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }, | ||
| }), | ||
| ]; | ||
| await writeLines(writer, events); | ||
| const textData = collect(textStream); | ||
| // [agent] label should appear only once | ||
| const agentLabels = textData.split("[agent]").length - 1; | ||
| assert.strictEqual(agentLabels, 1); | ||
| }); | ||
| test("handles partial lines across chunks", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ fileStream, textStream, mode: "raw" }); | ||
| const fullLine = JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "Split message" }], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }); | ||
| // Split the line across two chunks | ||
| const mid = Math.floor(fullLine.length / 2); | ||
| writer.write(fullLine.slice(0, mid)); | ||
| writer.write(fullLine.slice(mid) + "\n"); | ||
| await new Promise((resolve) => writer.end(resolve)); | ||
| const textData = collect(textStream); | ||
| assert.ok(textData.includes("Split message")); | ||
| }); | ||
| test("truncates long tool input", async () => { | ||
| const fileStream = new PassThrough(); | ||
| const textStream = new PassThrough(); | ||
| const writer = new TeeWriter({ fileStream, textStream, mode: "raw" }); | ||
| const longInput = { command: "x".repeat(300) }; | ||
| const event = JSON.stringify({ | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "tool_use", id: "t1", name: "Bash", input: longInput }, | ||
| ], | ||
| usage: { input_tokens: 10, output_tokens: 5 }, | ||
| }, | ||
| }); | ||
| await writeLines(writer, [event]); | ||
| const textData = collect(textStream); | ||
| assert.ok(textData.includes("> Tool: Bash")); | ||
| assert.ok(textData.includes("...")); | ||
| // Truncated to ~200 chars | ||
| const toolLine = textData.split("\n").find((l) => l.startsWith("> Tool:")); | ||
| assert.ok(toolLine.length < 250); | ||
| }); | ||
| test("defaults to raw mode", () => { | ||
| const writer = new TeeWriter({ | ||
| fileStream: new PassThrough(), | ||
| textStream: new PassThrough(), | ||
| }); | ||
| assert.strictEqual(writer.mode, "raw"); | ||
| }); | ||
| test("createTeeWriter factory returns a TeeWriter instance", () => { | ||
| const writer = createTeeWriter({ | ||
| fileStream: new PassThrough(), | ||
| textStream: new PassThrough(), | ||
| }); | ||
| assert.ok(writer instanceof TeeWriter); | ||
| }); | ||
| }); |
+26
-1
@@ -1,5 +0,7 @@ | ||
| #!/usr/bin/env node | ||
| #!/usr/bin/env bun | ||
| import { runOutputCommand } from "../src/commands/output.js"; | ||
| import { runTeeCommand } from "../src/commands/tee.js"; | ||
| import { runRunCommand } from "../src/commands/run.js"; | ||
| import { runSuperviseCommand } from "../src/commands/supervise.js"; | ||
@@ -9,2 +11,4 @@ const COMMANDS = { | ||
| tee: runTeeCommand, | ||
| run: runRunCommand, | ||
| supervise: runSuperviseCommand, | ||
| }; | ||
@@ -21,3 +25,22 @@ | ||
| tee [output.ndjson] Stream text to stdout, optionally save raw NDJSON | ||
| run [options] Run a single agent via the Claude Agent SDK | ||
| supervise [options] Run a supervised agent ↔ supervisor relay loop | ||
| Run options: | ||
| --task=PATH Path to task file (required) | ||
| --cwd=DIR Agent working directory (default: .) | ||
| --model=MODEL Claude model to use (default: opus) | ||
| --max-turns=N Maximum agentic turns (default: 50) | ||
| --output=PATH Write NDJSON trace to file (default: stdout) | ||
| --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| Supervise options: | ||
| --task=PATH Path to task file (required) | ||
| --supervisor-cwd=DIR Supervisor working directory (default: .) | ||
| --agent-cwd=DIR Agent working directory (default: temp directory) | ||
| --model=MODEL Claude model to use (default: opus) | ||
| --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20) | ||
| --output=PATH Write NDJSON trace to file (default: stdout) | ||
| --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| Options: | ||
@@ -32,2 +55,4 @@ --help Show this help message | ||
| fit-eval tee output.ndjson < trace.ndjson | ||
| fit-eval run --task=.github/tasks/security-audit.md --model=opus | ||
| fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=. | ||
| `.trim(); | ||
@@ -34,0 +59,0 @@ |
+3
-0
| export { TraceCollector, createTraceCollector } from "./src/trace-collector.js"; | ||
| export { AgentRunner, createAgentRunner } from "./src/agent-runner.js"; | ||
| export { Supervisor, createSupervisor } from "./src/supervisor.js"; | ||
| export { TeeWriter, createTeeWriter } from "./src/tee-writer.js"; |
+6
-3
| { | ||
| "name": "@forwardimpact/libeval", | ||
| "version": "0.1.0", | ||
| "version": "0.1.1", | ||
| "description": "Process Claude Code stream-json output into structured traces", | ||
@@ -13,7 +13,10 @@ "license": "Apache-2.0", | ||
| "engines": { | ||
| "node": ">=22.0.0" | ||
| "bun": ">=1.2.0" | ||
| }, | ||
| "scripts": { | ||
| "test": "node --test test/*.test.js" | ||
| "test": "bun run node --test test/*.test.js" | ||
| }, | ||
| "dependencies": { | ||
| "@anthropic-ai/claude-agent-sdk": "^0.1.0" | ||
| }, | ||
| "publishConfig": { | ||
@@ -20,0 +23,0 @@ "access": "public" |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
86338
129.36%17
88.89%2123
197.34%1
Infinity%3
50%1
Infinity%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added