Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

@forwardimpact/libeval

Package Overview
Dependencies
Maintainers
1
Versions
50
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@forwardimpact/libeval - npm Package Compare versions

Comparing version
0.1.3
to
0.1.5
+12
-4
bin/fit-eval.js

@@ -28,3 +28,4 @@ #!/usr/bin/env node

Run options:
--task=PATH Path to task file (required)
--task-file=PATH Path to task file (mutually exclusive with --task-text)
--task-text=STRING Inline task text (mutually exclusive with --task-file)
--cwd=DIR Agent working directory (default: .)

@@ -35,5 +36,7 @@ --model=MODEL Claude model to use (default: opus)

--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
Supervise options:
--task=PATH Path to task file (required)
--task-file=PATH Path to task file (mutually exclusive with --task-text)
--task-text=STRING Inline task text (mutually exclusive with --task-file)
--supervisor-cwd=DIR Supervisor working directory (default: .)

@@ -45,2 +48,6 @@ --agent-cwd=DIR Agent working directory (default: temp directory)

--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
--supervisor-allowed-tools=LIST
Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
--supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)

@@ -56,4 +63,5 @@ Options:

fit-eval tee output.ndjson < trace.ndjson
fit-eval run --task=.github/tasks/security-audit.md --model=opus
fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
fit-eval run --task-text="Perform a security audit of the repository." --model=opus
fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
`.trim();

@@ -60,0 +68,0 @@

export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
export { Supervisor, createSupervisor } from "./src/supervisor.js";
export {
Supervisor,
createSupervisor,
SUPERVISOR_SYSTEM_PROMPT,
AGENT_SYSTEM_PROMPT,
} from "./src/supervisor.js";
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
{
"name": "@forwardimpact/libeval",
"version": "0.1.3",
"version": "0.1.5",
"description": "Process Claude Code stream-json output into structured traces",

@@ -13,3 +13,4 @@ "license": "Apache-2.0",

"engines": {
"bun": ">=1.2.0"
"bun": ">=1.2.0",
"node": ">=18.0.0"
},

@@ -20,3 +21,3 @@ "scripts": {

"dependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.1.0"
"@anthropic-ai/claude-agent-sdk": "^0.2.91"
},

@@ -23,0 +24,0 @@ "publishConfig": {

@@ -21,2 +21,5 @@ /**

* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
* @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
* @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
*/

@@ -33,2 +36,5 @@ constructor({

settingSources,
agentProfile,
systemPrompt,
disallowedTools,
}) {

@@ -54,2 +60,5 @@ if (!cwd) throw new Error("cwd is required");

this.settingSources = settingSources ?? [];
this.agentProfile = agentProfile ?? null;
this.systemPrompt = systemPrompt ?? null;
this.disallowedTools = disallowedTools ?? [];
this.sessionId = null;

@@ -80,2 +89,7 @@ this.buffer = [];

settingSources: this.settingSources,
...(this.disallowedTools.length > 0 && {
disallowedTools: this.disallowedTools,
}),
...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
},

@@ -120,3 +134,7 @@ })) {

prompt,
options: { resume: this.sessionId },
options: {
resume: this.sessionId,
permissionMode: this.permissionMode,
allowDangerouslySkipPermissions: true,
},
})) {

@@ -123,0 +141,0 @@ const line = JSON.stringify(message);

@@ -27,3 +27,4 @@ import { readFileSync, createWriteStream } from "node:fs";

* Options:
* --task=PATH Path to task file (required)
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
* --cwd=DIR Agent working directory (default: .)

@@ -34,2 +35,3 @@ * --model=MODEL Claude model to use (default: opus)

* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
*

@@ -39,4 +41,8 @@ * @param {string[]} args - Command arguments

export async function runRunCommand(args) {
const task = parseFlag(args, "task");
if (!task) throw new Error("--task is required");
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");

@@ -47,2 +53,3 @@ const cwd = resolve(parseFlag(args, "cwd") ?? ".");

const outputPath = parseFlag(args, "output");
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
const allowedTools = (

@@ -52,3 +59,3 @@ parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"

const taskContent = readFileSync(task, "utf8");
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

@@ -71,2 +78,3 @@ // When --output is specified, stream text to stdout while writing NDJSON to file.

settingSources: ["project"],
agentProfile,
});

@@ -73,0 +81,0 @@

@@ -28,3 +28,4 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";

* Options:
* --task=PATH Path to task file (required)
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
* --supervisor-cwd=DIR Supervisor working directory (default: .)

@@ -36,2 +37,4 @@ * --agent-cwd=DIR Agent working directory (default: temp directory)

* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
*

@@ -41,4 +44,8 @@ * @param {string[]} args - Command arguments

export async function runSuperviseCommand(args) {
const task = parseFlag(args, "task");
if (!task) throw new Error("--task is required");
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");

@@ -53,7 +60,13 @@ const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");

const outputPath = parseFlag(args, "output");
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
const allowedTools = (
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
).split(",");
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
const supervisorAllowedTools = supervisorAllowedToolsRaw
? supervisorAllowedToolsRaw.split(",")
: undefined;
const taskContent = readFileSync(task, "utf8");
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;

@@ -80,2 +93,5 @@ // When --output is specified, stream text to stdout while writing NDJSON to file.

allowedTools,
supervisorAllowedTools,
supervisorProfile,
agentProfile,
});

@@ -82,0 +98,0 @@

/**
* Supervisor — orchestrates a relay loop between an agent and a supervisor,
* both running as AgentRunner instances. The agent works on a task while the
* supervisor observes and decides when the evaluation is complete.
* both running as AgentRunner instances. The supervisor receives the task first,
* introduces itself, and delegates work to the agent. The loop then alternates:
* agent → supervisor → agent.
*

@@ -11,14 +12,26 @@ * Follows OO+DI: constructor injection, factory function, tests bypass factory.

import { createAgentRunner } from "./agent-runner.js";
import { TraceCollector } from "./trace-collector.js";
/**
* Check if the supervisor's response signals evaluation completion.
* Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
* to avoid false positives from natural language.
* Check if the supervisor's response signals evaluation success.
* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
* formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
* avoid matching inside longer identifiers.
* @param {string} text
* @returns {boolean}
*/
export function isDone(text) {
return /^EVALUATION_COMPLETE$/m.test(text);
export function isSuccessful(text) {
return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
}
/** System prompt appended for the supervisor runner in supervise mode. */
export const SUPERVISOR_SYSTEM_PROMPT =
"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
/** System prompt appended for the agent runner in supervise mode. */
export const AGENT_SYSTEM_PROMPT =
"You are being supervised by another AI agent. " +
"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
export class Supervisor {

@@ -48,12 +61,14 @@ /**

* Run the supervisor ↔ agent relay loop.
* @param {string} task - The initial task for the agent
* The supervisor receives the task first, introduces itself, and delegates
* work to the agent. The loop then alternates: agent → supervisor → agent.
* @param {string} task - The initial task for the supervisor
* @returns {Promise<{success: boolean, turns: number}>}
*/
async run(task) {
// Turn 0: Agent receives the task and starts working
this.currentSource = "agent";
// Turn 0: Supervisor receives the task and introduces it to the agent
this.currentSource = "supervisor";
this.currentTurn = 0;
let agentResult = await this.agentRunner.run(task);
let supervisorResult = await this.supervisorRunner.run(task);
if (agentResult.error) {
if (supervisorResult.error) {
this.emitSummary({ success: false, turns: 0 });

@@ -63,18 +78,21 @@ return { success: false, turns: 0 };

// The supervisor's turn is fully complete (all tool calls executed) by the
// time we check the signal — no work is interrupted.
if (isSuccessful(supervisorResult.text)) {
this.emitSummary({ success: true, turns: 0 });
return { success: true, turns: 0 };
}
for (let turn = 1; turn <= this.maxTurns; turn++) {
// Supervisor observes the agent's output
const supervisorPrompt =
`The agent reported:\n\n${agentResult.text}\n\n` +
`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
this.currentSource = "supervisor";
// Supervisor's output becomes the agent's input
this.currentSource = "agent";
this.currentTurn = turn;
let supervisorResult;
let agentResult;
if (turn === 1) {
supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
agentResult = await this.agentRunner.run(supervisorResult.text);
} else {
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
agentResult = await this.agentRunner.resume(supervisorResult.text);
}
if (supervisorResult.error) {
if (agentResult.error) {
this.emitSummary({ success: false, turns: turn });

@@ -84,16 +102,24 @@ return { success: false, turns: turn };

if (isDone(supervisorResult.text)) {
this.emitSummary({ success: true, turns: turn });
return { success: true, turns: turn };
}
// Build the full agent transcript from buffered NDJSON events so the
// supervisor sees tool calls and reasoning, not just the SDK result summary.
const agentTranscript = this.extractTranscript(this.agentRunner);
// Supervisor's response becomes the agent's next input
this.currentSource = "agent";
const supervisorPrompt =
`The agent reported:\n\n${agentTranscript}\n\n` +
`Review the agent's work and decide how to proceed.`;
this.currentSource = "supervisor";
this.currentTurn = turn;
agentResult = await this.agentRunner.resume(supervisorResult.text);
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
if (agentResult.error) {
if (supervisorResult.error) {
this.emitSummary({ success: false, turns: turn });
return { success: false, turns: turn };
}
// The supervisor's turn is fully complete — check for success signal.
if (isSuccessful(supervisorResult.text)) {
this.emitSummary({ success: true, turns: turn });
return { success: true, turns: turn };
}
}

@@ -106,2 +132,17 @@

/**
* Extract a human-readable transcript from an AgentRunner's buffered output.
* Drains the buffer and replays events through a TraceCollector.
* @param {import("./agent-runner.js").AgentRunner} runner
* @returns {string}
*/
extractTranscript(runner) {
const lines = runner.drainOutput();
const collector = new TraceCollector();
for (const line of lines) {
collector.addLine(line);
}
return collector.toText() || "[The agent produced no output.]";
}
/**
* Emit a single NDJSON line tagged with the current source and turn.

@@ -146,2 +187,6 @@ * Called in real-time via the AgentRunner onLine callback.

* @param {string[]} [deps.allowedTools] - Tools the agent may use
* @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
* @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
* @param {string} [deps.supervisorProfile] - Supervisor agent profile name
* @param {string} [deps.agentProfile] - Agent profile name
* @returns {Supervisor}

@@ -157,2 +202,6 @@ */

allowedTools,
supervisorDisallowedTools,
supervisorAllowedTools,
supervisorProfile,
agentProfile,
}) {

@@ -173,4 +222,18 @@ // Forward-reference: onLine captures `supervisor` before construction completes.

settingSources: ["project"],
agentProfile,
systemPrompt: {
type: "preset",
preset: "claude_code",
append: AGENT_SYSTEM_PROMPT,
},
});
// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
// The relay loop handles agent communication — letting the supervisor use
// Task would bypass the relay and produce an empty agent trace.
const defaultDisallowed = ["Task", "TaskOutput"];
const disallowedTools = supervisorDisallowedTools
? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
: defaultDisallowed;
const supervisorRunner = createAgentRunner({

@@ -182,5 +245,19 @@ cwd: supervisorCwd,

maxTurns: 10,
allowedTools: ["Read", "Glob", "Grep"],
allowedTools: supervisorAllowedTools ?? [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
],
disallowedTools,
onLine,
settingSources: ["project"],
agentProfile: supervisorProfile,
systemPrompt: {
type: "preset",
preset: "claude_code",
append: SUPERVISOR_SYSTEM_PROMPT,
},
});

@@ -187,0 +264,0 @@

@@ -110,3 +110,2 @@ /**

this.lastSource = parsed.source;
this.textStream.write(`\n[${parsed.source}]\n`);
}

@@ -123,2 +122,6 @@ this.collector.addLine(JSON.stringify(parsed.event));

const turns = this.collector.turns;
const prefix =
this.mode === "supervised" && this.lastSource
? `[${this.lastSource}] `
: "";
while (this.turnsEmitted < turns.length) {

@@ -129,6 +132,6 @@ const turn = turns[this.turnsEmitted++];

if (block.type === "text") {
this.textStream.write(block.text + "\n");
this.textStream.write(`${prefix}${block.text}\n`);
} else if (block.type === "tool_use") {
const input = summarizeInput(block.input);
this.textStream.write(`> Tool: ${block.name} ${input}\n`);
this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
}

@@ -135,0 +138,0 @@ }

@@ -9,4 +9,6 @@ import { describe, test } from "node:test";

createSupervisor,
SUPERVISOR_SYSTEM_PROMPT,
AGENT_SYSTEM_PROMPT,
} from "@forwardimpact/libeval";
import { isDone } from "../src/supervisor.js";
import { isSuccessful } from "../src/supervisor.js";

@@ -65,23 +67,47 @@ /**

describe("isDone", () => {
test("detects EVALUATION_COMPLETE on its own line", () => {
assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
describe("isSuccessful", () => {
test("detects EVALUATION_SUCCESSFUL on its own line", () => {
assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
assert.strictEqual(
isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
true,
);
assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
});
test("does not match EVALUATION_COMPLETE embedded in text", () => {
assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
test("tolerates markdown formatting around the signal", () => {
assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
assert.strictEqual(
isSuccessful(
"Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
),
true,
);
});
test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
assert.strictEqual(
isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
true,
);
assert.strictEqual(
isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
true,
);
});
test("does not match empty or unrelated text", () => {
assert.strictEqual(isDone(""), false);
assert.strictEqual(isDone("All done!"), false);
assert.strictEqual(isDone("DONE"), false);
assert.strictEqual(isSuccessful(""), false);
assert.strictEqual(isSuccessful("All done!"), false);
assert.strictEqual(isSuccessful("DONE"), false);
});
test("does not match old EVALUATION_COMPLETE signal", () => {
assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
});
});

@@ -123,3 +149,24 @@

test("completes on EVALUATION_COMPLETE from supervisor", async () => {
test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
const agentRunner = createMockRunner([]);
const supervisorRunner = createMockRunner([
{ text: "EVALUATION_SUCCESSFUL" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 0);
});
test("completes after one agent turn", async () => {
const agentRunner = createMockRunner([

@@ -130,3 +177,4 @@ { text: "I installed the packages." },

const supervisorRunner = createMockRunner([
{ text: "Good work.\n\nEVALUATION_COMPLETE" },
{ text: "Welcome! Please install the packages." },
{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
]);

@@ -156,5 +204,6 @@

const supervisorRunner = createMockRunner([
{ text: "Here is your task. Do the work." },
{ text: "Keep going, you need to do more." },
{ text: "Almost there, continue." },
{ text: "EVALUATION_COMPLETE" },
{ text: "EVALUATION_SUCCESSFUL" },
]);

@@ -177,5 +226,4 @@

test("enforces maxTurns limit", async () => {
// Agent responds to every turn, supervisor never says done
// Supervisor starts, agent responds each turn, supervisor never says done
const agentRunner = createMockRunner([
{ text: "Turn 0" },
{ text: "Turn 1" },

@@ -186,2 +234,3 @@ { text: "Turn 2" },

const supervisorRunner = createMockRunner([
{ text: "Start working." },
{ text: "Continue." },

@@ -206,12 +255,13 @@ { text: "Continue." },

test("output contains tagged lines with correct source and turn", async () => {
const agentMessages = [[{ type: "assistant", content: "Working" }]];
const supervisorMessages = [
[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
[{ type: "assistant", content: "Go ahead" }],
[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
];
const agentMessages = [[{ type: "assistant", content: "Working" }]];
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
const supervisorRunner = createMockRunner(
[{ text: "EVALUATION_COMPLETE" }],
[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
supervisorMessages,
);
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);

@@ -236,15 +286,15 @@ const output = new PassThrough();

// Should have: agent turn 0, supervisor turn 1, orchestrator summary
assert.ok(lines.length >= 3);
// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
assert.ok(lines.length >= 4);
const agentLine = JSON.parse(lines[0]);
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);
assert.ok("event" in supervisorLine);
const agentLine = JSON.parse(lines[1]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 0);
assert.strictEqual(agentLine.turn, 1);
assert.ok("event" in agentLine);
const supervisorLine = JSON.parse(lines[1]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 1);
assert.ok("event" in supervisorLine);
const summaryLine = JSON.parse(lines[lines.length - 1]);

@@ -262,7 +312,10 @@ assert.strictEqual(summaryLine.source, "orchestrator");

};
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
const supervisorRunner = createMockRunner(
[{ text: "EVALUATION_COMPLETE" }],
[[{ type: "assistant", content: "ok" }]],
[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
[
[{ type: "assistant", content: "Go" }],
[{ type: "assistant", content: "ok" }],
],
);
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);

@@ -287,3 +340,4 @@ const output = new PassThrough();

const tagged = JSON.parse(lines[0]);
// First line is supervisor turn 0, second is agent turn 1
const tagged = JSON.parse(lines[1]);
// The original event's `source` field is preserved inside `event`

@@ -294,17 +348,19 @@ assert.strictEqual(tagged.source, "agent");

test("emits agent output and summary when agent errors on turn 0", async () => {
const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
const agentRunner = createMockRunner(
[{ text: "Partial work", success: false }],
agentMessages,
test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
const supervisorMessages = [
[{ type: "assistant", content: "Starting..." }],
];
const supervisorRunner = createMockRunner(
[{ text: "Starting...", success: false }],
supervisorMessages,
);
// Override run to simulate an error return
const origRun = agentRunner.run;
agentRunner.run = async (task) => {
const result = await origRun.call(agentRunner, task);
const origRun = supervisorRunner.run;
supervisorRunner.run = async (task) => {
const result = await origRun.call(supervisorRunner, task);
return { ...result, error: new Error("Process exited with code 1") };
};
const supervisorRunner = createMockRunner([]);
const agentRunner = createMockRunner([]);

@@ -326,3 +382,3 @@ const output = new PassThrough();

// Output should still contain the agent's buffered lines + summary
// Output should still contain the supervisor's buffered lines + summary
const data = output.read()?.toString() ?? "";

@@ -334,7 +390,7 @@ const lines = data

assert.ok(lines.length >= 2, "Expected at least agent line + summary");
assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
const agentLine = JSON.parse(lines[0]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 0);
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);

@@ -356,2 +412,97 @@ const summaryLine = JSON.parse(lines[lines.length - 1]);

});
test("createSupervisor uses default supervisor tools when none specified", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
]);
});
test("createSupervisor passes custom supervisor tools", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorAllowedTools: ["Read", "Glob", "Grep"],
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Read",
"Glob",
"Grep",
]);
});
test("createSupervisor wires system prompts to both runners", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: AGENT_SYSTEM_PROMPT,
});
assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: SUPERVISOR_SYSTEM_PROMPT,
});
});
test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
"Task",
"TaskOutput",
]);
// Agent should not have disallowed tools
assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
});
test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorDisallowedTools: ["WebSearch", "Task"],
});
const disallowed = supervisor.supervisorRunner.disallowedTools;
assert.ok(disallowed.includes("Task"));
assert.ok(disallowed.includes("TaskOutput"));
assert.ok(disallowed.includes("WebSearch"));
// No duplicates
assert.strictEqual(disallowed.length, new Set(disallowed).size);
});
test("system prompt constants are non-empty strings", () => {
assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
});
test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
});
});

@@ -190,7 +190,5 @@ import { describe, test } from "node:test";

// Text should show source labels
assert.ok(textData.includes("[agent]"));
assert.ok(textData.includes("Working on it"));
assert.ok(textData.includes("[supervisor]"));
assert.ok(textData.includes("Looks good"));
// Text should show source prefixes on content lines
assert.ok(textData.includes("[agent] Working on it"));
assert.ok(textData.includes("[supervisor] Looks good"));
assert.ok(textData.includes("Evaluation completed after 1 turns"));

@@ -258,5 +256,5 @@ });

const textData = collect(textStream);
// [agent] label should appear only once
const agentLabels = textData.split("[agent]").length - 1;
assert.strictEqual(agentLabels, 1);
// [agent] prefix should appear on each content line
assert.ok(textData.includes("[agent] Step 1"));
assert.ok(textData.includes("[agent] Step 2"));
});

@@ -263,0 +261,0 @@