Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

@forwardimpact/libeval

Package Overview
Dependencies
Maintainers
1
Versions
50
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@forwardimpact/libeval - npm Package Compare versions

Comparing version
0.1.6
to
0.1.8
+101
test/mock-runner.js
/**
* Test-only mock factory for AgentRunner. Yields pre-scripted responses,
* and (when an `onBatch` callback is set) fires it at the same boundaries
* the real AgentRunner would: assistant messages with at least one text
* block, and the terminal `result` message. If the callback calls
* `abort()`, the mock stops iterating that response's messages and
* reports `aborted: true`.
*
* Intentionally a regular module (not a test file) so describe/test blocks
* here would not run. Lives under test/ to make its scope explicit.
*/
import { PassThrough } from "node:stream";
import { AgentRunner } from "@forwardimpact/libeval";
/**
* Whether a scripted message should trigger an onBatch flush. Mirrors the
* real AgentRunner: assistant-with-text-block or terminal `result` message.
* Tool-only or string-content messages accumulate without flushing.
* @param {object} message
* @returns {boolean}
*/
export function shouldFlush(message) {
if (message.type === "result") return true;
if (message.type !== "assistant") return false;
const content = message.message?.content ?? message.content;
if (!Array.isArray(content)) return false;
for (const block of content) {
if (block.type === "text" && block.text) return true;
}
return false;
}
/**
* Create a mock AgentRunner that yields pre-scripted responses. Each call
* to `run()` or `resume()` pops the next response from the array.
* @param {object[]} responses - Array of {text, success} objects
* @param {object[]} [messages] - Messages to buffer per response
* @returns {AgentRunner}
*/
export function createMockRunner(responses, messages) {
const output = new PassThrough();
let callIndex = 0;
const runner = new AgentRunner({
cwd: "/tmp",
query: async function* () {},
output,
});
const consume = async (msgs) => {
let aborted = false;
for (const m of msgs) {
const line = JSON.stringify(m);
runner.buffer.push(line);
if (runner.onLine) runner.onLine(line);
if (runner.onBatch && shouldFlush(m)) {
await runner.onBatch([line], {
abort: () => {
aborted = true;
},
});
if (aborted) break;
}
}
return aborted;
};
runner.run = async (_task) => {
const resp = responses[callIndex++];
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
const aborted = await consume(msgs);
runner.sessionId = "mock-session";
return {
success: resp.success ?? true,
text: resp.text,
sessionId: "mock-session",
aborted,
error: null,
};
};
runner.resume = async (_prompt) => {
const resp = responses[callIndex++];
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
const aborted = await consume(msgs);
return {
success: resp.success ?? true,
text: resp.text,
sessionId: runner.sessionId,
aborted,
error: null,
};
};
return runner;
}
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import { Supervisor } from "@forwardimpact/libeval";
import { isIntervention } from "../src/supervisor.js";
import { createMockRunner } from "./mock-runner.js";
describe("isIntervention", () => {
test("detects EVALUATION_INTERVENTION on its own line", () => {
assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
assert.strictEqual(
isIntervention("Some text\nEVALUATION_INTERVENTION\nMore text"),
true,
);
assert.strictEqual(
isIntervention("Stop.\n\nEVALUATION_INTERVENTION"),
true,
);
});
test("tolerates markdown formatting around the signal", () => {
assert.strictEqual(isIntervention("**EVALUATION_INTERVENTION**"), true);
assert.strictEqual(isIntervention("*EVALUATION_INTERVENTION*"), true);
assert.strictEqual(isIntervention("__EVALUATION_INTERVENTION__"), true);
assert.strictEqual(isIntervention("_EVALUATION_INTERVENTION_"), true);
assert.strictEqual(isIntervention("`EVALUATION_INTERVENTION`"), true);
assert.strictEqual(
isIntervention(
"Wrong path.\n\n**EVALUATION_INTERVENTION**\n\nTry the documented one.",
),
true,
);
});
test("matches EVALUATION_INTERVENTION inline", () => {
assert.strictEqual(
isIntervention("Stopping you with EVALUATION_INTERVENTION now."),
true,
);
assert.strictEqual(
isIntervention("Note: EVALUATION_INTERVENTION. Switch to Y."),
true,
);
});
test("does not match empty or unrelated text", () => {
assert.strictEqual(isIntervention(""), false);
assert.strictEqual(isIntervention("Stop and think."), false);
assert.strictEqual(isIntervention("INTERVENTION"), false);
});
test("does not match EVALUATION_COMPLETE alone", () => {
assert.strictEqual(isIntervention("EVALUATION_COMPLETE"), false);
assert.strictEqual(
isIntervention("Good work.\n\nEVALUATION_COMPLETE"),
false,
);
});
});
describe("Supervisor - mid-turn intervention", () => {
test("observation without intervention does not interrupt the agent", async () => {
// Agent emits one structured assistant text block — fires onBatch once.
// Supervisor responds with "Keep going." — neither signal flag is set,
// so the agent's SDK session completes naturally and the end-of-turn
// review then emits EVALUATION_COMPLETE.
const agentMessages = [
[
{
type: "assistant",
message: {
content: [{ type: "text", text: "I'm working on it." }],
},
},
],
];
const agentRunner = createMockRunner(
[{ text: "I'm working on it." }],
agentMessages,
);
const supervisorRunner = createMockRunner([
{ text: "Welcome! Please install." },
{ text: "Keep going." },
{ text: "Good work.\n\nEVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
let agentResumeCalls = 0;
const origAgentResume = agentRunner.resume;
agentRunner.resume = async (prompt) => {
agentResumeCalls++;
return origAgentResume.call(agentRunner, prompt);
};
const result = await supervisor.run("Install");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
assert.strictEqual(
agentResumeCalls,
0,
"Agent should not be resumed when supervisor never intervenes",
);
// Trace must contain a mid_turn_review marker but no intervention markers.
const data = output.read()?.toString() ?? "";
const orchestratorEvents = data
.trim()
.split("\n")
.filter((l) => l.length > 0)
.map((l) => JSON.parse(l))
.filter((e) => e.source === "orchestrator");
assert.ok(
orchestratorEvents.some((e) => e.event?.type === "mid_turn_review"),
"Trace should contain mid_turn_review when onBatch fires",
);
assert.ok(
!orchestratorEvents.some(
(e) => e.event?.type === "intervention_requested",
),
"Trace should not contain intervention_requested when supervisor only observes",
);
});
test("EVALUATION_INTERVENTION from mid-turn batch interrupts and relays", async () => {
// Agent's first call fires onBatch on a structured assistant text block;
// supervisor responds with EVALUATION_INTERVENTION → abort + relay.
// Agent's second call (resume) finishes naturally; end-of-turn review
// then emits EVALUATION_COMPLETE.
const agentMessages = [
[
{
type: "assistant",
message: {
content: [{ type: "text", text: "I'll try the wrong path." }],
},
},
],
[
{
type: "assistant",
message: {
content: [
{ type: "text", text: "OK, switching to the documented path." },
],
},
},
],
];
const agentRunner = createMockRunner(
[
{ text: "I'll try the wrong path." },
{ text: "OK, switching to the documented path." },
],
agentMessages,
);
// Supervisor responses (in order):
// 0: turn 0 introduction
// 1: mid-turn 1 batch 1 — intervene
// 2: mid-turn 1 batch 1 (post-resume) — keep going
// 3: end-of-turn 1 — EVALUATION_COMPLETE
const supervisorMessages = [
undefined,
[
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "EVALUATION_INTERVENTION Stop and use the documented path.",
},
],
},
},
],
undefined,
undefined,
];
const supervisorRunner = createMockRunner(
[
{ text: "Welcome." },
{ text: "EVALUATION_INTERVENTION Stop and use the documented path." },
{ text: "Keep going." },
{ text: "Good.\n\nEVALUATION_COMPLETE" },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
let agentResumeCalls = 0;
let firstResumePrompt = null;
const origAgentResume = agentRunner.resume;
agentRunner.resume = async (prompt) => {
agentResumeCalls++;
if (agentResumeCalls === 1) firstResumePrompt = prompt;
return origAgentResume.call(agentRunner, prompt);
};
const result = await supervisor.run("Install");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
assert.strictEqual(
agentResumeCalls,
1,
"Agent should be resumed exactly once after intervention",
);
assert.ok(
firstResumePrompt && firstResumePrompt.includes("documented path"),
"Resume prompt should carry the supervisor's intervention text",
);
const orchestratorEvents = (output.read()?.toString() ?? "")
.trim()
.split("\n")
.filter((l) => l.length > 0)
.map((l) => JSON.parse(l))
.filter((e) => e.source === "orchestrator");
assert.ok(
orchestratorEvents.some(
(e) => e.event?.type === "intervention_requested",
),
"Trace should contain intervention_requested orchestrator event",
);
assert.ok(
orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
"Trace should contain intervention_relayed orchestrator event",
);
});
test("EVALUATION_INTERVENTION and EVALUATION_COMPLETE in the same turn", async () => {
// Batch 1: supervisor intervenes (abort + relay).
// After resume, batch 1 of resume: supervisor writes EVALUATION_COMPLETE
// (mid-turn) — the loop must exit success without running an end-of-turn
// review.
const agentMessages = [
[
{
type: "assistant",
message: { content: [{ type: "text", text: "Trying X." }] },
},
],
[
{
type: "assistant",
message: { content: [{ type: "text", text: "OK trying Y." }] },
},
],
];
const agentRunner = createMockRunner(
[{ text: "Trying X." }, { text: "Trying Y." }],
agentMessages,
);
const supervisorMessages = [
undefined,
[
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "EVALUATION_INTERVENTION Try Y instead.",
},
],
},
},
],
[
{
type: "assistant",
message: {
content: [{ type: "text", text: "Excellent. EVALUATION_COMPLETE" }],
},
},
],
];
const supervisorRunner = createMockRunner(
[
{ text: "Welcome." },
{ text: "EVALUATION_INTERVENTION Try Y instead." },
{ text: "Excellent. EVALUATION_COMPLETE" },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
let agentResumeCalls = 0;
const origAgentResume = agentRunner.resume;
agentRunner.resume = async (prompt) => {
agentResumeCalls++;
return origAgentResume.call(agentRunner, prompt);
};
const result = await supervisor.run("Install");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
assert.strictEqual(
agentResumeCalls,
1,
"Agent.resume runs once (after intervention); EVALUATION_COMPLETE then ends the turn",
);
const orchestratorEvents = (output.read()?.toString() ?? "")
.trim()
.split("\n")
.filter((l) => l.length > 0)
.map((l) => JSON.parse(l))
.filter((e) => e.source === "orchestrator");
assert.ok(
orchestratorEvents.some(
(e) => e.event?.type === "intervention_requested",
),
"Trace should contain intervention_requested",
);
assert.ok(
orchestratorEvents.some((e) => e.event?.type === "complete_requested"),
"Trace should contain complete_requested for mid-turn EVALUATION_COMPLETE",
);
});
});
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import {
Supervisor,
createSupervisor,
SUPERVISOR_SYSTEM_PROMPT,
AGENT_SYSTEM_PROMPT,
} from "@forwardimpact/libeval";
import { createMockRunner } from "./mock-runner.js";
describe("Supervisor - output and events", () => {
test("output contains tagged lines with correct source and turn", async () => {
const supervisorMessages = [
[{ type: "assistant", content: "Go ahead" }],
[{ type: "assistant", content: "EVALUATION_COMPLETE" }],
];
const agentMessages = [[{ type: "assistant", content: "Working" }]];
const supervisorRunner = createMockRunner(
[{ text: "Go ahead" }, { text: "EVALUATION_COMPLETE" }],
supervisorMessages,
);
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
assert.ok(lines.length >= 4);
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);
assert.ok("event" in supervisorLine);
const agentLine = JSON.parse(lines[1]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 1);
assert.ok("event" in agentLine);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.type, "summary");
assert.strictEqual(summaryLine.success, true);
});
test("events are nested under event key (no field collisions)", async () => {
const sourceEvent = {
type: "assistant",
source: "sdk-internal",
content: "test",
};
const supervisorRunner = createMockRunner(
[{ text: "Go" }, { text: "EVALUATION_COMPLETE" }],
[
[{ type: "assistant", content: "Go" }],
[{ type: "assistant", content: "ok" }],
],
);
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
// First line is supervisor turn 0, second is agent turn 1
const tagged = JSON.parse(lines[1]);
assert.strictEqual(tagged.source, "agent");
assert.strictEqual(tagged.event.source, "sdk-internal");
});
test("mid-turn intervention emits orchestrator events and shares the agent's turn id", async () => {
// Agent emits one structured assistant text block on its first call —
// supervisor intervenes mid-turn. Resume then completes naturally and
// the end-of-turn review signals EVALUATION_COMPLETE.
const agentMessages = [
[
{
type: "assistant",
message: {
content: [{ type: "text", text: "Trying the wrong thing." }],
},
},
],
[
{
type: "assistant",
message: {
content: [{ type: "text", text: "Switching to the right thing." }],
},
},
],
];
const supervisorMessages = [
undefined,
[
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "EVALUATION_INTERVENTION Switch to the right path.",
},
],
},
},
],
undefined,
undefined,
];
const agentRunner = createMockRunner(
[{ text: "Trying the wrong thing." }, { text: "Switching." }],
agentMessages,
);
const supervisorRunner = createMockRunner(
[
{ text: "Welcome." },
{ text: "EVALUATION_INTERVENTION Switch to the right path." },
{ text: "Keep going." },
{ text: "Done. EVALUATION_COMPLETE" },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
const result = await supervisor.run("Task");
assert.strictEqual(result.success, true);
const lines = (output.read()?.toString() ?? "")
.trim()
.split("\n")
.filter((l) => l.length > 0)
.map((l) => JSON.parse(l));
// (1) Orchestrator event with intervention_requested.
const interventionRequested = lines.find(
(l) =>
l.source === "orchestrator" &&
l.event?.type === "intervention_requested",
);
assert.ok(
interventionRequested,
"Trace must contain intervention_requested orchestrator event",
);
// (2) At least one agent line and one supervisor line share a turn id —
// mid-turn supervisor activity is tagged with the agent's turn.
const agentTurns = new Set(
lines.filter((l) => l.source === "agent").map((l) => l.turn),
);
const supervisorTurns = new Set(
lines.filter((l) => l.source === "supervisor").map((l) => l.turn),
);
const sharedTurns = [...agentTurns].filter((t) => supervisorTurns.has(t));
assert.ok(
sharedTurns.length > 0,
"At least one turn id must appear on both agent and supervisor lines",
);
// (3) Final summary line still emitted.
const summary = lines[lines.length - 1];
assert.strictEqual(summary.source, "orchestrator");
assert.strictEqual(summary.type, "summary");
assert.strictEqual(summary.success, true);
});
test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
const supervisorMessages = [
[{ type: "assistant", content: "Starting..." }],
];
const supervisorRunner = createMockRunner(
[{ text: "Starting...", success: false }],
supervisorMessages,
);
const origRun = supervisorRunner.run;
supervisorRunner.run = async (task) => {
const result = await origRun.call(supervisorRunner, task);
return { ...result, error: new Error("Process exited with code 1") };
};
const agentRunner = createMockRunner([]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
const result = await supervisor.run("Task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 0);
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.success, false);
assert.strictEqual(summaryLine.turns, 0);
});
});
describe("Supervisor - createSupervisor factory", () => {
test("createSupervisor factory returns a Supervisor instance", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.ok(supervisor instanceof Supervisor);
});
test("createSupervisor uses default supervisor tools when none specified", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
]);
});
test("createSupervisor passes custom supervisor tools", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorAllowedTools: ["Read", "Glob", "Grep"],
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Read",
"Glob",
"Grep",
]);
});
test("createSupervisor wires system prompts to both runners", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: AGENT_SYSTEM_PROMPT,
});
assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: SUPERVISOR_SYSTEM_PROMPT,
});
});
test("createSupervisor blocks sub-agent spawn tools on supervisor by default", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
"Agent",
"Task",
"TaskOutput",
"TaskStop",
]);
assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
});
test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorDisallowedTools: ["WebSearch", "Task"],
});
const disallowed = supervisor.supervisorRunner.disallowedTools;
assert.ok(disallowed.includes("Agent"));
assert.ok(disallowed.includes("Task"));
assert.ok(disallowed.includes("TaskOutput"));
assert.ok(disallowed.includes("TaskStop"));
assert.ok(disallowed.includes("WebSearch"));
assert.strictEqual(disallowed.length, new Set(disallowed).size);
});
test("system prompt constants are non-empty strings", () => {
assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
});
test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_COMPLETE"));
});
});
import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import { Supervisor } from "@forwardimpact/libeval";
import { isComplete } from "../src/supervisor.js";
import { createMockRunner } from "./mock-runner.js";
describe("isComplete", () => {
test("detects EVALUATION_COMPLETE on its own line", () => {
assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true);
assert.strictEqual(
isComplete("Some text\nEVALUATION_COMPLETE\nMore text"),
true,
);
assert.strictEqual(isComplete("Done.\n\nEVALUATION_COMPLETE"), true);
});
test("tolerates markdown formatting around the signal", () => {
assert.strictEqual(isComplete("**EVALUATION_COMPLETE**"), true);
assert.strictEqual(isComplete("*EVALUATION_COMPLETE*"), true);
assert.strictEqual(isComplete("__EVALUATION_COMPLETE__"), true);
assert.strictEqual(isComplete("_EVALUATION_COMPLETE_"), true);
assert.strictEqual(isComplete("`EVALUATION_COMPLETE`"), true);
assert.strictEqual(
isComplete("Good work.\n\n**EVALUATION_COMPLETE**\n\nNow filing issues."),
true,
);
});
test("matches EVALUATION_COMPLETE anywhere in text", () => {
assert.strictEqual(isComplete("not EVALUATION_COMPLETE yet"), true);
assert.strictEqual(
isComplete("The agent is EVALUATION_COMPLETE done"),
true,
);
assert.strictEqual(
isComplete("Great work! EVALUATION_COMPLETE. Now filing issues."),
true,
);
});
test("does not match empty or unrelated text", () => {
assert.strictEqual(isComplete(""), false);
assert.strictEqual(isComplete("All done!"), false);
assert.strictEqual(isComplete("DONE"), false);
});
test("does not match old EVALUATION_SUCCESSFUL signal", () => {
assert.strictEqual(isComplete("EVALUATION_SUCCESSFUL"), false);
});
});
describe("Supervisor - run and turns", () => {
test("constructor throws on missing agentRunner", () => {
assert.throws(
() =>
new Supervisor({
supervisorRunner: createMockRunner([]),
output: new PassThrough(),
}),
/agentRunner is required/,
);
});
test("constructor throws on missing supervisorRunner", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
output: new PassThrough(),
}),
/supervisorRunner is required/,
);
});
test("constructor throws on missing output", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
supervisorRunner: createMockRunner([]),
}),
/output is required/,
);
});
test("completes on EVALUATION_COMPLETE from supervisor at turn 0", async () => {
const agentRunner = createMockRunner([]);
const supervisorRunner = createMockRunner([
{ text: "EVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 0);
});
test("completes after one agent turn", async () => {
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
const supervisorRunner = createMockRunner([
{ text: "Welcome! Please install the packages." },
{ text: "Good work.\n\nEVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
});
test("detects EVALUATION_COMPLETE in streamed messages when result text differs", async () => {
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
const supervisorMessages = [
undefined,
[
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "Good work.\n\nEVALUATION_COMPLETE\n\nNow filing issues.",
},
],
},
},
{
type: "assistant",
message: {
content: [
{ type: "text", text: "## Summary\n\nAll issues filed." },
],
},
},
],
];
const supervisorRunner = createMockRunner(
[
{ text: "Welcome! Please install the packages." },
{ text: "## Summary\n\nAll issues filed." },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
});
test("relays only the last assistant text block to the agent", async () => {
// Supervisor emits reasoning text ("Let me research...") then a tool call,
// then a final task message. Only the final message should reach the agent.
const supervisorMessages = [
// Turn 0: multiple assistant messages with reasoning + task
[
{
type: "assistant",
message: {
content: [
{ type: "text", text: "Let me research the product first." },
],
},
},
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "Hello! Here is your task: install the packages.",
},
],
},
},
],
// Turn 1: evaluation
undefined,
];
let capturedAgentPrompt = null;
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
const origRun = agentRunner.run;
agentRunner.run = async (task) => {
capturedAgentPrompt = task;
return origRun.call(agentRunner, task);
};
const supervisorRunner = createMockRunner(
[
// SDK result text = last message text (but relay should use buffer)
{ text: "Hello! Here is your task: install the packages." },
{ text: "EVALUATION_COMPLETE" },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
await supervisor.run("Evaluate the product");
// Agent should receive only the final text, not the reasoning
assert.strictEqual(
capturedAgentPrompt,
"Hello! Here is your task: install the packages.",
);
assert.ok(
!capturedAgentPrompt.includes("research"),
"Reasoning text should not leak to agent",
);
});
test("runs multiple turns before completion", async () => {
const agentRunner = createMockRunner([
{ text: "Started working." },
{ text: "Made progress." },
{ text: "Finished everything." },
]);
const supervisorRunner = createMockRunner([
{ text: "Here is your task. Do the work." },
{ text: "Keep going, you need to do more." },
{ text: "Almost there, continue." },
{ text: "EVALUATION_COMPLETE" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Do the work");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 3);
});
test("enforces maxTurns limit", async () => {
const agentRunner = createMockRunner([
{ text: "Turn 1" },
{ text: "Turn 2" },
]);
const supervisorRunner = createMockRunner([
{ text: "Start working." },
{ text: "Continue." },
{ text: "Continue." },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 2,
});
const result = await supervisor.run("Endless task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 2);
});
});
+2
-2

@@ -32,3 +32,3 @@ #!/usr/bin/env node

--model=MODEL Claude model to use (default: opus)
--max-turns=N Maximum agentic turns (default: 50)
--max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
--output=PATH Write NDJSON trace to file (default: stdout)

@@ -44,3 +44,3 @@ --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)

--model=MODEL Claude model to use (default: opus)
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
--output=PATH Write NDJSON trace to file (default: stdout)

@@ -47,0 +47,0 @@ --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)

@@ -8,3 +8,5 @@ export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";

AGENT_SYSTEM_PROMPT,
isComplete,
isIntervention,
} from "./src/supervisor.js";
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
{
"name": "@forwardimpact/libeval",
"version": "0.1.6",
"version": "0.1.8",
"description": "Process Claude Code stream-json output into structured traces",

@@ -5,0 +5,0 @@ "license": "Apache-2.0",

@@ -20,2 +20,3 @@ /**

* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
* @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)

@@ -35,2 +36,3 @@ * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI

onLine,
onBatch,
settingSources,

@@ -48,3 +50,3 @@ agentProfile,

this.model = model ?? "opus";
this.maxTurns = maxTurns ?? 50;
this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
this.allowedTools = allowedTools ?? [

@@ -60,2 +62,3 @@ "Bash",

this.onLine = onLine ?? null;
this.onBatch = onBatch ?? null;
this.settingSources = settingSources ?? [];

@@ -67,2 +70,4 @@ this.agentProfile = agentProfile ?? null;

this.buffer = [];
/** @type {AbortController|null} */
this.currentAbortController = null;
}

@@ -73,11 +78,9 @@

* @param {string} task - The task prompt
* @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
*/
async run(task) {
let text = "";
let stopReason = null;
let error = null;
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
for await (const message of this.query({
const iterator = this.query({
prompt: task,

@@ -87,3 +90,3 @@ options: {

allowedTools: this.allowedTools,
maxTurns: this.maxTurns,
...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
model: this.model,

@@ -93,2 +96,3 @@ permissionMode: this.permissionMode,

settingSources: this.settingSources,
abortController,
...(this.disallowedTools.length > 0 && {

@@ -100,25 +104,7 @@ disallowedTools: this.disallowedTools,

},
})) {
const line = JSON.stringify(message);
this.output.write(line + "\n");
this.buffer.push(line);
if (this.onLine) this.onLine(line);
if (message.type === "system" && message.subtype === "init") {
this.sessionId = message.session_id;
}
if (message.type === "result") {
text = message.result ?? "";
stopReason = message.subtype;
}
}
} catch (err) {
error = err;
});
return await this.#consumeQuery(iterator);
} finally {
this.currentAbortController = null;
}
// If the SDK already emitted a successful result, honour it even when the
// stream throws afterwards (e.g. "Credit balance is too low" during
// cleanup). Only treat errors as fatal when no result was received yet.
const success = stopReason === "success";
return { success, text, sessionId: this.sessionId, error };
}

@@ -129,11 +115,9 @@

* @param {string} prompt - The follow-up prompt
* @returns {Promise<{success: boolean, text: string}>}
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
*/
async resume(prompt) {
let text = "";
let stopReason = null;
let error = null;
const abortController = new AbortController();
this.currentAbortController = abortController;
try {
for await (const message of this.query({
const iterator = this.query({
prompt,

@@ -144,4 +128,42 @@ options: {

allowDangerouslySkipPermissions: true,
abortController,
},
})) {
});
return await this.#consumeQuery(iterator);
} finally {
this.currentAbortController = null;
}
}
/**
* Shared consumer for both `run()` and `resume()`. Iterates the SDK query
* iterator, mirroring every line to the output stream / buffer / onLine
* callback, and — when `onBatch` is set — flushes accumulated lines to it
* at natural boundaries (assistant messages with text blocks, and the
* terminal `result` message).
*
* INVARIANT: the `await this.onBatch(...)` call below is the ONLY
* suspension point in this loop. While it is pending, no further lines
* are pulled from the SDK generator. The Supervisor relies on this — its
* onBatch callback flips `currentSource` to "supervisor" for the duration
* of its mid-turn LLM call, and the invariant guarantees no agent line
* can arrive concurrently and be mis-tagged.
*
* If the supervisor calls `abort()` from inside the callback, the next
* iteration of the for-await loop will throw. We catch the throw, check
* `currentAbortController.signal.aborted` (avoiding fragility around
* AbortError vs DOMException shapes), and report `aborted: true` so the
* caller can distinguish "supervisor asked us to stop" from a real error.
* @param {AsyncIterable<object>} iterator
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
*/
async #consumeQuery(iterator) {
let text = "";
let stopReason = null;
let error = null;
let aborted = false;
const pendingBatch = [];
try {
for await (const message of iterator) {
const line = JSON.stringify(message);

@@ -151,3 +173,7 @@ this.output.write(line + "\n");

if (this.onLine) this.onLine(line);
if (this.onBatch) pendingBatch.push(line);
if (message.type === "system" && message.subtype === "init") {
this.sessionId = message.session_id;
}
if (message.type === "result") {

@@ -157,9 +183,24 @@ text = message.result ?? "";

}
const shouldFlush =
this.onBatch &&
(message.type === "result" ||
(message.type === "assistant" && hasTextBlock(message)));
if (shouldFlush) {
const batchLines = pendingBatch.splice(0, pendingBatch.length);
await this.onBatch(batchLines, {
abort: () => this.currentAbortController?.abort(),
});
}
}
} catch (err) {
error = err;
if (this.currentAbortController?.signal.aborted) {
aborted = true;
} else {
error = err;
}
}
const success = stopReason === "success";
return { success, text, error };
return { success, text, sessionId: this.sessionId, error, aborted };
}

@@ -179,2 +220,19 @@

/**
* Whether an SDK assistant message contains at least one text block.
* Tool-only assistant messages return false so they accumulate into the
* pending batch and flush with the next text block (or with the terminal
* `result` message), keeping supervisor LLM cost bounded.
* @param {object} message
* @returns {boolean}
*/
function hasTextBlock(message) {
const content = message.message?.content ?? message.content;
if (!Array.isArray(content)) return false;
for (const block of content) {
if (block.type === "text" && block.text) return true;
}
return false;
}
/**
* Factory function — wires real dependencies.

@@ -181,0 +239,0 @@ * @param {object} deps - Same as AgentRunner constructor

@@ -22,2 +22,34 @@ import { readFileSync, createWriteStream } from "node:fs";

/**
* Parse and validate run command options from args.
* @param {string[]} args
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
*/
function parseRunOptions(args) {
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");
const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
if (taskAmend) taskContent += `\n\n${taskAmend}`;
return {
taskContent,
cwd: resolve(parseFlag(args, "cwd") ?? "."),
model: parseFlag(args, "model") ?? "opus",
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
outputPath: parseFlag(args, "output"),
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
allowedTools: (
parseFlag(args, "allowed-tools") ??
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
).split(","),
};
}
/**
* Run command — execute a single agent via the Claude Agent SDK.

@@ -32,6 +64,7 @@ *

* --model=MODEL Claude model to use (default: opus)
* --max-turns=N Maximum agentic turns (default: 50)
* --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
* --output=PATH Write NDJSON trace to file (default: stdout)
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
* --task-amend=TEXT Additional text appended to the task prompt
*

@@ -41,20 +74,12 @@ * @param {string[]} args - Command arguments

export async function runRunCommand(args) {
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");
const {
taskContent,
cwd,
model,
maxTurns,
outputPath,
agentProfile,
allowedTools,
} = parseRunOptions(args);
const cwd = resolve(parseFlag(args, "cwd") ?? ".");
const model = parseFlag(args, "model") ?? "opus";
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
const outputPath = parseFlag(args, "output");
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
const allowedTools = (
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
).split(",");
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
// When --output is specified, stream text to stdout while writing NDJSON to file.

@@ -61,0 +86,0 @@ // Otherwise, write NDJSON directly to stdout (backwards-compatible).

@@ -23,2 +23,46 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";

/**
* Parse all supervise flags from args into an options object.
* @param {string[]} args
* @returns {object}
*/
function parseSuperviseOptions(args) {
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
if (taskAmend) taskContent += `\n\n${taskAmend}`;
return {
taskContent,
supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
agentCwd: resolve(
parseFlag(args, "agent-cwd") ??
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
),
model: parseFlag(args, "model") ?? "opus",
maxTurns: (() => {
const raw = parseFlag(args, "max-turns") ?? "20";
return raw === "0" ? 0 : parseInt(raw, 10);
})(),
outputPath: parseFlag(args, "output"),
supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
allowedTools: (
parseFlag(args, "allowed-tools") ??
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
).split(","),
supervisorAllowedTools: supervisorAllowedToolsRaw
? supervisorAllowedToolsRaw.split(",")
: undefined,
};
}
/**
* Supervise command — run two agents in a relay loop via the Claude Agent SDK.

@@ -34,3 +78,3 @@ *

* --model=MODEL Claude model to use (default: opus)
* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
* --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
* --output=PATH Write NDJSON trace to file (default: stdout)

@@ -40,2 +84,3 @@ * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)

* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
* --task-amend=TEXT Additional text appended to the task prompt
*

@@ -45,32 +90,9 @@ * @param {string[]} args - Command arguments

export async function runSuperviseCommand(args) {
const taskFile = parseFlag(args, "task-file");
const taskText = parseFlag(args, "task-text");
if (taskFile && taskText)
throw new Error("--task-file and --task-text are mutually exclusive");
if (!taskFile && !taskText)
throw new Error("--task-file or --task-text is required");
const opts = parseSuperviseOptions(args);
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
const agentCwd = resolve(
parseFlag(args, "agent-cwd") ??
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
);
const model = parseFlag(args, "model") ?? "opus";
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
const outputPath = parseFlag(args, "output");
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
const allowedTools = (
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
).split(",");
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
const supervisorAllowedTools = supervisorAllowedToolsRaw
? supervisorAllowedToolsRaw.split(",")
: undefined;
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
// When --output is specified, stream text to stdout while writing NDJSON to file.
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
const fileStream = outputPath ? createWriteStream(outputPath) : null;
const fileStream = opts.outputPath
? createWriteStream(opts.outputPath)
: null;
const output = fileStream

@@ -86,15 +108,15 @@ ? createTeeWriter({

const supervisor = createSupervisor({
supervisorCwd,
agentCwd,
supervisorCwd: opts.supervisorCwd,
agentCwd: opts.agentCwd,
query,
output,
model,
maxTurns,
allowedTools,
supervisorAllowedTools,
supervisorProfile,
agentProfile,
model: opts.model,
maxTurns: opts.maxTurns,
allowedTools: opts.allowedTools,
supervisorAllowedTools: opts.supervisorAllowedTools,
supervisorProfile: opts.supervisorProfile,
agentProfile: opts.agentProfile,
});
const result = await supervisor.run(taskContent);
const result = await supervisor.run(opts.taskContent);

@@ -101,0 +123,0 @@ if (fileStream) {

@@ -16,4 +16,4 @@ /**

* Check if the supervisor's response signals evaluation success.
* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
* formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
* Matches EVALUATION_COMPLETE anywhere in the text, tolerating markdown
* formatting (e.g. **EVALUATION_COMPLETE**). Uses word boundaries to
* avoid matching inside longer identifiers.

@@ -23,16 +23,40 @@ * @param {string} text

*/
export function isSuccessful(text) {
return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
export function isComplete(text) {
return /(?:^|[\s*_~`])EVALUATION_COMPLETE(?:[\s*_~`.,!?]|$)/m.test(text);
}
/**
* Check if the supervisor's response signals a mid-turn intervention.
* Same tolerance rules as isComplete (markdown formatting, word boundaries),
* but matches the EVALUATION_INTERVENTION keyword instead.
* @param {string} text
* @returns {boolean}
*/
export function isIntervention(text) {
return /(?:^|[\s*_~`])EVALUATION_INTERVENTION(?:[\s*_~`.,!?]|$)/m.test(text);
}
/** System prompt appended for the supervisor runner in supervise mode. */
export const SUPERVISOR_SYSTEM_PROMPT =
"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
"You relay messages to one persistent agent session — your only output " +
"channel. Spawning sub-agents or restarting the agent is blocked. Do not " +
"do the work yourself. Reply briefly to let the agent continue, write " +
"EVALUATION_INTERVENTION + instructions to interrupt mid-turn, or " +
"EVALUATION_COMPLETE when done. Only your final message each turn is " +
"relayed.";
/** System prompt appended for the agent runner in supervise mode. */
export const AGENT_SYSTEM_PROMPT =
"You are being supervised by another AI agent. " +
"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
"A supervisor watches your work and may interrupt with new instructions " +
"mid-task. Treat any new prompt as authoritative and adjust course. " +
"When uncertain, stop and ask a clarifying question.";
/**
* Maximum number of mid-turn interventions allowed within a single agent turn.
* Bounded so a looping supervisor exhausts its quota fast (observability) but
* leaves headroom for legitimate "intervene, observe, intervene again" patterns.
* The outer maxTurns budget still bounds overall runtime.
*/
const MAX_INTERVENTIONS_PER_TURN = 5;
export class Supervisor {

@@ -61,3 +85,3 @@ /**

* The SDK result text only reflects the last assistant message, so when
* the supervisor writes EVALUATION_SUCCESSFUL in an early message and
* the supervisor writes EVALUATION_COMPLETE in an early message and
* then continues with follow-up work, the result text won't contain it.

@@ -67,3 +91,19 @@ * This flag captures the signal from the full message stream.

*/
this.successSignalSeen = false;
this.completeSignalSeen = false;
/**
* Set to true when any supervisor message contains EVALUATION_INTERVENTION.
* Mirrors completeSignalSeen — populated by emitLine when a supervisor
* assistant text block matches isIntervention(...). The mid-turn loop
* reads this flag after each supervisor invocation to decide whether to
* abort the agent's in-flight SDK session.
* @type {boolean}
*/
this.interventionSignalSeen = false;
/**
* The most recent supervisor SDK result captured inside the mid-turn
* onBatch callback. The outer loop reads this after the agent aborts to
* build the next relay prompt without re-running the supervisor.
* @type {{success: boolean, text: string}|null}
*/
this.lastSupervisorResult = null;
}

@@ -82,3 +122,5 @@

this.currentTurn = 0;
this.successSignalSeen = false;
this.completeSignalSeen = false;
this.interventionSignalSeen = false;
this.lastSupervisorResult = null;
let supervisorResult = await this.supervisorRunner.run(task);

@@ -93,6 +135,6 @@

// streamed message content. The SDK result text only reflects the last
// assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
// assistant message, so when the supervisor writes EVALUATION_COMPLETE
// early and then continues (e.g. filing issues), we must also check the
// flag set by emitLine during streaming.
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
if (this.completeSignalSeen || isComplete(supervisorResult.text)) {
this.emitSummary({ success: true, turns: 0 });

@@ -102,49 +144,181 @@ return { success: true, turns: 0 };

for (let turn = 1; turn <= this.maxTurns; turn++) {
// Supervisor's output becomes the agent's input
this.currentSource = "agent";
this.currentTurn = turn;
let agentResult;
if (turn === 1) {
agentResult = await this.agentRunner.run(supervisorResult.text);
} else {
agentResult = await this.agentRunner.resume(supervisorResult.text);
}
const turnLimit = this.maxTurns === 0 ? Infinity : this.maxTurns;
for (let turn = 1; turn <= turnLimit; turn++) {
// Only the supervisor's final message is relayed to the agent.
// Extract the last assistant text block from the buffer to avoid
// leaking intermediate reasoning (research, tool calls, notes).
const relay = this.extractLastText(
this.supervisorRunner,
supervisorResult.text,
);
if (agentResult.error) {
this.emitSummary({ success: false, turns: turn });
return { success: false, turns: turn };
}
// Drive the agent through interventions until its SDK session ends
// naturally, the supervisor signals completion mid-turn, or the
// per-turn intervention budget is exhausted.
const turnOutcome = await this.#runAgentTurn(turn, relay);
if (turnOutcome.exit) return turnOutcome.exit;
// Build the full agent transcript from buffered NDJSON events so the
// supervisor sees tool calls and reasoning, not just the SDK result summary.
const agentTranscript = this.extractTranscript(this.agentRunner);
// End-of-turn review (existing behaviour). Returns either an exit
// outcome (error or completion) or the supervisor result for the
// next turn's relay.
const reviewOutcome = await this.#endOfTurnReview(turn);
if (reviewOutcome.exit) return reviewOutcome.exit;
supervisorResult = reviewOutcome.supervisorResult;
}
const supervisorPrompt =
`The agent reported:\n\n${agentTranscript}\n\n` +
`Review the agent's work and decide how to proceed.`;
this.emitSummary({ success: false, turns: this.maxTurns });
return { success: false, turns: this.maxTurns };
}
this.currentSource = "supervisor";
this.currentTurn = turn;
this.successSignalSeen = false;
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
/**
* Drive the agent through one turn, allowing the supervisor to interrupt
* mid-stream via EVALUATION_INTERVENTION. Returns either an `exit` outcome
* (the loop should return immediately) or `{exit: null}` (proceed to the
* end-of-turn review).
* @param {number} turn
* @param {string} initialRelay
* @returns {Promise<{exit: {success: boolean, turns: number}|null}>}
*/
async #runAgentTurn(turn, initialRelay) {
let relay = initialRelay;
let interventions = 0;
if (supervisorResult.error) {
this.emitSummary({ success: false, turns: turn });
return { success: false, turns: turn };
}
// Wire the mid-turn observation hook on the agent runner. The bound
// callback captures `turn` so the inner loop's multiple resume(...)
// calls all see the same turn id. The supervisorRunner does NOT get
// an onBatch callback — it only fires onLine, which is enough for
// emitLine to detect EVALUATION_COMPLETE / EVALUATION_INTERVENTION.
this.agentRunner.onBatch = (batchLines, ctx) =>
this.#midTurnReview(turn, batchLines, ctx);
// The supervisor's turn is fully complete — check for success signal
// in either the SDK result text or streamed messages.
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
this.emitSummary({ success: true, turns: turn });
return { success: true, turns: turn };
try {
while (true) {
this.currentSource = "agent";
this.currentTurn = turn;
const isFirstAgentCall = turn === 1 && interventions === 0;
const agentResult = isFirstAgentCall
? await this.agentRunner.run(relay)
: await this.agentRunner.resume(relay);
if (agentResult.error && !agentResult.aborted) {
this.emitSummary({ success: false, turns: turn });
return { exit: { success: false, turns: turn } };
}
// Mid-turn EVALUATION_COMPLETE: end the session immediately.
if (this.completeSignalSeen) {
this.emitSummary({ success: true, turns: turn });
return { exit: { success: true, turns: turn } };
}
if (agentResult.aborted && this.interventionSignalSeen) {
interventions++;
if (interventions >= MAX_INTERVENTIONS_PER_TURN) {
this.emitOrchestratorEvent({ type: "intervention_limit", turn });
return { exit: null };
}
relay = this.extractLastText(
this.supervisorRunner,
this.lastSupervisorResult?.text ?? "",
);
this.emitOrchestratorEvent({ type: "intervention_relayed", turn });
continue;
}
// Agent's SDK session finished naturally — proceed to end-of-turn.
return { exit: null };
}
} finally {
// Detach onBatch before the end-of-turn review so the supervisor's
// own SDK session does not trigger nested onBatch fires.
this.agentRunner.onBatch = null;
}
}
this.emitSummary({ success: false, turns: this.maxTurns });
return { success: false, turns: this.maxTurns };
/**
* Mid-turn supervisor review fired from inside the agent's onBatch hook.
* Emits a `mid_turn_review` orchestrator marker, runs the supervisor's
* LLM against the batch, and aborts the agent if the supervisor signals
* EVALUATION_INTERVENTION or EVALUATION_COMPLETE.
* @param {number} turn
* @param {string[]} batchLines
* @param {{abort: () => void}} ctx
*/
async #midTurnReview(turn, batchLines, { abort }) {
const batchTranscript = this.renderBatch(batchLines);
// Order matters: emit the orchestrator marker BEFORE the supervisor
// LLM call so the trace reads
// agent line → orchestrator:mid_turn_review
// → supervisor lines (tagged turn:N)
// → orchestrator:intervention_requested|complete_requested
this.emitOrchestratorEvent({ type: "mid_turn_review", turn });
// currentTurn stays = turn so mid-turn supervisor lines share the
// agent's turn id. They are distinguishable from end-of-turn reviews
// by the surrounding orchestrator events emitted around this call.
this.currentSource = "supervisor";
this.completeSignalSeen = false;
this.interventionSignalSeen = false;
this.lastSupervisorResult = await this.supervisorRunner.resume(
`The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
`Respond with a brief acknowledgement to let it continue, or write ` +
`EVALUATION_INTERVENTION followed by a corrective message to stop ` +
`and relay a new instruction. Write EVALUATION_COMPLETE only when ` +
`the task is fully done.`,
);
this.currentSource = "agent";
if (this.interventionSignalSeen) {
this.emitOrchestratorEvent({ type: "intervention_requested", turn });
abort();
return;
}
if (this.completeSignalSeen) {
this.emitOrchestratorEvent({ type: "complete_requested", turn });
abort();
}
// Non-intervention: do nothing; the agent loop pulls the next line.
}
/**
* End-of-turn supervisor review (existing behaviour). Returns either an
* exit outcome (error or completion) or the supervisor result so the
* outer loop can build the next turn's relay.
* @param {number} turn
* @returns {Promise<{exit: {success: boolean, turns: number}|null, supervisorResult?: object}>}
*/
async #endOfTurnReview(turn) {
// Build the full agent transcript from buffered NDJSON events so the
// supervisor sees tool calls and reasoning, not just the SDK result.
const agentTranscript = this.extractTranscript(this.agentRunner);
const supervisorPrompt =
`The agent reported:\n\n${agentTranscript}\n\n` +
`Review the agent's work and decide how to proceed.`;
this.currentSource = "supervisor";
this.currentTurn = turn;
this.completeSignalSeen = false;
this.interventionSignalSeen = false;
const supervisorResult =
await this.supervisorRunner.resume(supervisorPrompt);
if (supervisorResult.error) {
this.emitSummary({ success: false, turns: turn });
return { exit: { success: false, turns: turn } };
}
// The supervisor's turn is fully complete — check for success signal
// in either the SDK result text or streamed messages.
if (this.completeSignalSeen || isComplete(supervisorResult.text)) {
this.emitSummary({ success: true, turns: turn });
return { exit: { success: true, turns: turn } };
}
return { exit: null, supervisorResult };
}
/**
* Extract a human-readable transcript from an AgentRunner's buffered output.

@@ -165,2 +339,27 @@ * Drains the buffer and replays events through a TraceCollector.

/**
* Extract only the last assistant text block from an AgentRunner's buffer.
* Scans buffered NDJSON events in reverse to find the final assistant message
* with a text content block. This prevents intermediate reasoning (tool calls,
* research notes) from leaking to the agent.
* @param {import("./agent-runner.js").AgentRunner} runner
* @param {string} fallback - Fallback text if no assistant text block is found
* @returns {string}
*/
extractLastText(runner, fallback) {
const lines = runner.buffer;
for (let i = lines.length - 1; i >= 0; i--) {
const event = JSON.parse(lines[i]);
if (event.type !== "assistant") continue;
const content = event.message?.content ?? event.content;
if (!Array.isArray(content)) continue;
for (let j = content.length - 1; j >= 0; j--) {
if (content[j].type === "text" && content[j].text) {
return content[j].text;
}
}
}
return fallback;
}
/**
* Emit a single NDJSON line tagged with the current source and turn.

@@ -170,3 +369,4 @@ * Called in real-time via the AgentRunner onLine callback.

* When the current source is the supervisor, also scans assistant text
* content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
* content for the EVALUATION_COMPLETE and EVALUATION_INTERVENTION signals,
* setting completeSignalSeen / interventionSignalSeen respectively.
* @param {string} line - Raw NDJSON line from the runner

@@ -183,6 +383,6 @@ */

// Scan supervisor assistant messages for the success signal in real time.
// Scan supervisor assistant messages for the signals in real time.
// The SDK result text only reflects the final assistant message, but the
// supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
// then continue with follow-up tool calls.
// supervisor may write EVALUATION_COMPLETE / EVALUATION_INTERVENTION in
// an earlier message and then continue with follow-up tool calls.
if (this.currentSource === "supervisor" && event.type === "assistant") {

@@ -192,5 +392,5 @@ const content = event.message?.content ?? event.content ?? [];

for (const block of content) {
if (block.type === "text" && isSuccessful(block.text)) {
this.successSignalSeen = true;
}
if (block.type !== "text" || !block.text) continue;
if (isComplete(block.text)) this.completeSignalSeen = true;
if (isIntervention(block.text)) this.interventionSignalSeen = true;
}

@@ -202,2 +402,37 @@ }

/**
* Render a batch of buffered NDJSON lines as human-readable text for the
* mid-turn supervisor prompt. Reuses the TraceCollector pipeline so the
* supervisor sees tool calls and reasoning, not just raw events.
* @param {string[]} batchLines
* @returns {string}
*/
renderBatch(batchLines) {
if (batchLines.length === 0) return "[empty]";
const collector = new TraceCollector();
for (const line of batchLines) {
collector.addLine(line);
}
return collector.toText() || "[empty]";
}
/**
* Emit an orchestrator-source NDJSON line. Used by the mid-turn loop to
* mark mid_turn_review / intervention_requested / intervention_relayed /
* intervention_limit / complete_requested boundaries in the trace, so the
* improvement coach can distinguish mid-turn supervisor activity from
* end-of-turn reviews. Additive to existing trace shape — the parser
* already reads `source` and ignores unknown event types.
* @param {{type: string, turn?: number}} event
*/
emitOrchestratorEvent(event) {
this.output.write(
JSON.stringify({
source: "orchestrator",
turn: this.currentTurn,
event,
}) + "\n",
);
}
/**
* Emit a final orchestrator summary line.

@@ -268,6 +503,10 @@ * @param {{success: boolean, turns: number}} result

// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
// The relay loop handles agent communication — letting the supervisor use
// Task would bypass the relay and produce an empty agent trace.
const defaultDisallowed = ["Task", "TaskOutput"];
// Block every sub-agent spawning tool so the supervisor cannot bypass the
// relay loop. The current Claude Agent SDK exposes the spawn tool to the
// model as `Agent`; older versions called it `Task`. Both are blocked
// (along with TaskOutput/TaskStop) so the supervisor sees no spawn tool
// regardless of which SDK version is installed. Letting the supervisor
// spawn its own sub-agent would bypass the relay and produce an empty
// agent trace, which is the failure mode that motivated this default.
const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
const disallowedTools = supervisorDisallowedTools

@@ -274,0 +513,0 @@ ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]

import { describe, test } from "node:test";
import assert from "node:assert";
import { PassThrough } from "node:stream";
import {
AgentRunner,
Supervisor,
createSupervisor,
SUPERVISOR_SYSTEM_PROMPT,
AGENT_SYSTEM_PROMPT,
} from "@forwardimpact/libeval";
import { isSuccessful } from "../src/supervisor.js";
/**
* Create a mock AgentRunner that yields pre-scripted responses.
* Each call to run() or resume() pops the next response from the array.
* @param {object[]} responses - Array of {text, success} objects
* @param {object[]} [messages] - Messages to buffer per turn
* @returns {AgentRunner}
*/
function createMockRunner(responses, messages) {
const output = new PassThrough();
let callIndex = 0;
const runner = new AgentRunner({
cwd: "/tmp",
query: async function* () {},
output,
});
// Override run and resume to return scripted responses
runner.run = async (_task) => {
const resp = responses[callIndex++];
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
for (const m of msgs) {
const line = JSON.stringify(m);
runner.buffer.push(line);
if (runner.onLine) runner.onLine(line);
}
runner.sessionId = "mock-session";
return {
success: resp.success ?? true,
text: resp.text,
sessionId: "mock-session",
};
};
runner.resume = async (_prompt) => {
const resp = responses[callIndex++];
const msgs = messages?.[callIndex - 1] ?? [
{ type: "assistant", content: resp.text },
];
for (const m of msgs) {
const line = JSON.stringify(m);
runner.buffer.push(line);
if (runner.onLine) runner.onLine(line);
}
return { success: resp.success ?? true, text: resp.text };
};
return runner;
}
describe("isSuccessful", () => {
test("detects EVALUATION_SUCCESSFUL on its own line", () => {
assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
assert.strictEqual(
isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
true,
);
assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
});
test("tolerates markdown formatting around the signal", () => {
assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
assert.strictEqual(
isSuccessful(
"Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
),
true,
);
});
test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
assert.strictEqual(
isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
true,
);
assert.strictEqual(
isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
true,
);
});
test("does not match empty or unrelated text", () => {
assert.strictEqual(isSuccessful(""), false);
assert.strictEqual(isSuccessful("All done!"), false);
assert.strictEqual(isSuccessful("DONE"), false);
});
test("does not match old EVALUATION_COMPLETE signal", () => {
assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
});
});
describe("Supervisor", () => {
test("constructor throws on missing agentRunner", () => {
assert.throws(
() =>
new Supervisor({
supervisorRunner: createMockRunner([]),
output: new PassThrough(),
}),
/agentRunner is required/,
);
});
test("constructor throws on missing supervisorRunner", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
output: new PassThrough(),
}),
/supervisorRunner is required/,
);
});
test("constructor throws on missing output", () => {
assert.throws(
() =>
new Supervisor({
agentRunner: createMockRunner([]),
supervisorRunner: createMockRunner([]),
}),
/output is required/,
);
});
test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
const agentRunner = createMockRunner([]);
const supervisorRunner = createMockRunner([
{ text: "EVALUATION_SUCCESSFUL" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 0);
});
test("completes after one agent turn", async () => {
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
const supervisorRunner = createMockRunner([
{ text: "Welcome! Please install the packages." },
{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
});
test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
// Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
// an early message, then continues with follow-up work (e.g. filing issues).
// The SDK result text reflects only the final message, which does NOT
// contain the signal.
const agentRunner = createMockRunner([
{ text: "I installed the packages." },
]);
// The supervisor's result text is the Summary (no signal), but messages
// include one with EVALUATION_SUCCESSFUL.
const supervisorMessages = [
undefined, // turn 0: use default
[
{
type: "assistant",
message: {
content: [
{
type: "text",
text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
},
],
},
},
{
type: "assistant",
message: {
content: [
{ type: "text", text: "## Summary\n\nAll issues filed." },
],
},
},
],
];
const supervisorRunner = createMockRunner(
[
{ text: "Welcome! Please install the packages." },
// Result text is the final message — does NOT contain the signal
{ text: "## Summary\n\nAll issues filed." },
],
supervisorMessages,
);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
const result = await supervisor.run("Install stuff");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 1);
});
test("runs multiple turns before completion", async () => {
const agentRunner = createMockRunner([
{ text: "Started working." },
{ text: "Made progress." },
{ text: "Finished everything." },
]);
const supervisorRunner = createMockRunner([
{ text: "Here is your task. Do the work." },
{ text: "Keep going, you need to do more." },
{ text: "Almost there, continue." },
{ text: "EVALUATION_SUCCESSFUL" },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
const result = await supervisor.run("Do the work");
assert.strictEqual(result.success, true);
assert.strictEqual(result.turns, 3);
});
test("enforces maxTurns limit", async () => {
// Supervisor starts, agent responds each turn, supervisor never says done
const agentRunner = createMockRunner([
{ text: "Turn 1" },
{ text: "Turn 2" },
]);
const supervisorRunner = createMockRunner([
{ text: "Start working." },
{ text: "Continue." },
{ text: "Continue." },
]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 2,
});
const result = await supervisor.run("Endless task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 2);
});
test("output contains tagged lines with correct source and turn", async () => {
const supervisorMessages = [
[{ type: "assistant", content: "Go ahead" }],
[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
];
const agentMessages = [[{ type: "assistant", content: "Working" }]];
const supervisorRunner = createMockRunner(
[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
supervisorMessages,
);
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
assert.ok(lines.length >= 4);
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);
assert.ok("event" in supervisorLine);
const agentLine = JSON.parse(lines[1]);
assert.strictEqual(agentLine.source, "agent");
assert.strictEqual(agentLine.turn, 1);
assert.ok("event" in agentLine);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.type, "summary");
assert.strictEqual(summaryLine.success, true);
});
test("events are nested under event key (no field collisions)", async () => {
const sourceEvent = {
type: "assistant",
source: "sdk-internal",
content: "test",
};
const supervisorRunner = createMockRunner(
[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
[
[{ type: "assistant", content: "Go" }],
[{ type: "assistant", content: "ok" }],
],
);
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
await supervisor.run("Task");
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
// First line is supervisor turn 0, second is agent turn 1
const tagged = JSON.parse(lines[1]);
// The original event's `source` field is preserved inside `event`
assert.strictEqual(tagged.source, "agent");
assert.strictEqual(tagged.event.source, "sdk-internal");
});
test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
const supervisorMessages = [
[{ type: "assistant", content: "Starting..." }],
];
const supervisorRunner = createMockRunner(
[{ text: "Starting...", success: false }],
supervisorMessages,
);
// Override run to simulate an error return
const origRun = supervisorRunner.run;
supervisorRunner.run = async (task) => {
const result = await origRun.call(supervisorRunner, task);
return { ...result, error: new Error("Process exited with code 1") };
};
const agentRunner = createMockRunner([]);
const output = new PassThrough();
const supervisor = new Supervisor({
agentRunner,
supervisorRunner,
output,
maxTurns: 10,
});
agentRunner.onLine = (line) => supervisor.emitLine(line);
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
const result = await supervisor.run("Task");
assert.strictEqual(result.success, false);
assert.strictEqual(result.turns, 0);
// Output should still contain the supervisor's buffered lines + summary
const data = output.read()?.toString() ?? "";
const lines = data
.trim()
.split("\n")
.filter((l) => l.length > 0);
assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
const supervisorLine = JSON.parse(lines[0]);
assert.strictEqual(supervisorLine.source, "supervisor");
assert.strictEqual(supervisorLine.turn, 0);
const summaryLine = JSON.parse(lines[lines.length - 1]);
assert.strictEqual(summaryLine.source, "orchestrator");
assert.strictEqual(summaryLine.success, false);
assert.strictEqual(summaryLine.turns, 0);
});
test("createSupervisor factory returns a Supervisor instance", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.ok(supervisor instanceof Supervisor);
});
test("createSupervisor uses default supervisor tools when none specified", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Bash",
"Read",
"Glob",
"Grep",
"Write",
"Edit",
]);
});
test("createSupervisor passes custom supervisor tools", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorAllowedTools: ["Read", "Glob", "Grep"],
});
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
"Read",
"Glob",
"Grep",
]);
});
test("createSupervisor wires system prompts to both runners", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: AGENT_SYSTEM_PROMPT,
});
assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
type: "preset",
preset: "claude_code",
append: SUPERVISOR_SYSTEM_PROMPT,
});
});
test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
});
assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
"Task",
"TaskOutput",
]);
// Agent should not have disallowed tools
assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
});
test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
const supervisor = createSupervisor({
supervisorCwd: "/tmp/sup",
agentCwd: "/tmp/agent",
query: async function* () {},
output: new PassThrough(),
supervisorDisallowedTools: ["WebSearch", "Task"],
});
const disallowed = supervisor.supervisorRunner.disallowedTools;
assert.ok(disallowed.includes("Task"));
assert.ok(disallowed.includes("TaskOutput"));
assert.ok(disallowed.includes("WebSearch"));
// No duplicates
assert.strictEqual(disallowed.length, new Set(disallowed).size);
});
test("system prompt constants are non-empty strings", () => {
assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
});
test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
});
});