@forwardimpact/libeval
Advanced tools
+12
-4
@@ -28,3 +28,4 @@ #!/usr/bin/env node | ||
| Run options: | ||
| --task=PATH Path to task file (required) | ||
| --task-file=PATH Path to task file (mutually exclusive with --task-text) | ||
| --task-text=STRING Inline task text (mutually exclusive with --task-file) | ||
| --cwd=DIR Agent working directory (default: .) | ||
@@ -35,5 +36,7 @@ --model=MODEL Claude model to use (default: opus) | ||
| --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
| Supervise options: | ||
| --task=PATH Path to task file (required) | ||
| --task-file=PATH Path to task file (mutually exclusive with --task-text) | ||
| --task-text=STRING Inline task text (mutually exclusive with --task-file) | ||
| --supervisor-cwd=DIR Supervisor working directory (default: .) | ||
@@ -45,2 +48,6 @@ --agent-cwd=DIR Agent working directory (default: temp directory) | ||
| --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| --supervisor-allowed-tools=LIST | ||
| Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI) | ||
| --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
@@ -56,4 +63,5 @@ Options: | ||
| fit-eval tee output.ndjson < trace.ndjson | ||
| fit-eval run --task=.github/tasks/security-audit.md --model=opus | ||
| fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=. | ||
| fit-eval run --task-text="Perform a security audit of the repository." --model=opus | ||
| fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus | ||
| fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=. | ||
| `.trim(); | ||
@@ -60,0 +68,0 @@ |
+6
-1
| export { TraceCollector, createTraceCollector } from "./src/trace-collector.js"; | ||
| export { AgentRunner, createAgentRunner } from "./src/agent-runner.js"; | ||
| export { Supervisor, createSupervisor } from "./src/supervisor.js"; | ||
| export { | ||
| Supervisor, | ||
| createSupervisor, | ||
| SUPERVISOR_SYSTEM_PROMPT, | ||
| AGENT_SYSTEM_PROMPT, | ||
| } from "./src/supervisor.js"; | ||
| export { TeeWriter, createTeeWriter } from "./src/tee-writer.js"; |
+4
-3
| { | ||
| "name": "@forwardimpact/libeval", | ||
| "version": "0.1.3", | ||
| "version": "0.1.5", | ||
| "description": "Process Claude Code stream-json output into structured traces", | ||
@@ -13,3 +13,4 @@ "license": "Apache-2.0", | ||
| "engines": { | ||
| "bun": ">=1.2.0" | ||
| "bun": ">=1.2.0", | ||
| "node": ">=18.0.0" | ||
| }, | ||
@@ -20,3 +21,3 @@ "scripts": { | ||
| "dependencies": { | ||
| "@anthropic-ai/claude-agent-sdk": "^0.1.0" | ||
| "@anthropic-ai/claude-agent-sdk": "^0.2.91" | ||
| }, | ||
@@ -23,0 +24,0 @@ "publishConfig": { |
+19
-1
@@ -21,2 +21,5 @@ /** | ||
| * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md) | ||
| * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI | ||
| * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends) | ||
| * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context | ||
| */ | ||
@@ -33,2 +36,5 @@ constructor({ | ||
| settingSources, | ||
| agentProfile, | ||
| systemPrompt, | ||
| disallowedTools, | ||
| }) { | ||
@@ -54,2 +60,5 @@ if (!cwd) throw new Error("cwd is required"); | ||
| this.settingSources = settingSources ?? []; | ||
| this.agentProfile = agentProfile ?? null; | ||
| this.systemPrompt = systemPrompt ?? null; | ||
| this.disallowedTools = disallowedTools ?? []; | ||
| this.sessionId = null; | ||
@@ -80,2 +89,7 @@ this.buffer = []; | ||
| settingSources: this.settingSources, | ||
| ...(this.disallowedTools.length > 0 && { | ||
| disallowedTools: this.disallowedTools, | ||
| }), | ||
| ...(this.systemPrompt && { systemPrompt: this.systemPrompt }), | ||
| ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }), | ||
| }, | ||
@@ -120,3 +134,7 @@ })) { | ||
| prompt, | ||
| options: { resume: this.sessionId }, | ||
| options: { | ||
| resume: this.sessionId, | ||
| permissionMode: this.permissionMode, | ||
| allowDangerouslySkipPermissions: true, | ||
| }, | ||
| })) { | ||
@@ -123,0 +141,0 @@ const line = JSON.stringify(message); |
+12
-4
@@ -27,3 +27,4 @@ import { readFileSync, createWriteStream } from "node:fs"; | ||
| * Options: | ||
| * --task=PATH Path to task file (required) | ||
| * --task-file=PATH Path to task file (mutually exclusive with --task-text) | ||
| * --task-text=STRING Inline task text (mutually exclusive with --task-file) | ||
| * --cwd=DIR Agent working directory (default: .) | ||
@@ -34,2 +35,3 @@ * --model=MODEL Claude model to use (default: opus) | ||
| * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
| * | ||
@@ -39,4 +41,8 @@ * @param {string[]} args - Command arguments | ||
| export async function runRunCommand(args) { | ||
| const task = parseFlag(args, "task"); | ||
| if (!task) throw new Error("--task is required"); | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
@@ -47,2 +53,3 @@ const cwd = resolve(parseFlag(args, "cwd") ?? "."); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const agentProfile = parseFlag(args, "agent-profile") ?? undefined; | ||
| const allowedTools = ( | ||
@@ -52,3 +59,3 @@ parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| const taskContent = readFileSync(task, "utf8"); | ||
| const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
@@ -71,2 +78,3 @@ // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
| settingSources: ["project"], | ||
| agentProfile, | ||
| }); | ||
@@ -73,0 +81,0 @@ |
@@ -28,3 +28,4 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs"; | ||
| * Options: | ||
| * --task=PATH Path to task file (required) | ||
| * --task-file=PATH Path to task file (mutually exclusive with --task-text) | ||
| * --task-text=STRING Inline task text (mutually exclusive with --task-file) | ||
| * --supervisor-cwd=DIR Supervisor working directory (default: .) | ||
@@ -36,2 +37,4 @@ * --agent-cwd=DIR Agent working directory (default: temp directory) | ||
| * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit) | ||
| * --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI) | ||
| * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI) | ||
| * | ||
@@ -41,4 +44,8 @@ * @param {string[]} args - Command arguments | ||
| export async function runSuperviseCommand(args) { | ||
| const task = parseFlag(args, "task"); | ||
| if (!task) throw new Error("--task is required"); | ||
| const taskFile = parseFlag(args, "task-file"); | ||
| const taskText = parseFlag(args, "task-text"); | ||
| if (taskFile && taskText) | ||
| throw new Error("--task-file and --task-text are mutually exclusive"); | ||
| if (!taskFile && !taskText) | ||
| throw new Error("--task-file or --task-text is required"); | ||
@@ -53,7 +60,13 @@ const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? "."); | ||
| const outputPath = parseFlag(args, "output"); | ||
| const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined; | ||
| const agentProfile = parseFlag(args, "agent-profile") ?? undefined; | ||
| const allowedTools = ( | ||
| parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit" | ||
| ).split(","); | ||
| const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools"); | ||
| const supervisorAllowedTools = supervisorAllowedToolsRaw | ||
| ? supervisorAllowedToolsRaw.split(",") | ||
| : undefined; | ||
| const taskContent = readFileSync(task, "utf8"); | ||
| const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText; | ||
@@ -80,2 +93,5 @@ // When --output is specified, stream text to stdout while writing NDJSON to file. | ||
| allowedTools, | ||
| supervisorAllowedTools, | ||
| supervisorProfile, | ||
| agentProfile, | ||
| }); | ||
@@ -82,0 +98,0 @@ |
+108
-31
| /** | ||
| * Supervisor — orchestrates a relay loop between an agent and a supervisor, | ||
| * both running as AgentRunner instances. The agent works on a task while the | ||
| * supervisor observes and decides when the evaluation is complete. | ||
| * both running as AgentRunner instances. The supervisor receives the task first, | ||
| * introduces itself, and delegates work to the agent. The loop then alternates: | ||
| * agent → supervisor → agent. | ||
| * | ||
@@ -11,14 +12,26 @@ * Follows OO+DI: constructor injection, factory function, tests bypass factory. | ||
| import { createAgentRunner } from "./agent-runner.js"; | ||
| import { TraceCollector } from "./trace-collector.js"; | ||
| /** | ||
| * Check if the supervisor's response signals evaluation completion. | ||
| * Uses a structured signal — `EVALUATION_COMPLETE` on its own line — | ||
| * to avoid false positives from natural language. | ||
| * Check if the supervisor's response signals evaluation success. | ||
| * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown | ||
| * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to | ||
| * avoid matching inside longer identifiers. | ||
| * @param {string} text | ||
| * @returns {boolean} | ||
| */ | ||
| export function isDone(text) { | ||
| return /^EVALUATION_COMPLETE$/m.test(text); | ||
| export function isSuccessful(text) { | ||
| return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text); | ||
| } | ||
| /** System prompt appended for the supervisor runner in supervise mode. */ | ||
| export const SUPERVISOR_SYSTEM_PROMPT = | ||
| "You supervise another AI agent through a relay — your output becomes the agent's next input. " + | ||
| "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete."; | ||
| /** System prompt appended for the agent runner in supervise mode. */ | ||
| export const AGENT_SYSTEM_PROMPT = | ||
| "You are being supervised by another AI agent. " + | ||
| "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding."; | ||
| export class Supervisor { | ||
@@ -48,12 +61,14 @@ /** | ||
| * Run the supervisor ↔ agent relay loop. | ||
| * @param {string} task - The initial task for the agent | ||
| * The supervisor receives the task first, introduces itself, and delegates | ||
| * work to the agent. The loop then alternates: agent → supervisor → agent. | ||
| * @param {string} task - The initial task for the supervisor | ||
| * @returns {Promise<{success: boolean, turns: number}>} | ||
| */ | ||
| async run(task) { | ||
| // Turn 0: Agent receives the task and starts working | ||
| this.currentSource = "agent"; | ||
| // Turn 0: Supervisor receives the task and introduces it to the agent | ||
| this.currentSource = "supervisor"; | ||
| this.currentTurn = 0; | ||
| let agentResult = await this.agentRunner.run(task); | ||
| let supervisorResult = await this.supervisorRunner.run(task); | ||
| if (agentResult.error) { | ||
| if (supervisorResult.error) { | ||
| this.emitSummary({ success: false, turns: 0 }); | ||
@@ -63,18 +78,21 @@ return { success: false, turns: 0 }; | ||
| // The supervisor's turn is fully complete (all tool calls executed) by the | ||
| // time we check the signal — no work is interrupted. | ||
| if (isSuccessful(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: 0 }); | ||
| return { success: true, turns: 0 }; | ||
| } | ||
| for (let turn = 1; turn <= this.maxTurns; turn++) { | ||
| // Supervisor observes the agent's output | ||
| const supervisorPrompt = | ||
| `The agent reported:\n\n${agentResult.text}\n\n` + | ||
| `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`; | ||
| this.currentSource = "supervisor"; | ||
| // Supervisor's output becomes the agent's input | ||
| this.currentSource = "agent"; | ||
| this.currentTurn = turn; | ||
| let supervisorResult; | ||
| let agentResult; | ||
| if (turn === 1) { | ||
| supervisorResult = await this.supervisorRunner.run(supervisorPrompt); | ||
| agentResult = await this.agentRunner.run(supervisorResult.text); | ||
| } else { | ||
| supervisorResult = await this.supervisorRunner.resume(supervisorPrompt); | ||
| agentResult = await this.agentRunner.resume(supervisorResult.text); | ||
| } | ||
| if (supervisorResult.error) { | ||
| if (agentResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
@@ -84,16 +102,24 @@ return { success: false, turns: turn }; | ||
| if (isDone(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { success: true, turns: turn }; | ||
| } | ||
| // Build the full agent transcript from buffered NDJSON events so the | ||
| // supervisor sees tool calls and reasoning, not just the SDK result summary. | ||
| const agentTranscript = this.extractTranscript(this.agentRunner); | ||
| // Supervisor's response becomes the agent's next input | ||
| this.currentSource = "agent"; | ||
| const supervisorPrompt = | ||
| `The agent reported:\n\n${agentTranscript}\n\n` + | ||
| `Review the agent's work and decide how to proceed.`; | ||
| this.currentSource = "supervisor"; | ||
| this.currentTurn = turn; | ||
| agentResult = await this.agentRunner.resume(supervisorResult.text); | ||
| supervisorResult = await this.supervisorRunner.resume(supervisorPrompt); | ||
| if (agentResult.error) { | ||
| if (supervisorResult.error) { | ||
| this.emitSummary({ success: false, turns: turn }); | ||
| return { success: false, turns: turn }; | ||
| } | ||
| // The supervisor's turn is fully complete — check for success signal. | ||
| if (isSuccessful(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
| return { success: true, turns: turn }; | ||
| } | ||
| } | ||
@@ -106,2 +132,17 @@ | ||
| /** | ||
| * Extract a human-readable transcript from an AgentRunner's buffered output. | ||
| * Drains the buffer and replays events through a TraceCollector. | ||
| * @param {import("./agent-runner.js").AgentRunner} runner | ||
| * @returns {string} | ||
| */ | ||
| extractTranscript(runner) { | ||
| const lines = runner.drainOutput(); | ||
| const collector = new TraceCollector(); | ||
| for (const line of lines) { | ||
| collector.addLine(line); | ||
| } | ||
| return collector.toText() || "[The agent produced no output.]"; | ||
| } | ||
| /** | ||
| * Emit a single NDJSON line tagged with the current source and turn. | ||
@@ -146,2 +187,6 @@ * Called in real-time via the AgentRunner onLine callback. | ||
| * @param {string[]} [deps.allowedTools] - Tools the agent may use | ||
| * @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit) | ||
| * @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor | ||
| * @param {string} [deps.supervisorProfile] - Supervisor agent profile name | ||
| * @param {string} [deps.agentProfile] - Agent profile name | ||
| * @returns {Supervisor} | ||
@@ -157,2 +202,6 @@ */ | ||
| allowedTools, | ||
| supervisorDisallowedTools, | ||
| supervisorAllowedTools, | ||
| supervisorProfile, | ||
| agentProfile, | ||
| }) { | ||
@@ -173,4 +222,18 @@ // Forward-reference: onLine captures `supervisor` before construction completes. | ||
| settingSources: ["project"], | ||
| agentProfile, | ||
| systemPrompt: { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: AGENT_SYSTEM_PROMPT, | ||
| }, | ||
| }); | ||
| // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents. | ||
| // The relay loop handles agent communication — letting the supervisor use | ||
| // Task would bypass the relay and produce an empty agent trace. | ||
| const defaultDisallowed = ["Task", "TaskOutput"]; | ||
| const disallowedTools = supervisorDisallowedTools | ||
| ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])] | ||
| : defaultDisallowed; | ||
| const supervisorRunner = createAgentRunner({ | ||
@@ -182,5 +245,19 @@ cwd: supervisorCwd, | ||
| maxTurns: 10, | ||
| allowedTools: ["Read", "Glob", "Grep"], | ||
| allowedTools: supervisorAllowedTools ?? [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ], | ||
| disallowedTools, | ||
| onLine, | ||
| settingSources: ["project"], | ||
| agentProfile: supervisorProfile, | ||
| systemPrompt: { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: SUPERVISOR_SYSTEM_PROMPT, | ||
| }, | ||
| }); | ||
@@ -187,0 +264,0 @@ |
@@ -110,3 +110,2 @@ /** | ||
| this.lastSource = parsed.source; | ||
| this.textStream.write(`\n[${parsed.source}]\n`); | ||
| } | ||
@@ -123,2 +122,6 @@ this.collector.addLine(JSON.stringify(parsed.event)); | ||
| const turns = this.collector.turns; | ||
| const prefix = | ||
| this.mode === "supervised" && this.lastSource | ||
| ? `[${this.lastSource}] ` | ||
| : ""; | ||
| while (this.turnsEmitted < turns.length) { | ||
@@ -129,6 +132,6 @@ const turn = turns[this.turnsEmitted++]; | ||
| if (block.type === "text") { | ||
| this.textStream.write(block.text + "\n"); | ||
| this.textStream.write(`${prefix}${block.text}\n`); | ||
| } else if (block.type === "tool_use") { | ||
| const input = summarizeInput(block.input); | ||
| this.textStream.write(`> Tool: ${block.name} ${input}\n`); | ||
| this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`); | ||
| } | ||
@@ -135,0 +138,0 @@ } |
+200
-49
@@ -9,4 +9,6 @@ import { describe, test } from "node:test"; | ||
| createSupervisor, | ||
| SUPERVISOR_SYSTEM_PROMPT, | ||
| AGENT_SYSTEM_PROMPT, | ||
| } from "@forwardimpact/libeval"; | ||
| import { isDone } from "../src/supervisor.js"; | ||
| import { isSuccessful } from "../src/supervisor.js"; | ||
@@ -65,23 +67,47 @@ /** | ||
| describe("isDone", () => { | ||
| test("detects EVALUATION_COMPLETE on its own line", () => { | ||
| assert.strictEqual(isDone("EVALUATION_COMPLETE"), true); | ||
| describe("isSuccessful", () => { | ||
| test("detects EVALUATION_SUCCESSFUL on its own line", () => { | ||
| assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true); | ||
| assert.strictEqual( | ||
| isDone("Some text\nEVALUATION_COMPLETE\nMore text"), | ||
| isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"), | ||
| true, | ||
| ); | ||
| assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true); | ||
| assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true); | ||
| }); | ||
| test("does not match EVALUATION_COMPLETE embedded in text", () => { | ||
| assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false); | ||
| assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false); | ||
| assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false); | ||
| test("tolerates markdown formatting around the signal", () => { | ||
| assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true); | ||
| assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true); | ||
| assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true); | ||
| assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true); | ||
| assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true); | ||
| assert.strictEqual( | ||
| isSuccessful( | ||
| "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.", | ||
| ), | ||
| true, | ||
| ); | ||
| }); | ||
| test("matches EVALUATION_SUCCESSFUL anywhere in text", () => { | ||
| assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true); | ||
| assert.strictEqual( | ||
| isSuccessful("The agent is EVALUATION_SUCCESSFUL done"), | ||
| true, | ||
| ); | ||
| assert.strictEqual( | ||
| isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."), | ||
| true, | ||
| ); | ||
| }); | ||
| test("does not match empty or unrelated text", () => { | ||
| assert.strictEqual(isDone(""), false); | ||
| assert.strictEqual(isDone("All done!"), false); | ||
| assert.strictEqual(isDone("DONE"), false); | ||
| assert.strictEqual(isSuccessful(""), false); | ||
| assert.strictEqual(isSuccessful("All done!"), false); | ||
| assert.strictEqual(isSuccessful("DONE"), false); | ||
| }); | ||
| test("does not match old EVALUATION_COMPLETE signal", () => { | ||
| assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false); | ||
| }); | ||
| }); | ||
@@ -123,3 +149,24 @@ | ||
| test("completes on EVALUATION_COMPLETE from supervisor", async () => { | ||
| test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => { | ||
| const agentRunner = createMockRunner([]); | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "EVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 0); | ||
| }); | ||
| test("completes after one agent turn", async () => { | ||
| const agentRunner = createMockRunner([ | ||
@@ -130,3 +177,4 @@ { text: "I installed the packages." }, | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Good work.\n\nEVALUATION_COMPLETE" }, | ||
| { text: "Welcome! Please install the packages." }, | ||
| { text: "Good work.\n\nEVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
@@ -156,5 +204,6 @@ | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Here is your task. Do the work." }, | ||
| { text: "Keep going, you need to do more." }, | ||
| { text: "Almost there, continue." }, | ||
| { text: "EVALUATION_COMPLETE" }, | ||
| { text: "EVALUATION_SUCCESSFUL" }, | ||
| ]); | ||
@@ -177,5 +226,4 @@ | ||
| test("enforces maxTurns limit", async () => { | ||
| // Agent responds to every turn, supervisor never says done | ||
| // Supervisor starts, agent responds each turn, supervisor never says done | ||
| const agentRunner = createMockRunner([ | ||
| { text: "Turn 0" }, | ||
| { text: "Turn 1" }, | ||
@@ -186,2 +234,3 @@ { text: "Turn 2" }, | ||
| const supervisorRunner = createMockRunner([ | ||
| { text: "Start working." }, | ||
| { text: "Continue." }, | ||
@@ -206,12 +255,13 @@ { text: "Continue." }, | ||
| test("output contains tagged lines with correct source and turn", async () => { | ||
| const agentMessages = [[{ type: "assistant", content: "Working" }]]; | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "EVALUATION_COMPLETE" }], | ||
| [{ type: "assistant", content: "Go ahead" }], | ||
| [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }], | ||
| ]; | ||
| const agentMessages = [[{ type: "assistant", content: "Working" }]]; | ||
| const agentRunner = createMockRunner([{ text: "Working" }], agentMessages); | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "EVALUATION_COMPLETE" }], | ||
| [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }], | ||
| supervisorMessages, | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Working" }], agentMessages); | ||
@@ -236,15 +286,15 @@ const output = new PassThrough(); | ||
| // Should have: agent turn 0, supervisor turn 1, orchestrator summary | ||
| assert.ok(lines.length >= 3); | ||
| // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary | ||
| assert.ok(lines.length >= 4); | ||
| const agentLine = JSON.parse(lines[0]); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
| assert.ok("event" in supervisorLine); | ||
| const agentLine = JSON.parse(lines[1]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 0); | ||
| assert.strictEqual(agentLine.turn, 1); | ||
| assert.ok("event" in agentLine); | ||
| const supervisorLine = JSON.parse(lines[1]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 1); | ||
| assert.ok("event" in supervisorLine); | ||
| const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
@@ -262,7 +312,10 @@ assert.strictEqual(summaryLine.source, "orchestrator"); | ||
| }; | ||
| const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]); | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "EVALUATION_COMPLETE" }], | ||
| [[{ type: "assistant", content: "ok" }]], | ||
| [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }], | ||
| [ | ||
| [{ type: "assistant", content: "Go" }], | ||
| [{ type: "assistant", content: "ok" }], | ||
| ], | ||
| ); | ||
| const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]); | ||
@@ -287,3 +340,4 @@ const output = new PassThrough(); | ||
| const tagged = JSON.parse(lines[0]); | ||
| // First line is supervisor turn 0, second is agent turn 1 | ||
| const tagged = JSON.parse(lines[1]); | ||
| // The original event's `source` field is preserved inside `event` | ||
@@ -294,17 +348,19 @@ assert.strictEqual(tagged.source, "agent"); | ||
| test("emits agent output and summary when agent errors on turn 0", async () => { | ||
| const agentMessages = [[{ type: "assistant", content: "Partial work" }]]; | ||
| const agentRunner = createMockRunner( | ||
| [{ text: "Partial work", success: false }], | ||
| agentMessages, | ||
| test("emits supervisor output and summary when supervisor errors on turn 0", async () => { | ||
| const supervisorMessages = [ | ||
| [{ type: "assistant", content: "Starting..." }], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [{ text: "Starting...", success: false }], | ||
| supervisorMessages, | ||
| ); | ||
| // Override run to simulate an error return | ||
| const origRun = agentRunner.run; | ||
| agentRunner.run = async (task) => { | ||
| const result = await origRun.call(agentRunner, task); | ||
| const origRun = supervisorRunner.run; | ||
| supervisorRunner.run = async (task) => { | ||
| const result = await origRun.call(supervisorRunner, task); | ||
| return { ...result, error: new Error("Process exited with code 1") }; | ||
| }; | ||
| const supervisorRunner = createMockRunner([]); | ||
| const agentRunner = createMockRunner([]); | ||
@@ -326,3 +382,3 @@ const output = new PassThrough(); | ||
| // Output should still contain the agent's buffered lines + summary | ||
| // Output should still contain the supervisor's buffered lines + summary | ||
| const data = output.read()?.toString() ?? ""; | ||
@@ -334,7 +390,7 @@ const lines = data | ||
| assert.ok(lines.length >= 2, "Expected at least agent line + summary"); | ||
| assert.ok(lines.length >= 2, "Expected at least supervisor line + summary"); | ||
| const agentLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(agentLine.source, "agent"); | ||
| assert.strictEqual(agentLine.turn, 0); | ||
| const supervisorLine = JSON.parse(lines[0]); | ||
| assert.strictEqual(supervisorLine.source, "supervisor"); | ||
| assert.strictEqual(supervisorLine.turn, 0); | ||
@@ -356,2 +412,97 @@ const summaryLine = JSON.parse(lines[lines.length - 1]); | ||
| }); | ||
| test("createSupervisor uses default supervisor tools when none specified", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Bash", | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| "Write", | ||
| "Edit", | ||
| ]); | ||
| }); | ||
| test("createSupervisor passes custom supervisor tools", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorAllowedTools: ["Read", "Glob", "Grep"], | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [ | ||
| "Read", | ||
| "Glob", | ||
| "Grep", | ||
| ]); | ||
| }); | ||
| test("createSupervisor wires system prompts to both runners", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: AGENT_SYSTEM_PROMPT, | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, { | ||
| type: "preset", | ||
| preset: "claude_code", | ||
| append: SUPERVISOR_SYSTEM_PROMPT, | ||
| }); | ||
| }); | ||
| test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| }); | ||
| assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [ | ||
| "Task", | ||
| "TaskOutput", | ||
| ]); | ||
| // Agent should not have disallowed tools | ||
| assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []); | ||
| }); | ||
| test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => { | ||
| const supervisor = createSupervisor({ | ||
| supervisorCwd: "/tmp/sup", | ||
| agentCwd: "/tmp/agent", | ||
| query: async function* () {}, | ||
| output: new PassThrough(), | ||
| supervisorDisallowedTools: ["WebSearch", "Task"], | ||
| }); | ||
| const disallowed = supervisor.supervisorRunner.disallowedTools; | ||
| assert.ok(disallowed.includes("Task")); | ||
| assert.ok(disallowed.includes("TaskOutput")); | ||
| assert.ok(disallowed.includes("WebSearch")); | ||
| // No duplicates | ||
| assert.strictEqual(disallowed.length, new Set(disallowed).size); | ||
| }); | ||
| test("system prompt constants are non-empty strings", () => { | ||
| assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string"); | ||
| assert.ok(typeof AGENT_SYSTEM_PROMPT === "string"); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0); | ||
| assert.ok(AGENT_SYSTEM_PROMPT.length > 0); | ||
| }); | ||
| test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => { | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay")); | ||
| assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL")); | ||
| }); | ||
| }); |
@@ -190,7 +190,5 @@ import { describe, test } from "node:test"; | ||
| // Text should show source labels | ||
| assert.ok(textData.includes("[agent]")); | ||
| assert.ok(textData.includes("Working on it")); | ||
| assert.ok(textData.includes("[supervisor]")); | ||
| assert.ok(textData.includes("Looks good")); | ||
| // Text should show source prefixes on content lines | ||
| assert.ok(textData.includes("[agent] Working on it")); | ||
| assert.ok(textData.includes("[supervisor] Looks good")); | ||
| assert.ok(textData.includes("Evaluation completed after 1 turns")); | ||
@@ -258,5 +256,5 @@ }); | ||
| const textData = collect(textStream); | ||
| // [agent] label should appear only once | ||
| const agentLabels = textData.split("[agent]").length - 1; | ||
| assert.strictEqual(agentLabels, 1); | ||
| // [agent] prefix should appear on each content line | ||
| assert.ok(textData.includes("[agent] Step 1")); | ||
| assert.ok(textData.includes("[agent] Step 2")); | ||
| }); | ||
@@ -263,0 +261,0 @@ |
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
98679
13.16%2396
12.33%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed