@forwardimpact/libeval
Advanced tools
+1
-1
| { | ||
| "name": "@forwardimpact/libeval", | ||
| "version": "0.1.5", | ||
| "version": "0.1.6", | ||
| "description": "Process Claude Code stream-json output into structured traces", | ||
@@ -5,0 +5,0 @@ "license": "Apache-2.0", |
+38
-5
@@ -56,2 +56,11 @@ /** | ||
| this.currentTurn = 0; | ||
| /** | ||
| * Set to true when any supervisor message contains the success signal. | ||
| * The SDK result text only reflects the last assistant message, so when | ||
| * the supervisor writes EVALUATION_SUCCESSFUL in an early message and | ||
| * then continues with follow-up work, the result text won't contain it. | ||
| * This flag captures the signal from the full message stream. | ||
| * @type {boolean} | ||
| */ | ||
| this.successSignalSeen = false; | ||
| } | ||
@@ -70,2 +79,3 @@ | ||
| this.currentTurn = 0; | ||
| this.successSignalSeen = false; | ||
| let supervisorResult = await this.supervisorRunner.run(task); | ||
@@ -78,5 +88,8 @@ | ||
| // The supervisor's turn is fully complete (all tool calls executed) by the | ||
| // time we check the signal — no work is interrupted. | ||
| if (isSuccessful(supervisorResult.text)) { | ||
| // Check for the success signal in either the SDK result text or the | ||
| // streamed message content. The SDK result text only reflects the last | ||
| // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL | ||
| // early and then continues (e.g. filing issues), we must also check the | ||
| // flag set by emitLine during streaming. | ||
| if (this.successSignalSeen || isSuccessful(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: 0 }); | ||
@@ -112,2 +125,3 @@ return { success: true, turns: 0 }; | ||
| this.currentTurn = turn; | ||
| this.successSignalSeen = false; | ||
| supervisorResult = await this.supervisorRunner.resume(supervisorPrompt); | ||
@@ -120,4 +134,5 @@ | ||
| // The supervisor's turn is fully complete — check for success signal. | ||
| if (isSuccessful(supervisorResult.text)) { | ||
| // The supervisor's turn is fully complete — check for success signal | ||
| // in either the SDK result text or streamed messages. | ||
| if (this.successSignalSeen || isSuccessful(supervisorResult.text)) { | ||
| this.emitSummary({ success: true, turns: turn }); | ||
@@ -150,2 +165,5 @@ return { success: true, turns: turn }; | ||
| * Called in real-time via the AgentRunner onLine callback. | ||
| * | ||
| * When the current source is the supervisor, also scans assistant text | ||
| * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen. | ||
| * @param {string} line - Raw NDJSON line from the runner | ||
@@ -161,2 +179,17 @@ */ | ||
| this.output.write(JSON.stringify(tagged) + "\n"); | ||
| // Scan supervisor assistant messages for the success signal in real time. | ||
| // The SDK result text only reflects the final assistant message, but the | ||
| // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and | ||
| // then continue with follow-up tool calls. | ||
| if (this.currentSource === "supervisor" && event.type === "assistant") { | ||
| const content = event.message?.content ?? event.content ?? []; | ||
| if (Array.isArray(content)) { | ||
| for (const block of content) { | ||
| if (block.type === "text" && isSuccessful(block.text)) { | ||
| this.successSignalSeen = true; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
@@ -163,0 +196,0 @@ |
@@ -41,2 +41,9 @@ /** | ||
| // Unwrap combined supervised trace format {source, turn, event}. | ||
| // The Supervisor emits this wrapper; when replayed through addLine the | ||
| // inner event is the one we need. | ||
| if (event.event && !event.type && typeof event.source === "string") { | ||
| event = event.event; | ||
| } | ||
| switch (event.type) { | ||
@@ -43,0 +50,0 @@ case "system": |
@@ -192,2 +192,63 @@ import { describe, test } from "node:test"; | ||
| test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => { | ||
| // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in | ||
| // an early message, then continues with follow-up work (e.g. filing issues). | ||
| // The SDK result text reflects only the final message, which does NOT | ||
| // contain the signal. | ||
| const agentRunner = createMockRunner([ | ||
| { text: "I installed the packages." }, | ||
| ]); | ||
| // The supervisor's result text is the Summary (no signal), but messages | ||
| // include one with EVALUATION_SUCCESSFUL. | ||
| const supervisorMessages = [ | ||
| undefined, // turn 0: use default | ||
| [ | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { | ||
| type: "text", | ||
| text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| { | ||
| type: "assistant", | ||
| message: { | ||
| content: [ | ||
| { type: "text", text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| }, | ||
| }, | ||
| ], | ||
| ]; | ||
| const supervisorRunner = createMockRunner( | ||
| [ | ||
| { text: "Welcome! Please install the packages." }, | ||
| // Result text is the final message — does NOT contain the signal | ||
| { text: "## Summary\n\nAll issues filed." }, | ||
| ], | ||
| supervisorMessages, | ||
| ); | ||
| const output = new PassThrough(); | ||
| const supervisor = new Supervisor({ | ||
| agentRunner, | ||
| supervisorRunner, | ||
| output, | ||
| maxTurns: 10, | ||
| }); | ||
| agentRunner.onLine = (line) => supervisor.emitLine(line); | ||
| supervisorRunner.onLine = (line) => supervisor.emitLine(line); | ||
| const result = await supervisor.run("Install stuff"); | ||
| assert.strictEqual(result.success, true); | ||
| assert.strictEqual(result.turns, 1); | ||
| }); | ||
| test("runs multiple turns before completion", async () => { | ||
@@ -194,0 +255,0 @@ const agentRunner = createMockRunner([ |
@@ -152,2 +152,98 @@ import { describe, test } from "node:test"; | ||
| test("unwraps combined supervised trace format {source, turn, event}", () => { | ||
| const collector = new TraceCollector(); | ||
| // System init wrapped in supervisor envelope | ||
| collector.addLine( | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 0, | ||
| event: { | ||
| type: "system", | ||
| subtype: "init", | ||
| session_id: "sess-supervised", | ||
| model: "claude-opus-4-6", | ||
| tools: ["Bash"], | ||
| }, | ||
| }), | ||
| ); | ||
| // Assistant message wrapped in supervisor envelope | ||
| collector.addLine( | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 1, | ||
| event: { | ||
| type: "assistant", | ||
| message: { | ||
| content: [{ type: "text", text: "I ran the tests." }], | ||
| usage: { input_tokens: 100, output_tokens: 50 }, | ||
| }, | ||
| }, | ||
| }), | ||
| ); | ||
| // Tool result wrapped in supervisor envelope | ||
| collector.addLine( | ||
| JSON.stringify({ | ||
| source: "agent", | ||
| turn: 1, | ||
| event: { | ||
| type: "user", | ||
| message: { | ||
| role: "user", | ||
| content: [ | ||
| { | ||
| type: "tool_result", | ||
| tool_use_id: "toolu_sup", | ||
| content: "All tests passed", | ||
| }, | ||
| ], | ||
| }, | ||
| }, | ||
| }), | ||
| ); | ||
| // Result event wrapped in supervisor envelope | ||
| collector.addLine( | ||
| JSON.stringify({ | ||
| source: "supervisor", | ||
| turn: 1, | ||
| event: { | ||
| type: "result", | ||
| subtype: "success", | ||
| total_cost_usd: 0.44, | ||
| duration_ms: 30000, | ||
| num_turns: 2, | ||
| }, | ||
| }), | ||
| ); | ||
| const trace = collector.toJSON(); | ||
| assert.strictEqual(trace.metadata.sessionId, "sess-supervised"); | ||
| assert.strictEqual(trace.turns.length, 2); | ||
| assert.strictEqual(trace.turns[0].role, "assistant"); | ||
| assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests."); | ||
| assert.strictEqual(trace.turns[1].role, "tool_result"); | ||
| assert.strictEqual(trace.turns[1].content, "All tests passed"); | ||
| assert.strictEqual(trace.summary.result, "success"); | ||
| assert.strictEqual(trace.summary.totalCostUsd, 0.44); | ||
| }); | ||
| test("skips orchestrator summary lines from supervised traces", () => { | ||
| const collector = new TraceCollector(); | ||
| collector.addLine( | ||
| JSON.stringify({ | ||
| source: "orchestrator", | ||
| type: "summary", | ||
| success: true, | ||
| turns: 3, | ||
| }), | ||
| ); | ||
| // Orchestrator summaries have no inner event and no recognized type | ||
| // after unwrap — they should be silently skipped. | ||
| assert.strictEqual(collector.toJSON().turns.length, 0); | ||
| }); | ||
| test("skips rate_limit_event and unknown types", () => { | ||
@@ -154,0 +250,0 @@ const collector = new TraceCollector(); |
105331
6.74%2577
7.55%