@forwardimpact/libeval - npm Package Compare versions

+1

-1

package.json

		{
		"name": "@forwardimpact/libeval",
		"version": "0.1.5",
		"version": "0.1.6",
		"description": "Process Claude Code stream-json output into structured traces",
		@@ -5,0 +5,0 @@ "license": "Apache-2.0",

+38

-5

src/supervisor.js

		@@ -56,2 +56,11 @@ /**
		this.currentTurn = 0;
		/**
		* Set to true when any supervisor message contains the success signal.
		* The SDK result text only reflects the last assistant message, so when
		* the supervisor writes EVALUATION_SUCCESSFUL in an early message and
		* then continues with follow-up work, the result text won't contain it.
		* This flag captures the signal from the full message stream.
		* @type {boolean}
		*/
		this.successSignalSeen = false;
		}
		@@ -70,2 +79,3 @@
		this.currentTurn = 0;
		this.successSignalSeen = false;
		let supervisorResult = await this.supervisorRunner.run(task);
		@@ -78,5 +88,8 @@

		// The supervisor's turn is fully complete (all tool calls executed) by the
		// time we check the signal — no work is interrupted.
		if (isSuccessful(supervisorResult.text)) {
		// Check for the success signal in either the SDK result text or the
		// streamed message content. The SDK result text only reflects the last
		// assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
		// early and then continues (e.g. filing issues), we must also check the
		// flag set by emitLine during streaming.
		if (this.successSignalSeen \|\| isSuccessful(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: 0 });
		@@ -112,2 +125,3 @@ return { success: true, turns: 0 };
		this.currentTurn = turn;
		this.successSignalSeen = false;
		supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
		@@ -120,4 +134,5 @@

		// The supervisor's turn is fully complete — check for success signal.
		if (isSuccessful(supervisorResult.text)) {
		// The supervisor's turn is fully complete — check for success signal
		// in either the SDK result text or streamed messages.
		if (this.successSignalSeen \|\| isSuccessful(supervisorResult.text)) {
		this.emitSummary({ success: true, turns: turn });
		@@ -150,2 +165,5 @@ return { success: true, turns: turn };
		* Called in real-time via the AgentRunner onLine callback.
		*
		* When the current source is the supervisor, also scans assistant text
		* content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
		* @param {string} line - Raw NDJSON line from the runner
		@@ -161,2 +179,17 @@ */
		this.output.write(JSON.stringify(tagged) + "\n");

		// Scan supervisor assistant messages for the success signal in real time.
		// The SDK result text only reflects the final assistant message, but the
		// supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
		// then continue with follow-up tool calls.
		if (this.currentSource === "supervisor" && event.type === "assistant") {
		const content = event.message?.content ?? event.content ?? [];
		if (Array.isArray(content)) {
		for (const block of content) {
		if (block.type === "text" && isSuccessful(block.text)) {
		this.successSignalSeen = true;
		}
		}
		}
		}
		}
		@@ -163,0 +196,0 @@

+7

-0

src/trace-collector.js

		@@ -41,2 +41,9 @@ /**

		// Unwrap combined supervised trace format {source, turn, event}.
		// The Supervisor emits this wrapper; when replayed through addLine the
		// inner event is the one we need.
		if (event.event && !event.type && typeof event.source === "string") {
		event = event.event;
		}

		switch (event.type) {
		@@ -43,0 +50,0 @@ case "system":

+61

-0

test/supervisor.test.js

		@@ -192,2 +192,63 @@ import { describe, test } from "node:test";

		test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
		// Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
		// an early message, then continues with follow-up work (e.g. filing issues).
		// The SDK result text reflects only the final message, which does NOT
		// contain the signal.
		const agentRunner = createMockRunner([
		{ text: "I installed the packages." },
		]);

		// The supervisor's result text is the Summary (no signal), but messages
		// include one with EVALUATION_SUCCESSFUL.
		const supervisorMessages = [
		undefined, // turn 0: use default
		[
		{
		type: "assistant",
		message: {
		content: [
		{
		type: "text",
		text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
		},
		],
		},
		},
		{
		type: "assistant",
		message: {
		content: [
		{ type: "text", text: "## Summary\n\nAll issues filed." },
		],
		},
		},
		],
		];

		const supervisorRunner = createMockRunner(
		[
		{ text: "Welcome! Please install the packages." },
		// Result text is the final message — does NOT contain the signal
		{ text: "## Summary\n\nAll issues filed." },
		],
		supervisorMessages,
		);

		const output = new PassThrough();
		const supervisor = new Supervisor({
		agentRunner,
		supervisorRunner,
		output,
		maxTurns: 10,
		});
		agentRunner.onLine = (line) => supervisor.emitLine(line);
		supervisorRunner.onLine = (line) => supervisor.emitLine(line);

		const result = await supervisor.run("Install stuff");

		assert.strictEqual(result.success, true);
		assert.strictEqual(result.turns, 1);
		});

		test("runs multiple turns before completion", async () => {
		@@ -194,0 +255,0 @@ const agentRunner = createMockRunner([

+96

-0

test/trace-collector.test.js

		@@ -152,2 +152,98 @@ import { describe, test } from "node:test";

		test("unwraps combined supervised trace format {source, turn, event}", () => {
		const collector = new TraceCollector();

		// System init wrapped in supervisor envelope
		collector.addLine(
		JSON.stringify({
		source: "agent",
		turn: 0,
		event: {
		type: "system",
		subtype: "init",
		session_id: "sess-supervised",
		model: "claude-opus-4-6",
		tools: ["Bash"],
		},
		}),
		);

		// Assistant message wrapped in supervisor envelope
		collector.addLine(
		JSON.stringify({
		source: "agent",
		turn: 1,
		event: {
		type: "assistant",
		message: {
		content: [{ type: "text", text: "I ran the tests." }],
		usage: { input_tokens: 100, output_tokens: 50 },
		},
		},
		}),
		);

		// Tool result wrapped in supervisor envelope
		collector.addLine(
		JSON.stringify({
		source: "agent",
		turn: 1,
		event: {
		type: "user",
		message: {
		role: "user",
		content: [
		{
		type: "tool_result",
		tool_use_id: "toolu_sup",
		content: "All tests passed",
		},
		],
		},
		},
		}),
		);

		// Result event wrapped in supervisor envelope
		collector.addLine(
		JSON.stringify({
		source: "supervisor",
		turn: 1,
		event: {
		type: "result",
		subtype: "success",
		total_cost_usd: 0.44,
		duration_ms: 30000,
		num_turns: 2,
		},
		}),
		);

		const trace = collector.toJSON();
		assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
		assert.strictEqual(trace.turns.length, 2);
		assert.strictEqual(trace.turns[0].role, "assistant");
		assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
		assert.strictEqual(trace.turns[1].role, "tool_result");
		assert.strictEqual(trace.turns[1].content, "All tests passed");
		assert.strictEqual(trace.summary.result, "success");
		assert.strictEqual(trace.summary.totalCostUsd, 0.44);
		});

		test("skips orchestrator summary lines from supervised traces", () => {
		const collector = new TraceCollector();
		collector.addLine(
		JSON.stringify({
		source: "orchestrator",
		type: "summary",
		success: true,
		turns: 3,
		}),
		);

		// Orchestrator summaries have no inner event and no recognized type
		// after unwrap — they should be silently skipped.
		assert.strictEqual(collector.toJSON().turns.length, 0);
		});

		test("skips rate_limit_event and unknown types", () => {
		@@ -154,0 +250,0 @@ const collector = new TraceCollector();

@forwardimpact/libeval - npm Package Compare versions

Improved metrics