🚀. Socket Launch Week Day 2:Introducing Manifest Alerts.Learn more
Sign In

@fre4x/benchmark

Package Overview
Dependencies
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@fre4x/benchmark - npm Package Compare versions

Comparing version
1.1.0-beta.1
to
1.1.0-beta.3
+613
dist/fallback-catalog.json
[
{
"challenge_id": "code_ci_repair_drill",
"benchmark_id": "deterministic_mix",
"family": "code",
"runner_kind": "code_runner",
"title": "Code CI Repair Drill",
"description": "Read incident assets and answer with deterministic, machine-checkable outputs.",
"version": "builtin-v2",
"source": "builtin",
"tasks": [
{
"task_id": "code-ci-1",
"title": "Classify the CI failure",
"prompt": "Read the attached incident report and failing test summary.\nReturn compact JSON with exactly these fields:\n{\"root_cause\":\"...\",\"failing_test_count\":number}\nThe root_cause must be one of: cache_key, timeout_config, path_join.",
"response_format": "json",
"difficulty": 1,
"assets": [
{
"asset_id": "code-ci-incident",
"name": "incident_report.md",
"mime_type": "text/markdown",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Postmortem summary for the broken CI run.",
"content": "# Incident report\n\n- Regression introduced after lowering `timeout_ms` from 1500 to 500.\n- Remote artifact fetch now fails on slower runners.\n- No cache key or path joining regressions were found."
},
{
"asset_id": "code-ci-tests",
"name": "failing_tests.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Names of the failing tests from CI.",
"content": "FAIL retry fetches artifact after timeout\nFAIL increases timeout for cold runner\nFAIL preserves timeout override in deploy step"
}
],
"checkers": [
{
"checker_id": "code-ci-root-cause",
"kind": "json_field_equals",
"field": "root_cause",
"expected": "timeout_config"
},
{
"checker_id": "code-ci-failing-count",
"kind": "json_field_equals",
"field": "failing_test_count",
"expected": 3
},
{
"checker_id": "code-ci-materialized-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 0
},
{
"checker_id": "code-ci-log",
"kind": "runner_log_contains_text",
"terms": ["prepared code workspace inputs"]
}
]
},
{
"task_id": "code-ci-2",
"title": "Name the config key to patch",
"prompt": "Read the patch note and return only the configuration key that must be raised.\nDo not return prose.",
"response_format": "text",
"difficulty": 1,
"assets": [
{
"asset_id": "code-ci-patch-note",
"name": "patch_note.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Patch hint for the CI timeout regression.",
"content": "Increase timeout_ms to 1500 so slow runners can finish the artifact fetch."
}
],
"checkers": [
{
"checker_id": "code-ci-config-key",
"kind": "exact_text",
"expected": "timeout_ms",
"case_sensitive": true
}
]
}
]
},
{
"challenge_id": "web_ops_order_reconciliation",
"benchmark_id": "deterministic_mix",
"family": "web",
"runner_kind": "browser_runner",
"title": "Web Ops Order Reconciliation",
"description": "Parse a simulated operations dashboard and return deterministic JSON fields.",
"version": "builtin-v2",
"source": "builtin",
"tasks": [
{
"task_id": "web-ops-1",
"title": "Extract dashboard status",
"prompt": "Inspect the attached HTML snapshot.\nReturn compact JSON with exactly these fields:\n{\"order_id\":\"...\",\"status\":\"...\",\"eta\":\"...\"}",
"response_format": "json",
"difficulty": 2,
"assets": [
{
"asset_id": "web-ops-dashboard",
"name": "dashboard.html",
"mime_type": "text/html",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Static DOM snapshot of the fulfillment dashboard.",
"content": "<section id=\"orders\">\n <article data-order-id=\"SO-2048\">\n <h2>Shipment SO-2048</h2>\n <p class=\"status\">Delayed</p>\n <p class=\"eta\">ETA: 2026-05-14</p>\n </article>\n</section>"
}
],
"checkers": [
{
"checker_id": "web-order-id",
"kind": "json_field_equals",
"field": "order_id",
"expected": "SO-2048"
},
{
"checker_id": "web-status",
"kind": "json_field_equals",
"field": "status",
"expected": "Delayed"
},
{
"checker_id": "web-eta",
"kind": "json_field_equals",
"field": "eta",
"expected": "2026-05-14"
},
{
"checker_id": "web-materialized-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 1
},
{
"checker_id": "web-log",
"kind": "runner_log_contains_text",
"terms": ["prepared browser snapshot workspace"]
}
]
}
]
},
{
"challenge_id": "os_release_gate_review",
"benchmark_id": "deterministic_mix",
"family": "os",
"runner_kind": "os_runner",
"title": "OS Release Gate Review",
"description": "Read filesystem-style assets and answer with deterministic release decisions.",
"version": "builtin-v2",
"source": "builtin",
"tasks": [
{
"task_id": "os-gate-1",
"title": "Decide the release action",
"prompt": "Read the attached release notes and audit log.\nReturn only the final action in lowercase.\nExpected format: two words.",
"response_format": "text",
"difficulty": 2,
"assets": [
{
"asset_id": "os-release-notes",
"name": "release_notes.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Release checklist summary.",
"content": "release=2026.05.14\ndatabase_migration=pending\nsecurity_scan=pass\nrollback_bundle=ready"
},
{
"asset_id": "os-audit-log",
"name": "audit.log",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Audit events from the release gate.",
"content": "[warn] migration step not executed\n[info] rollback artifacts verified\n[decision] rollback release"
}
],
"checkers": [
{
"checker_id": "os-release-action",
"kind": "normalized_text",
"expected": "rollback release"
},
{
"checker_id": "os-materialized-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 2
},
{
"checker_id": "os-log",
"kind": "runner_log_contains_text",
"terms": ["prepared os workspace inventory"]
}
]
}
]
},
{
"challenge_id": "code_repository_incident_pack",
"benchmark_id": "external_mix",
"family": "code",
"runner_kind": "code_runner",
"title": "Code Repository Incident Pack",
"description": "Repository incident triage tasks for deterministic code evaluation.",
"version": "external-v1",
"source": "external-example",
"tasks": [
{
"task_id": "code-repo-1",
"title": "Identify regression owner",
"prompt": "Read the incident brief and return compact JSON with exactly these fields: {\"repository\":\"...\",\"root_cause\":\"...\"}. The root_cause must be one of: retry_budget, timeout_config, cache_key.",
"response_format": "json",
"difficulty": 1,
"assets": [
{
"asset_id": "code-repo-1-brief",
"name": "incident.md",
"mime_type": "text/markdown",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Repository incident summary.",
"content": "# Repo incident\n\nRepository: api-gateway\nRegression traced to timeout_config drift after a config cleanup.\nNo retry_budget or cache_key regressions were found."
}
],
"checkers": [
{
"checker_id": "code-repo-1-repository",
"kind": "json_field_equals",
"field": "repository",
"expected": "api-gateway"
},
{
"checker_id": "code-repo-1-cause",
"kind": "json_field_equals",
"field": "root_cause",
"expected": "timeout_config"
},
{
"checker_id": "code-repo-1-log",
"kind": "runner_log_contains_text",
"terms": ["prepared code workspace inputs"]
}
]
},
{
"task_id": "code-repo-2",
"title": "Return the hotfix branch",
"prompt": "Read the handoff note and return only the hotfix branch name.",
"response_format": "text",
"difficulty": 1,
"assets": [
{
"asset_id": "code-repo-2-note",
"name": "handoff.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Hotfix branch handoff.",
"content": "Create hotfix/timeout-rollback before restarting deployment."
}
],
"checkers": [
{
"checker_id": "code-repo-2-branch",
"kind": "exact_text",
"expected": "hotfix/timeout-rollback",
"case_sensitive": true
}
]
},
{
"task_id": "code-repo-3",
"title": "Extract severity metadata",
"prompt": "Inspect the release memo and return compact JSON with exactly these fields: {\"owner\":\"...\",\"severity\":\"...\"}.",
"response_format": "json",
"difficulty": 2,
"assets": [
{
"asset_id": "code-repo-3-memo",
"name": "release_memo.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Release severity memo.",
"content": "Owner: platform-release\nSeverity: high\nRollback status: ready"
}
],
"checkers": [
{
"checker_id": "code-repo-3-owner",
"kind": "json_field_equals",
"field": "owner",
"expected": "platform-release"
},
{
"checker_id": "code-repo-3-severity",
"kind": "json_field_equals",
"field": "severity",
"expected": "high"
},
{
"checker_id": "code-repo-3-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 1
}
]
}
]
},
{
"challenge_id": "web_customer_portal_pack",
"benchmark_id": "external_mix",
"family": "web",
"runner_kind": "browser_runner",
"title": "Web Customer Portal Pack",
"description": "Portal extraction tasks using deterministic DOM snapshots.",
"version": "external-v1",
"source": "external-example",
"tasks": [
{
"task_id": "web-portal-1",
"title": "Extract queue ownership",
"prompt": "Read the HTML snapshot and return compact JSON with exactly these fields: {\"ticket_id\":\"...\",\"priority\":\"...\",\"owner\":\"...\"}.",
"response_format": "json",
"difficulty": 2,
"assets": [
{
"asset_id": "web-portal-1-html",
"name": "queue.html",
"mime_type": "text/html",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Support queue HTML snapshot.",
"content": "<div class=\"ticket\" data-ticket-id=\"T-4401\"><span class=\"priority\">urgent</span><span class=\"owner\">mina.choi</span></div>"
}
],
"checkers": [
{
"checker_id": "web-portal-1-ticket",
"kind": "json_field_equals",
"field": "ticket_id",
"expected": "T-4401"
},
{
"checker_id": "web-portal-1-priority",
"kind": "json_field_equals",
"field": "priority",
"expected": "urgent"
},
{
"checker_id": "web-portal-1-owner",
"kind": "json_field_equals",
"field": "owner",
"expected": "mina.choi"
},
{
"checker_id": "web-portal-1-log",
"kind": "runner_log_contains_text",
"terms": ["prepared browser snapshot workspace"]
}
]
},
{
"task_id": "web-portal-2",
"title": "Return the account state",
"prompt": "Inspect the account banner and return only the account state phrase in lowercase.",
"response_format": "text",
"difficulty": 1,
"assets": [
{
"asset_id": "web-portal-2-banner",
"name": "banner.html",
"mime_type": "text/html",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Account banner snapshot.",
"content": "<aside class=\"banner\">Account state: billing hold</aside>"
}
],
"checkers": [
{
"checker_id": "web-portal-2-state",
"kind": "normalized_text",
"expected": "billing hold"
}
]
},
{
"task_id": "web-portal-3",
"title": "Extract invoice status",
"prompt": "Return compact JSON with exactly these fields after reading the invoice card: {\"invoice_id\":\"...\",\"status\":\"...\"}.",
"response_format": "json",
"difficulty": 1,
"assets": [
{
"asset_id": "web-portal-3-card",
"name": "invoice.html",
"mime_type": "text/html",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Invoice card HTML.",
"content": "<article class=\"invoice\" data-invoice-id=\"INV-991\"><span class=\"status\">Escalated</span></article>"
}
],
"checkers": [
{
"checker_id": "web-portal-3-id",
"kind": "json_field_equals",
"field": "invoice_id",
"expected": "INV-991"
},
{
"checker_id": "web-portal-3-status",
"kind": "json_field_equals",
"field": "status",
"expected": "Escalated"
},
{
"checker_id": "web-portal-3-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 1
}
]
}
]
},
{
"challenge_id": "os_release_ops_pack",
"benchmark_id": "external_mix",
"family": "os",
"runner_kind": "os_runner",
"title": "OS Release Ops Pack",
"description": "Filesystem and audit-log based deterministic operations tasks.",
"version": "external-v1",
"source": "external-example",
"tasks": [
{
"task_id": "os-ops-1",
"title": "Decide deployment action",
"prompt": "Read the release note and return only the final deployment action in lowercase.",
"response_format": "text",
"difficulty": 2,
"assets": [
{
"asset_id": "os-ops-1-release",
"name": "release.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Release action summary.",
"content": "deploy_window=open\nmigration=blocked\naction=pause deploy"
}
],
"checkers": [
{
"checker_id": "os-ops-1-action",
"kind": "normalized_text",
"expected": "pause deploy"
},
{
"checker_id": "os-ops-1-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 1
},
{
"checker_id": "os-ops-1-log",
"kind": "runner_log_contains_text",
"terms": ["prepared os workspace inventory"]
}
]
},
{
"task_id": "os-ops-2",
"title": "Extract cleanup order",
"prompt": "Return compact JSON with exactly these fields after reading the session audit: {\"cleanup_action\":\"...\",\"stale_sessions\":number}.",
"response_format": "json",
"difficulty": 2,
"assets": [
{
"asset_id": "os-ops-2-audit",
"name": "session_audit.log",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Session cleanup audit log.",
"content": "cleanup_action=rotate tokens\nstale_sessions=4\nrollback_ready=yes"
}
],
"checkers": [
{
"checker_id": "os-ops-2-action",
"kind": "json_field_equals",
"field": "cleanup_action",
"expected": "rotate tokens"
},
{
"checker_id": "os-ops-2-count",
"kind": "json_field_equals",
"field": "stale_sessions",
"expected": 4
}
]
}
]
},
{
"challenge_id": "code_data_quality_pack",
"benchmark_id": "external_mix",
"family": "code",
"runner_kind": "code_runner",
"title": "Code Data Quality Pack",
"description": "Data pipeline and schema audit tasks for deterministic code-oriented evaluation.",
"version": "external-v1",
"source": "external-example",
"tasks": [
{
"task_id": "code-data-1",
"title": "Extract failed row count",
"prompt": "Read the pipeline report and return compact JSON with exactly these fields: {\"pipeline\":\"...\",\"failed_rows\":number}.",
"response_format": "json",
"difficulty": 1,
"assets": [
{
"asset_id": "code-data-1-report",
"name": "pipeline_report.txt",
"mime_type": "text/plain",
"transport": "inline",
"materialization": "inline",
"encoding": "utf8",
"description": "Pipeline quality report.",
"content": "pipeline=nightly-orders\nfailed_rows=17\nschema_state=compatible"
}
],
"checkers": [
{
"checker_id": "code-data-1-pipeline",
"kind": "json_field_equals",
"field": "pipeline",
"expected": "nightly-orders"
},
{
"checker_id": "code-data-1-rows",
"kind": "json_field_equals",
"field": "failed_rows",
"expected": 17
}
]
},
{
"task_id": "code-data-2",
"title": "Return schema key",
"prompt": "Inspect the migration note and return only the schema key that must be pinned.",
"response_format": "text",
"difficulty": 1,
"assets": [
{
"asset_id": "code-data-2-note",
"name": "migration_note.md",
"mime_type": "text/markdown",
"transport": "inline",
"materialization": "workspace_file",
"encoding": "utf8",
"description": "Migration note for schema pinning.",
"content": "Pin schema_version before replaying the backlog."
}
],
"checkers": [
{
"checker_id": "code-data-2-key",
"kind": "exact_text",
"expected": "schema_version",
"case_sensitive": true
},
{
"checker_id": "code-data-2-assets",
"kind": "runner_fact_equals",
"fact": "materialized_asset_count",
"expected": 1
}
]
}
]
}
]
+3
-3
{
"name": "@fre4x/benchmark",
"version": "1.1.0-beta.1",
"description": "A benchmark MCP server for agent evaluation workflows.",
"version": "1.1.0-beta.3",
"description": "A deterministic benchmark MCP server for agent evaluation workflows.",
"type": "module",

@@ -14,3 +14,3 @@ "main": "dist/index.js",

"scripts": {
"build": "node ../scripts/build-package.mjs",
"build": "node ../scripts/build-package.mjs && node -e \"const fs=require('fs');fs.copyFileSync('catalogs/expanded-catalog.json','dist/fallback-catalog.json');\"",
"typecheck": "cross-env NODE_OPTIONS=--max-old-space-size=4096 tsc --noEmit",

@@ -17,0 +17,0 @@ "start": "node dist/index.js",

+122
-18

@@ -1,6 +0,6 @@

# benchmark — Unified Agent Evaluation
# benchmark — Deterministic Agent Evaluation
This package exposes a consistent MCP workflow for benchmark-driven agent evaluation.
This package exposes a consistent MCP workflow for deterministic benchmark-driven agent evaluation.
GAIA is the first built-in adapter, but the tool surface is generic so other benchmarks can plug in later without changing client behavior.
The rebuilt core is organized around challenge catalogs, typed task assets, and explicit checker kinds so coding, web, and OS-style tasks can share one MCP surface without relying on LLM judges.

@@ -11,7 +11,9 @@ ## Tools

|------|---------|
| `benchmark_list_challenges` | List available benchmark suites with version and asset metadata |
| `benchmark_start_challenge` | Start an attempt and return the first question |
| `benchmark_submit_solution` | Grade one answer and return the next question or final score |
| `benchmark_list_challenges` | List deterministic benchmark suites with family, runner, and checker metadata |
| `benchmark_get_catalog_status` | Inspect catalog source configuration, cache state, and availability |
| `benchmark_sync_catalog` | Fetch and cache the remote benchmark catalog when a URL source is configured |
| `benchmark_start_challenge` | Start an attempt and return the first task |
| `benchmark_submit_solution` | Grade one task and return checker evidence plus the next task or final score |
| `benchmark_get_asset` | Read an attached benchmark asset by `asset_id` |
| `benchmark_get_attempt` | Inspect attempt status and the current question |
| `benchmark_get_attempt` | Inspect attempt status, current task, and paginated evaluation history |
| `benchmark_cancel_attempt` | Cancel an active attempt |

@@ -24,3 +26,3 @@

3. Call `benchmark_start_challenge`
4. If the question has assets, call `benchmark_get_asset`
4. If the task has assets, call `benchmark_get_asset`
5. Call `benchmark_submit_solution`

@@ -31,5 +33,26 @@ 6. Repeat until `done: true`

## Fallback benchmark families
- **Code** — deterministic JSON/text answers backed by explicit checkers
- **Web** — DOM snapshot extraction tasks with JSON field assertions
- **OS** — filesystem/log review tasks with deterministic text grading
## Zero-config run
Run with the bundled fallback catalog and no extra configuration:
```bash
npx @fre4x/benchmark
```
Or from this repo:
```bash
cd /home/fritzprix/my_works/b1te
npm run inspector -w @fre4x/benchmark
```
## Mock Mode
Run without any external dataset file:
Run with the same bundled fallback catalog in mock mode:

@@ -40,6 +63,9 @@ ```bash

## Optional Environment
## Optional environment
```bash
BENCHMARK_GAIA_DATA_FILE=/absolute/path/to/gaia-challenges.json
BENCHMARK_CATALOG_FILE=/absolute/path/to/benchmark-catalog.json
BENCHMARK_CATALOG_URL=https://example.com/benchmark-catalog.json
BENCHMARK_CACHE_DIR=/absolute/path/to/catalog-cache
BENCHMARK_CACHE_TTL_SECONDS=3600
BENCHMARK_STATE_DIR=/absolute/path/to/store-attempt-json

@@ -49,6 +75,83 @@ BENCHMARK_MOCK=true

- `BENCHMARK_GAIA_DATA_FILE`: Optional JSON file with GAIA-compatible normalized challenge definitions
- `BENCHMARK_STATE_DIR`: Where attempt state is persisted
- `BENCHMARK_CATALOG_FILE`: Optional JSON file with challenge definitions in the rebuilt deterministic catalog format
- `BENCHMARK_CATALOG_URL`: Optional remote JSON catalog URL for fetch/cache based ingestion
- `BENCHMARK_CACHE_DIR`: Optional cache directory for remote catalog snapshots
- `BENCHMARK_CACHE_TTL_SECONDS`: Freshness window for remote catalog cache reuse
- `BENCHMARK_STATE_DIR`: Where attempt files and lock directories are persisted
- `BENCHMARK_MOCK`: Alternate mock-mode flag
`BENCHMARK_GAIA_DATA_FILE` is still accepted as a backward-compatible alias, but the rebuilt package is no longer GAIA-first.
When `BENCHMARK_CATALOG_URL` is set, the package will reuse a fresh cached copy when available and can be explicitly refreshed with `benchmark_sync_catalog`.
## Catalog shape
External catalogs must be a JSON array of challenge definitions shaped like:
```json
[
{
"challenge_id": "custom_suite",
"benchmark_id": "custom",
"family": "code",
"runner_kind": "code_runner",
"title": "Custom Challenge",
"description": "Deterministic single-task suite",
"version": "v1",
"source": "external",
"tasks": [
{
"task_id": "custom-1",
"title": "Return yes",
"prompt": "Return only yes.",
"response_format": "text",
"difficulty": 1,
"assets": [],
"checkers": [
{
"checker_id": "custom-yes",
"kind": "exact_text",
"expected": "yes"
}
]
}
]
}
]
```
Supported checker kinds today:
- `exact_text`
- `normalized_text`
- `regex_match`
- `contains_all_text`
- `json_field_equals`
- `runner_fact_equals`
- `runner_log_contains_text`
Each submission now also records runner execution metadata:
- workspace directory
- materialized asset and submission artifacts
- runner facts
- runner logs
## Example external catalog
The repo includes the bundled fallback catalog source at:
```bash
benchmark/catalogs/expanded-catalog.json
```
Use it like this:
```bash
cd /home/fritzprix/my_works/b1te
BENCHMARK_CATALOG_FILE=/home/fritzprix/my_works/b1te/benchmark/catalogs/expanded-catalog.json npm run inspector -w @fre4x/benchmark
```
If no catalog env is provided at runtime, the published package falls back to the bundled copy of this catalog automatically.
## Claude Desktop

@@ -60,9 +163,10 @@

"benchmark": {
"command": "npx",
"args": ["-y", "@fre4x/benchmark"],
"env": {
"BENCHMARK_GAIA_DATA_FILE": "/absolute/path/to/gaia-challenges.json"
"command": "npx",
"args": ["-y", "@fre4x/benchmark"],
"env": {
"BENCHMARK_CATALOG_URL": "https://example.com/benchmark-catalog.json",
"BENCHMARK_CACHE_DIR": "/absolute/path/to/benchmark-cache"
}
}
}
}
}

@@ -69,0 +173,0 @@ ```

Sorry, the diff of this file is too big to display