@fre4x/benchmark
Advanced tools
| [ | ||
| { | ||
| "challenge_id": "code_ci_repair_drill", | ||
| "benchmark_id": "deterministic_mix", | ||
| "family": "code", | ||
| "runner_kind": "code_runner", | ||
| "title": "Code CI Repair Drill", | ||
| "description": "Read incident assets and answer with deterministic, machine-checkable outputs.", | ||
| "version": "builtin-v2", | ||
| "source": "builtin", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "code-ci-1", | ||
| "title": "Classify the CI failure", | ||
| "prompt": "Read the attached incident report and failing test summary.\nReturn compact JSON with exactly these fields:\n{\"root_cause\":\"...\",\"failing_test_count\":number}\nThe root_cause must be one of: cache_key, timeout_config, path_join.", | ||
| "response_format": "json", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-ci-incident", | ||
| "name": "incident_report.md", | ||
| "mime_type": "text/markdown", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Postmortem summary for the broken CI run.", | ||
| "content": "# Incident report\n\n- Regression introduced after lowering `timeout_ms` from 1500 to 500.\n- Remote artifact fetch now fails on slower runners.\n- No cache key or path joining regressions were found." | ||
| }, | ||
| { | ||
| "asset_id": "code-ci-tests", | ||
| "name": "failing_tests.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Names of the failing tests from CI.", | ||
| "content": "FAIL retry fetches artifact after timeout\nFAIL increases timeout for cold runner\nFAIL preserves timeout override in deploy step" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-ci-root-cause", | ||
| "kind": "json_field_equals", | ||
| "field": "root_cause", | ||
| "expected": "timeout_config" | ||
| }, | ||
| { | ||
| "checker_id": "code-ci-failing-count", | ||
| "kind": "json_field_equals", | ||
| "field": "failing_test_count", | ||
| "expected": 3 | ||
| }, | ||
| { | ||
| "checker_id": "code-ci-materialized-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 0 | ||
| }, | ||
| { | ||
| "checker_id": "code-ci-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared code workspace inputs"] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "code-ci-2", | ||
| "title": "Name the config key to patch", | ||
| "prompt": "Read the patch note and return only the configuration key that must be raised.\nDo not return prose.", | ||
| "response_format": "text", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-ci-patch-note", | ||
| "name": "patch_note.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Patch hint for the CI timeout regression.", | ||
| "content": "Increase timeout_ms to 1500 so slow runners can finish the artifact fetch." | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-ci-config-key", | ||
| "kind": "exact_text", | ||
| "expected": "timeout_ms", | ||
| "case_sensitive": true | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "web_ops_order_reconciliation", | ||
| "benchmark_id": "deterministic_mix", | ||
| "family": "web", | ||
| "runner_kind": "browser_runner", | ||
| "title": "Web Ops Order Reconciliation", | ||
| "description": "Parse a simulated operations dashboard and return deterministic JSON fields.", | ||
| "version": "builtin-v2", | ||
| "source": "builtin", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "web-ops-1", | ||
| "title": "Extract dashboard status", | ||
| "prompt": "Inspect the attached HTML snapshot.\nReturn compact JSON with exactly these fields:\n{\"order_id\":\"...\",\"status\":\"...\",\"eta\":\"...\"}", | ||
| "response_format": "json", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "web-ops-dashboard", | ||
| "name": "dashboard.html", | ||
| "mime_type": "text/html", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Static DOM snapshot of the fulfillment dashboard.", | ||
| "content": "<section id=\"orders\">\n <article data-order-id=\"SO-2048\">\n <h2>Shipment SO-2048</h2>\n <p class=\"status\">Delayed</p>\n <p class=\"eta\">ETA: 2026-05-14</p>\n </article>\n</section>" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "web-order-id", | ||
| "kind": "json_field_equals", | ||
| "field": "order_id", | ||
| "expected": "SO-2048" | ||
| }, | ||
| { | ||
| "checker_id": "web-status", | ||
| "kind": "json_field_equals", | ||
| "field": "status", | ||
| "expected": "Delayed" | ||
| }, | ||
| { | ||
| "checker_id": "web-eta", | ||
| "kind": "json_field_equals", | ||
| "field": "eta", | ||
| "expected": "2026-05-14" | ||
| }, | ||
| { | ||
| "checker_id": "web-materialized-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 1 | ||
| }, | ||
| { | ||
| "checker_id": "web-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared browser snapshot workspace"] | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "os_release_gate_review", | ||
| "benchmark_id": "deterministic_mix", | ||
| "family": "os", | ||
| "runner_kind": "os_runner", | ||
| "title": "OS Release Gate Review", | ||
| "description": "Read filesystem-style assets and answer with deterministic release decisions.", | ||
| "version": "builtin-v2", | ||
| "source": "builtin", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "os-gate-1", | ||
| "title": "Decide the release action", | ||
| "prompt": "Read the attached release notes and audit log.\nReturn only the final action in lowercase.\nExpected format: two words.", | ||
| "response_format": "text", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "os-release-notes", | ||
| "name": "release_notes.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Release checklist summary.", | ||
| "content": "release=2026.05.14\ndatabase_migration=pending\nsecurity_scan=pass\nrollback_bundle=ready" | ||
| }, | ||
| { | ||
| "asset_id": "os-audit-log", | ||
| "name": "audit.log", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Audit events from the release gate.", | ||
| "content": "[warn] migration step not executed\n[info] rollback artifacts verified\n[decision] rollback release" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "os-release-action", | ||
| "kind": "normalized_text", | ||
| "expected": "rollback release" | ||
| }, | ||
| { | ||
| "checker_id": "os-materialized-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 2 | ||
| }, | ||
| { | ||
| "checker_id": "os-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared os workspace inventory"] | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "code_repository_incident_pack", | ||
| "benchmark_id": "external_mix", | ||
| "family": "code", | ||
| "runner_kind": "code_runner", | ||
| "title": "Code Repository Incident Pack", | ||
| "description": "Repository incident triage tasks for deterministic code evaluation.", | ||
| "version": "external-v1", | ||
| "source": "external-example", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "code-repo-1", | ||
| "title": "Identify regression owner", | ||
| "prompt": "Read the incident brief and return compact JSON with exactly these fields: {\"repository\":\"...\",\"root_cause\":\"...\"}. The root_cause must be one of: retry_budget, timeout_config, cache_key.", | ||
| "response_format": "json", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-repo-1-brief", | ||
| "name": "incident.md", | ||
| "mime_type": "text/markdown", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Repository incident summary.", | ||
| "content": "# Repo incident\n\nRepository: api-gateway\nRegression traced to timeout_config drift after a config cleanup.\nNo retry_budget or cache_key regressions were found." | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-repo-1-repository", | ||
| "kind": "json_field_equals", | ||
| "field": "repository", | ||
| "expected": "api-gateway" | ||
| }, | ||
| { | ||
| "checker_id": "code-repo-1-cause", | ||
| "kind": "json_field_equals", | ||
| "field": "root_cause", | ||
| "expected": "timeout_config" | ||
| }, | ||
| { | ||
| "checker_id": "code-repo-1-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared code workspace inputs"] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "code-repo-2", | ||
| "title": "Return the hotfix branch", | ||
| "prompt": "Read the handoff note and return only the hotfix branch name.", | ||
| "response_format": "text", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-repo-2-note", | ||
| "name": "handoff.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Hotfix branch handoff.", | ||
| "content": "Create hotfix/timeout-rollback before restarting deployment." | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-repo-2-branch", | ||
| "kind": "exact_text", | ||
| "expected": "hotfix/timeout-rollback", | ||
| "case_sensitive": true | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "code-repo-3", | ||
| "title": "Extract severity metadata", | ||
| "prompt": "Inspect the release memo and return compact JSON with exactly these fields: {\"owner\":\"...\",\"severity\":\"...\"}.", | ||
| "response_format": "json", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-repo-3-memo", | ||
| "name": "release_memo.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Release severity memo.", | ||
| "content": "Owner: platform-release\nSeverity: high\nRollback status: ready" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-repo-3-owner", | ||
| "kind": "json_field_equals", | ||
| "field": "owner", | ||
| "expected": "platform-release" | ||
| }, | ||
| { | ||
| "checker_id": "code-repo-3-severity", | ||
| "kind": "json_field_equals", | ||
| "field": "severity", | ||
| "expected": "high" | ||
| }, | ||
| { | ||
| "checker_id": "code-repo-3-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 1 | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "web_customer_portal_pack", | ||
| "benchmark_id": "external_mix", | ||
| "family": "web", | ||
| "runner_kind": "browser_runner", | ||
| "title": "Web Customer Portal Pack", | ||
| "description": "Portal extraction tasks using deterministic DOM snapshots.", | ||
| "version": "external-v1", | ||
| "source": "external-example", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "web-portal-1", | ||
| "title": "Extract queue ownership", | ||
| "prompt": "Read the HTML snapshot and return compact JSON with exactly these fields: {\"ticket_id\":\"...\",\"priority\":\"...\",\"owner\":\"...\"}.", | ||
| "response_format": "json", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "web-portal-1-html", | ||
| "name": "queue.html", | ||
| "mime_type": "text/html", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Support queue HTML snapshot.", | ||
| "content": "<div class=\"ticket\" data-ticket-id=\"T-4401\"><span class=\"priority\">urgent</span><span class=\"owner\">mina.choi</span></div>" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "web-portal-1-ticket", | ||
| "kind": "json_field_equals", | ||
| "field": "ticket_id", | ||
| "expected": "T-4401" | ||
| }, | ||
| { | ||
| "checker_id": "web-portal-1-priority", | ||
| "kind": "json_field_equals", | ||
| "field": "priority", | ||
| "expected": "urgent" | ||
| }, | ||
| { | ||
| "checker_id": "web-portal-1-owner", | ||
| "kind": "json_field_equals", | ||
| "field": "owner", | ||
| "expected": "mina.choi" | ||
| }, | ||
| { | ||
| "checker_id": "web-portal-1-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared browser snapshot workspace"] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "web-portal-2", | ||
| "title": "Return the account state", | ||
| "prompt": "Inspect the account banner and return only the account state phrase in lowercase.", | ||
| "response_format": "text", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "web-portal-2-banner", | ||
| "name": "banner.html", | ||
| "mime_type": "text/html", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Account banner snapshot.", | ||
| "content": "<aside class=\"banner\">Account state: billing hold</aside>" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "web-portal-2-state", | ||
| "kind": "normalized_text", | ||
| "expected": "billing hold" | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "web-portal-3", | ||
| "title": "Extract invoice status", | ||
| "prompt": "Return compact JSON with exactly these fields after reading the invoice card: {\"invoice_id\":\"...\",\"status\":\"...\"}.", | ||
| "response_format": "json", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "web-portal-3-card", | ||
| "name": "invoice.html", | ||
| "mime_type": "text/html", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Invoice card HTML.", | ||
| "content": "<article class=\"invoice\" data-invoice-id=\"INV-991\"><span class=\"status\">Escalated</span></article>" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "web-portal-3-id", | ||
| "kind": "json_field_equals", | ||
| "field": "invoice_id", | ||
| "expected": "INV-991" | ||
| }, | ||
| { | ||
| "checker_id": "web-portal-3-status", | ||
| "kind": "json_field_equals", | ||
| "field": "status", | ||
| "expected": "Escalated" | ||
| }, | ||
| { | ||
| "checker_id": "web-portal-3-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 1 | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "os_release_ops_pack", | ||
| "benchmark_id": "external_mix", | ||
| "family": "os", | ||
| "runner_kind": "os_runner", | ||
| "title": "OS Release Ops Pack", | ||
| "description": "Filesystem and audit-log based deterministic operations tasks.", | ||
| "version": "external-v1", | ||
| "source": "external-example", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "os-ops-1", | ||
| "title": "Decide deployment action", | ||
| "prompt": "Read the release note and return only the final deployment action in lowercase.", | ||
| "response_format": "text", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "os-ops-1-release", | ||
| "name": "release.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Release action summary.", | ||
| "content": "deploy_window=open\nmigration=blocked\naction=pause deploy" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "os-ops-1-action", | ||
| "kind": "normalized_text", | ||
| "expected": "pause deploy" | ||
| }, | ||
| { | ||
| "checker_id": "os-ops-1-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 1 | ||
| }, | ||
| { | ||
| "checker_id": "os-ops-1-log", | ||
| "kind": "runner_log_contains_text", | ||
| "terms": ["prepared os workspace inventory"] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "os-ops-2", | ||
| "title": "Extract cleanup order", | ||
| "prompt": "Return compact JSON with exactly these fields after reading the session audit: {\"cleanup_action\":\"...\",\"stale_sessions\":number}.", | ||
| "response_format": "json", | ||
| "difficulty": 2, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "os-ops-2-audit", | ||
| "name": "session_audit.log", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Session cleanup audit log.", | ||
| "content": "cleanup_action=rotate tokens\nstale_sessions=4\nrollback_ready=yes" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "os-ops-2-action", | ||
| "kind": "json_field_equals", | ||
| "field": "cleanup_action", | ||
| "expected": "rotate tokens" | ||
| }, | ||
| { | ||
| "checker_id": "os-ops-2-count", | ||
| "kind": "json_field_equals", | ||
| "field": "stale_sessions", | ||
| "expected": 4 | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "challenge_id": "code_data_quality_pack", | ||
| "benchmark_id": "external_mix", | ||
| "family": "code", | ||
| "runner_kind": "code_runner", | ||
| "title": "Code Data Quality Pack", | ||
| "description": "Data pipeline and schema audit tasks for deterministic code-oriented evaluation.", | ||
| "version": "external-v1", | ||
| "source": "external-example", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "code-data-1", | ||
| "title": "Extract failed row count", | ||
| "prompt": "Read the pipeline report and return compact JSON with exactly these fields: {\"pipeline\":\"...\",\"failed_rows\":number}.", | ||
| "response_format": "json", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-data-1-report", | ||
| "name": "pipeline_report.txt", | ||
| "mime_type": "text/plain", | ||
| "transport": "inline", | ||
| "materialization": "inline", | ||
| "encoding": "utf8", | ||
| "description": "Pipeline quality report.", | ||
| "content": "pipeline=nightly-orders\nfailed_rows=17\nschema_state=compatible" | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-data-1-pipeline", | ||
| "kind": "json_field_equals", | ||
| "field": "pipeline", | ||
| "expected": "nightly-orders" | ||
| }, | ||
| { | ||
| "checker_id": "code-data-1-rows", | ||
| "kind": "json_field_equals", | ||
| "field": "failed_rows", | ||
| "expected": 17 | ||
| } | ||
| ] | ||
| }, | ||
| { | ||
| "task_id": "code-data-2", | ||
| "title": "Return schema key", | ||
| "prompt": "Inspect the migration note and return only the schema key that must be pinned.", | ||
| "response_format": "text", | ||
| "difficulty": 1, | ||
| "assets": [ | ||
| { | ||
| "asset_id": "code-data-2-note", | ||
| "name": "migration_note.md", | ||
| "mime_type": "text/markdown", | ||
| "transport": "inline", | ||
| "materialization": "workspace_file", | ||
| "encoding": "utf8", | ||
| "description": "Migration note for schema pinning.", | ||
| "content": "Pin schema_version before replaying the backlog." | ||
| } | ||
| ], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "code-data-2-key", | ||
| "kind": "exact_text", | ||
| "expected": "schema_version", | ||
| "case_sensitive": true | ||
| }, | ||
| { | ||
| "checker_id": "code-data-2-assets", | ||
| "kind": "runner_fact_equals", | ||
| "fact": "materialized_asset_count", | ||
| "expected": 1 | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| ] |
+3
-3
| { | ||
| "name": "@fre4x/benchmark", | ||
| "version": "1.1.0-beta.1", | ||
| "description": "A benchmark MCP server for agent evaluation workflows.", | ||
| "version": "1.1.0-beta.3", | ||
| "description": "A deterministic benchmark MCP server for agent evaluation workflows.", | ||
| "type": "module", | ||
@@ -14,3 +14,3 @@ "main": "dist/index.js", | ||
| "scripts": { | ||
| "build": "node ../scripts/build-package.mjs", | ||
| "build": "node ../scripts/build-package.mjs && node -e \"const fs=require('fs');fs.copyFileSync('catalogs/expanded-catalog.json','dist/fallback-catalog.json');\"", | ||
| "typecheck": "cross-env NODE_OPTIONS=--max-old-space-size=4096 tsc --noEmit", | ||
@@ -17,0 +17,0 @@ "start": "node dist/index.js", |
+122
-18
@@ -1,6 +0,6 @@ | ||
| # benchmark — Unified Agent Evaluation | ||
| # benchmark — Deterministic Agent Evaluation | ||
| This package exposes a consistent MCP workflow for benchmark-driven agent evaluation. | ||
| This package exposes a consistent MCP workflow for deterministic benchmark-driven agent evaluation. | ||
| GAIA is the first built-in adapter, but the tool surface is generic so other benchmarks can plug in later without changing client behavior. | ||
| The rebuilt core is organized around challenge catalogs, typed task assets, and explicit checker kinds so coding, web, and OS-style tasks can share one MCP surface without relying on LLM judges. | ||
@@ -11,7 +11,9 @@ ## Tools | ||
| |------|---------| | ||
| | `benchmark_list_challenges` | List available benchmark suites with version and asset metadata | | ||
| | `benchmark_start_challenge` | Start an attempt and return the first question | | ||
| | `benchmark_submit_solution` | Grade one answer and return the next question or final score | | ||
| | `benchmark_list_challenges` | List deterministic benchmark suites with family, runner, and checker metadata | | ||
| | `benchmark_get_catalog_status` | Inspect catalog source configuration, cache state, and availability | | ||
| | `benchmark_sync_catalog` | Fetch and cache the remote benchmark catalog when a URL source is configured | | ||
| | `benchmark_start_challenge` | Start an attempt and return the first task | | ||
| | `benchmark_submit_solution` | Grade one task and return checker evidence plus the next task or final score | | ||
| | `benchmark_get_asset` | Read an attached benchmark asset by `asset_id` | | ||
| | `benchmark_get_attempt` | Inspect attempt status and the current question | | ||
| | `benchmark_get_attempt` | Inspect attempt status, current task, and paginated evaluation history | | ||
| | `benchmark_cancel_attempt` | Cancel an active attempt | | ||
@@ -24,3 +26,3 @@ | ||
| 3. Call `benchmark_start_challenge` | ||
| 4. If the question has assets, call `benchmark_get_asset` | ||
| 4. If the task has assets, call `benchmark_get_asset` | ||
| 5. Call `benchmark_submit_solution` | ||
@@ -31,5 +33,26 @@ 6. Repeat until `done: true` | ||
| ## Fallback benchmark families | ||
| - **Code** — deterministic JSON/text answers backed by explicit checkers | ||
| - **Web** — DOM snapshot extraction tasks with JSON field assertions | ||
| - **OS** — filesystem/log review tasks with deterministic text grading | ||
| ## Zero-config run | ||
| Run with the bundled fallback catalog and no extra configuration: | ||
| ```bash | ||
| npx @fre4x/benchmark | ||
| ``` | ||
| Or from this repo: | ||
| ```bash | ||
| cd /home/fritzprix/my_works/b1te | ||
| npm run inspector -w @fre4x/benchmark | ||
| ``` | ||
| ## Mock Mode | ||
| Run without any external dataset file: | ||
| Run with the same bundled fallback catalog in mock mode: | ||
@@ -40,6 +63,9 @@ ```bash | ||
| ## Optional Environment | ||
| ## Optional environment | ||
| ```bash | ||
| BENCHMARK_GAIA_DATA_FILE=/absolute/path/to/gaia-challenges.json | ||
| BENCHMARK_CATALOG_FILE=/absolute/path/to/benchmark-catalog.json | ||
| BENCHMARK_CATALOG_URL=https://example.com/benchmark-catalog.json | ||
| BENCHMARK_CACHE_DIR=/absolute/path/to/catalog-cache | ||
| BENCHMARK_CACHE_TTL_SECONDS=3600 | ||
| BENCHMARK_STATE_DIR=/absolute/path/to/store-attempt-json | ||
@@ -49,6 +75,83 @@ BENCHMARK_MOCK=true | ||
| - `BENCHMARK_GAIA_DATA_FILE`: Optional JSON file with GAIA-compatible normalized challenge definitions | ||
| - `BENCHMARK_STATE_DIR`: Where attempt state is persisted | ||
| - `BENCHMARK_CATALOG_FILE`: Optional JSON file with challenge definitions in the rebuilt deterministic catalog format | ||
| - `BENCHMARK_CATALOG_URL`: Optional remote JSON catalog URL for fetch/cache based ingestion | ||
| - `BENCHMARK_CACHE_DIR`: Optional cache directory for remote catalog snapshots | ||
| - `BENCHMARK_CACHE_TTL_SECONDS`: Freshness window for remote catalog cache reuse | ||
| - `BENCHMARK_STATE_DIR`: Where attempt files and lock directories are persisted | ||
| - `BENCHMARK_MOCK`: Alternate mock-mode flag | ||
| `BENCHMARK_GAIA_DATA_FILE` is still accepted as a backward-compatible alias, but the rebuilt package is no longer GAIA-first. | ||
| When `BENCHMARK_CATALOG_URL` is set, the package will reuse a fresh cached copy when available and can be explicitly refreshed with `benchmark_sync_catalog`. | ||
| ## Catalog shape | ||
| External catalogs must be a JSON array of challenge definitions shaped like: | ||
| ```json | ||
| [ | ||
| { | ||
| "challenge_id": "custom_suite", | ||
| "benchmark_id": "custom", | ||
| "family": "code", | ||
| "runner_kind": "code_runner", | ||
| "title": "Custom Challenge", | ||
| "description": "Deterministic single-task suite", | ||
| "version": "v1", | ||
| "source": "external", | ||
| "tasks": [ | ||
| { | ||
| "task_id": "custom-1", | ||
| "title": "Return yes", | ||
| "prompt": "Return only yes.", | ||
| "response_format": "text", | ||
| "difficulty": 1, | ||
| "assets": [], | ||
| "checkers": [ | ||
| { | ||
| "checker_id": "custom-yes", | ||
| "kind": "exact_text", | ||
| "expected": "yes" | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| } | ||
| ] | ||
| ``` | ||
| Supported checker kinds today: | ||
| - `exact_text` | ||
| - `normalized_text` | ||
| - `regex_match` | ||
| - `contains_all_text` | ||
| - `json_field_equals` | ||
| - `runner_fact_equals` | ||
| - `runner_log_contains_text` | ||
| Each submission now also records runner execution metadata: | ||
| - workspace directory | ||
| - materialized asset and submission artifacts | ||
| - runner facts | ||
| - runner logs | ||
| ## Example external catalog | ||
| The repo includes the bundled fallback catalog source at: | ||
| ```bash | ||
| benchmark/catalogs/expanded-catalog.json | ||
| ``` | ||
| Use it like this: | ||
| ```bash | ||
| cd /home/fritzprix/my_works/b1te | ||
| BENCHMARK_CATALOG_FILE=/home/fritzprix/my_works/b1te/benchmark/catalogs/expanded-catalog.json npm run inspector -w @fre4x/benchmark | ||
| ``` | ||
| If no catalog env is provided at runtime, the published package falls back to the bundled copy of this catalog automatically. | ||
| ## Claude Desktop | ||
@@ -60,9 +163,10 @@ | ||
| "benchmark": { | ||
| "command": "npx", | ||
| "args": ["-y", "@fre4x/benchmark"], | ||
| "env": { | ||
| "BENCHMARK_GAIA_DATA_FILE": "/absolute/path/to/gaia-challenges.json" | ||
| "command": "npx", | ||
| "args": ["-y", "@fre4x/benchmark"], | ||
| "env": { | ||
| "BENCHMARK_CATALOG_URL": "https://example.com/benchmark-catalog.json", | ||
| "BENCHMARK_CACHE_DIR": "/absolute/path/to/benchmark-cache" | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
@@ -69,0 +173,0 @@ ``` |
Sorry, the diff of this file is too big to display
1194339
5.18%4
33.33%32682
4.76%178
140.54%