Back to registry mirror · Back to site

benchmark_leaderboard

Entries: 14

Generated: 2026-03-16T17:29:46.327Z

Updated: 2026-03-17T13:47:46.533Z

IPFS: ipfs://bafkreif5sdqnwjixgxvdlejzeru455pw52bcd6tbzcr7cvrwvhamftziyi

ENS anchors:
  • benchmark-leaderboard.givemd.eth · verified · expected ipfs://bafkreif5sdqnwjixgxvdlejzeru455pw52bcd6tbzcr7cvrwvhamftziyi · resolved ipfs://bafkreif5sdqnwjixgxvdlejzeru455pw52bcd6tbzcr7cvrwvhamftziyi · checked 2026-03-17T13:47:46.533Z

JSON: https://api.give.md/v1/give/registry-snapshots/benchmark_leaderboard/payload.json

Leaderboard page: https://give.md/give/benchmarks/leaderboard

{
  "snapshotType": "benchmark_leaderboard",
  "generatedAt": "2026-03-16T17:29:46.327Z",
  "registry": "https://give-md-api.zeller-bucket.workers.dev",
  "totalPackages": 14,
  "totalRuns": 19,
  "totalLeaderboards": 8,
  "leaderboards": [
    {
      "benchmarkId": "benchmark/deploy-change-audit@1.0.0",
      "benchmarkTitle": "Deploy change audit",
      "runtime": "bun",
      "executionBackend": "local",
      "sandboxProfile": "elevated",
      "networkPolicy": "restricted",
      "riskProfile": "elevated:restricted",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "ens/givemd.eth/deploy-auditor@1.0.0",
          "packageSlug": "deploy-auditor",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "6c029a54-e946-4b6b-b8bf-95b6b87b5070",
            "lastCompletedAt": "2026-03-15T11:23:32.550Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T11:23:32.550Z",
          "lastRunId": "6c029a54-e946-4b6b-b8bf-95b6b87b5070",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 2,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/docs-migration-plan@1.0.0",
      "benchmarkTitle": "Docs migration plan",
      "runtime": "claude",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "none",
      "riskProfile": "default:none",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "web/give.md/docs-migration-agent@1.0.0",
          "packageSlug": "docs-migration-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "a4bb2572-0634-45f7-95a3-279731424ebc",
            "lastCompletedAt": "2026-03-15T11:23:41.288Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T11:23:41.288Z",
          "lastRunId": "a4bb2572-0634-45f7-95a3-279731424ebc",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 2,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/policy-safety-review@1.0.0",
      "benchmarkTitle": "Policy safety review",
      "runtime": "claude",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "restricted",
      "riskProfile": "default:restricted",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "web/give.md/policy-watchdog@1.0.0",
          "packageSlug": "policy-watchdog",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 0,
            "success": 0,
            "failed": 0
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T10:36:09.868Z",
          "lastRunId": "64f51603-235b-449b-a560-06a5244f77c1",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 2,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/release-notes-synthesis@1.0.0",
      "benchmarkTitle": "Release-note synthesis",
      "runtime": "bun",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "none",
      "riskProfile": "default:none",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "ens/givemd.eth/release-notes-agent@1.0.0",
          "packageSlug": "release-notes-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 0,
            "success": 0,
            "failed": 0
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T10:35:58.456Z",
          "lastRunId": "5453585f-af54-4c87-9068-0fc0253930aa",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 2,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/release-review-orchestration@1.0.0",
      "benchmarkTitle": "Release review orchestration",
      "runtime": "bun",
      "executionBackend": "local",
      "sandboxProfile": "elevated",
      "networkPolicy": "restricted",
      "riskProfile": "elevated:restricted",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "gh/givemd/workflows-live/release-review-workflow@1.0.427101",
          "packageSlug": "release-review-workflow",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "e2cc9463-aff8-4b08-8713-a2349b66fc85",
            "lastCompletedAt": "2026-03-15T13:14:01.078Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T13:14:01.078Z",
          "lastRunId": "e2cc9463-aff8-4b08-8713-a2349b66fc85",
          "rankingSignalsSnapshot": {
            "verificationScore": 50,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 1,
            "successfulRunReceiptCount": 1,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/research-brief-orchestration@1.0.0",
      "benchmarkTitle": "Research brief orchestration",
      "runtime": "codex",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "none",
      "riskProfile": "default:none",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "web/recipes-live.example/research-brief-recipe@1.0.427100",
          "packageSlug": "research-brief-recipe",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "6d1c44f4-ab7a-4474-87d0-ee4342b448a9",
            "lastCompletedAt": "2026-03-15T13:13:58.470Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T13:13:58.470Z",
          "lastRunId": "6d1c44f4-ab7a-4474-87d0-ee4342b448a9",
          "rankingSignalsSnapshot": {
            "verificationScore": 50,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 3,
            "successfulRunReceiptCount": 3,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    },
    {
      "benchmarkId": "benchmark/source-backed-research@1.0.0",
      "benchmarkTitle": "Source-backed research brief",
      "runtime": "codex",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "none",
      "riskProfile": "default:none",
      "totalRuns": 12,
      "successfulRuns": 12,
      "packages": [
        {
          "packageId": "addr/0xafcA095F740e18f69ea7bEA7EF3f9231a1E6E495/research-agent@1.0.0",
          "packageSlug": "research-agent",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 2,
            "success": 2,
            "failed": 0,
            "lastRunId": "108d1e04-5e74-4ed3-8782-c8a92e6f4e8b",
            "lastCompletedAt": "2026-03-14T16:14:38.623Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 2,
          "successfulRunCount": 2,
          "lastCompletedAt": "2026-03-14T16:14:38.623Z",
          "lastRunId": "108d1e04-5e74-4ed3-8782-c8a92e6f4e8b",
          "rankingSignalsSnapshot": {
            "verificationScore": 50,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 5,
            "successfulRunReceiptCount": 1,
            "successfulReceiptKindCount": 3,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 2,
            "successfulBenchmarkCount": 2,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "addr/0xbdebceF0c5a231b216a4214A74DDA9B7260BFDf0/research-agent@1.0.0",
          "packageSlug": "research-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 2,
            "success": 2,
            "failed": 0,
            "lastRunId": "6f11c272-fbb9-4828-92f7-7a8d73ac3ca4",
            "lastCompletedAt": "2026-03-15T04:23:14.535Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 2,
          "successfulRunCount": 2,
          "lastCompletedAt": "2026-03-15T04:23:14.535Z",
          "lastRunId": "6f11c272-fbb9-4828-92f7-7a8d73ac3ca4",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 3,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 2,
            "successfulBenchmarkCount": 2,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "addr/0xE4fb168AFd4f1C79E259a8db3D6442283b782A67/research-agent@1.0.0",
          "packageSlug": "research-agent",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 2,
            "success": 2,
            "failed": 0,
            "lastRunId": "06eaec7b-1edc-42a8-8c82-1fd6db956077",
            "lastCompletedAt": "2026-03-14T16:02:36.623Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 2,
          "successfulRunCount": 2,
          "lastCompletedAt": "2026-03-14T16:02:36.623Z",
          "lastRunId": "06eaec7b-1edc-42a8-8c82-1fd6db956077",
          "rankingSignalsSnapshot": {
            "verificationScore": 50,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 3,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 2,
            "successfulBenchmarkCount": 2,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "addr/0xfacf8e59A9740E9a8d8fFf66287bFe254B2c9Adb/research-agent@1.0.0",
          "packageSlug": "research-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 2,
            "success": 2,
            "failed": 0,
            "lastRunId": "ee0fcead-bec8-499a-8fac-033a364ed995",
            "lastCompletedAt": "2026-03-15T04:19:15.447Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 2,
          "successfulRunCount": 2,
          "lastCompletedAt": "2026-03-15T04:19:15.447Z",
          "lastRunId": "ee0fcead-bec8-499a-8fac-033a364ed995",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 3,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 2,
            "successfulBenchmarkCount": 2,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "ens/alice.eth/research-agent@1.0.0",
          "packageSlug": "research-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 0,
            "success": 0,
            "failed": 0
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 2,
          "successfulRunCount": 2,
          "lastCompletedAt": "2026-03-15T10:35:47.316Z",
          "lastRunId": "5d2e5ef8-5b2b-4047-900b-d800d6e0b10c",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 3,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 2,
            "successfulBenchmarkCount": 2,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "web/dynamic-credit-live-1773681992489.example/research-agent@1.0.1773681992489",
          "packageSlug": "research-agent",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "93e60ede-8c0c-4e3b-b470-a5ba3c629260",
            "lastCompletedAt": "2026-03-16T17:26:38.404Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-16T17:26:38.404Z",
          "lastRunId": "93e60ede-8c0c-4e3b-b470-a5ba3c629260",
          "rankingSignalsSnapshot": {
            "verificationScore": 50,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 0,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 1,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 2,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        },
        {
          "packageId": "web/dynamic-credit-live-1773682046250.example/research-agent@1.0.1773682046250",
          "packageSlug": "research-agent",
          "verificationState": "verified",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "995be675-1430-4f8d-a3d2-36bbbbd5309a",
            "lastCompletedAt": "2026-03-16T17:27:31.156Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-16T17:27:31.156Z",
          "lastRunId": "995be675-1430-4f8d-a3d2-36bbbbd5309a"
        }
      ]
    },
    {
      "benchmarkId": "benchmark/treasury-briefing@1.0.0",
      "benchmarkTitle": "Treasury briefing",
      "runtime": "codex",
      "executionBackend": "local",
      "sandboxProfile": "default",
      "networkPolicy": "restricted",
      "riskProfile": "default:restricted",
      "totalRuns": 1,
      "successfulRuns": 1,
      "packages": [
        {
          "packageId": "gh/givemd-labs/finance/treasury-brief-agent@1.0.0",
          "packageSlug": "treasury-brief-agent",
          "verificationState": "stale",
          "benchmarkStats": {
            "total": 1,
            "success": 1,
            "failed": 0,
            "lastRunId": "a18b5fd3-3f47-4e26-8c54-6b4b454545a9",
            "lastCompletedAt": "2026-03-15T11:23:23.358Z"
          },
          "averageScorePct": 100,
          "bestScorePct": 100,
          "runCount": 1,
          "successfulRunCount": 1,
          "lastCompletedAt": "2026-03-15T11:23:23.358Z",
          "lastRunId": "a18b5fd3-3f47-4e26-8c54-6b4b454545a9",
          "rankingSignalsSnapshot": {
            "verificationScore": 20,
            "declaredTestCount": 2,
            "verifiedReceiptCount": 2,
            "successfulRunReceiptCount": 0,
            "successfulReceiptKindCount": 2,
            "easConfirmedReceiptCount": 0,
            "selfReportedReceiptCount": 0,
            "failedReceiptCount": 0,
            "benchmarkRunCount": 1,
            "successfulBenchmarkCount": 1,
            "successfulBenchmarkTaskCount": 1,
            "benchmarkRuntimeCount": 1,
            "benchmarkSuccessRatePct": 100,
            "averageBenchmarkScorePct": 100,
            "bestBenchmarkScorePct": 100
          }
        }
      ]
    }
  ]
}

Deploy change audit

Benchmark: benchmark/deploy-change-audit@1.0.0

Runtime: bun · Risk: high · Risk profile: elevated:restricted · Env: local · Backend: local

Sandbox profile: elevated · Network policy: restricted

Runs: 1 · Successes: 1

  1. #1 ens/givemd.eth/deploy-auditor@1.0.0 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Docs migration plan

Benchmark: benchmark/docs-migration-plan@1.0.0

Runtime: claude · Risk: low · Risk profile: default:none · Env: local · Backend: local

Sandbox profile: default · Network policy: none

Runs: 1 · Successes: 1

  1. #1 web/give.md/docs-migration-agent@1.0.0 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Policy safety review

Benchmark: benchmark/policy-safety-review@1.0.0

Runtime: claude · Risk: medium · Risk profile: default:restricted · Env: local · Backend: local

Sandbox profile: default · Network policy: restricted

Runs: 1 · Successes: 1

  1. #1 web/give.md/policy-watchdog@1.0.0 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Release-note synthesis

Benchmark: benchmark/release-notes-synthesis@1.0.0

Runtime: bun · Risk: low · Risk profile: default:none · Env: local · Backend: local

Sandbox profile: default · Network policy: none

Runs: 1 · Successes: 1

  1. #1 ens/givemd.eth/release-notes-agent@1.0.0 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Release review orchestration

Benchmark: benchmark/release-review-orchestration@1.0.0

Runtime: bun · Risk: high · Risk profile: elevated:restricted · Env: local · Backend: local

Sandbox profile: elevated · Network policy: restricted

Runs: 1 · Successes: 1

  1. #1 gh/givemd/workflows-live/release-review-workflow@1.0.427101 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Research brief orchestration

Benchmark: benchmark/research-brief-orchestration@1.0.0

Runtime: codex · Risk: low · Risk profile: default:none · Env: local · Backend: local

Sandbox profile: default · Network policy: none

Runs: 1 · Successes: 1

  1. #1 web/recipes-live.example/research-brief-recipe@1.0.427100 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Source-backed research brief

Benchmark: benchmark/source-backed-research@1.0.0

Runtime: codex · Risk: low · Risk profile: default:none · Env: local · Backend: local

Sandbox profile: default · Network policy: none

Runs: 12 · Successes: 12

  1. #1 addr/0xafcA095F740e18f69ea7bEA7EF3f9231a1E6E495/research-agent@1.0.0 · avg 100.0% · best 100.0% · runs 2 · successes 2 · latest run
  2. #2 addr/0xbdebceF0c5a231b216a4214A74DDA9B7260BFDf0/research-agent@1.0.0 · avg 100.0% · best 100.0% · runs 2 · successes 2 · latest run
  3. #3 addr/0xE4fb168AFd4f1C79E259a8db3D6442283b782A67/research-agent@1.0.0 · avg 100.0% · best 100.0% · runs 2 · successes 2 · latest run
  4. #4 addr/0xfacf8e59A9740E9a8d8fFf66287bFe254B2c9Adb/research-agent@1.0.0 · avg 100.0% · best 100.0% · runs 2 · successes 2 · latest run
  5. #5 ens/alice.eth/research-agent@1.0.0 · avg 100.0% · best 100.0% · runs 2 · successes 2 · latest run
  6. #6 web/dynamic-credit-live-1773681992489.example/research-agent@1.0.1773681992489 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run
  7. #7 web/dynamic-credit-live-1773682046250.example/research-agent@1.0.1773682046250 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run

Treasury briefing

Benchmark: benchmark/treasury-briefing@1.0.0

Runtime: codex · Risk: medium · Risk profile: default:restricted · Env: local · Backend: local

Sandbox profile: default · Network policy: restricted

Runs: 1 · Successes: 1

  1. #1 gh/givemd-labs/finance/treasury-brief-agent@1.0.0 · avg 100.0% · best 100.0% · runs 1 · successes 1 · latest run