From 3c840259feb12a0f241d642769cb60e15a820209 Mon Sep 17 00:00:00 2001
From: Johann Schopplich <mail@johannschopplich.com>
Date: Mon, 27 Oct 2025 11:48:33 +0100
Subject: [PATCH] test: add LLM retrieval accuracy tests

---
 .gitignore                                    |     1 +
 README.md                                     |   301 +-
 benchmarks/.env.example                       |     3 +
 benchmarks/data/github-repos.json             |  1302 ++
 benchmarks/package.json                       |    26 +
 benchmarks/results/accuracy/accuracy.md       |    96 +
 benchmarks/results/accuracy/raw-results.json  | 17492 ++++++++++++++++
 benchmarks/results/accuracy/report.md         |    96 +
 benchmarks/results/accuracy/summary.json      |    95 +
 benchmarks/results/token-efficiency.md        |   141 +
 benchmarks/scripts/accuracy-benchmark.ts      |   140 +
 benchmarks/scripts/fetch-github-data.ts       |    78 +
 .../scripts/token-efficiency-benchmark.ts     |   228 +
 benchmarks/src/constants.ts                   |    39 +
 benchmarks/src/datasets.ts                    |   146 +
 benchmarks/src/evaluate.ts                    |   133 +
 benchmarks/src/formatters.ts                  |    90 +
 benchmarks/src/questions.ts                   |   398 +
 benchmarks/src/report.ts                      |   288 +
 benchmarks/src/types.ts                       |    35 +
 docs/benchmarks.md                            |   158 -
 package.json                                  |     8 +-
 pnpm-lock.yaml                                |   618 +-
 pnpm-workspace.yaml                           |     2 +
 scripts/generate-bench.ts                     |   213 -
 25 files changed, 21404 insertions(+), 723 deletions(-)
 create mode 100644 benchmarks/.env.example
 create mode 100644 benchmarks/data/github-repos.json
 create mode 100644 benchmarks/package.json
 create mode 100644 benchmarks/results/accuracy/accuracy.md
 create mode 100644 benchmarks/results/accuracy/raw-results.json
 create mode 100644 benchmarks/results/accuracy/report.md
 create mode 100644 benchmarks/results/accuracy/summary.json
 create mode 100644 benchmarks/results/token-efficiency.md
 create mode 100644 benchmarks/scripts/accuracy-benchmark.ts
 create mode 100644 benchmarks/scripts/fetch-github-data.ts
 create mode 100644 benchmarks/scripts/token-efficiency-benchmark.ts
 create mode 100644 benchmarks/src/constants.ts
 create mode 100644 benchmarks/src/datasets.ts
 create mode 100644 benchmarks/src/evaluate.ts
 create mode 100644 benchmarks/src/formatters.ts
 create mode 100644 benchmarks/src/questions.ts
 create mode 100644 benchmarks/src/report.ts
 create mode 100644 benchmarks/src/types.ts
 delete mode 100644 docs/benchmarks.md
 create mode 100644 pnpm-workspace.yaml
 delete mode 100644 scripts/generate-bench.ts

diff --git a/.gitignore b/.gitignore
index b186605..f73f2b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 dist
 node_modules
 .DS_Store
+.env
diff --git a/README.md b/README.md
index 546493c..271be53 100644
--- a/README.md
+++ b/README.md
@@ -42,168 +42,148 @@ users[2]{id,name,role}:
 - 📐 **Indentation-based structure:** replaces braces with whitespace for better readability
 - 🧺 **Tabular arrays:** declare keys once, then stream rows without repetition
 
-## Token Benchmarks
+## Benchmarks
 
-> [!NOTE]
-> Benchmarks for LLM accuracy and retrieval are currently in development.
+<!-- automd:file src="./benchmarks/results/token-efficiency.md" -->
 
-<!-- automd:file src="./docs/benchmarks.md" -->
+### Token Efficiency
 
-| Example | JSON | TOON | Tokens Saved | Reduction |
-| ------- | ---- | ---- | ------------ | --------- |
-| 👤 Simple user object | 31 | 18 | 13 | **41.9%** |
-| 🏷️ User with tags | 48 | 28 | 20 | **41.7%** |
-| 📦 Small product catalog | 117 | 49 | 68 | **58.1%** |
-| 👥 API response with users | 123 | 53 | 70 | **56.9%** |
-| ⚙️ Nested configuration | 68 | 42 | 26 | **38.2%** |
-| 🛒 E-commerce order | 163 | 94 | 69 | **42.3%** |
-| 📊 Analytics data | 209 | 94 | 115 | **55.0%** |
-| 📈 Large dataset (50 records) | 2159 | 762 | 1397 | **64.7%** |
-| **Total** | **2918** | **1140** | **1778** | **60.9%** |
+```
+⭐ GitHub Repositories       ██████████████░░░░░░░░░░░   8,745 tokens  (JSON: 15,145)  💰 42.3% saved
+📈 Analytics Time Series     ██████████░░░░░░░░░░░░░░░   3,631 tokens  (JSON:  9,024)  💰 59.8% saved
+👥 API Response              ██████████████░░░░░░░░░░░   2,593 tokens  (JSON:  4,589)  💰 43.5% saved
+🛒 E-commerce Order          ███████████████░░░░░░░░░░     203 tokens  (JSON:    338)  💰 39.9% saved
+```
+
+**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → 47.9% savings
 
 <details>
-<summary><strong>View detailed results</strong></summary>
+<summary><strong>View detailed examples</strong></summary>
 
-### 📦 Small product catalog
+#### ⭐ GitHub Repositories
 
-**Savings: 68 tokens (58.1% reduction)**
+**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata
 
-**JSON** (117 tokens):
+**Savings:** 6,400 tokens (42.3% reduction)
+
+**JSON** (15,145 tokens):
 
 ```json
 {
-  "items": [
+  "repositories": [
     {
-      "sku": "A1",
-      "name": "Widget",
-      "qty": 2,
-      "price": 9.99
+      "id": 28457823,
+      "name": "freeCodeCamp",
+      "repo": "freeCodeCamp/freeCodeCamp",
+      "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...",
+      "createdAt": "2014-12-24T17:49:19Z",
+      "updatedAt": "2025-10-27T07:40:58Z",
+      "pushedAt": "2025-10-26T11:31:08Z",
+      "stars": 430828,
+      "watchers": 8582,
+      "forks": 42136,
+      "defaultBranch": "main"
     },
     {
-      "sku": "B2",
-      "name": "Gadget",
-      "qty": 1,
-      "price": 14.5
+      "id": 132750724,
+      "name": "build-your-own-x",
+      "repo": "codecrafters-io/build-your-own-x",
+      "description": "Master programming by recreating your favorite technologies from scratch.",
+      "createdAt": "2018-05-09T12:03:18Z",
+      "updatedAt": "2025-10-27T07:43:25Z",
+      "pushedAt": "2025-10-10T18:45:01Z",
+      "stars": 430102,
+      "watchers": 6322,
+      "forks": 40388,
+      "defaultBranch": "master"
     },
     {
-      "sku": "C3",
-      "name": "Doohickey",
-      "qty": 5,
-      "price": 7.25
+      "id": 21737465,
+      "name": "awesome",
+      "repo": "sindresorhus/awesome",
+      "description": "😎 Awesome lists about all kinds of interesting topics",
+      "createdAt": "2014-07-11T13:42:37Z",
+      "updatedAt": "2025-10-27T07:44:27Z",
+      "pushedAt": "2025-10-23T17:26:53Z",
+      "stars": 409760,
+      "watchers": 8016,
+      "forks": 32015,
+      "defaultBranch": "main"
     }
   ]
 }
 ```
 
-**TOON** (49 tokens):
+**TOON** (8,745 tokens):
 
 ```
-items[3]{sku,name,qty,price}:
-  A1,Widget,2,9.99
-  B2,Gadget,1,14.5
-  C3,Doohickey,5,7.25
+repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
+  28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
+  132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
+  21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
 ```
 
 ---
 
-### 👥 API response with users
+#### 📈 Analytics Time Series
 
-**Savings: 70 tokens (56.9% reduction)**
+**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue)
 
-**JSON** (123 tokens):
+**Savings:** 5,393 tokens (59.8% reduction)
 
-```json
-{
-  "users": [
-    {
-      "id": 1,
-      "name": "Alice",
-      "email": "alice@example.com",
-      "active": true
-    },
-    {
-      "id": 2,
-      "name": "Bob",
-      "email": "bob@example.com",
-      "active": true
-    },
-    {
-      "id": 3,
-      "name": "Charlie",
-      "email": "charlie@example.com",
-      "active": false
-    }
-  ],
-  "total": 3,
-  "page": 1
-}
-```
-
-**TOON** (53 tokens):
-
-```
-users[3]{id,name,email,active}:
-  1,Alice,alice@example.com,true
-  2,Bob,bob@example.com,true
-  3,Charlie,charlie@example.com,false
-total: 3
-page: 1
-```
-
----
-
-### 📊 Analytics data
-
-**Savings: 115 tokens (55.0% reduction)**
-
-**JSON** (209 tokens):
+**JSON** (9,024 tokens):
 
 ```json
 {
   "metrics": [
+    {
+      "date": "2024-12-31",
+      "views": 3769,
+      "clicks": 400,
+      "conversions": 59,
+      "revenue": 198.98
+    },
     {
       "date": "2025-01-01",
-      "views": 1234,
-      "clicks": 89,
-      "conversions": 12
+      "views": 5742,
+      "clicks": 463,
+      "conversions": 28,
+      "revenue": 295.77
     },
     {
       "date": "2025-01-02",
-      "views": 2345,
-      "clicks": 156,
-      "conversions": 23
+      "views": 3669,
+      "clicks": 336,
+      "conversions": 102,
+      "revenue": 624.23
     },
     {
       "date": "2025-01-03",
-      "views": 1890,
-      "clicks": 123,
-      "conversions": 18
+      "views": 1332,
+      "clicks": 304,
+      "conversions": 99,
+      "revenue": 113.06
     },
     {
       "date": "2025-01-04",
-      "views": 3456,
-      "clicks": 234,
-      "conversions": 34
-    },
-    {
-      "date": "2025-01-05",
-      "views": 2789,
-      "clicks": 178,
-      "conversions": 27
+      "views": 1444,
+      "clicks": 222,
+      "conversions": 88,
+      "revenue": 986.69
     }
   ]
 }
 ```
 
-**TOON** (94 tokens):
+**TOON** (3,631 tokens):
 
 ```
-metrics[5]{date,views,clicks,conversions}:
-  2025-01-01,1234,89,12
-  2025-01-02,2345,156,23
-  2025-01-03,1890,123,18
-  2025-01-04,3456,234,34
-  2025-01-05,2789,178,27
+metrics[5]{date,views,clicks,conversions,revenue}:
+  2024-12-31,3769,400,59,198.98
+  2025-01-01,5742,463,28,295.77
+  2025-01-02,3669,336,102,624.23
+  2025-01-03,1332,304,99,113.06
+  2025-01-04,1444,222,88,986.69
 ```
 
 </details>
@@ -213,6 +193,107 @@ metrics[5]{date,views,clicks,conversions}:
 > [!NOTE]
 > Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers.
 
+<!-- automd:file src="./benchmarks/results/accuracy/report.md" -->
+
+### Retrieval Accuracy
+
+Tested across **2 LLMs** with data retrieval tasks:
+
+```
+gpt-4o-mini          ██████████████░░░░░░ 72.3% accuracy
+claude-haiku-4-5     ███████████████░░░░░ 76.7% accuracy
+```
+
+**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.**
+
+| Format | Accuracy | Average Tokens |
+| ------ | -------- | -------------- |
+| `toon` | 73.9% | 4.678 |
+| `json` | 73.6% | 8.713 |
+| `markdown-kv` | 73.6% | 8.649 |
+| `csv` | 72.3% | 4.745 |
+| `yaml` | 71.7% | 7.091 |
+
+<details>
+<summary><strong>View detailed breakdown by dataset and model</strong></summary>
+
+#### Performance by Dataset
+
+##### Uniform employee records (TOON optimal format)
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 72.4% | 2.483 | 84/116 |
+| `csv` | 69.0% | 2.337 | 80/116 |
+| `yaml` | 68.1% | 4.969 | 79/116 |
+| `markdown-kv` | 68.1% | 6.270 | 79/116 |
+| `json` | 68.1% | 6.347 | 79/116 |
+
+##### E-commerce orders with nested structures
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 84.1% | 5.967 | 74/88 |
+| `csv` | 83.0% | 6.735 | 73/88 |
+| `yaml` | 81.8% | 7.328 | 72/88 |
+| `markdown-kv` | 86.4% | 9.110 | 76/88 |
+| `json` | 84.1% | 9.694 | 74/88 |
+
+##### Time-series analytics data
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `csv` | 72.4% | 1.393 | 42/58 |
+| `toon` | 70.7% | 1.515 | 41/58 |
+| `yaml` | 72.4% | 2.938 | 42/58 |
+| `json` | 74.1% | 3.665 | 43/58 |
+| `markdown-kv` | 70.7% | 3.779 | 41/58 |
+
+##### Popular GitHub repositories
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 64.3% | 8.745 | 36/56 |
+| `csv` | 62.5% | 8.513 | 35/56 |
+| `json` | 67.9% | 15.145 | 38/56 |
+| `markdown-kv` | 67.9% | 15.436 | 38/56 |
+| `yaml` | 62.5% | 13.129 | 35/56 |
+
+
+#### Performance by Model
+
+##### gpt-4o-mini
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `toon` | 72.3% | 115/159 |
+| `json` | 71.7% | 114/159 |
+| `markdown-kv` | 70.4% | 112/159 |
+| `csv` | 69.2% | 110/159 |
+| `yaml` | 68.6% | 109/159 |
+
+##### claude-haiku-4-5
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `markdown-kv` | 76.7% | 122/159 |
+| `toon` | 75.5% | 120/159 |
+| `json` | 75.5% | 120/159 |
+| `csv` | 75.5% | 120/159 |
+| `yaml` | 74.8% | 119/159 |
+
+
+#### Methodology
+
+- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).
+- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.
+- **Question types**: Field retrieval, aggregation, and filtering tasks.
+- **Real data**: Faker.js-generated datasets + GitHub repositories.
+
+</details>
+
+<!-- /automd -->
+
 ## Installation
 
 ```bash
diff --git a/benchmarks/.env.example b/benchmarks/.env.example
new file mode 100644
index 0000000..df70883
--- /dev/null
+++ b/benchmarks/.env.example
@@ -0,0 +1,3 @@
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+GOOGLE_GENERATIVE_AI_API_KEY=
diff --git a/benchmarks/data/github-repos.json b/benchmarks/data/github-repos.json
new file mode 100644
index 0000000..b7ed072
--- /dev/null
+++ b/benchmarks/data/github-repos.json
@@ -0,0 +1,1302 @@
+[
+  {
+    "id": 28457823,
+    "name": "freeCodeCamp",
+    "repo": "freeCodeCamp/freeCodeCamp",
+    "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming, and computer science for free.",
+    "createdAt": "2014-12-24T17:49:19Z",
+    "updatedAt": "2025-10-27T07:40:58Z",
+    "pushedAt": "2025-10-26T11:31:08Z",
+    "stars": 430828,
+    "watchers": 8582,
+    "forks": 42136,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 132750724,
+    "name": "build-your-own-x",
+    "repo": "codecrafters-io/build-your-own-x",
+    "description": "Master programming by recreating your favorite technologies from scratch.",
+    "createdAt": "2018-05-09T12:03:18Z",
+    "updatedAt": "2025-10-27T07:43:25Z",
+    "pushedAt": "2025-10-10T18:45:01Z",
+    "stars": 430102,
+    "watchers": 6322,
+    "forks": 40388,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 21737465,
+    "name": "awesome",
+    "repo": "sindresorhus/awesome",
+    "description": "😎 Awesome lists about all kinds of interesting topics",
+    "createdAt": "2014-07-11T13:42:37Z",
+    "updatedAt": "2025-10-27T07:44:27Z",
+    "pushedAt": "2025-10-23T17:26:53Z",
+    "stars": 409760,
+    "watchers": 8016,
+    "forks": 32015,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 13491895,
+    "name": "free-programming-books",
+    "repo": "EbookFoundation/free-programming-books",
+    "description": ":books: Freely available programming books",
+    "createdAt": "2013-10-11T06:50:37Z",
+    "updatedAt": "2025-10-27T07:36:14Z",
+    "pushedAt": "2025-10-26T23:24:34Z",
+    "stars": 375134,
+    "watchers": 9788,
+    "forks": 65149,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 54346799,
+    "name": "public-apis",
+    "repo": "public-apis/public-apis",
+    "description": "A collective list of free APIs",
+    "createdAt": "2016-03-20T23:49:42Z",
+    "updatedAt": "2025-10-27T07:45:41Z",
+    "pushedAt": "2025-05-20T15:56:34Z",
+    "stars": 373288,
+    "watchers": 4392,
+    "forks": 39386,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 85077558,
+    "name": "developer-roadmap",
+    "repo": "kamranahmedse/developer-roadmap",
+    "description": "Interactive roadmaps, guides and other educational content to help developers grow in their careers.",
+    "createdAt": "2017-03-15T13:45:52Z",
+    "updatedAt": "2025-10-27T07:26:36Z",
+    "pushedAt": "2025-10-24T10:20:46Z",
+    "stars": 342038,
+    "watchers": 6887,
+    "forks": 43222,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 60493101,
+    "name": "coding-interview-university",
+    "repo": "jwasham/coding-interview-university",
+    "description": "A complete computer science study plan to become a software engineer.",
+    "createdAt": "2016-06-06T02:34:12Z",
+    "updatedAt": "2025-10-27T07:46:31Z",
+    "pushedAt": "2025-08-28T14:42:47Z",
+    "stars": 331885,
+    "watchers": 8512,
+    "forks": 81046,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 83222441,
+    "name": "system-design-primer",
+    "repo": "donnemartin/system-design-primer",
+    "description": "Learn how to design large-scale systems. Prep for the system design interview.  Includes Anki flashcards.",
+    "createdAt": "2017-02-26T16:15:28Z",
+    "updatedAt": "2025-10-27T07:38:55Z",
+    "pushedAt": "2025-05-21T11:13:33Z",
+    "stars": 324162,
+    "watchers": 6818,
+    "forks": 52866,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 177736533,
+    "name": "996.ICU",
+    "repo": "996icu/996.ICU",
+    "description": "Repo for counting stars and contributing. Press F to pay respect to glorious developers.",
+    "createdAt": "2019-03-26T07:31:14Z",
+    "updatedAt": "2025-10-27T07:35:11Z",
+    "pushedAt": "2025-08-22T06:01:29Z",
+    "stars": 274700,
+    "watchers": 4217,
+    "forks": 21033,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 21289110,
+    "name": "awesome-python",
+    "repo": "vinta/awesome-python",
+    "description": "An opinionated list of awesome Python frameworks, libraries, software and resources.",
+    "createdAt": "2014-06-27T21:00:06Z",
+    "updatedAt": "2025-10-27T07:40:04Z",
+    "pushedAt": "2025-10-16T13:40:58Z",
+    "stars": 266460,
+    "watchers": 6127,
+    "forks": 26579,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 36633370,
+    "name": "awesome-selfhosted",
+    "repo": "awesome-selfhosted/awesome-selfhosted",
+    "description": "A list of Free Software network services and web applications which can be hosted on your own servers",
+    "createdAt": "2015-06-01T02:33:17Z",
+    "updatedAt": "2025-10-27T07:43:02Z",
+    "pushedAt": "2025-10-23T10:47:33Z",
+    "stars": 254916,
+    "watchers": 2995,
+    "forks": 11798,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 88011908,
+    "name": "project-based-learning",
+    "repo": "practical-tutorials/project-based-learning",
+    "description": "Curated list of project-based tutorials",
+    "createdAt": "2017-04-12T05:07:46Z",
+    "updatedAt": "2025-10-27T07:45:41Z",
+    "pushedAt": "2024-08-15T05:33:54Z",
+    "stars": 247930,
+    "watchers": 3445,
+    "forks": 32413,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 10270250,
+    "name": "react",
+    "repo": "facebook/react",
+    "description": "The library for web and native user interfaces.",
+    "createdAt": "2013-05-24T16:15:54Z",
+    "updatedAt": "2025-10-27T06:47:16Z",
+    "pushedAt": "2025-10-24T22:08:43Z",
+    "stars": 240059,
+    "watchers": 6687,
+    "forks": 49664,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 63476337,
+    "name": "Python",
+    "repo": "TheAlgorithms/Python",
+    "description": "All Algorithms implemented in Python",
+    "createdAt": "2016-07-16T09:44:01Z",
+    "updatedAt": "2025-10-27T07:26:23Z",
+    "pushedAt": "2025-10-20T00:59:36Z",
+    "stars": 212044,
+    "watchers": 5975,
+    "forks": 48986,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 11730342,
+    "name": "vue",
+    "repo": "vuejs/vue",
+    "description": "This is the repo for Vue 2. For Vue 3, go to https://github.com/vuejs/core",
+    "createdAt": "2013-07-29T03:24:51Z",
+    "updatedAt": "2025-10-27T05:37:40Z",
+    "pushedAt": "2024-10-10T07:24:15Z",
+    "stars": 209624,
+    "watchers": 5787,
+    "forks": 33796,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 2325298,
+    "name": "linux",
+    "repo": "torvalds/linux",
+    "description": "Linux kernel source tree",
+    "createdAt": "2011-09-04T22:48:12Z",
+    "updatedAt": "2025-10-27T07:25:34Z",
+    "pushedAt": "2025-10-26T23:00:24Z",
+    "stars": 205761,
+    "watchers": 7739,
+    "forks": 58023,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 19415064,
+    "name": "computer-science",
+    "repo": "ossu/computer-science",
+    "description": "🎓 Path to a free self-taught education in Computer Science!",
+    "createdAt": "2014-05-04T00:18:39Z",
+    "updatedAt": "2025-10-27T07:25:53Z",
+    "pushedAt": "2025-08-23T18:48:52Z",
+    "stars": 196024,
+    "watchers": 5935,
+    "forks": 24465,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 126577260,
+    "name": "javascript-algorithms",
+    "repo": "trekhleb/javascript-algorithms",
+    "description": "📝 Algorithms and data structures implemented in JavaScript with explanations and links to further readings",
+    "createdAt": "2018-03-24T07:47:04Z",
+    "updatedAt": "2025-10-27T07:26:50Z",
+    "pushedAt": "2025-10-22T15:03:29Z",
+    "stars": 193648,
+    "watchers": 4267,
+    "forks": 30919,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 45717250,
+    "name": "tensorflow",
+    "repo": "tensorflow/tensorflow",
+    "description": "An Open Source Machine Learning Framework for Everyone",
+    "createdAt": "2015-11-07T01:19:20Z",
+    "updatedAt": "2025-10-27T07:33:01Z",
+    "pushedAt": "2025-10-27T06:15:29Z",
+    "stars": 192220,
+    "watchers": 7431,
+    "forks": 74928,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 138393139,
+    "name": "the-book-of-secret-knowledge",
+    "repo": "trimstray/the-book-of-secret-knowledge",
+    "description": "A collection of inspiring lists, manuals, cheatsheets, blogs, hacks, one-liners, cli/web tools and more.",
+    "createdAt": "2018-06-23T10:43:14Z",
+    "updatedAt": "2025-10-27T07:43:08Z",
+    "pushedAt": "2024-11-19T14:00:38Z",
+    "stars": 191315,
+    "watchers": 2679,
+    "forks": 11763,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 14440270,
+    "name": "You-Dont-Know-JS",
+    "repo": "getify/You-Dont-Know-JS",
+    "description": "A book series (2 published editions) on the JS language.",
+    "createdAt": "2013-11-16T02:37:24Z",
+    "updatedAt": "2025-10-27T07:25:47Z",
+    "pushedAt": "2025-05-20T14:22:36Z",
+    "stars": 183631,
+    "watchers": 5802,
+    "forks": 33668,
+    "defaultBranch": "2nd-ed"
+  },
+  {
+    "id": 121395510,
+    "name": "CS-Notes",
+    "repo": "CyC2018/CS-Notes",
+    "description": ":books: 技术面试必备基础知识、Leetcode、计算机操作系统、计算机网络、系统设计",
+    "createdAt": "2018-02-13T14:56:24Z",
+    "updatedAt": "2025-10-27T07:19:57Z",
+    "pushedAt": "2024-08-21T09:40:10Z",
+    "stars": 182646,
+    "watchers": 5252,
+    "forks": 51251,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 291137,
+    "name": "ohmyzsh",
+    "repo": "ohmyzsh/ohmyzsh",
+    "description": "🙃   A delightful community-driven (with 2,400+ contributors) framework for managing your zsh configuration. Includes 300+ optional plugins (rails, git, macOS, hub, docker, homebrew, node, php, python, etc), 140+ themes to spice up your morning, and an auto-update tool that makes it easy to keep up with the latest updates from the community.",
+    "createdAt": "2009-08-28T18:15:37Z",
+    "updatedAt": "2025-10-27T07:25:29Z",
+    "pushedAt": "2025-10-26T13:17:47Z",
+    "stars": 182297,
+    "watchers": 2618,
+    "forks": 26259,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 614765452,
+    "name": "AutoGPT",
+    "repo": "Significant-Gravitas/AutoGPT",
+    "description": "AutoGPT is the vision of accessible AI for everyone, to use and to build on. Our mission is to provide the tools, so that you can focus on what matters.",
+    "createdAt": "2023-03-16T09:21:07Z",
+    "updatedAt": "2025-10-27T07:34:44Z",
+    "pushedAt": "2025-10-27T00:10:36Z",
+    "stars": 179292,
+    "watchers": 1547,
+    "forks": 46077,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 41881900,
+    "name": "vscode",
+    "repo": "microsoft/vscode",
+    "description": "Visual Studio Code",
+    "createdAt": "2015-09-03T20:23:38Z",
+    "updatedAt": "2025-10-27T07:26:11Z",
+    "pushedAt": "2025-10-27T07:29:25Z",
+    "stars": 177925,
+    "watchers": 3364,
+    "forks": 35788,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 123458551,
+    "name": "Python-100-Days",
+    "repo": "jackfrued/Python-100-Days",
+    "description": "Python - 100天从新手到大师",
+    "createdAt": "2018-03-01T16:05:52Z",
+    "updatedAt": "2025-10-27T07:26:50Z",
+    "pushedAt": "2025-03-28T10:29:23Z",
+    "stars": 173752,
+    "watchers": 6098,
+    "forks": 54771,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 2126244,
+    "name": "bootstrap",
+    "repo": "twbs/bootstrap",
+    "description": "The most popular HTML, CSS, and JavaScript framework for developing responsive, mobile first projects on the web.",
+    "createdAt": "2011-07-29T21:19:00Z",
+    "updatedAt": "2025-10-27T07:25:34Z",
+    "pushedAt": "2025-10-26T18:41:31Z",
+    "stars": 173599,
+    "watchers": 6681,
+    "forks": 79156,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 31792824,
+    "name": "flutter",
+    "repo": "flutter/flutter",
+    "description": "Flutter makes it easy and fast to build beautiful apps for mobile and beyond",
+    "createdAt": "2015-03-06T22:54:58Z",
+    "updatedAt": "2025-10-27T07:31:00Z",
+    "pushedAt": "2025-10-27T05:33:32Z",
+    "stars": 173546,
+    "watchers": 3481,
+    "forks": 29414,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 1062897,
+    "name": "gitignore",
+    "repo": "github/gitignore",
+    "description": "A collection of useful .gitignore templates",
+    "createdAt": "2010-11-08T20:17:14Z",
+    "updatedAt": "2025-10-27T07:34:35Z",
+    "pushedAt": "2025-09-10T18:42:03Z",
+    "stars": 170298,
+    "watchers": 3366,
+    "forks": 82998,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 35955666,
+    "name": "the-art-of-command-line",
+    "repo": "jlevy/the-art-of-command-line",
+    "description": "Master the command line, in one page",
+    "createdAt": "2015-05-20T15:11:03Z",
+    "updatedAt": "2025-10-27T07:26:07Z",
+    "pushedAt": "2024-06-25T18:13:44Z",
+    "stars": 158582,
+    "watchers": 2812,
+    "forks": 14754,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 527591471,
+    "name": "stable-diffusion-webui",
+    "repo": "AUTOMATIC1111/stable-diffusion-webui",
+    "description": "Stable Diffusion web UI",
+    "createdAt": "2022-08-22T14:05:26Z",
+    "updatedAt": "2025-10-27T07:49:02Z",
+    "pushedAt": "2025-10-07T20:06:10Z",
+    "stars": 157565,
+    "watchers": 1154,
+    "forks": 29246,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 21540759,
+    "name": "awesome-go",
+    "repo": "avelino/awesome-go",
+    "description": "A curated list of awesome Go frameworks, libraries and software",
+    "createdAt": "2014-07-06T13:42:15Z",
+    "updatedAt": "2025-10-27T07:49:36Z",
+    "pushedAt": "2025-10-22T12:15:14Z",
+    "stars": 155801,
+    "watchers": 2818,
+    "forks": 12706,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 658928958,
+    "name": "ollama",
+    "repo": "ollama/ollama",
+    "description": "Get up and running with OpenAI gpt-oss, DeepSeek-R1, Gemma 3 and other models.",
+    "createdAt": "2023-06-26T19:39:32Z",
+    "updatedAt": "2025-10-27T07:43:05Z",
+    "pushedAt": "2025-10-27T01:25:05Z",
+    "stars": 154808,
+    "watchers": 877,
+    "forks": 13467,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 233472199,
+    "name": "Microsoft-Activation-Scripts",
+    "repo": "massgravel/Microsoft-Activation-Scripts",
+    "description": "Open-source Windows and Office activator featuring HWID, Ohook, TSforge, KMS38, and Online KMS activation methods, along with advanced troubleshooting.",
+    "createdAt": "2020-01-12T23:03:34Z",
+    "updatedAt": "2025-10-27T07:44:35Z",
+    "pushedAt": "2025-09-30T22:22:59Z",
+    "stars": 153864,
+    "watchers": 1319,
+    "forks": 14861,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 132464395,
+    "name": "JavaGuide",
+    "repo": "Snailclimb/JavaGuide",
+    "description": "「Java学习+面试指南」一份涵盖大部分 Java 程序员所需要掌握的核心知识。准备 Java 面试，首选 JavaGuide！",
+    "createdAt": "2018-05-07T13:27:00Z",
+    "updatedAt": "2025-10-27T07:25:13Z",
+    "pushedAt": "2025-10-20T08:53:33Z",
+    "stars": 152308,
+    "watchers": 4470,
+    "forks": 46020,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 193215554,
+    "name": "n8n",
+    "repo": "n8n-io/n8n",
+    "description": "Fair-code workflow automation platform with native AI capabilities. Combine visual building with custom code, self-host or cloud, 400+ integrations.",
+    "createdAt": "2019-06-22T09:24:21Z",
+    "updatedAt": "2025-10-27T07:48:50Z",
+    "pushedAt": "2025-10-27T07:12:52Z",
+    "stars": 151975,
+    "watchers": 880,
+    "forks": 48459,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 155220641,
+    "name": "transformers",
+    "repo": "huggingface/transformers",
+    "description": "🤗 Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training. ",
+    "createdAt": "2018-10-29T13:56:00Z",
+    "updatedAt": "2025-10-27T07:45:24Z",
+    "pushedAt": "2025-10-25T16:31:22Z",
+    "stars": 151659,
+    "watchers": 1166,
+    "forks": 30955,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 6498492,
+    "name": "javascript",
+    "repo": "airbnb/javascript",
+    "description": "JavaScript Style Guide",
+    "createdAt": "2012-11-01T23:13:50Z",
+    "updatedAt": "2025-10-27T06:50:33Z",
+    "pushedAt": "2025-09-17T18:12:44Z",
+    "stars": 147687,
+    "watchers": 3705,
+    "forks": 26797,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 1039520,
+    "name": "youtube-dl",
+    "repo": "ytdl-org/youtube-dl",
+    "description": "Command-line program to download videos from YouTube.com and other video sites",
+    "createdAt": "2010-10-31T14:35:07Z",
+    "updatedAt": "2025-10-27T07:30:15Z",
+    "pushedAt": "2025-10-18T10:02:28Z",
+    "stars": 138545,
+    "watchers": 2160,
+    "forks": 10527,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 574523116,
+    "name": "awesome-chatgpt-prompts",
+    "repo": "f/awesome-chatgpt-prompts",
+    "description": "This repo includes ChatGPT prompt curation to use ChatGPT and other LLM tools better.",
+    "createdAt": "2022-12-05T13:54:13Z",
+    "updatedAt": "2025-10-27T07:42:24Z",
+    "pushedAt": "2025-10-14T17:23:13Z",
+    "stars": 135794,
+    "watchers": 1562,
+    "forks": 18073,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 70107786,
+    "name": "next.js",
+    "repo": "vercel/next.js",
+    "description": "The React Framework",
+    "createdAt": "2016-10-05T23:32:51Z",
+    "updatedAt": "2025-10-27T07:38:47Z",
+    "pushedAt": "2025-10-27T07:02:37Z",
+    "stars": 135306,
+    "watchers": 1497,
+    "forks": 29680,
+    "defaultBranch": "canary"
+  },
+  {
+    "id": 599320067,
+    "name": "langflow",
+    "repo": "langflow-ai/langflow",
+    "description": "Langflow is a powerful tool for building and deploying AI-powered agents and workflows.",
+    "createdAt": "2023-02-08T22:28:03Z",
+    "updatedAt": "2025-10-27T07:22:05Z",
+    "pushedAt": "2025-10-27T00:28:51Z",
+    "stars": 134904,
+    "watchers": 453,
+    "forks": 7853,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 307260205,
+    "name": "yt-dlp",
+    "repo": "yt-dlp/yt-dlp",
+    "description": "A feature-rich command-line audio/video downloader",
+    "createdAt": "2020-10-26T04:22:55Z",
+    "updatedAt": "2025-10-27T07:35:17Z",
+    "pushedAt": "2025-10-25T22:47:00Z",
+    "stars": 132793,
+    "watchers": 675,
+    "forks": 10659,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 58028038,
+    "name": "HelloGitHub",
+    "repo": "521xueweihan/HelloGitHub",
+    "description": ":octocat: 分享 GitHub 上有趣、入门级的开源项目。Share interesting, entry-level open source projects on GitHub.",
+    "createdAt": "2016-05-04T06:24:11Z",
+    "updatedAt": "2025-10-27T07:49:37Z",
+    "pushedAt": "2025-09-28T02:00:22Z",
+    "stars": 132228,
+    "watchers": 4182,
+    "forks": 10822,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 62607227,
+    "name": "tech-interview-handbook",
+    "repo": "yangshun/tech-interview-handbook",
+    "description": "💯 Curated coding interview preparation materials for busy software engineers",
+    "createdAt": "2016-07-05T05:00:48Z",
+    "updatedAt": "2025-10-27T07:26:22Z",
+    "pushedAt": "2025-08-27T00:17:33Z",
+    "stars": 131399,
+    "watchers": 2182,
+    "forks": 15942,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 23096959,
+    "name": "go",
+    "repo": "golang/go",
+    "description": "The Go programming language",
+    "createdAt": "2014-08-19T04:33:40Z",
+    "updatedAt": "2025-10-27T07:25:58Z",
+    "pushedAt": "2025-10-27T04:49:52Z",
+    "stars": 130538,
+    "watchers": 3346,
+    "forks": 18415,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 111583593,
+    "name": "scrcpy",
+    "repo": "Genymobile/scrcpy",
+    "description": "Display and control your Android device",
+    "createdAt": "2017-11-21T18:00:27Z",
+    "updatedAt": "2025-10-27T07:30:24Z",
+    "pushedAt": "2025-10-26T10:52:03Z",
+    "stars": 130238,
+    "watchers": 1321,
+    "forks": 12191,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 241576270,
+    "name": "fucking-algorithm",
+    "repo": "labuladong/fucking-algorithm",
+    "description": "刷算法全靠套路，认准 labuladong 就够了！English version supported! Crack LeetCode, not only how, but also why. ",
+    "createdAt": "2020-02-19T09:01:23Z",
+    "updatedAt": "2025-10-27T07:27:20Z",
+    "pushedAt": "2025-10-08T04:06:00Z",
+    "stars": 129651,
+    "watchers": 2283,
+    "forks": 23450,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 112507086,
+    "name": "30-seconds-of-code",
+    "repo": "Chalarangelo/30-seconds-of-code",
+    "description": "Coding articles to level up your development skills",
+    "createdAt": "2017-11-29T17:35:03Z",
+    "updatedAt": "2025-10-27T07:26:47Z",
+    "pushedAt": "2025-10-22T12:51:11Z",
+    "stars": 125630,
+    "watchers": 2594,
+    "forks": 12358,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 184456251,
+    "name": "PowerToys",
+    "repo": "microsoft/PowerToys",
+    "description": "Microsoft PowerToys is a collection of utilities that help you customize Windows and streamline everyday tasks",
+    "createdAt": "2019-05-01T17:44:02Z",
+    "updatedAt": "2025-10-27T07:50:46Z",
+    "pushedAt": "2025-10-27T02:44:52Z",
+    "stars": 125223,
+    "watchers": 1164,
+    "forks": 7451,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 29028775,
+    "name": "react-native",
+    "repo": "facebook/react-native",
+    "description": "A framework for building native applications using React",
+    "createdAt": "2015-01-09T18:10:16Z",
+    "updatedAt": "2025-10-27T07:20:37Z",
+    "pushedAt": "2025-10-27T06:53:57Z",
+    "stars": 124320,
+    "watchers": 3563,
+    "forks": 24914,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 9384267,
+    "name": "electron",
+    "repo": "electron/electron",
+    "description": ":electron: Build cross-platform desktop apps with JavaScript, HTML, and CSS",
+    "createdAt": "2013-04-12T01:47:36Z",
+    "updatedAt": "2025-10-27T07:25:42Z",
+    "pushedAt": "2025-10-27T06:46:57Z",
+    "stars": 118841,
+    "watchers": 2801,
+    "forks": 16578,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 20580498,
+    "name": "kubernetes",
+    "repo": "kubernetes/kubernetes",
+    "description": "Production-Grade Container Scheduling and Management",
+    "createdAt": "2014-06-06T22:56:04Z",
+    "updatedAt": "2025-10-27T07:31:13Z",
+    "pushedAt": "2025-10-26T22:21:34Z",
+    "stars": 118226,
+    "watchers": 3189,
+    "forks": 41578,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 552661142,
+    "name": "langchain",
+    "repo": "langchain-ai/langchain",
+    "description": "🦜🔗 Build context-aware reasoning applications",
+    "createdAt": "2022-10-17T02:58:36Z",
+    "updatedAt": "2025-10-27T07:37:09Z",
+    "pushedAt": "2025-10-27T07:39:14Z",
+    "stars": 118140,
+    "watchers": 775,
+    "forks": 19453,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 561730219,
+    "name": "hello-algo",
+    "repo": "krahets/hello-algo",
+    "description": "《Hello 算法》：动画图解、一键运行的数据结构与算法教程。支持 Python, Java, C++, C, C#, JS, Go, Swift, Rust, Ruby, Kotlin, TS, Dart 代码。简体版和繁体版同步更新，English version in translation",
+    "createdAt": "2022-11-04T11:08:34Z",
+    "updatedAt": "2025-10-27T07:28:05Z",
+    "pushedAt": "2025-10-16T21:33:36Z",
+    "stars": 118081,
+    "watchers": 582,
+    "forks": 14500,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 626805178,
+    "name": "dify",
+    "repo": "langgenius/dify",
+    "description": "Production-ready platform for agentic workflow development.",
+    "createdAt": "2023-04-12T07:40:24Z",
+    "updatedAt": "2025-10-27T07:45:31Z",
+    "pushedAt": "2025-10-27T07:48:43Z",
+    "stars": 117359,
+    "watchers": 697,
+    "forks": 18125,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 14098069,
+    "name": "free-programming-books-zh_CN",
+    "repo": "justjavac/free-programming-books-zh_CN",
+    "description": ":books: 免费的计算机编程类中文书籍，欢迎投稿",
+    "createdAt": "2013-11-04T01:59:19Z",
+    "updatedAt": "2025-10-27T07:25:46Z",
+    "pushedAt": "2024-07-15T08:55:20Z",
+    "stars": 115537,
+    "watchers": 5860,
+    "forks": 28362,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 32484381,
+    "name": "free-for-dev",
+    "repo": "ripienaar/free-for-dev",
+    "description": "A list of SaaS, PaaS and IaaS offerings that have free tiers of interest to devops and infradev",
+    "createdAt": "2015-03-18T21:06:26Z",
+    "updatedAt": "2025-10-27T07:26:05Z",
+    "pushedAt": "2025-10-23T04:49:00Z",
+    "stars": 114093,
+    "watchers": 1734,
+    "forks": 11683,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 27193779,
+    "name": "node",
+    "repo": "nodejs/node",
+    "description": "Node.js JavaScript runtime ✨🐢🚀✨",
+    "createdAt": "2014-11-26T19:57:11Z",
+    "updatedAt": "2025-10-27T07:38:07Z",
+    "pushedAt": "2025-10-27T01:02:07Z",
+    "stars": 113974,
+    "watchers": 2964,
+    "forks": 33571,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 701547123,
+    "name": "open-webui",
+    "repo": "open-webui/open-webui",
+    "description": "User-friendly AI Interface (Supports Ollama, OpenAI API, ...)",
+    "createdAt": "2023-10-06T22:08:27Z",
+    "updatedAt": "2025-10-27T07:32:58Z",
+    "pushedAt": "2025-10-27T05:20:59Z",
+    "stars": 113474,
+    "watchers": 516,
+    "forks": 15764,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 943149,
+    "name": "d3",
+    "repo": "d3/d3",
+    "description": "Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:",
+    "createdAt": "2010-09-27T17:22:42Z",
+    "updatedAt": "2025-10-27T07:25:31Z",
+    "pushedAt": "2025-07-27T11:30:40Z",
+    "stars": 111683,
+    "watchers": 3558,
+    "forks": 22851,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 808144141,
+    "name": "FreeDomain",
+    "repo": "DigitalPlatDev/FreeDomain",
+    "description": "DigitalPlat FreeDomain: Free Domain For Everyone",
+    "createdAt": "2024-05-30T13:23:00Z",
+    "updatedAt": "2025-10-27T07:49:47Z",
+    "pushedAt": "2025-09-25T12:12:01Z",
+    "stars": 111350,
+    "watchers": 120,
+    "forks": 2066,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 231283452,
+    "name": "excalidraw",
+    "repo": "excalidraw/excalidraw",
+    "description": "Virtual whiteboard for sketching hand-drawn like diagrams",
+    "createdAt": "2020-01-02T01:04:43Z",
+    "updatedAt": "2025-10-27T07:49:00Z",
+    "pushedAt": "2025-10-27T06:42:25Z",
+    "stars": 109225,
+    "watchers": 467,
+    "forks": 11332,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 576201,
+    "name": "three.js",
+    "repo": "mrdoob/three.js",
+    "description": "JavaScript 3D Library.",
+    "createdAt": "2010-03-23T18:58:01Z",
+    "updatedAt": "2025-10-27T07:25:30Z",
+    "pushedAt": "2025-10-26T17:25:47Z",
+    "stars": 109123,
+    "watchers": 2517,
+    "forks": 36051,
+    "defaultBranch": "dev"
+  },
+  {
+    "id": 23088740,
+    "name": "axios",
+    "repo": "axios/axios",
+    "description": "Promise based HTTP client for the browser and node.js",
+    "createdAt": "2014-08-18T22:30:27Z",
+    "updatedAt": "2025-10-27T05:22:18Z",
+    "pushedAt": "2025-10-26T22:46:40Z",
+    "stars": 108017,
+    "watchers": 1169,
+    "forks": 11366,
+    "defaultBranch": "v1.x"
+  },
+  {
+    "id": 724712,
+    "name": "rust",
+    "repo": "rust-lang/rust",
+    "description": "Empowering everyone to build reliable and efficient software.",
+    "createdAt": "2010-06-16T20:39:03Z",
+    "updatedAt": "2025-10-27T06:39:34Z",
+    "pushedAt": "2025-10-27T07:25:41Z",
+    "stars": 107453,
+    "watchers": 1467,
+    "forks": 13897,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 20929025,
+    "name": "TypeScript",
+    "repo": "microsoft/TypeScript",
+    "description": "TypeScript is a superset of JavaScript that compiles to clean JavaScript output.",
+    "createdAt": "2014-06-17T15:28:39Z",
+    "updatedAt": "2025-10-27T07:20:39Z",
+    "pushedAt": "2025-10-27T00:06:54Z",
+    "stars": 106530,
+    "watchers": 2148,
+    "forks": 13084,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 133442384,
+    "name": "deno",
+    "repo": "denoland/deno",
+    "description": "A modern runtime for JavaScript and TypeScript.",
+    "createdAt": "2018-05-15T01:34:26Z",
+    "updatedAt": "2025-10-27T07:14:57Z",
+    "pushedAt": "2025-10-24T23:41:20Z",
+    "stars": 104915,
+    "watchers": 1398,
+    "forks": 5753,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 103633984,
+    "name": "nodebestpractices",
+    "repo": "goldbergyoni/nodebestpractices",
+    "description": ":white_check_mark:  The Node.js best practices list (July 2024)",
+    "createdAt": "2017-09-15T08:33:19Z",
+    "updatedAt": "2025-10-27T07:26:43Z",
+    "pushedAt": "2025-04-15T21:52:42Z",
+    "stars": 104439,
+    "watchers": 1944,
+    "forks": 10627,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 63537249,
+    "name": "create-react-app",
+    "repo": "facebook/create-react-app",
+    "description": "Set up a modern web app by running one command.",
+    "createdAt": "2016-07-17T14:55:11Z",
+    "updatedAt": "2025-10-27T07:26:24Z",
+    "pushedAt": "2025-02-15T01:32:11Z",
+    "stars": 103811,
+    "watchers": 1892,
+    "forks": 27146,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 206462776,
+    "name": "GitHub-Chinese-Top-Charts",
+    "repo": "GrowingGit/GitHub-Chinese-Top-Charts",
+    "description": ":cn: GitHub中文排行榜，各语言分设「软件 | 资料」榜单，精准定位中文好项目。各取所需，高效学习。",
+    "createdAt": "2019-09-05T03:01:56Z",
+    "updatedAt": "2025-10-27T06:04:01Z",
+    "pushedAt": "2024-10-12T06:51:36Z",
+    "stars": 103336,
+    "watchers": 2607,
+    "forks": 13364,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 15634981,
+    "name": "godot",
+    "repo": "godotengine/godot",
+    "description": "Godot Engine – Multi-platform 2D and 3D game engine",
+    "createdAt": "2014-01-04T16:05:36Z",
+    "updatedAt": "2025-10-27T07:16:51Z",
+    "pushedAt": "2025-10-25T20:48:20Z",
+    "stars": 102604,
+    "watchers": 1493,
+    "forks": 23450,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 299354207,
+    "name": "rustdesk",
+    "repo": "rustdesk/rustdesk",
+    "description": "An open-source remote desktop application designed for self-hosting, as an alternative to TeamViewer.",
+    "createdAt": "2020-09-28T15:36:08Z",
+    "updatedAt": "2025-10-27T07:42:29Z",
+    "pushedAt": "2025-10-26T13:28:57Z",
+    "stars": 101456,
+    "watchers": 548,
+    "forks": 14837,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 655806940,
+    "name": "generative-ai-for-beginners",
+    "repo": "microsoft/generative-ai-for-beginners",
+    "description": "21 Lessons, Get Started Building with Generative AI ",
+    "createdAt": "2023-06-19T16:28:59Z",
+    "updatedAt": "2025-10-27T07:38:12Z",
+    "pushedAt": "2025-10-27T03:19:39Z",
+    "stars": 100935,
+    "watchers": 889,
+    "forks": 53478,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 100060912,
+    "name": "terminal",
+    "repo": "microsoft/terminal",
+    "description": "The new Windows Terminal and the original Windows console host, all in the same place!",
+    "createdAt": "2017-08-11T18:38:22Z",
+    "updatedAt": "2025-10-27T05:40:24Z",
+    "pushedAt": "2025-10-22T01:31:33Z",
+    "stars": 100726,
+    "watchers": 1334,
+    "forks": 8879,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 48378947,
+    "name": "frp",
+    "repo": "fatedier/frp",
+    "description": "A fast reverse proxy to help you expose a local server behind a NAT or firewall to the internet.",
+    "createdAt": "2015-12-21T15:24:59Z",
+    "updatedAt": "2025-10-27T07:00:25Z",
+    "pushedAt": "2025-10-17T02:53:43Z",
+    "stars": 100015,
+    "watchers": 1563,
+    "forks": 14562,
+    "defaultBranch": "dev"
+  },
+  {
+    "id": 908531752,
+    "name": "DeepSeek-V3",
+    "repo": "deepseek-ai/DeepSeek-V3",
+    "description": null,
+    "createdAt": "2024-12-26T09:52:40Z",
+    "updatedAt": "2025-10-27T07:28:30Z",
+    "pushedAt": "2025-08-28T03:24:37Z",
+    "stars": 99981,
+    "watchers": 750,
+    "forks": 16309,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 55076063,
+    "name": "Awesome-Hacking",
+    "repo": "Hack-with-Github/Awesome-Hacking",
+    "description": "A collection of various awesome lists for hackers, pentesters and security researchers",
+    "createdAt": "2016-03-30T15:47:10Z",
+    "updatedAt": "2025-10-27T07:49:40Z",
+    "pushedAt": "2025-01-18T01:48:02Z",
+    "stars": 99684,
+    "watchers": 3931,
+    "forks": 9634,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 15204860,
+    "name": "papers-we-love",
+    "repo": "papers-we-love/papers-we-love",
+    "description": "Papers from the computer science community to read and discuss.",
+    "createdAt": "2013-12-15T14:31:41Z",
+    "updatedAt": "2025-10-27T07:49:42Z",
+    "pushedAt": "2025-10-10T15:35:14Z",
+    "stars": 99626,
+    "watchers": 3159,
+    "forks": 6144,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 24195339,
+    "name": "angular",
+    "repo": "angular/angular",
+    "description": "Deliver web apps with confidence 🚀",
+    "createdAt": "2014-09-18T16:12:01Z",
+    "updatedAt": "2025-10-27T07:05:22Z",
+    "pushedAt": "2025-10-24T19:28:33Z",
+    "stars": 99167,
+    "watchers": 2980,
+    "forks": 26724,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 585146387,
+    "name": "ui",
+    "repo": "shadcn-ui/ui",
+    "description": "A set of beautifully-designed, accessible components and a code distribution platform. Works with your favorite frameworks. Open Source. Open Code.",
+    "createdAt": "2023-01-04T12:43:27Z",
+    "updatedAt": "2025-10-27T07:34:00Z",
+    "pushedAt": "2025-10-27T07:18:39Z",
+    "stars": 98464,
+    "watchers": 306,
+    "forks": 7031,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 196701619,
+    "name": "tauri",
+    "repo": "tauri-apps/tauri",
+    "description": "Build smaller, faster, and more secure desktop and mobile applications with a web frontend.",
+    "createdAt": "2019-07-13T09:09:37Z",
+    "updatedAt": "2025-10-27T07:27:10Z",
+    "pushedAt": "2025-10-26T13:55:16Z",
+    "stars": 98199,
+    "watchers": 530,
+    "forks": 3133,
+    "defaultBranch": "dev"
+  },
+  {
+    "id": 157616880,
+    "name": "iptv",
+    "repo": "iptv-org/iptv",
+    "description": "Collection of publicly available IPTV channels from all over the world",
+    "createdAt": "2018-11-14T22:00:57Z",
+    "updatedAt": "2025-10-27T07:13:48Z",
+    "pushedAt": "2025-10-27T00:13:17Z",
+    "stars": 98051,
+    "watchers": 1950,
+    "forks": 4195,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 23083156,
+    "name": "material-ui",
+    "repo": "mui/material-ui",
+    "description": "Material UI: Comprehensive React component library that implements Google's Material Design. Free forever.",
+    "createdAt": "2014-08-18T19:11:54Z",
+    "updatedAt": "2025-10-27T07:25:58Z",
+    "pushedAt": "2025-10-27T07:11:45Z",
+    "stars": 96875,
+    "watchers": 1312,
+    "forks": 32696,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 34526884,
+    "name": "ant-design",
+    "repo": "ant-design/ant-design",
+    "description": "An enterprise-class UI design language and React UI library",
+    "createdAt": "2015-04-24T15:37:24Z",
+    "updatedAt": "2025-10-27T07:19:39Z",
+    "pushedAt": "2025-10-27T07:44:37Z",
+    "stars": 96467,
+    "watchers": 236,
+    "forks": 53873,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 243950408,
+    "name": "HowToCook",
+    "repo": "Anduin2017/HowToCook",
+    "description": "程序员在家做饭方法指南。Programmer's guide about how to cook at home (Simplified Chinese only).",
+    "createdAt": "2020-02-29T10:43:49Z",
+    "updatedAt": "2025-10-27T07:31:17Z",
+    "pushedAt": "2025-10-23T12:40:47Z",
+    "stars": 95393,
+    "watchers": 488,
+    "forks": 10650,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 33614304,
+    "name": "thefuck",
+    "repo": "nvbn/thefuck",
+    "description": "Magnificent app which corrects your previous console command.",
+    "createdAt": "2015-04-08T15:08:04Z",
+    "updatedAt": "2025-10-27T07:26:06Z",
+    "pushedAt": "2024-07-19T14:56:13Z",
+    "stars": 94482,
+    "watchers": 825,
+    "forks": 3792,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 65600975,
+    "name": "pytorch",
+    "repo": "pytorch/pytorch",
+    "description": "Tensors and Dynamic neural networks in Python with strong GPU acceleration",
+    "createdAt": "2016-08-13T05:26:41Z",
+    "updatedAt": "2025-10-27T07:51:08Z",
+    "pushedAt": "2025-10-27T07:51:03Z",
+    "stars": 94273,
+    "watchers": 1771,
+    "forks": 25671,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 74791366,
+    "name": "clean-code-javascript",
+    "repo": "ryanmcdermott/clean-code-javascript",
+    "description": "Clean Code concepts adapted for JavaScript",
+    "createdAt": "2016-11-25T22:25:41Z",
+    "updatedAt": "2025-10-27T03:36:56Z",
+    "pushedAt": "2024-07-29T07:24:37Z",
+    "stars": 93960,
+    "watchers": 1744,
+    "forks": 12496,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 101296881,
+    "name": "every-programmer-should-know",
+    "repo": "mtdvio/every-programmer-should-know",
+    "description": "A collection of (mostly) technical things every software developer should know about",
+    "createdAt": "2017-08-24T13:18:26Z",
+    "updatedAt": "2025-10-27T07:26:42Z",
+    "pushedAt": "2025-10-22T15:21:18Z",
+    "stars": 93814,
+    "watchers": 2011,
+    "forks": 8436,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 16408992,
+    "name": "neovim",
+    "repo": "neovim/neovim",
+    "description": "Vim-fork focused on extensibility and usability",
+    "createdAt": "2014-01-31T13:39:22Z",
+    "updatedAt": "2025-10-27T07:30:43Z",
+    "pushedAt": "2025-10-27T05:15:23Z",
+    "stars": 93731,
+    "watchers": 971,
+    "forks": 6378,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 943398999,
+    "name": "system-prompts-and-models-of-ai-tools",
+    "repo": "x1xhlol/system-prompts-and-models-of-ai-tools",
+    "description": "FULL Augment Code, Claude Code, Cluely, CodeBuddy, Comet, Cursor, Devin AI, Junie, Kiro, Leap.new, Lovable, Manus Agent Tools, NotionAI, Orchids.app, Perplexity, Poke, Qoder, Replit, Same.dev, Trae, Traycer AI, VSCode Agent, Warp.dev, Windsurf, Xcode, Z.ai Code, dia & v0. (And other Open Sourced) System Prompts, Internal Tools & AI Models",
+    "createdAt": "2025-03-05T16:38:29Z",
+    "updatedAt": "2025-10-27T07:37:40Z",
+    "pushedAt": "2025-10-19T18:44:24Z",
+    "stars": 93282,
+    "watchers": 1183,
+    "forks": 25228,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 22790488,
+    "name": "java-design-patterns",
+    "repo": "iluwatar/java-design-patterns",
+    "description": "Design patterns implemented in Java",
+    "createdAt": "2014-08-09T16:45:18Z",
+    "updatedAt": "2025-10-27T07:35:54Z",
+    "pushedAt": "2025-10-21T21:30:34Z",
+    "stars": 93215,
+    "watchers": 3717,
+    "forks": 27309,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 90796663,
+    "name": "puppeteer",
+    "repo": "puppeteer/puppeteer",
+    "description": "JavaScript API for Chrome and Firefox",
+    "createdAt": "2017-05-09T22:16:13Z",
+    "updatedAt": "2025-10-27T07:31:12Z",
+    "pushedAt": "2025-10-26T04:03:55Z",
+    "stars": 92724,
+    "watchers": 1184,
+    "forks": 9314,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 311525798,
+    "name": "Web-Dev-For-Beginners",
+    "repo": "microsoft/Web-Dev-For-Beginners",
+    "description": "24 Lessons, 12 Weeks, Get Started as a Web Developer",
+    "createdAt": "2020-11-10T02:44:00Z",
+    "updatedAt": "2025-10-27T07:27:35Z",
+    "pushedAt": "2025-10-25T00:47:36Z",
+    "stars": 92476,
+    "watchers": 2690,
+    "forks": 14330,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 589831718,
+    "name": "ComfyUI",
+    "repo": "comfyanonymous/ComfyUI",
+    "description": "The most powerful and modular diffusion model GUI, api and backend with a graph/nodes interface.",
+    "createdAt": "2023-01-17T03:15:56Z",
+    "updatedAt": "2025-10-27T07:46:53Z",
+    "pushedAt": "2025-10-27T00:23:05Z",
+    "stars": 92036,
+    "watchers": 614,
+    "forks": 10341,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 63539055,
+    "name": "awesome-mac",
+    "repo": "jaywcjlove/awesome-mac",
+    "description": " Now we have become very big, Different from the original idea. Collect premium software in various categories.",
+    "createdAt": "2016-07-17T15:33:47Z",
+    "updatedAt": "2025-10-27T07:50:40Z",
+    "pushedAt": "2025-10-25T04:02:03Z",
+    "stars": 91815,
+    "watchers": 1517,
+    "forks": 6947,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 919443098,
+    "name": "DeepSeek-R1",
+    "repo": "deepseek-ai/DeepSeek-R1",
+    "description": null,
+    "createdAt": "2025-01-20T11:57:28Z",
+    "updatedAt": "2025-10-27T06:56:07Z",
+    "pushedAt": "2025-06-27T08:35:54Z",
+    "stars": 91380,
+    "watchers": 607,
+    "forks": 11766,
+    "defaultBranch": "main"
+  },
+  {
+    "id": 160919119,
+    "name": "fastapi",
+    "repo": "fastapi/fastapi",
+    "description": "FastAPI framework, high performance, easy to learn, fast to code, ready for production",
+    "createdAt": "2018-12-08T08:21:47Z",
+    "updatedAt": "2025-10-27T07:49:54Z",
+    "pushedAt": "2025-10-23T20:55:59Z",
+    "stars": 91203,
+    "watchers": 723,
+    "forks": 8123,
+    "defaultBranch": "master"
+  },
+  {
+    "id": 106017343,
+    "name": "tailwindcss",
+    "repo": "tailwindlabs/tailwindcss",
+    "description": "A utility-first CSS framework for rapid UI development.",
+    "createdAt": "2017-10-06T14:59:14Z",
+    "updatedAt": "2025-10-27T07:48:03Z",
+    "pushedAt": "2025-10-24T11:53:16Z",
+    "stars": 90800,
+    "watchers": 615,
+    "forks": 4771,
+    "defaultBranch": "main"
+  }
+]
diff --git a/benchmarks/package.json b/benchmarks/package.json
new file mode 100644
index 0000000..b3c7f71
--- /dev/null
+++ b/benchmarks/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "@toon/benchmarks",
+  "type": "module",
+  "private": true,
+  "scripts": {
+    "benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts",
+    "benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts",
+    "fetch-github-data": "tsx scripts/fetch-github-data.ts",
+    "test": "vitest"
+  },
+  "devDependencies": {
+    "@ai-sdk/anthropic": "^2.0.37",
+    "@ai-sdk/google": "^2.0.23",
+    "@ai-sdk/openai": "^2.0.53",
+    "@ai-sdk/provider": "^2.0.0",
+    "@antfu/eslint-config": "^6.1.0",
+    "@faker-js/faker": "^10.1.0",
+    "ai": "^5.0.80",
+    "consola": "^3.4.2",
+    "csv-stringify": "^6.6.0",
+    "gpt-tokenizer": "^3.2.0",
+    "ofetch": "^1.4.1",
+    "p-map": "^7.0.3",
+    "yaml": "^2.8.1"
+  }
+}
diff --git a/benchmarks/results/accuracy/accuracy.md b/benchmarks/results/accuracy/accuracy.md
new file mode 100644
index 0000000..e435df6
--- /dev/null
+++ b/benchmarks/results/accuracy/accuracy.md
@@ -0,0 +1,96 @@
+### Retrieval Accuracy
+
+Tested across **2 LLMs** with data retrieval tasks:
+
+```
+gpt-4o-mini          ██████████████░░░░░░ 72.3% accuracy
+claude-haiku-4-5     ███████████████░░░░░ 76.7% accuracy
+```
+
+**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.**
+
+| Format | Accuracy | Average Tokens |
+| ------ | -------- | -------------- |
+| `toon` | 73.9% | 4.678 |
+| `json` | 73.6% | 8.713 |
+| `markdown-kv` | 73.6% | 8.649 |
+| `csv` | 72.3% | 4.745 |
+| `yaml` | 71.7% | 7.091 |
+
+<details>
+<summary><strong>View detailed breakdown by dataset and model</strong></summary>
+
+#### Performance by Dataset
+
+##### Uniform employee records (TOON optimal format)
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 72.4% | 2.483 | 84/116 |
+| `csv` | 69.0% | 2.337 | 80/116 |
+| `yaml` | 68.1% | 4.969 | 79/116 |
+| `markdown-kv` | 68.1% | 6.270 | 79/116 |
+| `json` | 68.1% | 6.347 | 79/116 |
+
+##### E-commerce orders with nested structures
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 84.1% | 5.967 | 74/88 |
+| `csv` | 83.0% | 6.735 | 73/88 |
+| `yaml` | 81.8% | 7.328 | 72/88 |
+| `markdown-kv` | 86.4% | 9.110 | 76/88 |
+| `json` | 84.1% | 9.694 | 74/88 |
+
+##### Time-series analytics data
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `csv` | 72.4% | 1.393 | 42/58 |
+| `toon` | 70.7% | 1.515 | 41/58 |
+| `yaml` | 72.4% | 2.938 | 42/58 |
+| `json` | 74.1% | 3.665 | 43/58 |
+| `markdown-kv` | 70.7% | 3.779 | 41/58 |
+
+##### Popular GitHub repositories
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 64.3% | 8.745 | 36/56 |
+| `csv` | 62.5% | 8.513 | 35/56 |
+| `json` | 67.9% | 15.145 | 38/56 |
+| `markdown-kv` | 67.9% | 15.436 | 38/56 |
+| `yaml` | 62.5% | 13.129 | 35/56 |
+
+
+#### Performance by Model
+
+##### gpt-4o-mini
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `toon` | 72.3% | 115/159 |
+| `json` | 71.7% | 114/159 |
+| `markdown-kv` | 70.4% | 112/159 |
+| `csv` | 69.2% | 110/159 |
+| `yaml` | 68.6% | 109/159 |
+
+##### claude-haiku-4-5
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `markdown-kv` | 76.7% | 122/159 |
+| `toon` | 75.5% | 120/159 |
+| `json` | 75.5% | 120/159 |
+| `csv` | 75.5% | 120/159 |
+| `yaml` | 74.8% | 119/159 |
+
+
+#### Methodology
+
+- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching)
+- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding
+- **Question types**: Field retrieval, aggregation, and filtering tasks
+- **Real data**: faker.js-generated datasets + real GitHub repository data
+
+</details>
diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json
new file mode 100644
index 0000000..a5a21a0
--- /dev/null
+++ b/benchmarks/results/accuracy/raw-results.json
@@ -0,0 +1,17492 @@
+[
+  {
+    "questionId": "q1",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 1313
+  },
+  {
+    "questionId": "q1",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 7870,
+    "outputTokens": 6,
+    "latencyMs": 1346
+  },
+  {
+    "questionId": "q1",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q1",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 2982,
+    "outputTokens": 6,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q1",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 5010
+  },
+  {
+    "questionId": "q1",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 2856,
+    "outputTokens": 6,
+    "latencyMs": 1472
+  },
+  {
+    "questionId": "q1",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 1667
+  },
+  {
+    "questionId": "q1",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 6365,
+    "outputTokens": 6,
+    "latencyMs": 1507
+  },
+  {
+    "questionId": "q1",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 1325
+  },
+  {
+    "questionId": "q1",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "56176",
+    "actual": "56176",
+    "correct": true,
+    "inputTokens": 5760,
+    "outputTokens": 6,
+    "latencyMs": 2280
+  },
+  {
+    "questionId": "q2",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 2,
+    "latencyMs": 3167
+  },
+  {
+    "questionId": "q2",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 4,
+    "latencyMs": 1267
+  },
+  {
+    "questionId": "q2",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 2,
+    "latencyMs": 1402
+  },
+  {
+    "questionId": "q2",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 4,
+    "latencyMs": 1290
+  },
+  {
+    "questionId": "q2",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 2,
+    "latencyMs": 5070
+  },
+  {
+    "questionId": "q2",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 4,
+    "latencyMs": 1320
+  },
+  {
+    "questionId": "q2",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 2,
+    "latencyMs": 1745
+  },
+  {
+    "questionId": "q2",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 4,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q2",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 2,
+    "latencyMs": 2713
+  },
+  {
+    "questionId": "q2",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 4,
+    "latencyMs": 1309
+  },
+  {
+    "questionId": "q3",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 6393,
+    "outputTokens": 7,
+    "latencyMs": 1160
+  },
+  {
+    "questionId": "q3",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 7874,
+    "outputTokens": 12,
+    "latencyMs": 1338
+  },
+  {
+    "questionId": "q3",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 2530,
+    "outputTokens": 7,
+    "latencyMs": 1478
+  },
+  {
+    "questionId": "q3",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 12,
+    "latencyMs": 1563
+  },
+  {
+    "questionId": "q3",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 2384,
+    "outputTokens": 7,
+    "latencyMs": 1310
+  },
+  {
+    "questionId": "q3",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 2860,
+    "outputTokens": 12,
+    "latencyMs": 1236
+  },
+  {
+    "questionId": "q3",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 6319,
+    "outputTokens": 7,
+    "latencyMs": 2236
+  },
+  {
+    "questionId": "q3",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 6369,
+    "outputTokens": 12,
+    "latencyMs": 1253
+  },
+  {
+    "questionId": "q3",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 5015,
+    "outputTokens": 7,
+    "latencyMs": 1917
+  },
+  {
+    "questionId": "q3",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "lorenza.kunze@yahoo.com",
+    "actual": "lorenza.kunze@yahoo.com",
+    "correct": true,
+    "inputTokens": 5764,
+    "outputTokens": 12,
+    "latencyMs": 1332
+  },
+  {
+    "questionId": "q4",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 2945
+  },
+  {
+    "questionId": "q4",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 7870,
+    "outputTokens": 6,
+    "latencyMs": 1773
+  },
+  {
+    "questionId": "q4",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 1294
+  },
+  {
+    "questionId": "q4",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 2982,
+    "outputTokens": 6,
+    "latencyMs": 980
+  },
+  {
+    "questionId": "q4",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 1747
+  },
+  {
+    "questionId": "q4",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 2856,
+    "outputTokens": 6,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q4",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 1039
+  },
+  {
+    "questionId": "q4",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 6365,
+    "outputTokens": 6,
+    "latencyMs": 1453
+  },
+  {
+    "questionId": "q4",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 1056
+  },
+  {
+    "questionId": "q4",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "117381",
+    "actual": "117381",
+    "correct": true,
+    "inputTokens": 5760,
+    "outputTokens": 6,
+    "latencyMs": 1564
+  },
+  {
+    "questionId": "q5",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 2,
+    "latencyMs": 1263
+  },
+  {
+    "questionId": "q5",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 4,
+    "latencyMs": 1097
+  },
+  {
+    "questionId": "q5",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 2,
+    "latencyMs": 1248
+  },
+  {
+    "questionId": "q5",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 4,
+    "latencyMs": 1486
+  },
+  {
+    "questionId": "q5",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 2,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q5",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 4,
+    "latencyMs": 1019
+  },
+  {
+    "questionId": "q5",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 2,
+    "latencyMs": 1287
+  },
+  {
+    "questionId": "q5",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 4,
+    "latencyMs": 1243
+  },
+  {
+    "questionId": "q5",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 2,
+    "latencyMs": 1339
+  },
+  {
+    "questionId": "q5",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 4,
+    "latencyMs": 1621
+  },
+  {
+    "questionId": "q6",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 6,
+    "latencyMs": 1625
+  },
+  {
+    "questionId": "q6",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 11,
+    "latencyMs": 1328
+  },
+  {
+    "questionId": "q6",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 6,
+    "latencyMs": 1463
+  },
+  {
+    "questionId": "q6",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 11,
+    "latencyMs": 1149
+  },
+  {
+    "questionId": "q6",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 6,
+    "latencyMs": 1474
+  },
+  {
+    "questionId": "q6",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 11,
+    "latencyMs": 977
+  },
+  {
+    "questionId": "q6",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 6,
+    "latencyMs": 2079
+  },
+  {
+    "questionId": "q6",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 11,
+    "latencyMs": 1134
+  },
+  {
+    "questionId": "q6",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 6,
+    "latencyMs": 1124
+  },
+  {
+    "questionId": "q6",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "jayda60@hotmail.com",
+    "actual": "jayda60@hotmail.com",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 11,
+    "latencyMs": 1053
+  },
+  {
+    "questionId": "q7",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 1427
+  },
+  {
+    "questionId": "q7",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 7870,
+    "outputTokens": 6,
+    "latencyMs": 1246
+  },
+  {
+    "questionId": "q7",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q7",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 2982,
+    "outputTokens": 6,
+    "latencyMs": 1547
+  },
+  {
+    "questionId": "q7",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 1523
+  },
+  {
+    "questionId": "q7",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 2856,
+    "outputTokens": 6,
+    "latencyMs": 1148
+  },
+  {
+    "questionId": "q7",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 1360
+  },
+  {
+    "questionId": "q7",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 6365,
+    "outputTokens": 6,
+    "latencyMs": 1100
+  },
+  {
+    "questionId": "q7",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 1116
+  },
+  {
+    "questionId": "q7",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "92971",
+    "actual": "92971",
+    "correct": true,
+    "inputTokens": 5760,
+    "outputTokens": 6,
+    "latencyMs": 1202
+  },
+  {
+    "questionId": "q8",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Operations",
+    "correct": false,
+    "inputTokens": 6391,
+    "outputTokens": 2,
+    "latencyMs": 974
+  },
+  {
+    "questionId": "q8",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 4,
+    "latencyMs": 1357
+  },
+  {
+    "questionId": "q8",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 2,
+    "latencyMs": 1107
+  },
+  {
+    "questionId": "q8",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 4,
+    "latencyMs": 1126
+  },
+  {
+    "questionId": "q8",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 2,
+    "latencyMs": 1124
+  },
+  {
+    "questionId": "q8",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 4,
+    "latencyMs": 1208
+  },
+  {
+    "questionId": "q8",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Operations",
+    "correct": false,
+    "inputTokens": 6317,
+    "outputTokens": 2,
+    "latencyMs": 1463
+  },
+  {
+    "questionId": "q8",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 4,
+    "latencyMs": 1175
+  },
+  {
+    "questionId": "q8",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 2,
+    "latencyMs": 1952
+  },
+  {
+    "questionId": "q8",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 4,
+    "latencyMs": 1271
+  },
+  {
+    "questionId": "q9",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 6393,
+    "outputTokens": 7,
+    "latencyMs": 1301
+  },
+  {
+    "questionId": "q9",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 11,
+    "latencyMs": 1371
+  },
+  {
+    "questionId": "q9",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 2530,
+    "outputTokens": 7,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q9",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 11,
+    "latencyMs": 1088
+  },
+  {
+    "questionId": "q9",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 2384,
+    "outputTokens": 7,
+    "latencyMs": 1310
+  },
+  {
+    "questionId": "q9",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 11,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q9",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 6319,
+    "outputTokens": 7,
+    "latencyMs": 1531
+  },
+  {
+    "questionId": "q9",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 11,
+    "latencyMs": 1275
+  },
+  {
+    "questionId": "q9",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrence.hansen@yahoo.com",
+    "correct": false,
+    "inputTokens": 5015,
+    "outputTokens": 7,
+    "latencyMs": 1245
+  },
+  {
+    "questionId": "q9",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "terrance.hansen@yahoo.com",
+    "actual": "terrance.hansen@yahoo.com",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 11,
+    "latencyMs": 1215
+  },
+  {
+    "questionId": "q10",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 3,
+    "latencyMs": 4959
+  },
+  {
+    "questionId": "q10",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 7870,
+    "outputTokens": 6,
+    "latencyMs": 1269
+  },
+  {
+    "questionId": "q10",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 3,
+    "latencyMs": 1111
+  },
+  {
+    "questionId": "q10",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 2982,
+    "outputTokens": 6,
+    "latencyMs": 1254
+  },
+  {
+    "questionId": "q10",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 3,
+    "latencyMs": 1616
+  },
+  {
+    "questionId": "q10",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 2856,
+    "outputTokens": 6,
+    "latencyMs": 1123
+  },
+  {
+    "questionId": "q10",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 3,
+    "latencyMs": 1201
+  },
+  {
+    "questionId": "q10",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 6365,
+    "outputTokens": 6,
+    "latencyMs": 1371
+  },
+  {
+    "questionId": "q10",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 3,
+    "latencyMs": 1503
+  },
+  {
+    "questionId": "q10",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "107744",
+    "actual": "107744",
+    "correct": true,
+    "inputTokens": 5760,
+    "outputTokens": 6,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q11",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 2,
+    "latencyMs": 1383
+  },
+  {
+    "questionId": "q11",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 4,
+    "latencyMs": 1081
+  },
+  {
+    "questionId": "q11",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 2,
+    "latencyMs": 1677
+  },
+  {
+    "questionId": "q11",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 4,
+    "latencyMs": 1072
+  },
+  {
+    "questionId": "q11",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 2,
+    "latencyMs": 1142
+  },
+  {
+    "questionId": "q11",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 4,
+    "latencyMs": 991
+  },
+  {
+    "questionId": "q11",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 2,
+    "latencyMs": 1339
+  },
+  {
+    "questionId": "q11",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 4,
+    "latencyMs": 1117
+  },
+  {
+    "questionId": "q11",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 2,
+    "latencyMs": 2483
+  },
+  {
+    "questionId": "q11",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 4,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q12",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 5,
+    "latencyMs": 1827
+  },
+  {
+    "questionId": "q12",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 7867,
+    "outputTokens": 9,
+    "latencyMs": 1121
+  },
+  {
+    "questionId": "q12",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 5,
+    "latencyMs": 1373
+  },
+  {
+    "questionId": "q12",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 2979,
+    "outputTokens": 9,
+    "latencyMs": 1284
+  },
+  {
+    "questionId": "q12",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 5,
+    "latencyMs": 1751
+  },
+  {
+    "questionId": "q12",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 2853,
+    "outputTokens": 9,
+    "latencyMs": 1140
+  },
+  {
+    "questionId": "q12",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 5,
+    "latencyMs": 1624
+  },
+  {
+    "questionId": "q12",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 6362,
+    "outputTokens": 9,
+    "latencyMs": 1071
+  },
+  {
+    "questionId": "q12",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 5,
+    "latencyMs": 1970
+  },
+  {
+    "questionId": "q12",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "allan21@gmail.com",
+    "actual": "allan21@gmail.com",
+    "correct": true,
+    "inputTokens": 5757,
+    "outputTokens": 9,
+    "latencyMs": 1437
+  },
+  {
+    "questionId": "q13",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 6389,
+    "outputTokens": 3,
+    "latencyMs": 1263
+  },
+  {
+    "questionId": "q13",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 6,
+    "latencyMs": 1277
+  },
+  {
+    "questionId": "q13",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 2526,
+    "outputTokens": 3,
+    "latencyMs": 1151
+  },
+  {
+    "questionId": "q13",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 6,
+    "latencyMs": 1260
+  },
+  {
+    "questionId": "q13",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 2380,
+    "outputTokens": 3,
+    "latencyMs": 1071
+  },
+  {
+    "questionId": "q13",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 6,
+    "latencyMs": 891
+  },
+  {
+    "questionId": "q13",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 6315,
+    "outputTokens": 3,
+    "latencyMs": 1548
+  },
+  {
+    "questionId": "q13",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 6,
+    "latencyMs": 1456
+  },
+  {
+    "questionId": "q13",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 5011,
+    "outputTokens": 3,
+    "latencyMs": 1268
+  },
+  {
+    "questionId": "q13",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "145843",
+    "actual": "145843",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 6,
+    "latencyMs": 1205
+  },
+  {
+    "questionId": "q14",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 2,
+    "latencyMs": 1310
+  },
+  {
+    "questionId": "q14",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 4,
+    "latencyMs": 1071
+  },
+  {
+    "questionId": "q14",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 2,
+    "latencyMs": 895
+  },
+  {
+    "questionId": "q14",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 4,
+    "latencyMs": 1020
+  },
+  {
+    "questionId": "q14",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 2,
+    "latencyMs": 1168
+  },
+  {
+    "questionId": "q14",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 4,
+    "latencyMs": 977
+  },
+  {
+    "questionId": "q14",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Operations",
+    "correct": false,
+    "inputTokens": 6316,
+    "outputTokens": 2,
+    "latencyMs": 1370
+  },
+  {
+    "questionId": "q14",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 4,
+    "latencyMs": 1508
+  },
+  {
+    "questionId": "q14",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 2,
+    "latencyMs": 3622
+  },
+  {
+    "questionId": "q14",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 4,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q15",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 7,
+    "latencyMs": 3269
+  },
+  {
+    "questionId": "q15",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 9,
+    "latencyMs": 1538
+  },
+  {
+    "questionId": "q15",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 7,
+    "latencyMs": 1413
+  },
+  {
+    "questionId": "q15",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 9,
+    "latencyMs": 1027
+  },
+  {
+    "questionId": "q15",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 7,
+    "latencyMs": 1257
+  },
+  {
+    "questionId": "q15",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 9,
+    "latencyMs": 1169
+  },
+  {
+    "questionId": "q15",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 7,
+    "latencyMs": 1464
+  },
+  {
+    "questionId": "q15",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 9,
+    "latencyMs": 1799
+  },
+  {
+    "questionId": "q15",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 7,
+    "latencyMs": 1616
+  },
+  {
+    "questionId": "q15",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "alexandria61@gmail.com",
+    "actual": "alexandria61@gmail.com",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 9,
+    "latencyMs": 1349
+  },
+  {
+    "questionId": "q16",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 3,
+    "latencyMs": 1298
+  },
+  {
+    "questionId": "q16",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 7870,
+    "outputTokens": 6,
+    "latencyMs": 1115
+  },
+  {
+    "questionId": "q16",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 3,
+    "latencyMs": 1180
+  },
+  {
+    "questionId": "q16",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 2982,
+    "outputTokens": 6,
+    "latencyMs": 1110
+  },
+  {
+    "questionId": "q16",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 3,
+    "latencyMs": 1235
+  },
+  {
+    "questionId": "q16",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 2856,
+    "outputTokens": 6,
+    "latencyMs": 1228
+  },
+  {
+    "questionId": "q16",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 3,
+    "latencyMs": 1832
+  },
+  {
+    "questionId": "q16",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 6365,
+    "outputTokens": 6,
+    "latencyMs": 1401
+  },
+  {
+    "questionId": "q16",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 3,
+    "latencyMs": 933
+  },
+  {
+    "questionId": "q16",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "89436",
+    "actual": "89436",
+    "correct": true,
+    "inputTokens": 5760,
+    "outputTokens": 6,
+    "latencyMs": 1570
+  },
+  {
+    "questionId": "q17",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6393,
+    "outputTokens": 2,
+    "latencyMs": 1221
+  },
+  {
+    "questionId": "q17",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7872,
+    "outputTokens": 4,
+    "latencyMs": 1293
+  },
+  {
+    "questionId": "q17",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2530,
+    "outputTokens": 2,
+    "latencyMs": 1147
+  },
+  {
+    "questionId": "q17",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2984,
+    "outputTokens": 4,
+    "latencyMs": 923
+  },
+  {
+    "questionId": "q17",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2384,
+    "outputTokens": 2,
+    "latencyMs": 1180
+  },
+  {
+    "questionId": "q17",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2858,
+    "outputTokens": 4,
+    "latencyMs": 1025
+  },
+  {
+    "questionId": "q17",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6319,
+    "outputTokens": 2,
+    "latencyMs": 1748
+  },
+  {
+    "questionId": "q17",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6367,
+    "outputTokens": 4,
+    "latencyMs": 1188
+  },
+  {
+    "questionId": "q17",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5015,
+    "outputTokens": 2,
+    "latencyMs": 1452
+  },
+  {
+    "questionId": "q17",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5762,
+    "outputTokens": 4,
+    "latencyMs": 1329
+  },
+  {
+    "questionId": "q18",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 6,
+    "latencyMs": 768
+  },
+  {
+    "questionId": "q18",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 10,
+    "latencyMs": 1150
+  },
+  {
+    "questionId": "q18",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 6,
+    "latencyMs": 1501
+  },
+  {
+    "questionId": "q18",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 10,
+    "latencyMs": 1201
+  },
+  {
+    "questionId": "q18",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 6,
+    "latencyMs": 1604
+  },
+  {
+    "questionId": "q18",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 10,
+    "latencyMs": 1060
+  },
+  {
+    "questionId": "q18",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 6,
+    "latencyMs": 1350
+  },
+  {
+    "questionId": "q18",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 10,
+    "latencyMs": 1154
+  },
+  {
+    "questionId": "q18",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 6,
+    "latencyMs": 1199
+  },
+  {
+    "questionId": "q18",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "kelvin54@yahoo.com",
+    "actual": "kelvin54@yahoo.com",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 10,
+    "latencyMs": 1216
+  },
+  {
+    "questionId": "q19",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 1412
+  },
+  {
+    "questionId": "q19",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 7872,
+    "outputTokens": 6,
+    "latencyMs": 1908
+  },
+  {
+    "questionId": "q19",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 1366
+  },
+  {
+    "questionId": "q19",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 2984,
+    "outputTokens": 6,
+    "latencyMs": 1054
+  },
+  {
+    "questionId": "q19",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 1121
+  },
+  {
+    "questionId": "q19",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 2858,
+    "outputTokens": 6,
+    "latencyMs": 1262
+  },
+  {
+    "questionId": "q19",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 4632
+  },
+  {
+    "questionId": "q19",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 6367,
+    "outputTokens": 6,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q19",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 928
+  },
+  {
+    "questionId": "q19",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "143365",
+    "actual": "143365",
+    "correct": true,
+    "inputTokens": 5762,
+    "outputTokens": 6,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q20",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 2,
+    "latencyMs": 1053
+  },
+  {
+    "questionId": "q20",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 4,
+    "latencyMs": 1096
+  },
+  {
+    "questionId": "q20",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 2,
+    "latencyMs": 1784
+  },
+  {
+    "questionId": "q20",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 4,
+    "latencyMs": 1093
+  },
+  {
+    "questionId": "q20",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 2,
+    "latencyMs": 1335
+  },
+  {
+    "questionId": "q20",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 4,
+    "latencyMs": 1546
+  },
+  {
+    "questionId": "q20",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 2,
+    "latencyMs": 1293
+  },
+  {
+    "questionId": "q20",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 4,
+    "latencyMs": 1230
+  },
+  {
+    "questionId": "q20",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 2,
+    "latencyMs": 1467
+  },
+  {
+    "questionId": "q20",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 4,
+    "latencyMs": 1370
+  },
+  {
+    "questionId": "q21",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 6394,
+    "outputTokens": 6,
+    "latencyMs": 5026
+  },
+  {
+    "questionId": "q21",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 7876,
+    "outputTokens": 9,
+    "latencyMs": 1786
+  },
+  {
+    "questionId": "q21",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 2531,
+    "outputTokens": 6,
+    "latencyMs": 826
+  },
+  {
+    "questionId": "q21",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 2988,
+    "outputTokens": 9,
+    "latencyMs": 909
+  },
+  {
+    "questionId": "q21",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 2385,
+    "outputTokens": 6,
+    "latencyMs": 1120
+  },
+  {
+    "questionId": "q21",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 2862,
+    "outputTokens": 9,
+    "latencyMs": 996
+  },
+  {
+    "questionId": "q21",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 6320,
+    "outputTokens": 6,
+    "latencyMs": 1639
+  },
+  {
+    "questionId": "q21",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 6371,
+    "outputTokens": 9,
+    "latencyMs": 1299
+  },
+  {
+    "questionId": "q21",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 5016,
+    "outputTokens": 6,
+    "latencyMs": 1151
+  },
+  {
+    "questionId": "q21",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "dean19@gmail.com",
+    "actual": "dean19@gmail.com",
+    "correct": true,
+    "inputTokens": 5766,
+    "outputTokens": 9,
+    "latencyMs": 1246
+  },
+  {
+    "questionId": "q22",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 3,
+    "latencyMs": 1838
+  },
+  {
+    "questionId": "q22",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 6,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q22",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 3,
+    "latencyMs": 980
+  },
+  {
+    "questionId": "q22",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 6,
+    "latencyMs": 1299
+  },
+  {
+    "questionId": "q22",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 3,
+    "latencyMs": 1027
+  },
+  {
+    "questionId": "q22",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 6,
+    "latencyMs": 1433
+  },
+  {
+    "questionId": "q22",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 3,
+    "latencyMs": 2256
+  },
+  {
+    "questionId": "q22",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 6,
+    "latencyMs": 1091
+  },
+  {
+    "questionId": "q22",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 3,
+    "latencyMs": 1288
+  },
+  {
+    "questionId": "q22",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "111314",
+    "actual": "111314",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 6,
+    "latencyMs": 1306
+  },
+  {
+    "questionId": "q23",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6389,
+    "outputTokens": 2,
+    "latencyMs": 1951
+  },
+  {
+    "questionId": "q23",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 4,
+    "latencyMs": 1440
+  },
+  {
+    "questionId": "q23",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2526,
+    "outputTokens": 2,
+    "latencyMs": 978
+  },
+  {
+    "questionId": "q23",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 4,
+    "latencyMs": 1385
+  },
+  {
+    "questionId": "q23",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2380,
+    "outputTokens": 2,
+    "latencyMs": 2311
+  },
+  {
+    "questionId": "q23",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 4,
+    "latencyMs": 1066
+  },
+  {
+    "questionId": "q23",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6315,
+    "outputTokens": 2,
+    "latencyMs": 1914
+  },
+  {
+    "questionId": "q23",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 4,
+    "latencyMs": 1596
+  },
+  {
+    "questionId": "q23",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5011,
+    "outputTokens": 2,
+    "latencyMs": 1820
+  },
+  {
+    "questionId": "q23",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 4,
+    "latencyMs": 1067
+  },
+  {
+    "questionId": "q24",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 6,
+    "latencyMs": 2594
+  },
+  {
+    "questionId": "q24",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 10,
+    "latencyMs": 1139
+  },
+  {
+    "questionId": "q24",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 6,
+    "latencyMs": 1225
+  },
+  {
+    "questionId": "q24",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 10,
+    "latencyMs": 1082
+  },
+  {
+    "questionId": "q24",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 6,
+    "latencyMs": 4857
+  },
+  {
+    "questionId": "q24",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 10,
+    "latencyMs": 1082
+  },
+  {
+    "questionId": "q24",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 6,
+    "latencyMs": 1272
+  },
+  {
+    "questionId": "q24",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 10,
+    "latencyMs": 1201
+  },
+  {
+    "questionId": "q24",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 6,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q24",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "laurel54@yahoo.com",
+    "actual": "laurel54@yahoo.com",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 10,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q25",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 3,
+    "latencyMs": 1085
+  },
+  {
+    "questionId": "q25",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 7873,
+    "outputTokens": 6,
+    "latencyMs": 1102
+  },
+  {
+    "questionId": "q25",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 3,
+    "latencyMs": 1350
+  },
+  {
+    "questionId": "q25",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 6,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q25",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 3,
+    "latencyMs": 998
+  },
+  {
+    "questionId": "q25",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 2859,
+    "outputTokens": 6,
+    "latencyMs": 972
+  },
+  {
+    "questionId": "q25",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 3,
+    "latencyMs": 1331
+  },
+  {
+    "questionId": "q25",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 6368,
+    "outputTokens": 6,
+    "latencyMs": 1027
+  },
+  {
+    "questionId": "q25",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 3,
+    "latencyMs": 1170
+  },
+  {
+    "questionId": "q25",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "89553",
+    "actual": "89553",
+    "correct": true,
+    "inputTokens": 5763,
+    "outputTokens": 6,
+    "latencyMs": 1074
+  },
+  {
+    "questionId": "q26",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6389,
+    "outputTokens": 2,
+    "latencyMs": 1862
+  },
+  {
+    "questionId": "q26",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7866,
+    "outputTokens": 4,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q26",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2526,
+    "outputTokens": 2,
+    "latencyMs": 989
+  },
+  {
+    "questionId": "q26",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2978,
+    "outputTokens": 4,
+    "latencyMs": 1035
+  },
+  {
+    "questionId": "q26",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2380,
+    "outputTokens": 2,
+    "latencyMs": 2157
+  },
+  {
+    "questionId": "q26",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2852,
+    "outputTokens": 4,
+    "latencyMs": 1094
+  },
+  {
+    "questionId": "q26",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6315,
+    "outputTokens": 2,
+    "latencyMs": 1912
+  },
+  {
+    "questionId": "q26",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6361,
+    "outputTokens": 4,
+    "latencyMs": 1364
+  },
+  {
+    "questionId": "q26",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5011,
+    "outputTokens": 2,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q26",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5756,
+    "outputTokens": 4,
+    "latencyMs": 1082
+  },
+  {
+    "questionId": "q27",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 9,
+    "latencyMs": 1274
+  },
+  {
+    "questionId": "q27",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 14,
+    "latencyMs": 1130
+  },
+  {
+    "questionId": "q27",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 9,
+    "latencyMs": 1795
+  },
+  {
+    "questionId": "q27",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 14,
+    "latencyMs": 1309
+  },
+  {
+    "questionId": "q27",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 9,
+    "latencyMs": 1406
+  },
+  {
+    "questionId": "q27",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 14,
+    "latencyMs": 1398
+  },
+  {
+    "questionId": "q27",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 9,
+    "latencyMs": 1114
+  },
+  {
+    "questionId": "q27",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 14,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q27",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 9,
+    "latencyMs": 1941
+  },
+  {
+    "questionId": "q27",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "jayme.kertzmann77@gmail.com",
+    "actual": "jayme.kertzmann77@gmail.com",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 14,
+    "latencyMs": 1218
+  },
+  {
+    "questionId": "q28",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 1395
+  },
+  {
+    "questionId": "q28",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 6,
+    "latencyMs": 1342
+  },
+  {
+    "questionId": "q28",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 919
+  },
+  {
+    "questionId": "q28",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 6,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q28",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 1131
+  },
+  {
+    "questionId": "q28",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 6,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q28",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q28",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 6,
+    "latencyMs": 1095
+  },
+  {
+    "questionId": "q28",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 4588
+  },
+  {
+    "questionId": "q28",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "104053",
+    "actual": "104053",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 6,
+    "latencyMs": 1291
+  },
+  {
+    "questionId": "q29",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 2,
+    "latencyMs": 1688
+  },
+  {
+    "questionId": "q29",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7872,
+    "outputTokens": 4,
+    "latencyMs": 1301
+  },
+  {
+    "questionId": "q29",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 2,
+    "latencyMs": 1914
+  },
+  {
+    "questionId": "q29",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2984,
+    "outputTokens": 4,
+    "latencyMs": 1447
+  },
+  {
+    "questionId": "q29",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 2,
+    "latencyMs": 1725
+  },
+  {
+    "questionId": "q29",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2858,
+    "outputTokens": 4,
+    "latencyMs": 923
+  },
+  {
+    "questionId": "q29",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 2,
+    "latencyMs": 879
+  },
+  {
+    "questionId": "q29",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6367,
+    "outputTokens": 4,
+    "latencyMs": 1322
+  },
+  {
+    "questionId": "q29",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 2,
+    "latencyMs": 1394
+  },
+  {
+    "questionId": "q29",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5762,
+    "outputTokens": 4,
+    "latencyMs": 1008
+  },
+  {
+    "questionId": "q30",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 7,
+    "latencyMs": 894
+  },
+  {
+    "questionId": "q30",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 12,
+    "latencyMs": 1220
+  },
+  {
+    "questionId": "q30",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 7,
+    "latencyMs": 2225
+  },
+  {
+    "questionId": "q30",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 12,
+    "latencyMs": 1282
+  },
+  {
+    "questionId": "q30",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 7,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q30",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 12,
+    "latencyMs": 1686
+  },
+  {
+    "questionId": "q30",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 7,
+    "latencyMs": 1113
+  },
+  {
+    "questionId": "q30",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 12,
+    "latencyMs": 1089
+  },
+  {
+    "questionId": "q30",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 7,
+    "latencyMs": 949
+  },
+  {
+    "questionId": "q30",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "carley.bauch@yahoo.com",
+    "actual": "carley.bauch@yahoo.com",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 12,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q31",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 6394,
+    "outputTokens": 3,
+    "latencyMs": 4741
+  },
+  {
+    "questionId": "q31",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 7874,
+    "outputTokens": 6,
+    "latencyMs": 1132
+  },
+  {
+    "questionId": "q31",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 2531,
+    "outputTokens": 3,
+    "latencyMs": 1184
+  },
+  {
+    "questionId": "q31",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 6,
+    "latencyMs": 1137
+  },
+  {
+    "questionId": "q31",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 2385,
+    "outputTokens": 3,
+    "latencyMs": 963
+  },
+  {
+    "questionId": "q31",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 2860,
+    "outputTokens": 6,
+    "latencyMs": 1096
+  },
+  {
+    "questionId": "q31",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 6320,
+    "outputTokens": 3,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q31",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 6369,
+    "outputTokens": 6,
+    "latencyMs": 1594
+  },
+  {
+    "questionId": "q31",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 5016,
+    "outputTokens": 3,
+    "latencyMs": 1900
+  },
+  {
+    "questionId": "q31",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "142029",
+    "actual": "142029",
+    "correct": true,
+    "inputTokens": 5764,
+    "outputTokens": 6,
+    "latencyMs": 1274
+  },
+  {
+    "questionId": "q32",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Sales",
+    "correct": false,
+    "inputTokens": 6390,
+    "outputTokens": 2,
+    "latencyMs": 5224
+  },
+  {
+    "questionId": "q32",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 4,
+    "latencyMs": 1038
+  },
+  {
+    "questionId": "q32",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 2,
+    "latencyMs": 1902
+  },
+  {
+    "questionId": "q32",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 4,
+    "latencyMs": 1010
+  },
+  {
+    "questionId": "q32",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 2,
+    "latencyMs": 3263
+  },
+  {
+    "questionId": "q32",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 4,
+    "latencyMs": 871
+  },
+  {
+    "questionId": "q32",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Sales",
+    "correct": false,
+    "inputTokens": 6316,
+    "outputTokens": 2,
+    "latencyMs": 1278
+  },
+  {
+    "questionId": "q32",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 4,
+    "latencyMs": 1048
+  },
+  {
+    "questionId": "q32",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Sales",
+    "correct": false,
+    "inputTokens": 5012,
+    "outputTokens": 2,
+    "latencyMs": 1271
+  },
+  {
+    "questionId": "q32",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 4,
+    "latencyMs": 1075
+  },
+  {
+    "questionId": "q33",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 6394,
+    "outputTokens": 7,
+    "latencyMs": 1139
+  },
+  {
+    "questionId": "q33",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 7872,
+    "outputTokens": 14,
+    "latencyMs": 1319
+  },
+  {
+    "questionId": "q33",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 2531,
+    "outputTokens": 7,
+    "latencyMs": 1856
+  },
+  {
+    "questionId": "q33",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 2984,
+    "outputTokens": 14,
+    "latencyMs": 1393
+  },
+  {
+    "questionId": "q33",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 2385,
+    "outputTokens": 7,
+    "latencyMs": 1766
+  },
+  {
+    "questionId": "q33",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 2858,
+    "outputTokens": 14,
+    "latencyMs": 1609
+  },
+  {
+    "questionId": "q33",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 6320,
+    "outputTokens": 7,
+    "latencyMs": 1329
+  },
+  {
+    "questionId": "q33",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 6367,
+    "outputTokens": 14,
+    "latencyMs": 1178
+  },
+  {
+    "questionId": "q33",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 5016,
+    "outputTokens": 7,
+    "latencyMs": 1890
+  },
+  {
+    "questionId": "q33",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "cheyenne_skiles@hotmail.com",
+    "actual": "cheyenne_skiles@hotmail.com",
+    "correct": true,
+    "inputTokens": 5762,
+    "outputTokens": 14,
+    "latencyMs": 1326
+  },
+  {
+    "questionId": "q34",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 6392,
+    "outputTokens": 3,
+    "latencyMs": 1898
+  },
+  {
+    "questionId": "q34",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 6,
+    "latencyMs": 1074
+  },
+  {
+    "questionId": "q34",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 2529,
+    "outputTokens": 3,
+    "latencyMs": 1382
+  },
+  {
+    "questionId": "q34",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 6,
+    "latencyMs": 1060
+  },
+  {
+    "questionId": "q34",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 2383,
+    "outputTokens": 3,
+    "latencyMs": 1286
+  },
+  {
+    "questionId": "q34",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 6,
+    "latencyMs": 1591
+  },
+  {
+    "questionId": "q34",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 6318,
+    "outputTokens": 3,
+    "latencyMs": 2158
+  },
+  {
+    "questionId": "q34",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 6,
+    "latencyMs": 1532
+  },
+  {
+    "questionId": "q34",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 5014,
+    "outputTokens": 3,
+    "latencyMs": 1381
+  },
+  {
+    "questionId": "q34",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "84650",
+    "actual": "84650",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 6,
+    "latencyMs": 2262
+  },
+  {
+    "questionId": "q35",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 2,
+    "latencyMs": 2664
+  },
+  {
+    "questionId": "q35",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 4,
+    "latencyMs": 1260
+  },
+  {
+    "questionId": "q35",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 2,
+    "latencyMs": 1563
+  },
+  {
+    "questionId": "q35",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 4,
+    "latencyMs": 1415
+  },
+  {
+    "questionId": "q35",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 2,
+    "latencyMs": 1038
+  },
+  {
+    "questionId": "q35",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2857,
+    "outputTokens": 4,
+    "latencyMs": 1021
+  },
+  {
+    "questionId": "q35",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 2,
+    "latencyMs": 4276
+  },
+  {
+    "questionId": "q35",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 4,
+    "latencyMs": 1301
+  },
+  {
+    "questionId": "q35",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 2,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q35",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 4,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q36",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 9,
+    "latencyMs": 1390
+  },
+  {
+    "questionId": "q36",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 14,
+    "latencyMs": 1482
+  },
+  {
+    "questionId": "q36",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 9,
+    "latencyMs": 1754
+  },
+  {
+    "questionId": "q36",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 14,
+    "latencyMs": 1100
+  },
+  {
+    "questionId": "q36",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 9,
+    "latencyMs": 1421
+  },
+  {
+    "questionId": "q36",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 14,
+    "latencyMs": 2173
+  },
+  {
+    "questionId": "q36",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 9,
+    "latencyMs": 2911
+  },
+  {
+    "questionId": "q36",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 14,
+    "latencyMs": 1235
+  },
+  {
+    "questionId": "q36",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 9,
+    "latencyMs": 1303
+  },
+  {
+    "questionId": "q36",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "macey.gottlieb5@yahoo.com",
+    "actual": "macey.gottlieb5@yahoo.com",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 14,
+    "latencyMs": 1148
+  },
+  {
+    "questionId": "q37",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 3,
+    "latencyMs": 1430
+  },
+  {
+    "questionId": "q37",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 6,
+    "latencyMs": 1089
+  },
+  {
+    "questionId": "q37",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 3,
+    "latencyMs": 1059
+  },
+  {
+    "questionId": "q37",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 6,
+    "latencyMs": 1057
+  },
+  {
+    "questionId": "q37",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 3,
+    "latencyMs": 1716
+  },
+  {
+    "questionId": "q37",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 6,
+    "latencyMs": 904
+  },
+  {
+    "questionId": "q37",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 3,
+    "latencyMs": 2950
+  },
+  {
+    "questionId": "q37",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 6,
+    "latencyMs": 1189
+  },
+  {
+    "questionId": "q37",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 3,
+    "latencyMs": 1050
+  },
+  {
+    "questionId": "q37",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "89773",
+    "actual": "89773",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 6,
+    "latencyMs": 1329
+  },
+  {
+    "questionId": "q38",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 2,
+    "latencyMs": 3410
+  },
+  {
+    "questionId": "q38",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 7868,
+    "outputTokens": 4,
+    "latencyMs": 1891
+  },
+  {
+    "questionId": "q38",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 2,
+    "latencyMs": 1010
+  },
+  {
+    "questionId": "q38",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2980,
+    "outputTokens": 4,
+    "latencyMs": 988
+  },
+  {
+    "questionId": "q38",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 2,
+    "latencyMs": 1364
+  },
+  {
+    "questionId": "q38",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 2854,
+    "outputTokens": 4,
+    "latencyMs": 1395
+  },
+  {
+    "questionId": "q38",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 2,
+    "latencyMs": 2293
+  },
+  {
+    "questionId": "q38",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 6363,
+    "outputTokens": 4,
+    "latencyMs": 1137
+  },
+  {
+    "questionId": "q38",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 2,
+    "latencyMs": 1451
+  },
+  {
+    "questionId": "q38",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Marketing",
+    "actual": "Marketing",
+    "correct": true,
+    "inputTokens": 5758,
+    "outputTokens": 4,
+    "latencyMs": 1100
+  },
+  {
+    "questionId": "q39",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 6390,
+    "outputTokens": 10,
+    "latencyMs": 1674
+  },
+  {
+    "questionId": "q39",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 7869,
+    "outputTokens": 13,
+    "latencyMs": 1403
+  },
+  {
+    "questionId": "q39",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 2527,
+    "outputTokens": 10,
+    "latencyMs": 1413
+  },
+  {
+    "questionId": "q39",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 2981,
+    "outputTokens": 13,
+    "latencyMs": 1200
+  },
+  {
+    "questionId": "q39",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 2381,
+    "outputTokens": 10,
+    "latencyMs": 1730
+  },
+  {
+    "questionId": "q39",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 2855,
+    "outputTokens": 13,
+    "latencyMs": 1226
+  },
+  {
+    "questionId": "q39",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 6316,
+    "outputTokens": 10,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q39",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 6364,
+    "outputTokens": 13,
+    "latencyMs": 1337
+  },
+  {
+    "questionId": "q39",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 5012,
+    "outputTokens": 10,
+    "latencyMs": 2368
+  },
+  {
+    "questionId": "q39",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "georgianna_renner@yahoo.com",
+    "actual": "georgianna_renner@yahoo.com",
+    "correct": true,
+    "inputTokens": 5759,
+    "outputTokens": 13,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q40",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 6391,
+    "outputTokens": 3,
+    "latencyMs": 3815
+  },
+  {
+    "questionId": "q40",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 7871,
+    "outputTokens": 6,
+    "latencyMs": 1169
+  },
+  {
+    "questionId": "q40",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 2528,
+    "outputTokens": 3,
+    "latencyMs": 1070
+  },
+  {
+    "questionId": "q40",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 6,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q40",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 2382,
+    "outputTokens": 3,
+    "latencyMs": 1115
+  },
+  {
+    "questionId": "q40",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "49741",
+    "actual": "144426",
+    "correct": false,
+    "inputTokens": 2857,
+    "outputTokens": 6,
+    "latencyMs": 1365
+  },
+  {
+    "questionId": "q40",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 6317,
+    "outputTokens": 3,
+    "latencyMs": 2004
+  },
+  {
+    "questionId": "q40",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 6366,
+    "outputTokens": 6,
+    "latencyMs": 1113
+  },
+  {
+    "questionId": "q40",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 5013,
+    "outputTokens": 3,
+    "latencyMs": 3055
+  },
+  {
+    "questionId": "q40",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "49741",
+    "actual": "49741",
+    "correct": true,
+    "inputTokens": 5761,
+    "outputTokens": 6,
+    "latencyMs": 1392
+  },
+  {
+    "questionId": "q41",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 3877
+  },
+  {
+    "questionId": "q41",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 1128
+  },
+  {
+    "questionId": "q41",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 966
+  },
+  {
+    "questionId": "q41",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1070
+  },
+  {
+    "questionId": "q41",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 2411
+  },
+  {
+    "questionId": "q41",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 1286
+  },
+  {
+    "questionId": "q41",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 2082
+  },
+  {
+    "questionId": "q41",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1107
+  },
+  {
+    "questionId": "q41",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 1216
+  },
+  {
+    "questionId": "q41",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1052
+  },
+  {
+    "questionId": "q42",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 1572
+  },
+  {
+    "questionId": "q42",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 1084
+  },
+  {
+    "questionId": "q42",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 1377
+  },
+  {
+    "questionId": "q42",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "14",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q42",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 2705
+  },
+  {
+    "questionId": "q42",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 1020
+  },
+  {
+    "questionId": "q42",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 5345
+  },
+  {
+    "questionId": "q42",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "14",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1207
+  },
+  {
+    "questionId": "q42",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 921
+  },
+  {
+    "questionId": "q42",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1289
+  },
+  {
+    "questionId": "q43",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 2423
+  },
+  {
+    "questionId": "q43",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q43",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 975
+  },
+  {
+    "questionId": "q43",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1301
+  },
+  {
+    "questionId": "q43",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 1423
+  },
+  {
+    "questionId": "q43",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 927
+  },
+  {
+    "questionId": "q43",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 1258
+  },
+  {
+    "questionId": "q43",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1250
+  },
+  {
+    "questionId": "q43",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 872
+  },
+  {
+    "questionId": "q43",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1385
+  },
+  {
+    "questionId": "q44",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 1201
+  },
+  {
+    "questionId": "q44",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 1149
+  },
+  {
+    "questionId": "q44",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 1498
+  },
+  {
+    "questionId": "q44",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1149
+  },
+  {
+    "questionId": "q44",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 1098
+  },
+  {
+    "questionId": "q44",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 1121
+  },
+  {
+    "questionId": "q44",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 2522
+  },
+  {
+    "questionId": "q44",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1532
+  },
+  {
+    "questionId": "q44",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "17",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 4914
+  },
+  {
+    "questionId": "q44",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "17",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1324
+  },
+  {
+    "questionId": "q45",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 1446
+  },
+  {
+    "questionId": "q45",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 1105
+  },
+  {
+    "questionId": "q45",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 1297
+  },
+  {
+    "questionId": "q45",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q45",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 1561
+  },
+  {
+    "questionId": "q45",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 1292
+  },
+  {
+    "questionId": "q45",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 1127
+  },
+  {
+    "questionId": "q45",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1207
+  },
+  {
+    "questionId": "q45",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 1582
+  },
+  {
+    "questionId": "q45",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1278
+  },
+  {
+    "questionId": "q46",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6388,
+    "outputTokens": 2,
+    "latencyMs": 1278
+  },
+  {
+    "questionId": "q46",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 7865,
+    "outputTokens": 5,
+    "latencyMs": 3084
+  },
+  {
+    "questionId": "q46",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2525,
+    "outputTokens": 2,
+    "latencyMs": 1289
+  },
+  {
+    "questionId": "q46",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2977,
+    "outputTokens": 5,
+    "latencyMs": 1591
+  },
+  {
+    "questionId": "q46",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2379,
+    "outputTokens": 2,
+    "latencyMs": 3038
+  },
+  {
+    "questionId": "q46",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 2851,
+    "outputTokens": 5,
+    "latencyMs": 1447
+  },
+  {
+    "questionId": "q46",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 6314,
+    "outputTokens": 2,
+    "latencyMs": 1224
+  },
+  {
+    "questionId": "q46",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 6360,
+    "outputTokens": 5,
+    "latencyMs": 1250
+  },
+  {
+    "questionId": "q46",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "16",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5010,
+    "outputTokens": 2,
+    "latencyMs": 1364
+  },
+  {
+    "questionId": "q46",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "16",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 5755,
+    "outputTokens": 5,
+    "latencyMs": 1560
+  },
+  {
+    "questionId": "q47",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "91",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 6393,
+    "outputTokens": 2,
+    "latencyMs": 989
+  },
+  {
+    "questionId": "q47",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "91",
+    "actual": "89",
+    "correct": false,
+    "inputTokens": 7870,
+    "outputTokens": 5,
+    "latencyMs": 1358
+  },
+  {
+    "questionId": "q47",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "91",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 2530,
+    "outputTokens": 2,
+    "latencyMs": 1406
+  },
+  {
+    "questionId": "q47",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "91",
+    "actual": "85",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 5,
+    "latencyMs": 1123
+  },
+  {
+    "questionId": "q47",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "91",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 2384,
+    "outputTokens": 2,
+    "latencyMs": 4883
+  },
+  {
+    "questionId": "q47",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "91",
+    "actual": "85",
+    "correct": false,
+    "inputTokens": 2856,
+    "outputTokens": 5,
+    "latencyMs": 1402
+  },
+  {
+    "questionId": "q47",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "91",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 6319,
+    "outputTokens": 2,
+    "latencyMs": 1915
+  },
+  {
+    "questionId": "q47",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "91",
+    "actual": "89",
+    "correct": false,
+    "inputTokens": 6365,
+    "outputTokens": 5,
+    "latencyMs": 1263
+  },
+  {
+    "questionId": "q47",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "91",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 5015,
+    "outputTokens": 2,
+    "latencyMs": 1448
+  },
+  {
+    "questionId": "q47",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "91",
+    "actual": "89",
+    "correct": false,
+    "inputTokens": 5760,
+    "outputTokens": 5,
+    "latencyMs": 1243
+  },
+  {
+    "questionId": "q48",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "67",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 6393,
+    "outputTokens": 2,
+    "latencyMs": 1456
+  },
+  {
+    "questionId": "q48",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "67",
+    "actual": "57",
+    "correct": false,
+    "inputTokens": 7870,
+    "outputTokens": 5,
+    "latencyMs": 1186
+  },
+  {
+    "questionId": "q48",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "67",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 2530,
+    "outputTokens": 2,
+    "latencyMs": 1076
+  },
+  {
+    "questionId": "q48",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "67",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 5,
+    "latencyMs": 1168
+  },
+  {
+    "questionId": "q48",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "67",
+    "actual": "56",
+    "correct": false,
+    "inputTokens": 2384,
+    "outputTokens": 2,
+    "latencyMs": 3105
+  },
+  {
+    "questionId": "q48",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "67",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 2856,
+    "outputTokens": 5,
+    "latencyMs": 1375
+  },
+  {
+    "questionId": "q48",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "67",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 6319,
+    "outputTokens": 2,
+    "latencyMs": 1618
+  },
+  {
+    "questionId": "q48",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "67",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 6365,
+    "outputTokens": 5,
+    "latencyMs": 1454
+  },
+  {
+    "questionId": "q48",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "67",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 5015,
+    "outputTokens": 2,
+    "latencyMs": 1244
+  },
+  {
+    "questionId": "q48",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "67",
+    "actual": "57",
+    "correct": false,
+    "inputTokens": 5760,
+    "outputTokens": 5,
+    "latencyMs": 1113
+  },
+  {
+    "questionId": "q49",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "30",
+    "correct": false,
+    "inputTokens": 6393,
+    "outputTokens": 2,
+    "latencyMs": 1267
+  },
+  {
+    "questionId": "q49",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 7870,
+    "outputTokens": 5,
+    "latencyMs": 1227
+  },
+  {
+    "questionId": "q49",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "30",
+    "correct": false,
+    "inputTokens": 2530,
+    "outputTokens": 2,
+    "latencyMs": 1246
+  },
+  {
+    "questionId": "q49",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "27",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 5,
+    "latencyMs": 1127
+  },
+  {
+    "questionId": "q49",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 2384,
+    "outputTokens": 2,
+    "latencyMs": 1260
+  },
+  {
+    "questionId": "q49",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 2856,
+    "outputTokens": 5,
+    "latencyMs": 1293
+  },
+  {
+    "questionId": "q49",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6319,
+    "outputTokens": 2,
+    "latencyMs": 1246
+  },
+  {
+    "questionId": "q49",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "27",
+    "correct": false,
+    "inputTokens": 6365,
+    "outputTokens": 5,
+    "latencyMs": 1598
+  },
+  {
+    "questionId": "q49",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 5015,
+    "outputTokens": 2,
+    "latencyMs": 1471
+  },
+  {
+    "questionId": "q49",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 5760,
+    "outputTokens": 5,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q50",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "26",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 6393,
+    "outputTokens": 2,
+    "latencyMs": 3950
+  },
+  {
+    "questionId": "q50",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "26",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 7870,
+    "outputTokens": 5,
+    "latencyMs": 1075
+  },
+  {
+    "questionId": "q50",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "26",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 2530,
+    "outputTokens": 2,
+    "latencyMs": 1868
+  },
+  {
+    "questionId": "q50",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "26",
+    "actual": "16",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 5,
+    "latencyMs": 1075
+  },
+  {
+    "questionId": "q50",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "26",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 2384,
+    "outputTokens": 2,
+    "latencyMs": 1973
+  },
+  {
+    "questionId": "q50",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "26",
+    "actual": "16",
+    "correct": false,
+    "inputTokens": 2856,
+    "outputTokens": 5,
+    "latencyMs": 947
+  },
+  {
+    "questionId": "q50",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "26",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 6319,
+    "outputTokens": 2,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q50",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "26",
+    "actual": "16",
+    "correct": false,
+    "inputTokens": 6365,
+    "outputTokens": 5,
+    "latencyMs": 1221
+  },
+  {
+    "questionId": "q50",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "26",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 5015,
+    "outputTokens": 2,
+    "latencyMs": 1148
+  },
+  {
+    "questionId": "q50",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "26",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 5760,
+    "outputTokens": 5,
+    "latencyMs": 1286
+  },
+  {
+    "questionId": "q51",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "78",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 6387,
+    "outputTokens": 2,
+    "latencyMs": 2525
+  },
+  {
+    "questionId": "q51",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "78",
+    "actual": "81",
+    "correct": false,
+    "inputTokens": 7864,
+    "outputTokens": 5,
+    "latencyMs": 1613
+  },
+  {
+    "questionId": "q51",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "78",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 2524,
+    "outputTokens": 2,
+    "latencyMs": 1132
+  },
+  {
+    "questionId": "q51",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "78",
+    "actual": "78",
+    "correct": true,
+    "inputTokens": 2976,
+    "outputTokens": 5,
+    "latencyMs": 1104
+  },
+  {
+    "questionId": "q51",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "78",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 2378,
+    "outputTokens": 2,
+    "latencyMs": 1069
+  },
+  {
+    "questionId": "q51",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "78",
+    "actual": "73",
+    "correct": false,
+    "inputTokens": 2850,
+    "outputTokens": 5,
+    "latencyMs": 1113
+  },
+  {
+    "questionId": "q51",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "78",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 6313,
+    "outputTokens": 2,
+    "latencyMs": 1999
+  },
+  {
+    "questionId": "q51",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "78",
+    "actual": "78",
+    "correct": true,
+    "inputTokens": 6359,
+    "outputTokens": 5,
+    "latencyMs": 1214
+  },
+  {
+    "questionId": "q51",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "78",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 5009,
+    "outputTokens": 2,
+    "latencyMs": 1613
+  },
+  {
+    "questionId": "q51",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "78",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 5754,
+    "outputTokens": 5,
+    "latencyMs": 1012
+  },
+  {
+    "questionId": "q52",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "22",
+    "actual": "30",
+    "correct": false,
+    "inputTokens": 6387,
+    "outputTokens": 2,
+    "latencyMs": 1580
+  },
+  {
+    "questionId": "q52",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "22",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7864,
+    "outputTokens": 5,
+    "latencyMs": 1688
+  },
+  {
+    "questionId": "q52",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "22",
+    "actual": "22",
+    "correct": true,
+    "inputTokens": 2524,
+    "outputTokens": 2,
+    "latencyMs": 1290
+  },
+  {
+    "questionId": "q52",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "22",
+    "actual": "16",
+    "correct": false,
+    "inputTokens": 2976,
+    "outputTokens": 5,
+    "latencyMs": 1121
+  },
+  {
+    "questionId": "q52",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "22",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 2378,
+    "outputTokens": 2,
+    "latencyMs": 1544
+  },
+  {
+    "questionId": "q52",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "22",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 2850,
+    "outputTokens": 5,
+    "latencyMs": 822
+  },
+  {
+    "questionId": "q52",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "22",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 6313,
+    "outputTokens": 2,
+    "latencyMs": 2718
+  },
+  {
+    "questionId": "q52",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "22",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6359,
+    "outputTokens": 5,
+    "latencyMs": 1211
+  },
+  {
+    "questionId": "q52",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "22",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 5009,
+    "outputTokens": 2,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q52",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "22",
+    "actual": "16",
+    "correct": false,
+    "inputTokens": 5754,
+    "outputTokens": 5,
+    "latencyMs": 1156
+  },
+  {
+    "questionId": "q53",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6395,
+    "outputTokens": 2,
+    "latencyMs": 1089
+  },
+  {
+    "questionId": "q53",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "9",
+    "correct": false,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1368
+  },
+  {
+    "questionId": "q53",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 2532,
+    "outputTokens": 2,
+    "latencyMs": 1850
+  },
+  {
+    "questionId": "q53",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "9",
+    "correct": false,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 914
+  },
+  {
+    "questionId": "q53",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 2386,
+    "outputTokens": 2,
+    "latencyMs": 1156
+  },
+  {
+    "questionId": "q53",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q53",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 6321,
+    "outputTokens": 2,
+    "latencyMs": 1020
+  },
+  {
+    "questionId": "q53",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1021
+  },
+  {
+    "questionId": "q53",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 5017,
+    "outputTokens": 2,
+    "latencyMs": 1236
+  },
+  {
+    "questionId": "q53",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1574
+  },
+  {
+    "questionId": "q54",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6395,
+    "outputTokens": 2,
+    "latencyMs": 1437
+  },
+  {
+    "questionId": "q54",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1091
+  },
+  {
+    "questionId": "q54",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 2532,
+    "outputTokens": 2,
+    "latencyMs": 1917
+  },
+  {
+    "questionId": "q54",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "6",
+    "correct": false,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 1095
+  },
+  {
+    "questionId": "q54",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 2386,
+    "outputTokens": 2,
+    "latencyMs": 4230
+  },
+  {
+    "questionId": "q54",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q54",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6321,
+    "outputTokens": 2,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q54",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "6",
+    "correct": false,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1176
+  },
+  {
+    "questionId": "q54",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 5017,
+    "outputTokens": 2,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q54",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1383
+  },
+  {
+    "questionId": "q55",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "30",
+    "correct": false,
+    "inputTokens": 6395,
+    "outputTokens": 2,
+    "latencyMs": 1149
+  },
+  {
+    "questionId": "q55",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1072
+  },
+  {
+    "questionId": "q55",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 2532,
+    "outputTokens": 2,
+    "latencyMs": 1213
+  },
+  {
+    "questionId": "q55",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 1507
+  },
+  {
+    "questionId": "q55",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 2386,
+    "outputTokens": 2,
+    "latencyMs": 1826
+  },
+  {
+    "questionId": "q55",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q55",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6321,
+    "outputTokens": 2,
+    "latencyMs": 1008
+  },
+  {
+    "questionId": "q55",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1285
+  },
+  {
+    "questionId": "q55",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 5017,
+    "outputTokens": 2,
+    "latencyMs": 1124
+  },
+  {
+    "questionId": "q55",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "9",
+    "correct": false,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1212
+  },
+  {
+    "questionId": "q56",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 6395,
+    "outputTokens": 2,
+    "latencyMs": 1232
+  },
+  {
+    "questionId": "q56",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1792
+  },
+  {
+    "questionId": "q56",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "12",
+    "correct": true,
+    "inputTokens": 2532,
+    "outputTokens": 2,
+    "latencyMs": 1357
+  },
+  {
+    "questionId": "q56",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "6",
+    "correct": false,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 1247
+  },
+  {
+    "questionId": "q56",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 2386,
+    "outputTokens": 2,
+    "latencyMs": 1043
+  },
+  {
+    "questionId": "q56",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1065
+  },
+  {
+    "questionId": "q56",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 6321,
+    "outputTokens": 2,
+    "latencyMs": 1298
+  },
+  {
+    "questionId": "q56",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1767
+  },
+  {
+    "questionId": "q56",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "12",
+    "actual": "10",
+    "correct": false,
+    "inputTokens": 5017,
+    "outputTokens": 2,
+    "latencyMs": 3525
+  },
+  {
+    "questionId": "q56",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "12",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1355
+  },
+  {
+    "questionId": "q57",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "62",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 6394,
+    "outputTokens": 2,
+    "latencyMs": 1359
+  },
+  {
+    "questionId": "q57",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "62",
+    "actual": "62",
+    "correct": true,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1447
+  },
+  {
+    "questionId": "q57",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "62",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 2531,
+    "outputTokens": 2,
+    "latencyMs": 3832
+  },
+  {
+    "questionId": "q57",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "62",
+    "actual": "62",
+    "correct": true,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 1143
+  },
+  {
+    "questionId": "q57",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "62",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 2385,
+    "outputTokens": 2,
+    "latencyMs": 1370
+  },
+  {
+    "questionId": "q57",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "62",
+    "actual": "62",
+    "correct": true,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1042
+  },
+  {
+    "questionId": "q57",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "62",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 6320,
+    "outputTokens": 2,
+    "latencyMs": 1015
+  },
+  {
+    "questionId": "q57",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "62",
+    "actual": "62",
+    "correct": true,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1395
+  },
+  {
+    "questionId": "q57",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "62",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 5016,
+    "outputTokens": 2,
+    "latencyMs": 1008
+  },
+  {
+    "questionId": "q57",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "62",
+    "actual": "62",
+    "correct": true,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q58",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "45",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 6394,
+    "outputTokens": 2,
+    "latencyMs": 1304
+  },
+  {
+    "questionId": "q58",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "45",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 7872,
+    "outputTokens": 5,
+    "latencyMs": 1386
+  },
+  {
+    "questionId": "q58",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "45",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 2531,
+    "outputTokens": 2,
+    "latencyMs": 1433
+  },
+  {
+    "questionId": "q58",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "45",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 2984,
+    "outputTokens": 5,
+    "latencyMs": 967
+  },
+  {
+    "questionId": "q58",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "45",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 2385,
+    "outputTokens": 2,
+    "latencyMs": 2469
+  },
+  {
+    "questionId": "q58",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "45",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 2858,
+    "outputTokens": 5,
+    "latencyMs": 1382
+  },
+  {
+    "questionId": "q58",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "45",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 6320,
+    "outputTokens": 2,
+    "latencyMs": 1658
+  },
+  {
+    "questionId": "q58",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "45",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 6367,
+    "outputTokens": 5,
+    "latencyMs": 1450
+  },
+  {
+    "questionId": "q58",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "45",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 5016,
+    "outputTokens": 2,
+    "latencyMs": 1428
+  },
+  {
+    "questionId": "q58",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "45",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 5762,
+    "outputTokens": 5,
+    "latencyMs": 1144
+  },
+  {
+    "questionId": "q59",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1577
+  },
+  {
+    "questionId": "q59",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1181
+  },
+  {
+    "questionId": "q59",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1231
+  },
+  {
+    "questionId": "q59",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1407
+  },
+  {
+    "questionId": "q59",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1393
+  },
+  {
+    "questionId": "q59",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1534
+  },
+  {
+    "questionId": "q59",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1456
+  },
+  {
+    "questionId": "q59",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1933
+  },
+  {
+    "questionId": "q59",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1472
+  },
+  {
+    "questionId": "q59",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "96.17",
+    "actual": "96.17",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1224
+  },
+  {
+    "questionId": "q60",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 2069
+  },
+  {
+    "questionId": "q60",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1172
+  },
+  {
+    "questionId": "q60",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 1236
+  },
+  {
+    "questionId": "q60",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1157
+  },
+  {
+    "questionId": "q60",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 1364
+  },
+  {
+    "questionId": "q60",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1041
+  },
+  {
+    "questionId": "q60",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1478
+  },
+  {
+    "questionId": "q60",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1266
+  },
+  {
+    "questionId": "q60",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 3477
+  },
+  {
+    "questionId": "q60",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 2630
+  },
+  {
+    "questionId": "q61",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1479
+  },
+  {
+    "questionId": "q61",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1270
+  },
+  {
+    "questionId": "q61",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1270
+  },
+  {
+    "questionId": "q61",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1342
+  },
+  {
+    "questionId": "q61",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1350
+  },
+  {
+    "questionId": "q61",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1205
+  },
+  {
+    "questionId": "q61",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1502
+  },
+  {
+    "questionId": "q61",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1571
+  },
+  {
+    "questionId": "q61",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 2013
+  },
+  {
+    "questionId": "q61",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "599.39",
+    "actual": "599.39",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1428
+  },
+  {
+    "questionId": "q62",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 1666
+  },
+  {
+    "questionId": "q62",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1549
+  },
+  {
+    "questionId": "q62",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1033
+  },
+  {
+    "questionId": "q62",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1061
+  },
+  {
+    "questionId": "q62",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 2008
+  },
+  {
+    "questionId": "q62",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1214
+  },
+  {
+    "questionId": "q62",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1321
+  },
+  {
+    "questionId": "q62",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q62",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 1769
+  },
+  {
+    "questionId": "q62",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1157
+  },
+  {
+    "questionId": "q63",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1213
+  },
+  {
+    "questionId": "q63",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1332
+  },
+  {
+    "questionId": "q63",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 3749
+  },
+  {
+    "questionId": "q63",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1326
+  },
+  {
+    "questionId": "q63",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 947
+  },
+  {
+    "questionId": "q63",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q63",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1428
+  },
+  {
+    "questionId": "q63",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1659
+  },
+  {
+    "questionId": "q63",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 5584
+  },
+  {
+    "questionId": "q63",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "528.71",
+    "actual": "528.71",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q64",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 2425
+  },
+  {
+    "questionId": "q64",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1481
+  },
+  {
+    "questionId": "q64",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1109
+  },
+  {
+    "questionId": "q64",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1048
+  },
+  {
+    "questionId": "q64",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 1256
+  },
+  {
+    "questionId": "q64",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1117
+  },
+  {
+    "questionId": "q64",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1168
+  },
+  {
+    "questionId": "q64",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1504
+  },
+  {
+    "questionId": "q64",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 1134
+  },
+  {
+    "questionId": "q64",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1059
+  },
+  {
+    "questionId": "q65",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 2361
+  },
+  {
+    "questionId": "q65",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 8,
+    "latencyMs": 1158
+  },
+  {
+    "questionId": "q65",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1493
+  },
+  {
+    "questionId": "q65",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 8,
+    "latencyMs": 1068
+  },
+  {
+    "questionId": "q65",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 1490
+  },
+  {
+    "questionId": "q65",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 8,
+    "latencyMs": 1386
+  },
+  {
+    "questionId": "q65",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 1470
+  },
+  {
+    "questionId": "q65",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 8,
+    "latencyMs": 1189
+  },
+  {
+    "questionId": "q65",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 2824
+  },
+  {
+    "questionId": "q65",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1687.82",
+    "actual": "1687.82",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 8,
+    "latencyMs": 1565
+  },
+  {
+    "questionId": "q66",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 1480
+  },
+  {
+    "questionId": "q66",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1354
+  },
+  {
+    "questionId": "q66",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 5334
+  },
+  {
+    "questionId": "q66",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1158
+  },
+  {
+    "questionId": "q66",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 2043
+  },
+  {
+    "questionId": "q66",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1302
+  },
+  {
+    "questionId": "q66",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1006
+  },
+  {
+    "questionId": "q66",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1106
+  },
+  {
+    "questionId": "q66",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 1801
+  },
+  {
+    "questionId": "q66",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1626
+  },
+  {
+    "questionId": "q67",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 2107
+  },
+  {
+    "questionId": "q67",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1183
+  },
+  {
+    "questionId": "q67",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 7091
+  },
+  {
+    "questionId": "q67",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1730
+  },
+  {
+    "questionId": "q67",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1222
+  },
+  {
+    "questionId": "q67",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1447
+  },
+  {
+    "questionId": "q67",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 10295
+  },
+  {
+    "questionId": "q67",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1228
+  },
+  {
+    "questionId": "q67",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1748
+  },
+  {
+    "questionId": "q67",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "423.6",
+    "actual": "423.6",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1373
+  },
+  {
+    "questionId": "q68",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 3836
+  },
+  {
+    "questionId": "q68",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1297
+  },
+  {
+    "questionId": "q68",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 1927
+  },
+  {
+    "questionId": "q68",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q68",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 1551
+  },
+  {
+    "questionId": "q68",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q68",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1387
+  },
+  {
+    "questionId": "q68",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1237
+  },
+  {
+    "questionId": "q68",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 1934
+  },
+  {
+    "questionId": "q68",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1132
+  },
+  {
+    "questionId": "q69",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 2267
+  },
+  {
+    "questionId": "q69",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1772
+  },
+  {
+    "questionId": "q69",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1315
+  },
+  {
+    "questionId": "q69",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1165
+  },
+  {
+    "questionId": "q69",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1097
+  },
+  {
+    "questionId": "q69",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1299
+  },
+  {
+    "questionId": "q69",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1779
+  },
+  {
+    "questionId": "q69",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 3153
+  },
+  {
+    "questionId": "q69",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1813
+  },
+  {
+    "questionId": "q69",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "784.03",
+    "actual": "784.03",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1867
+  },
+  {
+    "questionId": "q70",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 1611
+  },
+  {
+    "questionId": "q70",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1173
+  },
+  {
+    "questionId": "q70",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 1977
+  },
+  {
+    "questionId": "q70",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1108
+  },
+  {
+    "questionId": "q70",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 1324
+  },
+  {
+    "questionId": "q70",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1225
+  },
+  {
+    "questionId": "q70",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1416
+  },
+  {
+    "questionId": "q70",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1200
+  },
+  {
+    "questionId": "q70",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 1259
+  },
+  {
+    "questionId": "q70",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "shipped",
+    "actual": "shipped",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1433
+  },
+  {
+    "questionId": "q71",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1729
+  },
+  {
+    "questionId": "q71",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1143
+  },
+  {
+    "questionId": "q71",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1837
+  },
+  {
+    "questionId": "q71",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1147
+  },
+  {
+    "questionId": "q71",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1777
+  },
+  {
+    "questionId": "q71",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1295
+  },
+  {
+    "questionId": "q71",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1081
+  },
+  {
+    "questionId": "q71",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1692
+  },
+  {
+    "questionId": "q71",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1661
+  },
+  {
+    "questionId": "q71",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "645.88",
+    "actual": "645.88",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1475
+  },
+  {
+    "questionId": "q72",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 2979
+  },
+  {
+    "questionId": "q72",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q72",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1620
+  },
+  {
+    "questionId": "q72",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1532
+  },
+  {
+    "questionId": "q72",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 1616
+  },
+  {
+    "questionId": "q72",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q72",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1190
+  },
+  {
+    "questionId": "q72",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q72",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 2335
+  },
+  {
+    "questionId": "q72",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "processing",
+    "actual": "processing",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1308
+  },
+  {
+    "questionId": "q73",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 3359
+  },
+  {
+    "questionId": "q73",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1227
+  },
+  {
+    "questionId": "q73",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1439
+  },
+  {
+    "questionId": "q73",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1179
+  },
+  {
+    "questionId": "q73",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1064
+  },
+  {
+    "questionId": "q73",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1144
+  },
+  {
+    "questionId": "q73",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1873
+  },
+  {
+    "questionId": "q73",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1302
+  },
+  {
+    "questionId": "q73",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1956
+  },
+  {
+    "questionId": "q73",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "371.91",
+    "actual": "371.91",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1281
+  },
+  {
+    "questionId": "q74",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 1591
+  },
+  {
+    "questionId": "q74",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1279
+  },
+  {
+    "questionId": "q74",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 3152
+  },
+  {
+    "questionId": "q74",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1061
+  },
+  {
+    "questionId": "q74",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 1557
+  },
+  {
+    "questionId": "q74",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1313
+  },
+  {
+    "questionId": "q74",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1433
+  },
+  {
+    "questionId": "q74",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1812
+  },
+  {
+    "questionId": "q74",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 1024
+  },
+  {
+    "questionId": "q74",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "pending",
+    "actual": "pending",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1243
+  },
+  {
+    "questionId": "q75",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 3,
+    "latencyMs": 1500
+  },
+  {
+    "questionId": "q75",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 6,
+    "latencyMs": 1275
+  },
+  {
+    "questionId": "q75",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 3,
+    "latencyMs": 1841
+  },
+  {
+    "questionId": "q75",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 6,
+    "latencyMs": 1080
+  },
+  {
+    "questionId": "q75",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 3,
+    "latencyMs": 1209
+  },
+  {
+    "questionId": "q75",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 6,
+    "latencyMs": 1308
+  },
+  {
+    "questionId": "q75",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 3,
+    "latencyMs": 1556
+  },
+  {
+    "questionId": "q75",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 6,
+    "latencyMs": 1240
+  },
+  {
+    "questionId": "q75",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 3,
+    "latencyMs": 1254
+  },
+  {
+    "questionId": "q75",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1066",
+    "actual": "1066",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 6,
+    "latencyMs": 1305
+  },
+  {
+    "questionId": "q76",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 2606
+  },
+  {
+    "questionId": "q76",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1422
+  },
+  {
+    "questionId": "q76",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 2688
+  },
+  {
+    "questionId": "q76",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1041
+  },
+  {
+    "questionId": "q76",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 3070
+  },
+  {
+    "questionId": "q76",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1167
+  },
+  {
+    "questionId": "q76",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1702
+  },
+  {
+    "questionId": "q76",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1182
+  },
+  {
+    "questionId": "q76",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 1740
+  },
+  {
+    "questionId": "q76",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "cancelled",
+    "actual": "cancelled",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1404
+  },
+  {
+    "questionId": "q77",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1596
+  },
+  {
+    "questionId": "q77",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 8,
+    "latencyMs": 2314
+  },
+  {
+    "questionId": "q77",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1114
+  },
+  {
+    "questionId": "q77",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 8,
+    "latencyMs": 1289
+  },
+  {
+    "questionId": "q77",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 2428
+  },
+  {
+    "questionId": "q77",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 8,
+    "latencyMs": 1325
+  },
+  {
+    "questionId": "q77",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 1343
+  },
+  {
+    "questionId": "q77",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 8,
+    "latencyMs": 1783
+  },
+  {
+    "questionId": "q77",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 918
+  },
+  {
+    "questionId": "q77",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1697.4",
+    "actual": "1697.4",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 8,
+    "latencyMs": 1308
+  },
+  {
+    "questionId": "q78",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9739,
+    "outputTokens": 3,
+    "latencyMs": 1396
+  },
+  {
+    "questionId": "q78",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 11906,
+    "outputTokens": 4,
+    "latencyMs": 1225
+  },
+  {
+    "questionId": "q78",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6013,
+    "outputTokens": 3,
+    "latencyMs": 2294
+  },
+  {
+    "questionId": "q78",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6992,
+    "outputTokens": 4,
+    "latencyMs": 1418
+  },
+  {
+    "questionId": "q78",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 6781,
+    "outputTokens": 3,
+    "latencyMs": 1613
+  },
+  {
+    "questionId": "q78",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 8413,
+    "outputTokens": 4,
+    "latencyMs": 1374
+  },
+  {
+    "questionId": "q78",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9158,
+    "outputTokens": 3,
+    "latencyMs": 1341
+  },
+  {
+    "questionId": "q78",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 9288,
+    "outputTokens": 4,
+    "latencyMs": 1223
+  },
+  {
+    "questionId": "q78",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 7373,
+    "outputTokens": 3,
+    "latencyMs": 2230
+  },
+  {
+    "questionId": "q78",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "delivered",
+    "actual": "delivered",
+    "correct": true,
+    "inputTokens": 8384,
+    "outputTokens": 4,
+    "latencyMs": 1425
+  },
+  {
+    "questionId": "q79",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1377
+  },
+  {
+    "questionId": "q79",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 9,
+    "latencyMs": 1550
+  },
+  {
+    "questionId": "q79",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1394
+  },
+  {
+    "questionId": "q79",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 9,
+    "latencyMs": 1202
+  },
+  {
+    "questionId": "q79",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q79",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 9,
+    "latencyMs": 1277
+  },
+  {
+    "questionId": "q79",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1564
+  },
+  {
+    "questionId": "q79",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 9,
+    "latencyMs": 1200
+  },
+  {
+    "questionId": "q79",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1596
+  },
+  {
+    "questionId": "q79",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Valerie Braun",
+    "actual": "Valerie Braun",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 9,
+    "latencyMs": 1151
+  },
+  {
+    "questionId": "q80",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1458
+  },
+  {
+    "questionId": "q80",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 9,
+    "latencyMs": 1283
+  },
+  {
+    "questionId": "q80",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 4702
+  },
+  {
+    "questionId": "q80",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 9,
+    "latencyMs": 1360
+  },
+  {
+    "questionId": "q80",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 6167
+  },
+  {
+    "questionId": "q80",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 9,
+    "latencyMs": 1449
+  },
+  {
+    "questionId": "q80",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 6096
+  },
+  {
+    "questionId": "q80",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 9,
+    "latencyMs": 1194
+  },
+  {
+    "questionId": "q80",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 7357
+  },
+  {
+    "questionId": "q80",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Anita Kozey",
+    "actual": "Anita Kozey",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 9,
+    "latencyMs": 1213
+  },
+  {
+    "questionId": "q81",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 6,
+    "latencyMs": 2539
+  },
+  {
+    "questionId": "q81",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 10,
+    "latencyMs": 1532
+  },
+  {
+    "questionId": "q81",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 6,
+    "latencyMs": 2960
+  },
+  {
+    "questionId": "q81",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 10,
+    "latencyMs": 1547
+  },
+  {
+    "questionId": "q81",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 6,
+    "latencyMs": 1358
+  },
+  {
+    "questionId": "q81",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 10,
+    "latencyMs": 1424
+  },
+  {
+    "questionId": "q81",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 6,
+    "latencyMs": 958
+  },
+  {
+    "questionId": "q81",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 10,
+    "latencyMs": 1381
+  },
+  {
+    "questionId": "q81",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 6,
+    "latencyMs": 1372
+  },
+  {
+    "questionId": "q81",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Elmer Kub PhD",
+    "actual": "Elmer Kub PhD",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 10,
+    "latencyMs": 1715
+  },
+  {
+    "questionId": "q82",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1972
+  },
+  {
+    "questionId": "q82",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 10,
+    "latencyMs": 1315
+  },
+  {
+    "questionId": "q82",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1634
+  },
+  {
+    "questionId": "q82",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 10,
+    "latencyMs": 1264
+  },
+  {
+    "questionId": "q82",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 1153
+  },
+  {
+    "questionId": "q82",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 10,
+    "latencyMs": 1252
+  },
+  {
+    "questionId": "q82",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 1697
+  },
+  {
+    "questionId": "q82",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 10,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q82",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 1854
+  },
+  {
+    "questionId": "q82",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Maxine Zemlak",
+    "actual": "Maxine Zemlak",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 10,
+    "latencyMs": 1752
+  },
+  {
+    "questionId": "q83",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 2076
+  },
+  {
+    "questionId": "q83",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1398
+  },
+  {
+    "questionId": "q83",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 2263
+  },
+  {
+    "questionId": "q83",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 3101
+  },
+  {
+    "questionId": "q83",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 1453
+  },
+  {
+    "questionId": "q83",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1265
+  },
+  {
+    "questionId": "q83",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 8807
+  },
+  {
+    "questionId": "q83",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1097
+  },
+  {
+    "questionId": "q83",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 1667
+  },
+  {
+    "questionId": "q83",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Emanuel Littel",
+    "actual": "Emanuel Littel",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q84",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 3,
+    "latencyMs": 2292
+  },
+  {
+    "questionId": "q84",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1202
+  },
+  {
+    "questionId": "q84",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 3,
+    "latencyMs": 1801
+  },
+  {
+    "questionId": "q84",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1287
+  },
+  {
+    "questionId": "q84",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 3,
+    "latencyMs": 1340
+  },
+  {
+    "questionId": "q84",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1163
+  },
+  {
+    "questionId": "q84",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 3,
+    "latencyMs": 2685
+  },
+  {
+    "questionId": "q84",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1397
+  },
+  {
+    "questionId": "q84",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 3,
+    "latencyMs": 1289
+  },
+  {
+    "questionId": "q84",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Andrew Kling",
+    "actual": "Andrew Kling",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1155
+  },
+  {
+    "questionId": "q85",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 6,
+    "latencyMs": 1601
+  },
+  {
+    "questionId": "q85",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 9,
+    "latencyMs": 1340
+  },
+  {
+    "questionId": "q85",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 6,
+    "latencyMs": 3525
+  },
+  {
+    "questionId": "q85",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 9,
+    "latencyMs": 1710
+  },
+  {
+    "questionId": "q85",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 6,
+    "latencyMs": 2333
+  },
+  {
+    "questionId": "q85",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 9,
+    "latencyMs": 1168
+  },
+  {
+    "questionId": "q85",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 6,
+    "latencyMs": 1781
+  },
+  {
+    "questionId": "q85",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 9,
+    "latencyMs": 1552
+  },
+  {
+    "questionId": "q85",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 6,
+    "latencyMs": 1584
+  },
+  {
+    "questionId": "q85",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Morris O'Hara",
+    "actual": "Morris O'Hara",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 9,
+    "latencyMs": 1548
+  },
+  {
+    "questionId": "q86",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 6,
+    "latencyMs": 7230
+  },
+  {
+    "questionId": "q86",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 9,
+    "latencyMs": 1933
+  },
+  {
+    "questionId": "q86",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 6,
+    "latencyMs": 1067
+  },
+  {
+    "questionId": "q86",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 9,
+    "latencyMs": 1288
+  },
+  {
+    "questionId": "q86",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 6,
+    "latencyMs": 3954
+  },
+  {
+    "questionId": "q86",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 9,
+    "latencyMs": 1314
+  },
+  {
+    "questionId": "q86",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 6,
+    "latencyMs": 1334
+  },
+  {
+    "questionId": "q86",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 9,
+    "latencyMs": 2441
+  },
+  {
+    "questionId": "q86",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 6,
+    "latencyMs": 1650
+  },
+  {
+    "questionId": "q86",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Elijah Franecki",
+    "actual": "Elijah Franecki",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 9,
+    "latencyMs": 1495
+  },
+  {
+    "questionId": "q87",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1262
+  },
+  {
+    "questionId": "q87",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 7,
+    "latencyMs": 1367
+  },
+  {
+    "questionId": "q87",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1385
+  },
+  {
+    "questionId": "q87",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 7,
+    "latencyMs": 1313
+  },
+  {
+    "questionId": "q87",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 1141
+  },
+  {
+    "questionId": "q87",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 7,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q87",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 3347
+  },
+  {
+    "questionId": "q87",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 7,
+    "latencyMs": 1457
+  },
+  {
+    "questionId": "q87",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 1276
+  },
+  {
+    "questionId": "q87",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Malcolm Erdman",
+    "actual": "Malcolm Erdman",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 7,
+    "latencyMs": 1211
+  },
+  {
+    "questionId": "q88",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1635
+  },
+  {
+    "questionId": "q88",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 9,
+    "latencyMs": 1582
+  },
+  {
+    "questionId": "q88",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1695
+  },
+  {
+    "questionId": "q88",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 9,
+    "latencyMs": 1318
+  },
+  {
+    "questionId": "q88",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 936
+  },
+  {
+    "questionId": "q88",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 9,
+    "latencyMs": 1204
+  },
+  {
+    "questionId": "q88",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 996
+  },
+  {
+    "questionId": "q88",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 9,
+    "latencyMs": 1261
+  },
+  {
+    "questionId": "q88",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 2276
+  },
+  {
+    "questionId": "q88",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Fannie Skiles",
+    "actual": "Fannie Skiles",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 9,
+    "latencyMs": 1380
+  },
+  {
+    "questionId": "q89",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 6,
+    "latencyMs": 1451
+  },
+  {
+    "questionId": "q89",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 10,
+    "latencyMs": 1977
+  },
+  {
+    "questionId": "q89",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 6,
+    "latencyMs": 1376
+  },
+  {
+    "questionId": "q89",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 10,
+    "latencyMs": 1250
+  },
+  {
+    "questionId": "q89",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 6,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q89",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 10,
+    "latencyMs": 1359
+  },
+  {
+    "questionId": "q89",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 6,
+    "latencyMs": 1791
+  },
+  {
+    "questionId": "q89",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 10,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q89",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 6,
+    "latencyMs": 2832
+  },
+  {
+    "questionId": "q89",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Sonja Emmerich",
+    "actual": "Sonja Emmerich",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 10,
+    "latencyMs": 1172
+  },
+  {
+    "questionId": "q90",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 7,
+    "latencyMs": 1491
+  },
+  {
+    "questionId": "q90",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 10,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q90",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 7,
+    "latencyMs": 1396
+  },
+  {
+    "questionId": "q90",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 10,
+    "latencyMs": 1514
+  },
+  {
+    "questionId": "q90",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 7,
+    "latencyMs": 1573
+  },
+  {
+    "questionId": "q90",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 10,
+    "latencyMs": 1284
+  },
+  {
+    "questionId": "q90",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 7,
+    "latencyMs": 5400
+  },
+  {
+    "questionId": "q90",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 10,
+    "latencyMs": 1486
+  },
+  {
+    "questionId": "q90",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 7,
+    "latencyMs": 1420
+  },
+  {
+    "questionId": "q90",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Frank Emmerich DVM",
+    "actual": "Frank Emmerich DVM",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 10,
+    "latencyMs": 1410
+  },
+  {
+    "questionId": "q91",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1248
+  },
+  {
+    "questionId": "q91",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 5,
+    "latencyMs": 1177
+  },
+  {
+    "questionId": "q91",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1601
+  },
+  {
+    "questionId": "q91",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 5,
+    "latencyMs": 1822
+  },
+  {
+    "questionId": "q91",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1103
+  },
+  {
+    "questionId": "q91",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 5,
+    "latencyMs": 1247
+  },
+  {
+    "questionId": "q91",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 1184
+  },
+  {
+    "questionId": "q91",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 5,
+    "latencyMs": 1137
+  },
+  {
+    "questionId": "q91",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 949
+  },
+  {
+    "questionId": "q91",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Ronald Collins",
+    "actual": "Ronald Collins",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 5,
+    "latencyMs": 1143
+  },
+  {
+    "questionId": "q92",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 4,
+    "latencyMs": 1021
+  },
+  {
+    "questionId": "q92",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 8,
+    "latencyMs": 1301
+  },
+  {
+    "questionId": "q92",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 4,
+    "latencyMs": 1254
+  },
+  {
+    "questionId": "q92",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 8,
+    "latencyMs": 1375
+  },
+  {
+    "questionId": "q92",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 4,
+    "latencyMs": 1316
+  },
+  {
+    "questionId": "q92",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 8,
+    "latencyMs": 2681
+  },
+  {
+    "questionId": "q92",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 4,
+    "latencyMs": 2427
+  },
+  {
+    "questionId": "q92",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 8,
+    "latencyMs": 1526
+  },
+  {
+    "questionId": "q92",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 4,
+    "latencyMs": 1252
+  },
+  {
+    "questionId": "q92",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Jeannie Klein",
+    "actual": "Jeannie Klein",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 8,
+    "latencyMs": 1324
+  },
+  {
+    "questionId": "q93",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 9740,
+    "outputTokens": 5,
+    "latencyMs": 1606
+  },
+  {
+    "questionId": "q93",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 11907,
+    "outputTokens": 8,
+    "latencyMs": 1223
+  },
+  {
+    "questionId": "q93",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 6014,
+    "outputTokens": 5,
+    "latencyMs": 1965
+  },
+  {
+    "questionId": "q93",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 6993,
+    "outputTokens": 8,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q93",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 6782,
+    "outputTokens": 5,
+    "latencyMs": 1110
+  },
+  {
+    "questionId": "q93",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 8414,
+    "outputTokens": 8,
+    "latencyMs": 1819
+  },
+  {
+    "questionId": "q93",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 9159,
+    "outputTokens": 5,
+    "latencyMs": 1010
+  },
+  {
+    "questionId": "q93",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 9289,
+    "outputTokens": 8,
+    "latencyMs": 1224
+  },
+  {
+    "questionId": "q93",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 7374,
+    "outputTokens": 5,
+    "latencyMs": 1430
+  },
+  {
+    "questionId": "q93",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "Joshua Watsica",
+    "actual": "Joshua Watsica",
+    "correct": true,
+    "inputTokens": 8385,
+    "outputTokens": 8,
+    "latencyMs": 1158
+  },
+  {
+    "questionId": "q94",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9736,
+    "outputTokens": 2,
+    "latencyMs": 1352
+  },
+  {
+    "questionId": "q94",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 11902,
+    "outputTokens": 5,
+    "latencyMs": 1498
+  },
+  {
+    "questionId": "q94",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 6010,
+    "outputTokens": 2,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q94",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 6988,
+    "outputTokens": 5,
+    "latencyMs": 1080
+  },
+  {
+    "questionId": "q94",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 6778,
+    "outputTokens": 2,
+    "latencyMs": 1760
+  },
+  {
+    "questionId": "q94",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8409,
+    "outputTokens": 5,
+    "latencyMs": 1156
+  },
+  {
+    "questionId": "q94",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9155,
+    "outputTokens": 2,
+    "latencyMs": 9923
+  },
+  {
+    "questionId": "q94",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 9284,
+    "outputTokens": 5,
+    "latencyMs": 1138
+  },
+  {
+    "questionId": "q94",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 7370,
+    "outputTokens": 2,
+    "latencyMs": 1070
+  },
+  {
+    "questionId": "q94",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8380,
+    "outputTokens": 5,
+    "latencyMs": 1114
+  },
+  {
+    "questionId": "q95",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9736,
+    "outputTokens": 2,
+    "latencyMs": 830
+  },
+  {
+    "questionId": "q95",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 11902,
+    "outputTokens": 5,
+    "latencyMs": 1085
+  },
+  {
+    "questionId": "q95",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 6010,
+    "outputTokens": 2,
+    "latencyMs": 2362
+  },
+  {
+    "questionId": "q95",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 6988,
+    "outputTokens": 5,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q95",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 6778,
+    "outputTokens": 2,
+    "latencyMs": 1630
+  },
+  {
+    "questionId": "q95",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8409,
+    "outputTokens": 5,
+    "latencyMs": 1219
+  },
+  {
+    "questionId": "q95",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9155,
+    "outputTokens": 2,
+    "latencyMs": 2666
+  },
+  {
+    "questionId": "q95",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 9284,
+    "outputTokens": 5,
+    "latencyMs": 1044
+  },
+  {
+    "questionId": "q95",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 7370,
+    "outputTokens": 2,
+    "latencyMs": 2187
+  },
+  {
+    "questionId": "q95",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8380,
+    "outputTokens": 5,
+    "latencyMs": 1313
+  },
+  {
+    "questionId": "q96",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 9737,
+    "outputTokens": 2,
+    "latencyMs": 1087
+  },
+  {
+    "questionId": "q96",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 11902,
+    "outputTokens": 5,
+    "latencyMs": 1292
+  },
+  {
+    "questionId": "q96",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6011,
+    "outputTokens": 2,
+    "latencyMs": 1979
+  },
+  {
+    "questionId": "q96",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 6988,
+    "outputTokens": 5,
+    "latencyMs": 1095
+  },
+  {
+    "questionId": "q96",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6779,
+    "outputTokens": 2,
+    "latencyMs": 1385
+  },
+  {
+    "questionId": "q96",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8409,
+    "outputTokens": 5,
+    "latencyMs": 1507
+  },
+  {
+    "questionId": "q96",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9156,
+    "outputTokens": 2,
+    "latencyMs": 1579
+  },
+  {
+    "questionId": "q96",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 9284,
+    "outputTokens": 5,
+    "latencyMs": 1365
+  },
+  {
+    "questionId": "q96",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 7371,
+    "outputTokens": 2,
+    "latencyMs": 1661
+  },
+  {
+    "questionId": "q96",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "7",
+    "correct": false,
+    "inputTokens": 8380,
+    "outputTokens": 5,
+    "latencyMs": 1423
+  },
+  {
+    "questionId": "q97",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 9737,
+    "outputTokens": 2,
+    "latencyMs": 1815
+  },
+  {
+    "questionId": "q97",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 11902,
+    "outputTokens": 5,
+    "latencyMs": 1345
+  },
+  {
+    "questionId": "q97",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 6011,
+    "outputTokens": 2,
+    "latencyMs": 2193
+  },
+  {
+    "questionId": "q97",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 6988,
+    "outputTokens": 5,
+    "latencyMs": 1417
+  },
+  {
+    "questionId": "q97",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6779,
+    "outputTokens": 2,
+    "latencyMs": 1721
+  },
+  {
+    "questionId": "q97",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 8409,
+    "outputTokens": 5,
+    "latencyMs": 1114
+  },
+  {
+    "questionId": "q97",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 9156,
+    "outputTokens": 2,
+    "latencyMs": 2208
+  },
+  {
+    "questionId": "q97",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9284,
+    "outputTokens": 5,
+    "latencyMs": 1895
+  },
+  {
+    "questionId": "q97",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 7371,
+    "outputTokens": 2,
+    "latencyMs": 1287
+  },
+  {
+    "questionId": "q97",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 8380,
+    "outputTokens": 5,
+    "latencyMs": 1281
+  },
+  {
+    "questionId": "q98",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9737,
+    "outputTokens": 2,
+    "latencyMs": 1387
+  },
+  {
+    "questionId": "q98",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 11902,
+    "outputTokens": 5,
+    "latencyMs": 1243
+  },
+  {
+    "questionId": "q98",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 6011,
+    "outputTokens": 2,
+    "latencyMs": 1284
+  },
+  {
+    "questionId": "q98",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 6988,
+    "outputTokens": 5,
+    "latencyMs": 1161
+  },
+  {
+    "questionId": "q98",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 6779,
+    "outputTokens": 2,
+    "latencyMs": 10406
+  },
+  {
+    "questionId": "q98",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 8409,
+    "outputTokens": 5,
+    "latencyMs": 1335
+  },
+  {
+    "questionId": "q98",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9156,
+    "outputTokens": 2,
+    "latencyMs": 1517
+  },
+  {
+    "questionId": "q98",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 9284,
+    "outputTokens": 5,
+    "latencyMs": 1702
+  },
+  {
+    "questionId": "q98",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "10",
+    "actual": "10",
+    "correct": true,
+    "inputTokens": 7371,
+    "outputTokens": 2,
+    "latencyMs": 1676
+  },
+  {
+    "questionId": "q98",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "10",
+    "actual": "8",
+    "correct": false,
+    "inputTokens": 8380,
+    "outputTokens": 5,
+    "latencyMs": 1218
+  },
+  {
+    "questionId": "q99",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "42342.25",
+    "actual": "$50,000.00",
+    "correct": false,
+    "inputTokens": 9737,
+    "outputTokens": 7,
+    "latencyMs": 1407
+  },
+  {
+    "questionId": "q99",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "42342.25",
+    "actual": "50,847.47",
+    "correct": false,
+    "inputTokens": 11902,
+    "outputTokens": 9,
+    "latencyMs": 1443
+  },
+  {
+    "questionId": "q99",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "42342.25",
+    "actual": "Total revenue across all orders is 42,195.36.",
+    "correct": false,
+    "inputTokens": 6011,
+    "outputTokens": 14,
+    "latencyMs": 1150
+  },
+  {
+    "questionId": "q99",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "42342.25",
+    "actual": "41,847.47",
+    "correct": false,
+    "inputTokens": 6988,
+    "outputTokens": 9,
+    "latencyMs": 1774
+  },
+  {
+    "questionId": "q99",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "42342.25",
+    "actual": "$32,186.73",
+    "correct": false,
+    "inputTokens": 6779,
+    "outputTokens": 7,
+    "latencyMs": 2654
+  },
+  {
+    "questionId": "q99",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "42342.25",
+    "actual": "48,847.47",
+    "correct": false,
+    "inputTokens": 8409,
+    "outputTokens": 9,
+    "latencyMs": 1386
+  },
+  {
+    "questionId": "q99",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "42342.25",
+    "actual": "$34,186.73",
+    "correct": false,
+    "inputTokens": 9156,
+    "outputTokens": 7,
+    "latencyMs": 1506
+  },
+  {
+    "questionId": "q99",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "42342.25",
+    "actual": "48,847.47",
+    "correct": false,
+    "inputTokens": 9284,
+    "outputTokens": 9,
+    "latencyMs": 1509
+  },
+  {
+    "questionId": "q99",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "42342.25",
+    "actual": "Total revenue across all orders is 48780.73.",
+    "correct": false,
+    "inputTokens": 7371,
+    "outputTokens": 13,
+    "latencyMs": 1700
+  },
+  {
+    "questionId": "q99",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "42342.25",
+    "actual": "47,847.47",
+    "correct": false,
+    "inputTokens": 8380,
+    "outputTokens": 9,
+    "latencyMs": 1230
+  },
+  {
+    "questionId": "q100",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "44",
+    "actual": "36",
+    "correct": false,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 1725
+  },
+  {
+    "questionId": "q100",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "44",
+    "actual": "48",
+    "correct": false,
+    "inputTokens": 11904,
+    "outputTokens": 5,
+    "latencyMs": 1377
+  },
+  {
+    "questionId": "q100",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "44",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q100",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "44",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 6990,
+    "outputTokens": 5,
+    "latencyMs": 1094
+  },
+  {
+    "questionId": "q100",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "44",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 1617
+  },
+  {
+    "questionId": "q100",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "44",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 8411,
+    "outputTokens": 5,
+    "latencyMs": 1344
+  },
+  {
+    "questionId": "q100",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "44",
+    "actual": "36",
+    "correct": false,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 2396
+  },
+  {
+    "questionId": "q100",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "44",
+    "actual": "48",
+    "correct": false,
+    "inputTokens": 9286,
+    "outputTokens": 5,
+    "latencyMs": 1145
+  },
+  {
+    "questionId": "q100",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "44",
+    "actual": "36",
+    "correct": false,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 951
+  },
+  {
+    "questionId": "q100",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "44",
+    "actual": "45",
+    "correct": false,
+    "inputTokens": 8382,
+    "outputTokens": 5,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q101",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "39",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 866
+  },
+  {
+    "questionId": "q101",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "39",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 11904,
+    "outputTokens": 5,
+    "latencyMs": 1964
+  },
+  {
+    "questionId": "q101",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "39",
+    "actual": "30",
+    "correct": false,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1994
+  },
+  {
+    "questionId": "q101",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "39",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 6990,
+    "outputTokens": 5,
+    "latencyMs": 1277
+  },
+  {
+    "questionId": "q101",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "39",
+    "actual": "32",
+    "correct": false,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 1884
+  },
+  {
+    "questionId": "q101",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "39",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 8411,
+    "outputTokens": 5,
+    "latencyMs": 1282
+  },
+  {
+    "questionId": "q101",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "39",
+    "actual": "32",
+    "correct": false,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1761
+  },
+  {
+    "questionId": "q101",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "39",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 9286,
+    "outputTokens": 5,
+    "latencyMs": 1250
+  },
+  {
+    "questionId": "q101",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "39",
+    "actual": "32",
+    "correct": false,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 1316
+  },
+  {
+    "questionId": "q101",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "39",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 8382,
+    "outputTokens": 5,
+    "latencyMs": 1373
+  },
+  {
+    "questionId": "q102",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "32",
+    "actual": "27",
+    "correct": false,
+    "inputTokens": 9739,
+    "outputTokens": 2,
+    "latencyMs": 1389
+  },
+  {
+    "questionId": "q102",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "32",
+    "actual": "28",
+    "correct": false,
+    "inputTokens": 11904,
+    "outputTokens": 5,
+    "latencyMs": 1215
+  },
+  {
+    "questionId": "q102",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "32",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 6013,
+    "outputTokens": 2,
+    "latencyMs": 1034
+  },
+  {
+    "questionId": "q102",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "32",
+    "actual": "26",
+    "correct": false,
+    "inputTokens": 6990,
+    "outputTokens": 5,
+    "latencyMs": 1063
+  },
+  {
+    "questionId": "q102",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "32",
+    "actual": "25",
+    "correct": false,
+    "inputTokens": 6781,
+    "outputTokens": 2,
+    "latencyMs": 7312
+  },
+  {
+    "questionId": "q102",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "32",
+    "actual": "28",
+    "correct": false,
+    "inputTokens": 8411,
+    "outputTokens": 5,
+    "latencyMs": 1387
+  },
+  {
+    "questionId": "q102",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "32",
+    "actual": "27",
+    "correct": false,
+    "inputTokens": 9158,
+    "outputTokens": 2,
+    "latencyMs": 1488
+  },
+  {
+    "questionId": "q102",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "32",
+    "actual": "28",
+    "correct": false,
+    "inputTokens": 9286,
+    "outputTokens": 5,
+    "latencyMs": 1268
+  },
+  {
+    "questionId": "q102",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "32",
+    "actual": "27",
+    "correct": false,
+    "inputTokens": 7373,
+    "outputTokens": 2,
+    "latencyMs": 1274
+  },
+  {
+    "questionId": "q102",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "32",
+    "actual": "26",
+    "correct": false,
+    "inputTokens": 8382,
+    "outputTokens": 5,
+    "latencyMs": 1354
+  },
+  {
+    "questionId": "q103",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1330
+  },
+  {
+    "questionId": "q103",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1437
+  },
+  {
+    "questionId": "q103",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1341
+  },
+  {
+    "questionId": "q103",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1231
+  },
+  {
+    "questionId": "q103",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 2515
+  },
+  {
+    "questionId": "q103",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q103",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 868
+  },
+  {
+    "questionId": "q103",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1149
+  },
+  {
+    "questionId": "q103",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1183
+  },
+  {
+    "questionId": "q103",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6975",
+    "actual": "6975",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1119
+  },
+  {
+    "questionId": "q104",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q104",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1371
+  },
+  {
+    "questionId": "q104",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 2052
+  },
+  {
+    "questionId": "q104",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 997
+  },
+  {
+    "questionId": "q104",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1152
+  },
+  {
+    "questionId": "q104",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1188
+  },
+  {
+    "questionId": "q104",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 1259
+  },
+  {
+    "questionId": "q104",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1239
+  },
+  {
+    "questionId": "q104",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1096
+  },
+  {
+    "questionId": "q104",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6686.23",
+    "actual": "6686.23",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1247
+  },
+  {
+    "questionId": "q105",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1354
+  },
+  {
+    "questionId": "q105",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1083
+  },
+  {
+    "questionId": "q105",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 869
+  },
+  {
+    "questionId": "q105",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1051
+  },
+  {
+    "questionId": "q105",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1528
+  },
+  {
+    "questionId": "q105",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1126
+  },
+  {
+    "questionId": "q105",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1136
+  },
+  {
+    "questionId": "q105",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1121
+  },
+  {
+    "questionId": "q105",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1217
+  },
+  {
+    "questionId": "q105",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "7500",
+    "actual": "7500",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1099
+  },
+  {
+    "questionId": "q106",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1416
+  },
+  {
+    "questionId": "q106",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1526
+  },
+  {
+    "questionId": "q106",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1350
+  },
+  {
+    "questionId": "q106",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1330
+  },
+  {
+    "questionId": "q106",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 2337
+  },
+  {
+    "questionId": "q106",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q106",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 3128
+  },
+  {
+    "questionId": "q106",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1151
+  },
+  {
+    "questionId": "q106",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1988
+  },
+  {
+    "questionId": "q106",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "14297.05",
+    "actual": "14297.05",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1166
+  },
+  {
+    "questionId": "q107",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 2217
+  },
+  {
+    "questionId": "q107",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1114
+  },
+  {
+    "questionId": "q107",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1360
+  },
+  {
+    "questionId": "q107",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1079
+  },
+  {
+    "questionId": "q107",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1951
+  },
+  {
+    "questionId": "q107",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1173
+  },
+  {
+    "questionId": "q107",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1076
+  },
+  {
+    "questionId": "q107",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1098
+  },
+  {
+    "questionId": "q107",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1101
+  },
+  {
+    "questionId": "q107",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6692",
+    "actual": "6692",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1254
+  },
+  {
+    "questionId": "q108",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 2041
+  },
+  {
+    "questionId": "q108",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1405
+  },
+  {
+    "questionId": "q108",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1170
+  },
+  {
+    "questionId": "q108",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1161
+  },
+  {
+    "questionId": "q108",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1326
+  },
+  {
+    "questionId": "q108",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1259
+  },
+  {
+    "questionId": "q108",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 3006
+  },
+  {
+    "questionId": "q108",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1461
+  },
+  {
+    "questionId": "q108",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 3824
+  },
+  {
+    "questionId": "q108",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "9302.76",
+    "actual": "9302.76",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1391
+  },
+  {
+    "questionId": "q109",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1091
+  },
+  {
+    "questionId": "q109",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1188
+  },
+  {
+    "questionId": "q109",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1450
+  },
+  {
+    "questionId": "q109",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1614
+  },
+  {
+    "questionId": "q109",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1642
+  },
+  {
+    "questionId": "q109",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q109",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1201
+  },
+  {
+    "questionId": "q109",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1261
+  },
+  {
+    "questionId": "q109",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 856
+  },
+  {
+    "questionId": "q109",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "3285",
+    "actual": "3285",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 980
+  },
+  {
+    "questionId": "q110",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 3090
+  },
+  {
+    "questionId": "q110",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1123
+  },
+  {
+    "questionId": "q110",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 2911
+  },
+  {
+    "questionId": "q110",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 979
+  },
+  {
+    "questionId": "q110",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q110",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 943
+  },
+  {
+    "questionId": "q110",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 2639
+  },
+  {
+    "questionId": "q110",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q110",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 2402
+  },
+  {
+    "questionId": "q110",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "3826.93",
+    "actual": "3826.93",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1723
+  },
+  {
+    "questionId": "q111",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 2401
+  },
+  {
+    "questionId": "q111",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1117
+  },
+  {
+    "questionId": "q111",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1568
+  },
+  {
+    "questionId": "q111",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1132
+  },
+  {
+    "questionId": "q111",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1478
+  },
+  {
+    "questionId": "q111",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1831
+  },
+  {
+    "questionId": "q111",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1631
+  },
+  {
+    "questionId": "q111",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1371
+  },
+  {
+    "questionId": "q111",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1209
+  },
+  {
+    "questionId": "q111",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6191",
+    "actual": "6191",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1411
+  },
+  {
+    "questionId": "q112",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1773
+  },
+  {
+    "questionId": "q112",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1090
+  },
+  {
+    "questionId": "q112",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1354
+  },
+  {
+    "questionId": "q112",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1095
+  },
+  {
+    "questionId": "q112",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1135
+  },
+  {
+    "questionId": "q112",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 976
+  },
+  {
+    "questionId": "q112",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q112",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1287
+  },
+  {
+    "questionId": "q112",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1288
+  },
+  {
+    "questionId": "q112",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1854.66",
+    "actual": "1854.66",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1157
+  },
+  {
+    "questionId": "q113",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1328
+  },
+  {
+    "questionId": "q113",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1068
+  },
+  {
+    "questionId": "q113",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1020
+  },
+  {
+    "questionId": "q113",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1069
+  },
+  {
+    "questionId": "q113",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 968
+  },
+  {
+    "questionId": "q113",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1436
+  },
+  {
+    "questionId": "q113",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q113",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1273
+  },
+  {
+    "questionId": "q113",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1788
+  },
+  {
+    "questionId": "q113",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "4696",
+    "actual": "4696",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1050
+  },
+  {
+    "questionId": "q114",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q114",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1192
+  },
+  {
+    "questionId": "q114",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 893
+  },
+  {
+    "questionId": "q114",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1065
+  },
+  {
+    "questionId": "q114",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1155
+  },
+  {
+    "questionId": "q114",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1842
+  },
+  {
+    "questionId": "q114",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 2740
+  },
+  {
+    "questionId": "q114",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1295
+  },
+  {
+    "questionId": "q114",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1053
+  },
+  {
+    "questionId": "q114",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "4211.6",
+    "actual": "4211.6",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q115",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1452
+  },
+  {
+    "questionId": "q115",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1272
+  },
+  {
+    "questionId": "q115",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1039
+  },
+  {
+    "questionId": "q115",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1155
+  },
+  {
+    "questionId": "q115",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 796
+  },
+  {
+    "questionId": "q115",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1048
+  },
+  {
+    "questionId": "q115",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 2282
+  },
+  {
+    "questionId": "q115",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1592
+  },
+  {
+    "questionId": "q115",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 2691
+  },
+  {
+    "questionId": "q115",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6196",
+    "actual": "6196",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1126
+  },
+  {
+    "questionId": "q116",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1288
+  },
+  {
+    "questionId": "q116",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6105.3",
+    "actual": "6105.30",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 991
+  },
+  {
+    "questionId": "q116",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1257
+  },
+  {
+    "questionId": "q116",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1004
+  },
+  {
+    "questionId": "q116",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1620
+  },
+  {
+    "questionId": "q116",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 991
+  },
+  {
+    "questionId": "q116",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 1048
+  },
+  {
+    "questionId": "q116",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1189
+  },
+  {
+    "questionId": "q116",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 3282
+  },
+  {
+    "questionId": "q116",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6105.3",
+    "actual": "6105.3",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 985
+  },
+  {
+    "questionId": "q117",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 871
+  },
+  {
+    "questionId": "q117",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1042
+  },
+  {
+    "questionId": "q117",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 999
+  },
+  {
+    "questionId": "q117",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1111
+  },
+  {
+    "questionId": "q117",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1132
+  },
+  {
+    "questionId": "q117",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1004
+  },
+  {
+    "questionId": "q117",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q117",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1271
+  },
+  {
+    "questionId": "q117",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 961
+  },
+  {
+    "questionId": "q117",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6528",
+    "actual": "6528",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1289
+  },
+  {
+    "questionId": "q118",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1634
+  },
+  {
+    "questionId": "q118",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q118",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 2678
+  },
+  {
+    "questionId": "q118",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1155
+  },
+  {
+    "questionId": "q118",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1104
+  },
+  {
+    "questionId": "q118",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1109
+  },
+  {
+    "questionId": "q118",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 3756
+  },
+  {
+    "questionId": "q118",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1082
+  },
+  {
+    "questionId": "q118",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1451
+  },
+  {
+    "questionId": "q118",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1136.09",
+    "actual": "1136.09",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1730
+  },
+  {
+    "questionId": "q119",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1327
+  },
+  {
+    "questionId": "q119",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 1282
+  },
+  {
+    "questionId": "q119",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1368
+  },
+  {
+    "questionId": "q119",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1487
+  },
+  {
+    "questionId": "q119",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 2752
+  },
+  {
+    "questionId": "q119",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 909
+  },
+  {
+    "questionId": "q119",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 3502
+  },
+  {
+    "questionId": "q119",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1212
+  },
+  {
+    "questionId": "q119",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1218
+  },
+  {
+    "questionId": "q119",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "4689",
+    "actual": "4689",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1064
+  },
+  {
+    "questionId": "q120",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 2777
+  },
+  {
+    "questionId": "q120",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1246
+  },
+  {
+    "questionId": "q120",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1424
+  },
+  {
+    "questionId": "q120",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1074
+  },
+  {
+    "questionId": "q120",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 2803
+  },
+  {
+    "questionId": "q120",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1107
+  },
+  {
+    "questionId": "q120",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 1066
+  },
+  {
+    "questionId": "q120",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1325
+  },
+  {
+    "questionId": "q120",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1330
+  },
+  {
+    "questionId": "q120",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "2637.73",
+    "actual": "2637.73",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 1192
+  },
+  {
+    "questionId": "q121",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 3713,
+    "outputTokens": 3,
+    "latencyMs": 1139
+  },
+  {
+    "questionId": "q121",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 4080,
+    "outputTokens": 6,
+    "latencyMs": 994
+  },
+  {
+    "questionId": "q121",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 1564,
+    "outputTokens": 3,
+    "latencyMs": 1309
+  },
+  {
+    "questionId": "q121",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 1509,
+    "outputTokens": 6,
+    "latencyMs": 1184
+  },
+  {
+    "questionId": "q121",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 1442,
+    "outputTokens": 3,
+    "latencyMs": 1182
+  },
+  {
+    "questionId": "q121",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 1445,
+    "outputTokens": 6,
+    "latencyMs": 1381
+  },
+  {
+    "questionId": "q121",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 3830,
+    "outputTokens": 3,
+    "latencyMs": 1103
+  },
+  {
+    "questionId": "q121",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 3415,
+    "outputTokens": 6,
+    "latencyMs": 1220
+  },
+  {
+    "questionId": "q121",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 2986,
+    "outputTokens": 3,
+    "latencyMs": 1169
+  },
+  {
+    "questionId": "q121",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "5685",
+    "actual": "5685",
+    "correct": true,
+    "inputTokens": 3110,
+    "outputTokens": 6,
+    "latencyMs": 1208
+  },
+  {
+    "questionId": "q122",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 3712,
+    "outputTokens": 5,
+    "latencyMs": 1037
+  },
+  {
+    "questionId": "q122",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 4079,
+    "outputTokens": 8,
+    "latencyMs": 1278
+  },
+  {
+    "questionId": "q122",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 1563,
+    "outputTokens": 5,
+    "latencyMs": 1441
+  },
+  {
+    "questionId": "q122",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 1508,
+    "outputTokens": 8,
+    "latencyMs": 1204
+  },
+  {
+    "questionId": "q122",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 1441,
+    "outputTokens": 5,
+    "latencyMs": 1782
+  },
+  {
+    "questionId": "q122",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 1444,
+    "outputTokens": 8,
+    "latencyMs": 1088
+  },
+  {
+    "questionId": "q122",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 3829,
+    "outputTokens": 5,
+    "latencyMs": 1447
+  },
+  {
+    "questionId": "q122",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 3414,
+    "outputTokens": 8,
+    "latencyMs": 1356
+  },
+  {
+    "questionId": "q122",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 2985,
+    "outputTokens": 5,
+    "latencyMs": 1309
+  },
+  {
+    "questionId": "q122",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "3421.06",
+    "actual": "3421.06",
+    "correct": true,
+    "inputTokens": 3109,
+    "outputTokens": 8,
+    "latencyMs": 995
+  },
+  {
+    "questionId": "q123",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "344498",
+    "actual": "188,000",
+    "correct": false,
+    "inputTokens": 3710,
+    "outputTokens": 4,
+    "latencyMs": 1405
+  },
+  {
+    "questionId": "q123",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "344498",
+    "actual": "188,945",
+    "correct": false,
+    "inputTokens": 4077,
+    "outputTokens": 7,
+    "latencyMs": 1110
+  },
+  {
+    "questionId": "q123",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "344498",
+    "actual": "186,000",
+    "correct": false,
+    "inputTokens": 1561,
+    "outputTokens": 4,
+    "latencyMs": 1306
+  },
+  {
+    "questionId": "q123",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "344498",
+    "actual": "337,045",
+    "correct": false,
+    "inputTokens": 1506,
+    "outputTokens": 7,
+    "latencyMs": 1292
+  },
+  {
+    "questionId": "q123",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "344498",
+    "actual": "188,000",
+    "correct": false,
+    "inputTokens": 1439,
+    "outputTokens": 4,
+    "latencyMs": 2659
+  },
+  {
+    "questionId": "q123",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "344498",
+    "actual": "372,915",
+    "correct": false,
+    "inputTokens": 1442,
+    "outputTokens": 7,
+    "latencyMs": 966
+  },
+  {
+    "questionId": "q123",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "344498",
+    "actual": "174,000",
+    "correct": false,
+    "inputTokens": 3827,
+    "outputTokens": 4,
+    "latencyMs": 1177
+  },
+  {
+    "questionId": "q123",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "344498",
+    "actual": "188,647",
+    "correct": false,
+    "inputTokens": 3412,
+    "outputTokens": 7,
+    "latencyMs": 1018
+  },
+  {
+    "questionId": "q123",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "344498",
+    "actual": "188,000",
+    "correct": false,
+    "inputTokens": 2983,
+    "outputTokens": 4,
+    "latencyMs": 1659
+  },
+  {
+    "questionId": "q123",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "344498",
+    "actual": "181,854",
+    "correct": false,
+    "inputTokens": 3107,
+    "outputTokens": 7,
+    "latencyMs": 1894
+  },
+  {
+    "questionId": "q124",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "312818.50",
+    "actual": "188,174.36",
+    "correct": false,
+    "inputTokens": 3708,
+    "outputTokens": 6,
+    "latencyMs": 2900
+  },
+  {
+    "questionId": "q124",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "312818.50",
+    "actual": "287,745.89",
+    "correct": false,
+    "inputTokens": 4075,
+    "outputTokens": 9,
+    "latencyMs": 1196
+  },
+  {
+    "questionId": "q124",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "312818.50",
+    "actual": "Total revenue across all dates is 139,155.36.",
+    "correct": false,
+    "inputTokens": 1559,
+    "outputTokens": 14,
+    "latencyMs": 1401
+  },
+  {
+    "questionId": "q124",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "312818.50",
+    "actual": "487,891.45",
+    "correct": false,
+    "inputTokens": 1504,
+    "outputTokens": 9,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q124",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "312818.50",
+    "actual": "Total revenue across all dates is 155,000.00.",
+    "correct": false,
+    "inputTokens": 1437,
+    "outputTokens": 14,
+    "latencyMs": 1308
+  },
+  {
+    "questionId": "q124",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "312818.50",
+    "actual": "487,891.89",
+    "correct": false,
+    "inputTokens": 1440,
+    "outputTokens": 9,
+    "latencyMs": 1120
+  },
+  {
+    "questionId": "q124",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "312818.50",
+    "actual": "Total revenue across all dates is 155,155.36.",
+    "correct": false,
+    "inputTokens": 3825,
+    "outputTokens": 14,
+    "latencyMs": 1143
+  },
+  {
+    "questionId": "q124",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "312818.50",
+    "actual": "381,968.89",
+    "correct": false,
+    "inputTokens": 3410,
+    "outputTokens": 9,
+    "latencyMs": 1172
+  },
+  {
+    "questionId": "q124",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "312818.50",
+    "actual": "Total revenue across all dates is 155,155.36.",
+    "correct": false,
+    "inputTokens": 2981,
+    "outputTokens": 14,
+    "latencyMs": 1179
+  },
+  {
+    "questionId": "q124",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "312818.50",
+    "actual": "381,847.89",
+    "correct": false,
+    "inputTokens": 3105,
+    "outputTokens": 9,
+    "latencyMs": 1073
+  },
+  {
+    "questionId": "q125",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "1811",
+    "actual": "1030",
+    "correct": false,
+    "inputTokens": 3710,
+    "outputTokens": 3,
+    "latencyMs": 3823
+  },
+  {
+    "questionId": "q125",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "1811",
+    "actual": "1,234",
+    "correct": false,
+    "inputTokens": 4078,
+    "outputTokens": 7,
+    "latencyMs": 1153
+  },
+  {
+    "questionId": "q125",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "1811",
+    "actual": "1040",
+    "correct": false,
+    "inputTokens": 1561,
+    "outputTokens": 3,
+    "latencyMs": 1472
+  },
+  {
+    "questionId": "q125",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "1811",
+    "actual": "1,945",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 7,
+    "latencyMs": 940
+  },
+  {
+    "questionId": "q125",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "1811",
+    "actual": "1030",
+    "correct": false,
+    "inputTokens": 1439,
+    "outputTokens": 3,
+    "latencyMs": 1067
+  },
+  {
+    "questionId": "q125",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "1811",
+    "actual": "1,945",
+    "correct": false,
+    "inputTokens": 1443,
+    "outputTokens": 7,
+    "latencyMs": 1183
+  },
+  {
+    "questionId": "q125",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "1811",
+    "actual": "Total conversions: 1030",
+    "correct": false,
+    "inputTokens": 3827,
+    "outputTokens": 7,
+    "latencyMs": 1103
+  },
+  {
+    "questionId": "q125",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "1811",
+    "actual": "1,454",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 7,
+    "latencyMs": 1067
+  },
+  {
+    "questionId": "q125",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "1811",
+    "actual": "1040",
+    "correct": false,
+    "inputTokens": 2983,
+    "outputTokens": 3,
+    "latencyMs": 932
+  },
+  {
+    "questionId": "q125",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "1811",
+    "actual": "1,454",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 7,
+    "latencyMs": 1530
+  },
+  {
+    "questionId": "q126",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "42",
+    "actual": "42",
+    "correct": true,
+    "inputTokens": 3710,
+    "outputTokens": 2,
+    "latencyMs": 1016
+  },
+  {
+    "questionId": "q126",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "42",
+    "actual": "42",
+    "correct": true,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 1440
+  },
+  {
+    "questionId": "q126",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "42",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 1561,
+    "outputTokens": 2,
+    "latencyMs": 1206
+  },
+  {
+    "questionId": "q126",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "42",
+    "actual": "42",
+    "correct": true,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 1452
+  },
+  {
+    "questionId": "q126",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "42",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 1439,
+    "outputTokens": 2,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q126",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "42",
+    "actual": "42",
+    "correct": true,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 1248
+  },
+  {
+    "questionId": "q126",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "42",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 3827,
+    "outputTokens": 2,
+    "latencyMs": 1420
+  },
+  {
+    "questionId": "q126",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "42",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 900
+  },
+  {
+    "questionId": "q126",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "42",
+    "actual": "42",
+    "correct": true,
+    "inputTokens": 2983,
+    "outputTokens": 2,
+    "latencyMs": 1309
+  },
+  {
+    "questionId": "q126",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "42",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1216
+  },
+  {
+    "questionId": "q127",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "28",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 3710,
+    "outputTokens": 2,
+    "latencyMs": 3911
+  },
+  {
+    "questionId": "q127",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "28",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 1056
+  },
+  {
+    "questionId": "q127",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "28",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 1561,
+    "outputTokens": 2,
+    "latencyMs": 839
+  },
+  {
+    "questionId": "q127",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "28",
+    "actual": "26",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 965
+  },
+  {
+    "questionId": "q127",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "28",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 1439,
+    "outputTokens": 2,
+    "latencyMs": 2163
+  },
+  {
+    "questionId": "q127",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "28",
+    "actual": "23",
+    "correct": false,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 1006
+  },
+  {
+    "questionId": "q127",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "28",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 3827,
+    "outputTokens": 2,
+    "latencyMs": 2619
+  },
+  {
+    "questionId": "q127",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "28",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 989
+  },
+  {
+    "questionId": "q127",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "28",
+    "actual": "22",
+    "correct": false,
+    "inputTokens": 2983,
+    "outputTokens": 2,
+    "latencyMs": 1830
+  },
+  {
+    "questionId": "q127",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "28",
+    "actual": "23",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1001
+  },
+  {
+    "questionId": "q128",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 3710,
+    "outputTokens": 2,
+    "latencyMs": 1217
+  },
+  {
+    "questionId": "q128",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "11",
+    "correct": true,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 3180
+  },
+  {
+    "questionId": "q128",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 1561,
+    "outputTokens": 2,
+    "latencyMs": 1076
+  },
+  {
+    "questionId": "q128",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 912
+  },
+  {
+    "questionId": "q128",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 1439,
+    "outputTokens": 2,
+    "latencyMs": 2900
+  },
+  {
+    "questionId": "q128",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "11",
+    "correct": true,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 1389
+  },
+  {
+    "questionId": "q128",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 3827,
+    "outputTokens": 2,
+    "latencyMs": 1107
+  },
+  {
+    "questionId": "q128",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "11",
+    "correct": true,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 1150
+  },
+  {
+    "questionId": "q128",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "11",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 2983,
+    "outputTokens": 2,
+    "latencyMs": 1047
+  },
+  {
+    "questionId": "q128",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "11",
+    "actual": "11",
+    "correct": true,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1169
+  },
+  {
+    "questionId": "q129",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "58",
+    "actual": "36",
+    "correct": false,
+    "inputTokens": 3709,
+    "outputTokens": 2,
+    "latencyMs": 1007
+  },
+  {
+    "questionId": "q129",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "58",
+    "actual": "50",
+    "correct": false,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 1342
+  },
+  {
+    "questionId": "q129",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "58",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 1560,
+    "outputTokens": 2,
+    "latencyMs": 828
+  },
+  {
+    "questionId": "q129",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "58",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 1305
+  },
+  {
+    "questionId": "q129",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "58",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 1438,
+    "outputTokens": 2,
+    "latencyMs": 1305
+  },
+  {
+    "questionId": "q129",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "58",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 1406
+  },
+  {
+    "questionId": "q129",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "58",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 3826,
+    "outputTokens": 2,
+    "latencyMs": 1513
+  },
+  {
+    "questionId": "q129",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "58",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 1026
+  },
+  {
+    "questionId": "q129",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "58",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 2,
+    "latencyMs": 1373
+  },
+  {
+    "questionId": "q129",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "58",
+    "actual": "54",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1112
+  },
+  {
+    "questionId": "q130",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 3709,
+    "outputTokens": 2,
+    "latencyMs": 1248
+  },
+  {
+    "questionId": "q130",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 1083
+  },
+  {
+    "questionId": "q130",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "24",
+    "correct": false,
+    "inputTokens": 1560,
+    "outputTokens": 2,
+    "latencyMs": 895
+  },
+  {
+    "questionId": "q130",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 1087
+  },
+  {
+    "questionId": "q130",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 1438,
+    "outputTokens": 2,
+    "latencyMs": 1157
+  },
+  {
+    "questionId": "q130",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 1155
+  },
+  {
+    "questionId": "q130",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 3826,
+    "outputTokens": 2,
+    "latencyMs": 1959
+  },
+  {
+    "questionId": "q130",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 1110
+  },
+  {
+    "questionId": "q130",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "41",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 2,
+    "latencyMs": 4540
+  },
+  {
+    "questionId": "q130",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "41",
+    "actual": "31",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1286
+  },
+  {
+    "questionId": "q131",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "23",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 3709,
+    "outputTokens": 2,
+    "latencyMs": 1059
+  },
+  {
+    "questionId": "q131",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "23",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 4078,
+    "outputTokens": 5,
+    "latencyMs": 1302
+  },
+  {
+    "questionId": "q131",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "23",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 1560,
+    "outputTokens": 2,
+    "latencyMs": 1019
+  },
+  {
+    "questionId": "q131",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "23",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 1507,
+    "outputTokens": 5,
+    "latencyMs": 975
+  },
+  {
+    "questionId": "q131",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "23",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 1438,
+    "outputTokens": 2,
+    "latencyMs": 1056
+  },
+  {
+    "questionId": "q131",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "23",
+    "actual": "20",
+    "correct": false,
+    "inputTokens": 1443,
+    "outputTokens": 5,
+    "latencyMs": 984
+  },
+  {
+    "questionId": "q131",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "23",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 3826,
+    "outputTokens": 2,
+    "latencyMs": 1420
+  },
+  {
+    "questionId": "q131",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "23",
+    "actual": "21",
+    "correct": false,
+    "inputTokens": 3413,
+    "outputTokens": 5,
+    "latencyMs": 1139
+  },
+  {
+    "questionId": "q131",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "23",
+    "actual": "18",
+    "correct": false,
+    "inputTokens": 2982,
+    "outputTokens": 2,
+    "latencyMs": 1097
+  },
+  {
+    "questionId": "q131",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "23",
+    "actual": "21",
+    "correct": false,
+    "inputTokens": 3108,
+    "outputTokens": 5,
+    "latencyMs": 1203
+  },
+  {
+    "questionId": "q132",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 15188,
+    "outputTokens": 3,
+    "latencyMs": 2257
+  },
+  {
+    "questionId": "q132",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 17409,
+    "outputTokens": 6,
+    "latencyMs": 1292
+  },
+  {
+    "questionId": "q132",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 8789,
+    "outputTokens": 3,
+    "latencyMs": 1877
+  },
+  {
+    "questionId": "q132",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 9279,
+    "outputTokens": 6,
+    "latencyMs": 1118
+  },
+  {
+    "questionId": "q132",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 8557,
+    "outputTokens": 3,
+    "latencyMs": 4023
+  },
+  {
+    "questionId": "q132",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 9125,
+    "outputTokens": 6,
+    "latencyMs": 1134
+  },
+  {
+    "questionId": "q132",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 15482,
+    "outputTokens": 3,
+    "latencyMs": 5304
+  },
+  {
+    "questionId": "q132",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 15367,
+    "outputTokens": 6,
+    "latencyMs": 1442
+  },
+  {
+    "questionId": "q132",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 13172,
+    "outputTokens": 3,
+    "latencyMs": 2157
+  },
+  {
+    "questionId": "q132",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "430828",
+    "actual": "430828",
+    "correct": true,
+    "inputTokens": 14483,
+    "outputTokens": 6,
+    "latencyMs": 1483
+  },
+  {
+    "questionId": "q133",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 15190,
+    "outputTokens": 3,
+    "latencyMs": 2084
+  },
+  {
+    "questionId": "q133",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 17410,
+    "outputTokens": 6,
+    "latencyMs": 2592
+  },
+  {
+    "questionId": "q133",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 8791,
+    "outputTokens": 3,
+    "latencyMs": 1208
+  },
+  {
+    "questionId": "q133",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 9280,
+    "outputTokens": 6,
+    "latencyMs": 1261
+  },
+  {
+    "questionId": "q133",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 8559,
+    "outputTokens": 3,
+    "latencyMs": 1697
+  },
+  {
+    "questionId": "q133",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 9126,
+    "outputTokens": 6,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q133",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 15484,
+    "outputTokens": 3,
+    "latencyMs": 1704
+  },
+  {
+    "questionId": "q133",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 15368,
+    "outputTokens": 6,
+    "latencyMs": 1637
+  },
+  {
+    "questionId": "q133",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 13174,
+    "outputTokens": 3,
+    "latencyMs": 1599
+  },
+  {
+    "questionId": "q133",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "11798",
+    "actual": "11798",
+    "correct": true,
+    "inputTokens": 14484,
+    "outputTokens": 6,
+    "latencyMs": 1505
+  },
+  {
+    "questionId": "q134",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 15193,
+    "outputTokens": 3,
+    "latencyMs": 2340
+  },
+  {
+    "questionId": "q134",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 17412,
+    "outputTokens": 6,
+    "latencyMs": 1380
+  },
+  {
+    "questionId": "q134",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 8794,
+    "outputTokens": 3,
+    "latencyMs": 1631
+  },
+  {
+    "questionId": "q134",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 9282,
+    "outputTokens": 6,
+    "latencyMs": 1271
+  },
+  {
+    "questionId": "q134",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 8562,
+    "outputTokens": 3,
+    "latencyMs": 1620
+  },
+  {
+    "questionId": "q134",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 9128,
+    "outputTokens": 6,
+    "latencyMs": 1279
+  },
+  {
+    "questionId": "q134",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 15487,
+    "outputTokens": 3,
+    "latencyMs": 14565
+  },
+  {
+    "questionId": "q134",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 15370,
+    "outputTokens": 6,
+    "latencyMs": 1559
+  },
+  {
+    "questionId": "q134",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 13177,
+    "outputTokens": 3,
+    "latencyMs": 1600
+  },
+  {
+    "questionId": "q134",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "183631",
+    "actual": "183631",
+    "correct": true,
+    "inputTokens": 14486,
+    "outputTokens": 6,
+    "latencyMs": 1179
+  },
+  {
+    "questionId": "q135",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 15192,
+    "outputTokens": 3,
+    "latencyMs": 2508
+  },
+  {
+    "questionId": "q135",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 17412,
+    "outputTokens": 6,
+    "latencyMs": 1359
+  },
+  {
+    "questionId": "q135",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 8793,
+    "outputTokens": 3,
+    "latencyMs": 1188
+  },
+  {
+    "questionId": "q135",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 9282,
+    "outputTokens": 6,
+    "latencyMs": 1204
+  },
+  {
+    "questionId": "q135",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 8561,
+    "outputTokens": 3,
+    "latencyMs": 2448
+  },
+  {
+    "questionId": "q135",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 9128,
+    "outputTokens": 6,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q135",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 15486,
+    "outputTokens": 3,
+    "latencyMs": 2442
+  },
+  {
+    "questionId": "q135",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 15370,
+    "outputTokens": 6,
+    "latencyMs": 1414
+  },
+  {
+    "questionId": "q135",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 13176,
+    "outputTokens": 3,
+    "latencyMs": 2254
+  },
+  {
+    "questionId": "q135",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "29246",
+    "actual": "29246",
+    "correct": true,
+    "inputTokens": 14486,
+    "outputTokens": 6,
+    "latencyMs": 1512
+  },
+  {
+    "questionId": "q136",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 15188,
+    "outputTokens": 3,
+    "latencyMs": 1565
+  },
+  {
+    "questionId": "q136",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 17407,
+    "outputTokens": 6,
+    "latencyMs": 1871
+  },
+  {
+    "questionId": "q136",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 8789,
+    "outputTokens": 3,
+    "latencyMs": 1963
+  },
+  {
+    "questionId": "q136",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 9277,
+    "outputTokens": 6,
+    "latencyMs": 1533
+  },
+  {
+    "questionId": "q136",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 8557,
+    "outputTokens": 3,
+    "latencyMs": 1561
+  },
+  {
+    "questionId": "q136",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 9123,
+    "outputTokens": 6,
+    "latencyMs": 1200
+  },
+  {
+    "questionId": "q136",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 15482,
+    "outputTokens": 3,
+    "latencyMs": 1657
+  },
+  {
+    "questionId": "q136",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 15365,
+    "outputTokens": 6,
+    "latencyMs": 1582
+  },
+  {
+    "questionId": "q136",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 13172,
+    "outputTokens": 3,
+    "latencyMs": 3402
+  },
+  {
+    "questionId": "q136",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "135306",
+    "actual": "135306",
+    "correct": true,
+    "inputTokens": 14481,
+    "outputTokens": 6,
+    "latencyMs": 1251
+  },
+  {
+    "questionId": "q137",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 15187,
+    "outputTokens": 3,
+    "latencyMs": 2019
+  },
+  {
+    "questionId": "q137",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 17408,
+    "outputTokens": 6,
+    "latencyMs": 1517
+  },
+  {
+    "questionId": "q137",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "24914",
+    "actual": "The repository undefined/react-native does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 8788,
+    "outputTokens": 14,
+    "latencyMs": 1737
+  },
+  {
+    "questionId": "q137",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 9278,
+    "outputTokens": 6,
+    "latencyMs": 1467
+  },
+  {
+    "questionId": "q137",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 8556,
+    "outputTokens": 3,
+    "latencyMs": 3442
+  },
+  {
+    "questionId": "q137",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 9124,
+    "outputTokens": 6,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q137",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 15481,
+    "outputTokens": 3,
+    "latencyMs": 1825
+  },
+  {
+    "questionId": "q137",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 15366,
+    "outputTokens": 6,
+    "latencyMs": 1443
+  },
+  {
+    "questionId": "q137",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "24914",
+    "actual": "124320",
+    "correct": false,
+    "inputTokens": 13171,
+    "outputTokens": 3,
+    "latencyMs": 1783
+  },
+  {
+    "questionId": "q137",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "24914",
+    "actual": "24914",
+    "correct": true,
+    "inputTokens": 14482,
+    "outputTokens": 6,
+    "latencyMs": 1362
+  },
+  {
+    "questionId": "q138",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 15187,
+    "outputTokens": 3,
+    "latencyMs": 1824
+  },
+  {
+    "questionId": "q138",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 17407,
+    "outputTokens": 6,
+    "latencyMs": 1479
+  },
+  {
+    "questionId": "q138",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "111683",
+    "actual": "108017",
+    "correct": false,
+    "inputTokens": 8788,
+    "outputTokens": 3,
+    "latencyMs": 3315
+  },
+  {
+    "questionId": "q138",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 9277,
+    "outputTokens": 6,
+    "latencyMs": 1270
+  },
+  {
+    "questionId": "q138",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 8556,
+    "outputTokens": 3,
+    "latencyMs": 1384
+  },
+  {
+    "questionId": "q138",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 9123,
+    "outputTokens": 6,
+    "latencyMs": 1252
+  },
+  {
+    "questionId": "q138",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 15481,
+    "outputTokens": 3,
+    "latencyMs": 3048
+  },
+  {
+    "questionId": "q138",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 15365,
+    "outputTokens": 6,
+    "latencyMs": 1381
+  },
+  {
+    "questionId": "q138",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 13171,
+    "outputTokens": 3,
+    "latencyMs": 3804
+  },
+  {
+    "questionId": "q138",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "111683",
+    "actual": "111683",
+    "correct": true,
+    "inputTokens": 14481,
+    "outputTokens": 6,
+    "latencyMs": 1498
+  },
+  {
+    "questionId": "q139",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 15194,
+    "outputTokens": 3,
+    "latencyMs": 1726
+  },
+  {
+    "questionId": "q139",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 17412,
+    "outputTokens": 6,
+    "latencyMs": 1526
+  },
+  {
+    "questionId": "q139",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 8795,
+    "outputTokens": 3,
+    "latencyMs": 1685
+  },
+  {
+    "questionId": "q139",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 9282,
+    "outputTokens": 6,
+    "latencyMs": 1140
+  },
+  {
+    "questionId": "q139",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "13364",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8563,
+    "outputTokens": 2,
+    "latencyMs": 1933
+  },
+  {
+    "questionId": "q139",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 9128,
+    "outputTokens": 6,
+    "latencyMs": 1157
+  },
+  {
+    "questionId": "q139",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 15488,
+    "outputTokens": 3,
+    "latencyMs": 1249
+  },
+  {
+    "questionId": "q139",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 15370,
+    "outputTokens": 6,
+    "latencyMs": 1347
+  },
+  {
+    "questionId": "q139",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 13178,
+    "outputTokens": 3,
+    "latencyMs": 2174
+  },
+  {
+    "questionId": "q139",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "13364",
+    "actual": "13364",
+    "correct": true,
+    "inputTokens": 14486,
+    "outputTokens": 6,
+    "latencyMs": 1197
+  },
+  {
+    "questionId": "q140",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "98464",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15186,
+    "outputTokens": 2,
+    "latencyMs": 3252
+  },
+  {
+    "questionId": "q140",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "98464",
+    "actual": "98464",
+    "correct": true,
+    "inputTokens": 17405,
+    "outputTokens": 6,
+    "latencyMs": 1667
+  },
+  {
+    "questionId": "q140",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "98464",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8787,
+    "outputTokens": 2,
+    "latencyMs": 1192
+  },
+  {
+    "questionId": "q140",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "98464",
+    "actual": "98464",
+    "correct": true,
+    "inputTokens": 9275,
+    "outputTokens": 6,
+    "latencyMs": 1113
+  },
+  {
+    "questionId": "q140",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "98464",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8555,
+    "outputTokens": 2,
+    "latencyMs": 2198
+  },
+  {
+    "questionId": "q140",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "98464",
+    "actual": "98464",
+    "correct": true,
+    "inputTokens": 9121,
+    "outputTokens": 6,
+    "latencyMs": 1187
+  },
+  {
+    "questionId": "q140",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "98464",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15480,
+    "outputTokens": 2,
+    "latencyMs": 8573
+  },
+  {
+    "questionId": "q140",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "98464",
+    "actual": "98464",
+    "correct": true,
+    "inputTokens": 15363,
+    "outputTokens": 6,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q140",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "98464",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 13170,
+    "outputTokens": 2,
+    "latencyMs": 3471
+  },
+  {
+    "questionId": "q140",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "98464",
+    "actual": "98464",
+    "correct": true,
+    "inputTokens": 14479,
+    "outputTokens": 6,
+    "latencyMs": 1457
+  },
+  {
+    "questionId": "q141",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 15188,
+    "outputTokens": 3,
+    "latencyMs": 1363
+  },
+  {
+    "questionId": "q141",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 17408,
+    "outputTokens": 6,
+    "latencyMs": 1803
+  },
+  {
+    "questionId": "q141",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 8789,
+    "outputTokens": 3,
+    "latencyMs": 3696
+  },
+  {
+    "questionId": "q141",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 9278,
+    "outputTokens": 6,
+    "latencyMs": 1391
+  },
+  {
+    "questionId": "q141",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "6378",
+    "actual": "93731",
+    "correct": false,
+    "inputTokens": 8557,
+    "outputTokens": 3,
+    "latencyMs": 7861
+  },
+  {
+    "questionId": "q141",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 9124,
+    "outputTokens": 6,
+    "latencyMs": 1420
+  },
+  {
+    "questionId": "q141",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 15482,
+    "outputTokens": 3,
+    "latencyMs": 1769
+  },
+  {
+    "questionId": "q141",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 15366,
+    "outputTokens": 6,
+    "latencyMs": 1233
+  },
+  {
+    "questionId": "q141",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "6378",
+    "actual": "93731",
+    "correct": false,
+    "inputTokens": 13172,
+    "outputTokens": 3,
+    "latencyMs": 1831
+  },
+  {
+    "questionId": "q141",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "6378",
+    "actual": "6378",
+    "correct": true,
+    "inputTokens": 14482,
+    "outputTokens": 6,
+    "latencyMs": 1507
+  },
+  {
+    "questionId": "q142",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 15190,
+    "outputTokens": 3,
+    "latencyMs": 10752
+  },
+  {
+    "questionId": "q142",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 17409,
+    "outputTokens": 6,
+    "latencyMs": 1672
+  },
+  {
+    "questionId": "q142",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 8791,
+    "outputTokens": 3,
+    "latencyMs": 1788
+  },
+  {
+    "questionId": "q142",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 9279,
+    "outputTokens": 6,
+    "latencyMs": 1633
+  },
+  {
+    "questionId": "q142",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 8559,
+    "outputTokens": 3,
+    "latencyMs": 1365
+  },
+  {
+    "questionId": "q142",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 9125,
+    "outputTokens": 6,
+    "latencyMs": 1242
+  },
+  {
+    "questionId": "q142",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 15484,
+    "outputTokens": 3,
+    "latencyMs": 2237
+  },
+  {
+    "questionId": "q142",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 15367,
+    "outputTokens": 6,
+    "latencyMs": 1275
+  },
+  {
+    "questionId": "q142",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 13174,
+    "outputTokens": 3,
+    "latencyMs": 3028
+  },
+  {
+    "questionId": "q142",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "254916",
+    "actual": "254916",
+    "correct": true,
+    "inputTokens": 14483,
+    "outputTokens": 6,
+    "latencyMs": 1615
+  },
+  {
+    "questionId": "q143",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 15188,
+    "outputTokens": 3,
+    "latencyMs": 1972
+  },
+  {
+    "questionId": "q143",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 17410,
+    "outputTokens": 6,
+    "latencyMs": 2308
+  },
+  {
+    "questionId": "q143",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 8789,
+    "outputTokens": 3,
+    "latencyMs": 1361
+  },
+  {
+    "questionId": "q143",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 9280,
+    "outputTokens": 6,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q143",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 8557,
+    "outputTokens": 3,
+    "latencyMs": 2196
+  },
+  {
+    "questionId": "q143",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 9126,
+    "outputTokens": 6,
+    "latencyMs": 1199
+  },
+  {
+    "questionId": "q143",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 15482,
+    "outputTokens": 3,
+    "latencyMs": 1758
+  },
+  {
+    "questionId": "q143",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 15368,
+    "outputTokens": 6,
+    "latencyMs": 1340
+  },
+  {
+    "questionId": "q143",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 13172,
+    "outputTokens": 3,
+    "latencyMs": 2122
+  },
+  {
+    "questionId": "q143",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "32413",
+    "actual": "32413",
+    "correct": true,
+    "inputTokens": 14484,
+    "outputTokens": 6,
+    "latencyMs": 1156
+  },
+  {
+    "questionId": "q144",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "240059",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15186,
+    "outputTokens": 2,
+    "latencyMs": 1208
+  },
+  {
+    "questionId": "q144",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "240059",
+    "actual": "240059",
+    "correct": true,
+    "inputTokens": 17405,
+    "outputTokens": 6,
+    "latencyMs": 1826
+  },
+  {
+    "questionId": "q144",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "240059",
+    "actual": "undefined",
+    "correct": false,
+    "inputTokens": 8787,
+    "outputTokens": 2,
+    "latencyMs": 2224
+  },
+  {
+    "questionId": "q144",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "240059",
+    "actual": "240059",
+    "correct": true,
+    "inputTokens": 9275,
+    "outputTokens": 6,
+    "latencyMs": 1220
+  },
+  {
+    "questionId": "q144",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "240059",
+    "actual": "undefined",
+    "correct": false,
+    "inputTokens": 8555,
+    "outputTokens": 2,
+    "latencyMs": 1199
+  },
+  {
+    "questionId": "q144",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "240059",
+    "actual": "240059",
+    "correct": true,
+    "inputTokens": 9121,
+    "outputTokens": 6,
+    "latencyMs": 1264
+  },
+  {
+    "questionId": "q144",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "240059",
+    "actual": "undefined/react does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 15480,
+    "outputTokens": 11,
+    "latencyMs": 3072
+  },
+  {
+    "questionId": "q144",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "240059",
+    "actual": "240059",
+    "correct": true,
+    "inputTokens": 15363,
+    "outputTokens": 6,
+    "latencyMs": 1609
+  },
+  {
+    "questionId": "q144",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "240059",
+    "actual": "undefined/react does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 13170,
+    "outputTokens": 11,
+    "latencyMs": 2608
+  },
+  {
+    "questionId": "q144",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "240059",
+    "actual": "240059",
+    "correct": true,
+    "inputTokens": 14479,
+    "outputTokens": 6,
+    "latencyMs": 1237
+  },
+  {
+    "questionId": "q145",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "48986",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15187,
+    "outputTokens": 2,
+    "latencyMs": 1906
+  },
+  {
+    "questionId": "q145",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "48986",
+    "actual": "48986",
+    "correct": true,
+    "inputTokens": 17406,
+    "outputTokens": 6,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q145",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "48986",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8788,
+    "outputTokens": 2,
+    "latencyMs": 2026
+  },
+  {
+    "questionId": "q145",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "48986",
+    "actual": "48986",
+    "correct": true,
+    "inputTokens": 9276,
+    "outputTokens": 6,
+    "latencyMs": 1318
+  },
+  {
+    "questionId": "q145",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "48986",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8556,
+    "outputTokens": 2,
+    "latencyMs": 1605
+  },
+  {
+    "questionId": "q145",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "48986",
+    "actual": "48986",
+    "correct": true,
+    "inputTokens": 9122,
+    "outputTokens": 6,
+    "latencyMs": 1270
+  },
+  {
+    "questionId": "q145",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "48986",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15481,
+    "outputTokens": 2,
+    "latencyMs": 5367
+  },
+  {
+    "questionId": "q145",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "48986",
+    "actual": "48986",
+    "correct": true,
+    "inputTokens": 15364,
+    "outputTokens": 6,
+    "latencyMs": 1204
+  },
+  {
+    "questionId": "q145",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "48986",
+    "actual": "The repository \"undefined/Python\" does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 13171,
+    "outputTokens": 16,
+    "latencyMs": 6329
+  },
+  {
+    "questionId": "q145",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "48986",
+    "actual": "48986",
+    "correct": true,
+    "inputTokens": 14480,
+    "outputTokens": 6,
+    "latencyMs": 1369
+  },
+  {
+    "questionId": "q146",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 15186,
+    "outputTokens": 3,
+    "latencyMs": 2063
+  },
+  {
+    "questionId": "q146",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 17405,
+    "outputTokens": 6,
+    "latencyMs": 1470
+  },
+  {
+    "questionId": "q146",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 8787,
+    "outputTokens": 3,
+    "latencyMs": 1386
+  },
+  {
+    "questionId": "q146",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 9275,
+    "outputTokens": 6,
+    "latencyMs": 1104
+  },
+  {
+    "questionId": "q146",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 8555,
+    "outputTokens": 3,
+    "latencyMs": 1747
+  },
+  {
+    "questionId": "q146",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 9121,
+    "outputTokens": 6,
+    "latencyMs": 1300
+  },
+  {
+    "questionId": "q146",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 15480,
+    "outputTokens": 3,
+    "latencyMs": 1443
+  },
+  {
+    "questionId": "q146",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 15363,
+    "outputTokens": 6,
+    "latencyMs": 1282
+  },
+  {
+    "questionId": "q146",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 13170,
+    "outputTokens": 3,
+    "latencyMs": 2185
+  },
+  {
+    "questionId": "q146",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "209624",
+    "actual": "209624",
+    "correct": true,
+    "inputTokens": 14479,
+    "outputTokens": 6,
+    "latencyMs": 1407
+  },
+  {
+    "questionId": "q147",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 15186,
+    "outputTokens": 3,
+    "latencyMs": 1743
+  },
+  {
+    "questionId": "q147",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 17406,
+    "outputTokens": 6,
+    "latencyMs": 1564
+  },
+  {
+    "questionId": "q147",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 8787,
+    "outputTokens": 3,
+    "latencyMs": 1317
+  },
+  {
+    "questionId": "q147",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 9276,
+    "outputTokens": 6,
+    "latencyMs": 1258
+  },
+  {
+    "questionId": "q147",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 8555,
+    "outputTokens": 3,
+    "latencyMs": 2419
+  },
+  {
+    "questionId": "q147",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 9122,
+    "outputTokens": 6,
+    "latencyMs": 1171
+  },
+  {
+    "questionId": "q147",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "58023",
+    "actual": "undefined/linux does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 15480,
+    "outputTokens": 11,
+    "latencyMs": 1680
+  },
+  {
+    "questionId": "q147",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 15364,
+    "outputTokens": 6,
+    "latencyMs": 1396
+  },
+  {
+    "questionId": "q147",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "58023",
+    "actual": "The repository \"undefined/linux\" does not exist in the provided data.",
+    "correct": false,
+    "inputTokens": 13170,
+    "outputTokens": 15,
+    "latencyMs": 1418
+  },
+  {
+    "questionId": "q147",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "58023",
+    "actual": "58023",
+    "correct": true,
+    "inputTokens": 14480,
+    "outputTokens": 6,
+    "latencyMs": 1399
+  },
+  {
+    "questionId": "q148",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 15189,
+    "outputTokens": 3,
+    "latencyMs": 1673
+  },
+  {
+    "questionId": "q148",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 17407,
+    "outputTokens": 6,
+    "latencyMs": 1736
+  },
+  {
+    "questionId": "q148",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 8790,
+    "outputTokens": 3,
+    "latencyMs": 1754
+  },
+  {
+    "questionId": "q148",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 9277,
+    "outputTokens": 6,
+    "latencyMs": 1317
+  },
+  {
+    "questionId": "q148",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "196024",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 3219
+  },
+  {
+    "questionId": "q148",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 9123,
+    "outputTokens": 6,
+    "latencyMs": 1311
+  },
+  {
+    "questionId": "q148",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 15483,
+    "outputTokens": 3,
+    "latencyMs": 1346
+  },
+  {
+    "questionId": "q148",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 15365,
+    "outputTokens": 6,
+    "latencyMs": 1560
+  },
+  {
+    "questionId": "q148",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 13173,
+    "outputTokens": 3,
+    "latencyMs": 1009
+  },
+  {
+    "questionId": "q148",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "196024",
+    "actual": "196024",
+    "correct": true,
+    "inputTokens": 14481,
+    "outputTokens": 6,
+    "latencyMs": 1446
+  },
+  {
+    "questionId": "q149",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 15189,
+    "outputTokens": 3,
+    "latencyMs": 3361
+  },
+  {
+    "questionId": "q149",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 17408,
+    "outputTokens": 6,
+    "latencyMs": 1788
+  },
+  {
+    "questionId": "q149",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 8790,
+    "outputTokens": 3,
+    "latencyMs": 1123
+  },
+  {
+    "questionId": "q149",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 9278,
+    "outputTokens": 6,
+    "latencyMs": 1235
+  },
+  {
+    "questionId": "q149",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 8558,
+    "outputTokens": 3,
+    "latencyMs": 1100
+  },
+  {
+    "questionId": "q149",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 9124,
+    "outputTokens": 6,
+    "latencyMs": 1188
+  },
+  {
+    "questionId": "q149",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 15483,
+    "outputTokens": 3,
+    "latencyMs": 1557
+  },
+  {
+    "questionId": "q149",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 15366,
+    "outputTokens": 6,
+    "latencyMs": 1352
+  },
+  {
+    "questionId": "q149",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 13173,
+    "outputTokens": 3,
+    "latencyMs": 1280
+  },
+  {
+    "questionId": "q149",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "30919",
+    "actual": "30919",
+    "correct": true,
+    "inputTokens": 14482,
+    "outputTokens": 6,
+    "latencyMs": 1247
+  },
+  {
+    "questionId": "q150",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 15188,
+    "outputTokens": 3,
+    "latencyMs": 1394
+  },
+  {
+    "questionId": "q150",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 17405,
+    "outputTokens": 6,
+    "latencyMs": 1801
+  },
+  {
+    "questionId": "q150",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 8789,
+    "outputTokens": 3,
+    "latencyMs": 2052
+  },
+  {
+    "questionId": "q150",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 9275,
+    "outputTokens": 6,
+    "latencyMs": 1176
+  },
+  {
+    "questionId": "q150",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 8557,
+    "outputTokens": 3,
+    "latencyMs": 2084
+  },
+  {
+    "questionId": "q150",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 9121,
+    "outputTokens": 6,
+    "latencyMs": 1191
+  },
+  {
+    "questionId": "q150",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 15482,
+    "outputTokens": 3,
+    "latencyMs": 1261
+  },
+  {
+    "questionId": "q150",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 15363,
+    "outputTokens": 6,
+    "latencyMs": 1355
+  },
+  {
+    "questionId": "q150",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 13172,
+    "outputTokens": 3,
+    "latencyMs": 3388
+  },
+  {
+    "questionId": "q150",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "192220",
+    "actual": "192220",
+    "correct": true,
+    "inputTokens": 14479,
+    "outputTokens": 6,
+    "latencyMs": 1591
+  },
+  {
+    "questionId": "q151",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 15191,
+    "outputTokens": 3,
+    "latencyMs": 1942
+  },
+  {
+    "questionId": "q151",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 17414,
+    "outputTokens": 6,
+    "latencyMs": 1340
+  },
+  {
+    "questionId": "q151",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 8792,
+    "outputTokens": 3,
+    "latencyMs": 1443
+  },
+  {
+    "questionId": "q151",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 9284,
+    "outputTokens": 6,
+    "latencyMs": 1732
+  },
+  {
+    "questionId": "q151",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 8560,
+    "outputTokens": 3,
+    "latencyMs": 1994
+  },
+  {
+    "questionId": "q151",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 9130,
+    "outputTokens": 6,
+    "latencyMs": 1198
+  },
+  {
+    "questionId": "q151",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 15485,
+    "outputTokens": 3,
+    "latencyMs": 5013
+  },
+  {
+    "questionId": "q151",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 15372,
+    "outputTokens": 6,
+    "latencyMs": 1463
+  },
+  {
+    "questionId": "q151",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 13175,
+    "outputTokens": 3,
+    "latencyMs": 1296
+  },
+  {
+    "questionId": "q151",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "11763",
+    "actual": "11763",
+    "correct": true,
+    "inputTokens": 14488,
+    "outputTokens": 6,
+    "latencyMs": 2877
+  },
+  {
+    "questionId": "q152",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15188,
+    "outputTokens": 2,
+    "latencyMs": 2160
+  },
+  {
+    "questionId": "q152",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 17406,
+    "outputTokens": 5,
+    "latencyMs": 1947
+  },
+  {
+    "questionId": "q152",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8789,
+    "outputTokens": 2,
+    "latencyMs": 1222
+  },
+  {
+    "questionId": "q152",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 9276,
+    "outputTokens": 5,
+    "latencyMs": 1487
+  },
+  {
+    "questionId": "q152",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 8557,
+    "outputTokens": 2,
+    "latencyMs": 1450
+  },
+  {
+    "questionId": "q152",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 9122,
+    "outputTokens": 5,
+    "latencyMs": 1358
+  },
+  {
+    "questionId": "q152",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 15482,
+    "outputTokens": 2,
+    "latencyMs": 873
+  },
+  {
+    "questionId": "q152",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 15364,
+    "outputTokens": 5,
+    "latencyMs": 1500
+  },
+  {
+    "questionId": "q152",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 13172,
+    "outputTokens": 2,
+    "latencyMs": 7031
+  },
+  {
+    "questionId": "q152",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "0",
+    "correct": false,
+    "inputTokens": 14480,
+    "outputTokens": 5,
+    "latencyMs": 1916
+  },
+  {
+    "questionId": "q153",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "15404143",
+    "actual": "43115556",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 4,
+    "latencyMs": 3324
+  },
+  {
+    "questionId": "q153",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "15404143",
+    "actual": "13,847,892",
+    "correct": false,
+    "inputTokens": 17407,
+    "outputTokens": 9,
+    "latencyMs": 1607
+  },
+  {
+    "questionId": "q153",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "15404143",
+    "actual": "10419582",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 4,
+    "latencyMs": 900
+  },
+  {
+    "questionId": "q153",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "15404143",
+    "actual": "13,847,892",
+    "correct": false,
+    "inputTokens": 9277,
+    "outputTokens": 9,
+    "latencyMs": 1385
+  },
+  {
+    "questionId": "q153",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "15404143",
+    "actual": "10419582",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 4,
+    "latencyMs": 1922
+  },
+  {
+    "questionId": "q153",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "15404143",
+    "actual": "15,847,892",
+    "correct": false,
+    "inputTokens": 9123,
+    "outputTokens": 9,
+    "latencyMs": 1230
+  },
+  {
+    "questionId": "q153",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "15404143",
+    "actual": "10419580",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 4,
+    "latencyMs": 1716
+  },
+  {
+    "questionId": "q153",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "15404143",
+    "actual": "13,847,892",
+    "correct": false,
+    "inputTokens": 15365,
+    "outputTokens": 9,
+    "latencyMs": 1384
+  },
+  {
+    "questionId": "q153",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "15404143",
+    "actual": "Total number of stars across all repositories is 4,978,155.",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 16,
+    "latencyMs": 3411
+  },
+  {
+    "questionId": "q153",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "15404143",
+    "actual": "13,847,892",
+    "correct": false,
+    "inputTokens": 14481,
+    "outputTokens": 9,
+    "latencyMs": 1539
+  },
+  {
+    "questionId": "q154",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 2523
+  },
+  {
+    "questionId": "q154",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 17408,
+    "outputTokens": 5,
+    "latencyMs": 1885
+  },
+  {
+    "questionId": "q154",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 1148
+  },
+  {
+    "questionId": "q154",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 9278,
+    "outputTokens": 5,
+    "latencyMs": 1378
+  },
+  {
+    "questionId": "q154",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 1364
+  },
+  {
+    "questionId": "q154",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 9124,
+    "outputTokens": 5,
+    "latencyMs": 1125
+  },
+  {
+    "questionId": "q154",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "104",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 1276
+  },
+  {
+    "questionId": "q154",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 15366,
+    "outputTokens": 5,
+    "latencyMs": 1331
+  },
+  {
+    "questionId": "q154",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 1534
+  },
+  {
+    "questionId": "q154",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "100",
+    "correct": true,
+    "inputTokens": 14482,
+    "outputTokens": 5,
+    "latencyMs": 1282
+  },
+  {
+    "questionId": "q155",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "19",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 2206
+  },
+  {
+    "questionId": "q155",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 17408,
+    "outputTokens": 5,
+    "latencyMs": 1568
+  },
+  {
+    "questionId": "q155",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "15",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 1478
+  },
+  {
+    "questionId": "q155",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 9278,
+    "outputTokens": 5,
+    "latencyMs": 1314
+  },
+  {
+    "questionId": "q155",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "12",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 2149
+  },
+  {
+    "questionId": "q155",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 9124,
+    "outputTokens": 5,
+    "latencyMs": 1485
+  },
+  {
+    "questionId": "q155",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 1043
+  },
+  {
+    "questionId": "q155",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 15366,
+    "outputTokens": 5,
+    "latencyMs": 1371
+  },
+  {
+    "questionId": "q155",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 1693
+  },
+  {
+    "questionId": "q155",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 14482,
+    "outputTokens": 5,
+    "latencyMs": 1237
+  },
+  {
+    "questionId": "q156",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "76",
+    "actual": "82",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 927
+  },
+  {
+    "questionId": "q156",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "76",
+    "actual": "100",
+    "correct": false,
+    "inputTokens": 17408,
+    "outputTokens": 5,
+    "latencyMs": 1274
+  },
+  {
+    "questionId": "q156",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "76",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 2541
+  },
+  {
+    "questionId": "q156",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "76",
+    "actual": "100",
+    "correct": false,
+    "inputTokens": 9278,
+    "outputTokens": 5,
+    "latencyMs": 1116
+  },
+  {
+    "questionId": "q156",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "76",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 997
+  },
+  {
+    "questionId": "q156",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "76",
+    "actual": "100",
+    "correct": false,
+    "inputTokens": 9124,
+    "outputTokens": 5,
+    "latencyMs": 1513
+  },
+  {
+    "questionId": "q156",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "76",
+    "actual": "104",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 3168
+  },
+  {
+    "questionId": "q156",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "76",
+    "actual": "100",
+    "correct": false,
+    "inputTokens": 15366,
+    "outputTokens": 5,
+    "latencyMs": 1498
+  },
+  {
+    "questionId": "q156",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "76",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 1600
+  },
+  {
+    "questionId": "q156",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "76",
+    "actual": "100",
+    "correct": false,
+    "inputTokens": 14482,
+    "outputTokens": 5,
+    "latencyMs": 1519
+  },
+  {
+    "questionId": "q157",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 1809
+  },
+  {
+    "questionId": "q157",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "89",
+    "correct": false,
+    "inputTokens": 17409,
+    "outputTokens": 5,
+    "latencyMs": 1409
+  },
+  {
+    "questionId": "q157",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 1367
+  },
+  {
+    "questionId": "q157",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "73",
+    "correct": false,
+    "inputTokens": 9279,
+    "outputTokens": 5,
+    "latencyMs": 1296
+  },
+  {
+    "questionId": "q157",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 1162
+  },
+  {
+    "questionId": "q157",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "89",
+    "correct": false,
+    "inputTokens": 9125,
+    "outputTokens": 5,
+    "latencyMs": 1435
+  },
+  {
+    "questionId": "q157",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "77",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 1774
+  },
+  {
+    "questionId": "q157",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "95",
+    "correct": false,
+    "inputTokens": 15367,
+    "outputTokens": 5,
+    "latencyMs": 1479
+  },
+  {
+    "questionId": "q157",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "100",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 2710
+  },
+  {
+    "questionId": "q157",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "100",
+    "actual": "95",
+    "correct": false,
+    "inputTokens": 14483,
+    "outputTokens": 5,
+    "latencyMs": 1272
+  },
+  {
+    "questionId": "q158",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "95",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 3038
+  },
+  {
+    "questionId": "q158",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "95",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 17409,
+    "outputTokens": 5,
+    "latencyMs": 1562
+  },
+  {
+    "questionId": "q158",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "95",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 1536
+  },
+  {
+    "questionId": "q158",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "95",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 9279,
+    "outputTokens": 5,
+    "latencyMs": 1216
+  },
+  {
+    "questionId": "q158",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "95",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 1760
+  },
+  {
+    "questionId": "q158",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "95",
+    "actual": "42",
+    "correct": false,
+    "inputTokens": 9125,
+    "outputTokens": 5,
+    "latencyMs": 1255
+  },
+  {
+    "questionId": "q158",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "95",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 1683
+  },
+  {
+    "questionId": "q158",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "95",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 15367,
+    "outputTokens": 5,
+    "latencyMs": 2256
+  },
+  {
+    "questionId": "q158",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "95",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 2831
+  },
+  {
+    "questionId": "q158",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "95",
+    "actual": "47",
+    "correct": false,
+    "inputTokens": 14483,
+    "outputTokens": 5,
+    "latencyMs": 1980
+  },
+  {
+    "questionId": "q159",
+    "format": "json",
+    "model": "gpt-4o-mini",
+    "expected": "83",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 15189,
+    "outputTokens": 2,
+    "latencyMs": 1327
+  },
+  {
+    "questionId": "q159",
+    "format": "json",
+    "model": "claude-haiku-4-5",
+    "expected": "83",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 17409,
+    "outputTokens": 5,
+    "latencyMs": 1894
+  },
+  {
+    "questionId": "q159",
+    "format": "toon",
+    "model": "gpt-4o-mini",
+    "expected": "83",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 8790,
+    "outputTokens": 2,
+    "latencyMs": 784
+  },
+  {
+    "questionId": "q159",
+    "format": "toon",
+    "model": "claude-haiku-4-5",
+    "expected": "83",
+    "actual": "73",
+    "correct": false,
+    "inputTokens": 9279,
+    "outputTokens": 5,
+    "latencyMs": 1422
+  },
+  {
+    "questionId": "q159",
+    "format": "csv",
+    "model": "gpt-4o-mini",
+    "expected": "83",
+    "actual": "34",
+    "correct": false,
+    "inputTokens": 8558,
+    "outputTokens": 2,
+    "latencyMs": 2644
+  },
+  {
+    "questionId": "q159",
+    "format": "csv",
+    "model": "claude-haiku-4-5",
+    "expected": "83",
+    "actual": "73",
+    "correct": false,
+    "inputTokens": 9125,
+    "outputTokens": 5,
+    "latencyMs": 1109
+  },
+  {
+    "questionId": "q159",
+    "format": "markdown-kv",
+    "model": "gpt-4o-mini",
+    "expected": "83",
+    "actual": "66",
+    "correct": false,
+    "inputTokens": 15483,
+    "outputTokens": 2,
+    "latencyMs": 1826
+  },
+  {
+    "questionId": "q159",
+    "format": "markdown-kv",
+    "model": "claude-haiku-4-5",
+    "expected": "83",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 15367,
+    "outputTokens": 5,
+    "latencyMs": 1342
+  },
+  {
+    "questionId": "q159",
+    "format": "yaml",
+    "model": "gpt-4o-mini",
+    "expected": "83",
+    "actual": "38",
+    "correct": false,
+    "inputTokens": 13173,
+    "outputTokens": 2,
+    "latencyMs": 2055
+  },
+  {
+    "questionId": "q159",
+    "format": "yaml",
+    "model": "claude-haiku-4-5",
+    "expected": "83",
+    "actual": "71",
+    "correct": false,
+    "inputTokens": 14483,
+    "outputTokens": 5,
+    "latencyMs": 1537
+  }
+]
\ No newline at end of file
diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/accuracy/report.md
new file mode 100644
index 0000000..9991de9
--- /dev/null
+++ b/benchmarks/results/accuracy/report.md
@@ -0,0 +1,96 @@
+### Retrieval Accuracy
+
+Tested across **2 LLMs** with data retrieval tasks:
+
+```
+gpt-4o-mini          ██████████████░░░░░░ 72.3% accuracy
+claude-haiku-4-5     ███████████████░░░░░ 76.7% accuracy
+```
+
+**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.**
+
+| Format | Accuracy | Average Tokens |
+| ------ | -------- | -------------- |
+| `toon` | 73.9% | 4.678 |
+| `json` | 73.6% | 8.713 |
+| `markdown-kv` | 73.6% | 8.649 |
+| `csv` | 72.3% | 4.745 |
+| `yaml` | 71.7% | 7.091 |
+
+<details>
+<summary><strong>View detailed breakdown by dataset and model</strong></summary>
+
+#### Performance by Dataset
+
+##### Uniform employee records (TOON optimal format)
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 72.4% | 2.483 | 84/116 |
+| `csv` | 69.0% | 2.337 | 80/116 |
+| `yaml` | 68.1% | 4.969 | 79/116 |
+| `markdown-kv` | 68.1% | 6.270 | 79/116 |
+| `json` | 68.1% | 6.347 | 79/116 |
+
+##### E-commerce orders with nested structures
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 84.1% | 5.967 | 74/88 |
+| `csv` | 83.0% | 6.735 | 73/88 |
+| `yaml` | 81.8% | 7.328 | 72/88 |
+| `markdown-kv` | 86.4% | 9.110 | 76/88 |
+| `json` | 84.1% | 9.694 | 74/88 |
+
+##### Time-series analytics data
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `csv` | 72.4% | 1.393 | 42/58 |
+| `toon` | 70.7% | 1.515 | 41/58 |
+| `yaml` | 72.4% | 2.938 | 42/58 |
+| `json` | 74.1% | 3.665 | 43/58 |
+| `markdown-kv` | 70.7% | 3.779 | 41/58 |
+
+##### Popular GitHub repositories
+
+| Format | Accuracy | Tokens | Correct/Total |
+|--------|----------|--------|---------------|
+| `toon` | 64.3% | 8.745 | 36/56 |
+| `csv` | 62.5% | 8.513 | 35/56 |
+| `json` | 67.9% | 15.145 | 38/56 |
+| `markdown-kv` | 67.9% | 15.436 | 38/56 |
+| `yaml` | 62.5% | 13.129 | 35/56 |
+
+
+#### Performance by Model
+
+##### gpt-4o-mini
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `toon` | 72.3% | 115/159 |
+| `json` | 71.7% | 114/159 |
+| `markdown-kv` | 70.4% | 112/159 |
+| `csv` | 69.2% | 110/159 |
+| `yaml` | 68.6% | 109/159 |
+
+##### claude-haiku-4-5
+
+| Format | Accuracy | Correct/Total |
+|--------|----------|---------------|
+| `markdown-kv` | 76.7% | 122/159 |
+| `toon` | 75.5% | 120/159 |
+| `json` | 75.5% | 120/159 |
+| `csv` | 75.5% | 120/159 |
+| `yaml` | 74.8% | 119/159 |
+
+
+#### Methodology
+
+- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).
+- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.
+- **Question types**: Field retrieval, aggregation, and filtering tasks.
+- **Real data**: Faker.js-generated datasets + GitHub repositories.
+
+</details>
diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json
new file mode 100644
index 0000000..b5dddc2
--- /dev/null
+++ b/benchmarks/results/accuracy/summary.json
@@ -0,0 +1,95 @@
+{
+  "formatResults": [
+    {
+      "format": "toon",
+      "accuracy": 0.7389937106918238,
+      "totalTokens": 4678,
+      "avgInputTokens": 4675,
+      "avgLatency": 1424,
+      "correctCount": 235,
+      "totalCount": 318
+    },
+    {
+      "format": "json",
+      "accuracy": 0.7358490566037735,
+      "totalTokens": 8713,
+      "avgInputTokens": 9177,
+      "avgLatency": 1678,
+      "correctCount": 234,
+      "totalCount": 318
+    },
+    {
+      "format": "markdown-kv",
+      "accuracy": 0.7358490566037735,
+      "totalTokens": 8649,
+      "avgInputTokens": 8242,
+      "avgLatency": 1724,
+      "correctCount": 234,
+      "totalCount": 318
+    },
+    {
+      "format": "csv",
+      "accuracy": 0.7232704402515723,
+      "totalTokens": 4745,
+      "avgInputTokens": 4878,
+      "avgLatency": 1573,
+      "correctCount": 230,
+      "totalCount": 318
+    },
+    {
+      "format": "yaml",
+      "accuracy": 0.7169811320754716,
+      "totalTokens": 7091,
+      "avgInputTokens": 7136,
+      "avgLatency": 1602,
+      "correctCount": 228,
+      "totalCount": 318
+    }
+  ],
+  "questions": 159,
+  "models": [
+    "gpt-4o-mini",
+    "claude-haiku-4-5"
+  ],
+  "datasets": [
+    {
+      "name": "tabular",
+      "description": "Uniform employee records (TOON optimal format)"
+    },
+    {
+      "name": "nested",
+      "description": "E-commerce orders with nested structures"
+    },
+    {
+      "name": "analytics",
+      "description": "Time-series analytics data"
+    },
+    {
+      "name": "github",
+      "description": "Popular GitHub repositories"
+    }
+  ],
+  "tokenCounts": {
+    "json-tabular": 6347,
+    "json-nested": 9694,
+    "json-analytics": 3665,
+    "json-github": 15145,
+    "toon-tabular": 2483,
+    "toon-nested": 5967,
+    "toon-analytics": 1515,
+    "toon-github": 8745,
+    "csv-tabular": 2337,
+    "csv-nested": 6735,
+    "csv-analytics": 1393,
+    "csv-github": 8513,
+    "markdown-kv-tabular": 6270,
+    "markdown-kv-nested": 9110,
+    "markdown-kv-analytics": 3779,
+    "markdown-kv-github": 15436,
+    "yaml-tabular": 4969,
+    "yaml-nested": 7328,
+    "yaml-analytics": 2938,
+    "yaml-github": 13129
+  },
+  "timestamp": "2025-10-27T10:46:35.127Z"
+}
\ No newline at end of file
diff --git a/benchmarks/results/token-efficiency.md b/benchmarks/results/token-efficiency.md
new file mode 100644
index 0000000..090397a
--- /dev/null
+++ b/benchmarks/results/token-efficiency.md
@@ -0,0 +1,141 @@
+### Token Efficiency
+
+```
+⭐ GitHub Repositories       ██████████████░░░░░░░░░░░   8,745 tokens  (JSON: 15,145)  💰 42.3% saved
+📈 Analytics Time Series     ██████████░░░░░░░░░░░░░░░   3,631 tokens  (JSON:  9,024)  💰 59.8% saved
+👥 API Response              ██████████████░░░░░░░░░░░   2,593 tokens  (JSON:  4,589)  💰 43.5% saved
+🛒 E-commerce Order          ███████████████░░░░░░░░░░     203 tokens  (JSON:    338)  💰 39.9% saved
+```
+
+**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → 47.9% savings
+
+<details>
+<summary><strong>View detailed examples</strong></summary>
+
+#### ⭐ GitHub Repositories
+
+**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata
+
+**Savings:** 6,400 tokens (42.3% reduction)
+
+**JSON** (15,145 tokens):
+
+```json
+{
+  "repositories": [
+    {
+      "id": 28457823,
+      "name": "freeCodeCamp",
+      "repo": "freeCodeCamp/freeCodeCamp",
+      "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...",
+      "createdAt": "2014-12-24T17:49:19Z",
+      "updatedAt": "2025-10-27T07:40:58Z",
+      "pushedAt": "2025-10-26T11:31:08Z",
+      "stars": 430828,
+      "watchers": 8582,
+      "forks": 42136,
+      "defaultBranch": "main"
+    },
+    {
+      "id": 132750724,
+      "name": "build-your-own-x",
+      "repo": "codecrafters-io/build-your-own-x",
+      "description": "Master programming by recreating your favorite technologies from scratch.",
+      "createdAt": "2018-05-09T12:03:18Z",
+      "updatedAt": "2025-10-27T07:43:25Z",
+      "pushedAt": "2025-10-10T18:45:01Z",
+      "stars": 430102,
+      "watchers": 6322,
+      "forks": 40388,
+      "defaultBranch": "master"
+    },
+    {
+      "id": 21737465,
+      "name": "awesome",
+      "repo": "sindresorhus/awesome",
+      "description": "😎 Awesome lists about all kinds of interesting topics",
+      "createdAt": "2014-07-11T13:42:37Z",
+      "updatedAt": "2025-10-27T07:44:27Z",
+      "pushedAt": "2025-10-23T17:26:53Z",
+      "stars": 409760,
+      "watchers": 8016,
+      "forks": 32015,
+      "defaultBranch": "main"
+    }
+  ]
+}
+```
+
+**TOON** (8,745 tokens):
+
+```
+repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
+  28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
+  132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
+  21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
+```
+
+---
+
+#### 📈 Analytics Time Series
+
+**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue)
+
+**Savings:** 5,393 tokens (59.8% reduction)
+
+**JSON** (9,024 tokens):
+
+```json
+{
+  "metrics": [
+    {
+      "date": "2024-12-31",
+      "views": 3769,
+      "clicks": 400,
+      "conversions": 59,
+      "revenue": 198.98
+    },
+    {
+      "date": "2025-01-01",
+      "views": 5742,
+      "clicks": 463,
+      "conversions": 28,
+      "revenue": 295.77
+    },
+    {
+      "date": "2025-01-02",
+      "views": 3669,
+      "clicks": 336,
+      "conversions": 102,
+      "revenue": 624.23
+    },
+    {
+      "date": "2025-01-03",
+      "views": 1332,
+      "clicks": 304,
+      "conversions": 99,
+      "revenue": 113.06
+    },
+    {
+      "date": "2025-01-04",
+      "views": 1444,
+      "clicks": 222,
+      "conversions": 88,
+      "revenue": 986.69
+    }
+  ]
+}
+```
+
+**TOON** (3,631 tokens):
+
+```
+metrics[5]{date,views,clicks,conversions,revenue}:
+  2024-12-31,3769,400,59,198.98
+  2025-01-01,5742,463,28,295.77
+  2025-01-02,3669,336,102,624.23
+  2025-01-03,1332,304,99,113.06
+  2025-01-04,1444,222,88,986.69
+```
+
+</details>
diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts
new file mode 100644
index 0000000..9867e5c
--- /dev/null
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -0,0 +1,140 @@
+/**
+ * TOON LLM Accuracy Benchmark
+ *
+ * Main entry point that orchestrates the full benchmark:
+ * 1. Generate questions from datasets
+ * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
+ * 3. Evaluate each question with each format using LLMs
+ * 4. Generate reports
+ */
+
+import type { EvaluationResult, Question } from '../src/types'
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import { consola } from 'consola'
+import pMap from 'p-map'
+import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
+import { datasets } from '../src/datasets'
+import { evaluateQuestion, models } from '../src/evaluate'
+import { formatters } from '../src/formatters'
+import { generateQuestions } from '../src/questions'
+import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
+
+consola.start('LLM Accuracy Benchmark for TOON')
+
+// Check if results already exist
+const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
+const rawResultsPath = path.join(resultsDir, 'raw-results.json')
+const summaryPath = path.join(resultsDir, 'summary.json')
+
+let existingResults: EvaluationResult[] | undefined
+let existingTokenCounts: Record<string, number> | undefined
+
+try {
+  const [rawData, summaryData] = await Promise.all([
+    fsp.readFile(rawResultsPath, 'utf-8'),
+    fsp.readFile(summaryPath, 'utf-8'),
+  ])
+  existingResults = JSON.parse(rawData)
+  const summary = JSON.parse(summaryData)
+  existingTokenCounts = summary.tokenCounts
+  consola.info('Found existing results – regenerating report only')
+}
+catch {
+  // Results don't exist, will run full evaluation
+}
+
+if (DRY_RUN) {
+  consola.info('Limiting questions and models for dry run')
+}
+
+let questions = generateQuestions()
+
+// Apply dry run limits if enabled
+if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
+  questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
+}
+
+// Filter models for dry run
+const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
+  ? Object.fromEntries(
+      Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
+    )
+  : models
+
+let results: EvaluationResult[]
+let tokenCounts: Record<string, number>
+
+if (existingResults && existingTokenCounts) {
+  // Reuse existing results
+  results = existingResults
+  tokenCounts = existingTokenCounts
+}
+else {
+  // Run full evaluation
+  consola.info(`Evaluating ${questions.length} questions`)
+  consola.info(`Testing ${Object.keys(formatters).length} formats`)
+  consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
+
+  // Calculate token counts for all format+dataset combinations
+  tokenCounts = calculateTokenCounts(formatters)
+
+  // Format datasets once (reuse for all questions)
+  const formattedDatasets: Record<string, Record<string, string>> = {}
+  for (const [formatName, formatter] of Object.entries(formatters)) {
+    formattedDatasets[formatName] = {}
+    for (const dataset of datasets) {
+      const formatted = formatter(dataset.data)
+      formattedDatasets[formatName]![dataset.name] = formatted
+    }
+  }
+
+  // Generate evaluation tasks
+  const tasks: { question: Question, formatName: string, modelName: string }[] = []
+  for (const question of questions) {
+    for (const [formatName] of Object.entries(formatters)) {
+      for (const [modelName] of Object.entries(activeModels)) {
+        tasks.push({ question, formatName, modelName })
+      }
+    }
+  }
+
+  const total = tasks.length
+
+  consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
+
+  // Evaluate all tasks in parallel
+  results = await pMap(
+    tasks,
+    async (task, index) => {
+      const formattedData = formattedDatasets[task.formatName]![task.question.dataset]!
+      const model = activeModels[task.modelName as keyof typeof activeModels]
+
+      const result = await evaluateQuestion(
+        task.question,
+        task.formatName,
+        formattedData,
+        model,
+        task.modelName,
+      )
+
+      // Progress update
+      if ((index + 1) % 10 === 0) {
+        const percent = (((index + 1) / total) * 100).toFixed(1)
+        console.log(`⏳ Progress: ${index + 1}/${total} (${percent}%)`)
+      }
+
+      return result
+    },
+    { concurrency: DEFAULT_CONCURRENCY },
+  )
+
+  consola.success('Evaluation complete!')
+}
+
+// Generate/regenerate markdown report
+const formatResults = calculateFormatResults(results, tokenCounts)
+await saveResults(results, formatResults, questions, tokenCounts)
+
+consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
+consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
diff --git a/benchmarks/scripts/fetch-github-data.ts b/benchmarks/scripts/fetch-github-data.ts
new file mode 100644
index 0000000..335dd77
--- /dev/null
+++ b/benchmarks/scripts/fetch-github-data.ts
@@ -0,0 +1,78 @@
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import process from 'node:process'
+import { consola } from 'consola'
+import { ofetch } from 'ofetch'
+import { BENCHMARKS_DIR } from '../src/constants'
+
+try {
+  // Fetch top 100 repos from GitHub
+  const repoList = await searchTop100Repos()
+  const repos = await fetchRepoDetails(repoList)
+
+  if (repos.length === 0) {
+    consola.error('❌ No repositories fetched. Exiting.')
+    process.exit(1)
+  }
+
+  // Sort by stars descending
+  repos.sort((a, b) => b.stars - a.stars)
+
+  await saveRepos(repos)
+
+  consola.success('Done!')
+}
+catch (error) {
+  consola.error(error)
+  process.exit(1)
+}
+
+async function searchTop100Repos(): Promise<string[]> {
+  consola.start('Fetching top 100 starred repositories from GitHub API…')
+
+  const response = await ofetch<{ items: { full_name: string }[] }>(
+    'https://api.github.com/search/repositories',
+    {
+      query: {
+        q: 'stars:>1',
+        sort: 'stars',
+        order: 'desc',
+        per_page: 100,
+      },
+      headers: {
+        'Accept': 'application/vnd.github+json',
+        'X-GitHub-Api-Version': '2022-11-28',
+      },
+    },
+  )
+
+  return response.items.map(item => item.full_name)
+}
+
+async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
+  consola.start(`Fetching ${repoList.length} GitHub repositories…`)
+
+  const repos: Record<string, any>[] = []
+
+  for (let i = 0; i < repoList.length; i++) {
+    const repoPath = repoList[i]!
+    console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}…`)
+    const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`)
+    repos.push(repo)
+  }
+
+  consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
+
+  return repos
+}
+
+async function saveRepos(repos: Record<string, any>[]): Promise<void> {
+  const outputDir = path.join(BENCHMARKS_DIR, 'data')
+  const outputFile = path.join(outputDir, 'github-repos.json')
+
+  await fsp.mkdir(outputDir, { recursive: true })
+  await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2))
+
+  const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
+  consola.info(`Saved to \`${relativePath}\``)
+}
diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
new file mode 100644
index 0000000..5957115
--- /dev/null
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -0,0 +1,228 @@
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import { faker } from '@faker-js/faker'
+import { consola } from 'consola'
+import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default)
+import { encode } from '../../src/index'
+import githubRepos from '../data/github-repos.json' with { type: 'json' }
+import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
+
+interface BenchmarkResult {
+  name: string
+  emoji: string
+  description: string
+  data: any
+  jsonTokens: number
+  toonTokens: number
+  savings: number
+  savingsPercent: string
+  showDetailed: boolean
+}
+
+const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md')
+
+const BENCHMARK_EXAMPLES = [
+  {
+    name: 'GitHub Repositories',
+    emoji: '⭐',
+    description: 'Top 100 GitHub repositories with stars, forks, and metadata',
+    getData: () => ({ repositories: githubRepos }),
+    showDetailed: true,
+  },
+  {
+    name: 'Analytics Time Series',
+    emoji: '📈',
+    description: '180 days of web metrics (views, clicks, conversions, revenue)',
+    getData: () => generateAnalytics(180),
+    showDetailed: true,
+  },
+  {
+    name: 'API Response',
+    emoji: '👥',
+    description: '50 user records with metadata and timestamps',
+    getData: () => generateUsers(50),
+    showDetailed: false,
+  },
+  {
+    name: 'E-commerce Order',
+    emoji: '🛒',
+    description: 'Nested order with customer and items',
+    getData: generateOrder,
+    showDetailed: false,
+  },
+] as const
+
+// Calculate total savings
+let totalJsonTokens = 0
+let totalToonTokens = 0
+
+const results: BenchmarkResult[] = []
+
+for (const example of BENCHMARK_EXAMPLES) {
+  const data = await example.getData()
+
+  const jsonString = JSON.stringify(data, undefined, 2)
+  const toonString = encode(data)
+
+  const jsonTokens = encodeTokens(jsonString).length
+  const toonTokens = encodeTokens(toonString).length
+  const savings = jsonTokens - toonTokens
+  const savingsPercent = ((savings / jsonTokens) * 100).toFixed(1)
+
+  totalJsonTokens += jsonTokens
+  totalToonTokens += toonTokens
+
+  results.push({
+    name: example.name,
+    emoji: example.emoji,
+    description: example.description,
+    data,
+    jsonTokens,
+    toonTokens,
+    savings,
+    savingsPercent,
+    showDetailed: example.showDetailed,
+  })
+}
+
+const totalSavings = totalJsonTokens - totalToonTokens
+const totalSavingsPercent = ((totalSavings / totalJsonTokens) * 100).toFixed(1)
+
+// Generate ASCII bar chart visualization
+const barChartSection = results
+  .map((result) => {
+    const percentage = Number.parseFloat(result.savingsPercent)
+    const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens
+    const jsonStr = result.jsonTokens.toLocaleString('en-US')
+    const toonStr = result.toonTokens.toLocaleString('en-US')
+    return `${result.emoji} ${result.name.padEnd(25)} ${bar}  ${toonStr.padStart(6)} tokens  (JSON: ${jsonStr.padStart(6)})  💰 ${result.savingsPercent}% saved`
+  })
+  .join('\n')
+
+// Generate detailed examples (only for selected examples)
+const detailedExamples = results
+  .filter(result => result.showDetailed)
+  .map((result, i, filtered) => {
+    // Truncate large datasets for display
+    let displayData = result.data
+    if (result.name === 'GitHub Repositories') {
+      displayData = {
+        repositories: result.data.repositories.slice(0, 3).map((repo: any) => ({
+          ...repo,
+          description: repo.description?.slice(0, 80) + (repo.description?.length > 80 ? '...' : ''),
+        })),
+      }
+    }
+    else if (result.name === 'Analytics Time Series') {
+      displayData = { metrics: result.data.metrics.slice(0, 5) }
+    }
+
+    const separator = i < filtered.length - 1 ? '\n\n---' : ''
+
+    return `#### ${result.emoji} ${result.name}
+
+**Configuration:** ${result.description}
+
+**Savings:** ${result.savings.toLocaleString('en-US')} tokens (${result.savingsPercent}% reduction)
+
+**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
+
+\`\`\`json
+${JSON.stringify(displayData, undefined, 2)}
+\`\`\`
+
+**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
+
+\`\`\`
+${encode(displayData)}
+\`\`\`${separator}`
+  })
+  .join('\n\n')
+
+const markdown = `### Token Efficiency
+
+\`\`\`
+${barChartSection}
+\`\`\`
+
+**Total:** ${totalToonTokens.toLocaleString('en-US')} tokens (TOON) vs ${totalJsonTokens.toLocaleString('en-US')} tokens (JSON) → ${totalSavingsPercent}% savings
+
+<details>
+<summary><strong>View detailed examples</strong></summary>
+
+${detailedExamples}
+
+</details>
+`.trimStart()
+
+console.log(markdown)
+
+await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true })
+await fsp.writeFile(outputFilePath, markdown, 'utf-8')
+
+consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
+
+// Generate ASCII bar chart
+function generateBarChart(percentage: number, maxWidth: number = 25): string {
+  const filled = Math.round((percentage / 100) * maxWidth)
+  const empty = maxWidth - filled
+  return '█'.repeat(filled) + '░'.repeat(empty)
+}
+
+// Generate analytics time series data
+function generateAnalytics(days: number) {
+  return {
+    metrics: Array.from({ length: days }, (_, i) => {
+      const date = new Date(2025, 0, 1)
+      date.setDate(date.getDate() + i)
+      return {
+        date: date.toISOString().split('T')[0],
+        views: Math.floor(Math.random() * 5000) + 1000,
+        clicks: Math.floor(Math.random() * 500) + 50,
+        conversions: Math.floor(Math.random() * 100) + 10,
+        revenue: Number((Math.random() * 1000 + 100).toFixed(2)),
+      }
+    }),
+  }
+}
+
+// Generate user API response
+function generateUsers(count: number) {
+  return {
+    users: Array.from({ length: count }, (_, i) => ({
+      id: i + 1,
+      name: faker.person.fullName(),
+      email: faker.internet.email(),
+      role: faker.helpers.arrayElement(['admin', 'user', 'moderator']),
+      active: faker.datatype.boolean(),
+      createdAt: faker.date.past({ years: 2 }).toISOString(),
+      lastLogin: faker.date.recent({ days: 30 }).toISOString(),
+    })),
+    total: count,
+    page: 1,
+  }
+}
+
+// Generate nested e-commerce order
+function generateOrder() {
+  return {
+    orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
+    customer: {
+      id: faker.number.int({ min: 1000, max: 9999 }),
+      name: faker.person.fullName(),
+      email: faker.internet.email(),
+      phone: faker.phone.number(),
+    },
+    items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
+      sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
+      name: faker.commerce.productName(),
+      quantity: faker.number.int({ min: 1, max: 5 }),
+      price: Number(faker.commerce.price({ min: 10, max: 200 })),
+    })),
+    subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
+    tax: Number(faker.commerce.price({ min: 10, max: 50 })),
+    total: Number(faker.commerce.price({ min: 110, max: 550 })),
+    status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
+    createdAt: faker.date.recent({ days: 7 }).toISOString(),
+  }
+}
diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts
new file mode 100644
index 0000000..e146db0
--- /dev/null
+++ b/benchmarks/src/constants.ts
@@ -0,0 +1,39 @@
+import process from 'node:process'
+import * as url from 'node:url'
+
+export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
+export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
+
+/**
+ * Benchmark execution configuration
+ */
+
+/**
+ * Enable dry run mode for quick testing with limited AI requests
+ *
+ * @remarks
+ * Set via environment variable: `DRY_RUN=true`
+ */
+export const DRY_RUN: boolean = process.env.DRY_RUN === 'true'
+
+/**
+ * Limits applied when DRY_RUN is enabled
+ */
+export const DRY_RUN_LIMITS = {
+  /** Maximum number of questions to evaluate */
+  maxQuestions: 10,
+  /** Maximum number of formats to test */
+  maxFormats: undefined as number | undefined,
+  /** Models to use in dry run */
+  allowedModels: [] as string[],
+}
+
+/**
+ * Default concurrency for parallel evaluations
+ */
+export const DEFAULT_CONCURRENCY = 20
+
+/**
+ * Delay between API requests to avoid rate limiting (in milliseconds)
+ */
+export const RATE_LIMIT_DELAY_MS = 100
diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts
new file mode 100644
index 0000000..87643f2
--- /dev/null
+++ b/benchmarks/src/datasets.ts
@@ -0,0 +1,146 @@
+/**
+ * Datasets for TOON benchmarks
+ *
+ * These datasets are designed to test TOON's strengths and weaknesses:
+ * - Tabular: Uniform records (TOON optimal)
+ * - Nested: Complex structures with nested objects
+ * - Analytics: Time-series data
+ */
+
+import type { Dataset } from './types'
+import { faker } from '@faker-js/faker'
+import githubRepos from '../data/github-repos.json' with { type: 'json' }
+
+// Seed for reproducibility
+faker.seed(12345)
+
+/**
+ * Tabular dataset: 100 uniform employee records
+ *
+ * @remarks
+ * Tests TOON's tabular array format
+ */
+const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance']
+const tabularDataset: Dataset = {
+  name: 'tabular',
+  description: 'Uniform employee records (TOON optimal format)',
+  data: {
+    employees: Array.from({ length: 100 }, (_, i) => {
+      const yearsExp = faker.number.int({ min: 1, max: 20 })
+      return {
+        id: i + 1,
+        name: faker.person.fullName(),
+        email: faker.internet.email().toLowerCase(),
+        department: departments[i % departments.length]!,
+        salary: faker.number.int({ min: 45000, max: 150000 }),
+        yearsExperience: yearsExp,
+        active: faker.datatype.boolean(0.8), // 80% active
+      }
+    }),
+  },
+}
+
+/**
+ * Nested dataset: 50 e-commerce orders with nested structures
+ *
+ * @remarks
+ * Tests TOON's handling of complex nested objects
+ */
+const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp']
+const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
+
+const nestedDataset: Dataset = {
+  name: 'nested',
+  description: 'E-commerce orders with nested structures',
+  data: {
+    orders: Array.from({ length: 50 }, (_, i) => {
+      const customerId = (i % 20) + 1
+      const itemCount = faker.number.int({ min: 1, max: 4 })
+
+      const items = Array.from({ length: itemCount }, (_, j) => {
+        const price = faker.number.float({ min: 9.99, max: 199.99, fractionDigits: 2 })
+        const quantity = faker.number.int({ min: 1, max: 5 })
+        return {
+          sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
+          name: productNames[j % productNames.length]!,
+          quantity,
+          price,
+        }
+      })
+
+      const total = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
+
+      return {
+        orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
+        customer: {
+          id: customerId,
+          name: faker.person.fullName(),
+          email: faker.internet.email().toLowerCase(),
+        },
+        items,
+        total,
+        status: statuses[i % statuses.length]!,
+        orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
+      }
+    }),
+  },
+}
+
+/**
+ * Analytics dataset: 60 days of time-series metrics
+ *
+ * @remarks
+ * Tests TOON's handling of numeric data and date fields
+ */
+const analyticsDataset: Dataset = {
+  name: 'analytics',
+  description: 'Time-series analytics data',
+  data: {
+    metrics: Array.from({ length: 60 }, (_, i) => {
+      const date = new Date('2025-01-01')
+      date.setDate(date.getDate() + i)
+
+      // Simulate realistic web traffic with some variation
+      const baseViews = 5000
+      const weekendMultiplier = date.getDay() === 0 || date.getDay() === 6 ? 0.7 : 1.0
+      const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
+      const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
+      const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
+      const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
+      const revenue = Number((conversions * avgOrderValue).toFixed(2))
+
+      return {
+        date: date.toISOString().split('T')[0]!,
+        views,
+        clicks,
+        conversions,
+        revenue,
+        bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
+      }
+    }),
+  },
+}
+
+/**
+ * GitHub dataset: Popular repositories
+ *
+ * @remarks
+ * Tests TOON's tabular format with real-world data
+ */
+const githubDataset: Dataset = {
+  name: 'github',
+  description: 'Popular GitHub repositories',
+  data: {
+    repositories: githubRepos.slice(0, 200),
+  },
+}
+
+/**
+ * All datasets used in the benchmark
+ */
+export const datasets: Dataset[] = [
+  tabularDataset,
+  nestedDataset,
+  analyticsDataset,
+  githubDataset,
+]
diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts
new file mode 100644
index 0000000..ec1c3ec
--- /dev/null
+++ b/benchmarks/src/evaluate.ts
@@ -0,0 +1,133 @@
+/**
+ * LLM evaluation logic for TOON benchmarks
+ *
+ * Handles:
+ * - Model configuration
+ * - Question evaluation with LLMs
+ * - Answer validation using LLM-as-judge
+ */
+
+import type { LanguageModelV2 } from '@ai-sdk/provider'
+import type { EvaluationResult, Question } from './types'
+import { setTimeout } from 'node:timers/promises'
+import { anthropic } from '@ai-sdk/anthropic'
+import { openai } from '@ai-sdk/openai'
+import { generateText } from 'ai'
+import { consola } from 'consola'
+import { RATE_LIMIT_DELAY_MS } from './constants'
+
+/**
+ * Models used for evaluation
+ */
+export const models: Record<string, LanguageModelV2> = {
+  'gpt-4o-mini': openai('gpt-4o-mini'),
+  'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
+}
+
+/**
+ * Validate an answer using LLM-as-judge approach
+ * More robust than string matching for LLM outputs
+ */
+export async function validateAnswer(
+  actual: string,
+  expected: string,
+  question: string,
+): Promise<boolean> {
+  const prompt = `You are validating answers to questions about structured data.
+
+Question: ${question}
+Expected answer: ${expected}
+Actual answer: ${actual}
+
+Is the actual answer correct? Consider:
+- Exact matches are correct
+- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
+- Minor formatting differences are acceptable
+- Case-insensitive comparison for text
+
+Respond with only "YES" or "NO".`
+
+  try {
+    const { text } = await generateText({
+      model: models['gpt-4o-mini']!,
+      prompt,
+      temperature: 0,
+      maxOutputTokens: 16,
+    })
+
+    await setTimeout(RATE_LIMIT_DELAY_MS)
+
+    return text.trim().toUpperCase() === 'YES'
+  }
+  catch (error) {
+    consola.error('Validation error:', error)
+    // Fallback to simple string comparison
+    return actual.toLowerCase().trim() === expected.toLowerCase().trim()
+  }
+}
+
+/**
+ * Evaluate a single question with a specific format and model
+ */
+export async function evaluateQuestion(
+  question: Question,
+  formatName: string,
+  formattedData: string,
+  model: any,
+  modelName: string,
+): Promise<EvaluationResult> {
+  const prompt = `Given the following data in ${formatName} format:
+
+\`\`\`
+${formattedData}
+\`\`\`
+
+Question: ${question.prompt}
+
+Provide only the direct answer, without any additional explanation or formatting.`
+
+  const startTime = Date.now()
+
+  try {
+    const { text, usage } = await generateText({
+      model,
+      prompt,
+      temperature: 0,
+      maxOutputTokens: 50,
+    })
+
+    await setTimeout(RATE_LIMIT_DELAY_MS)
+
+    const latencyMs = Date.now() - startTime
+    const correct = await validateAnswer(text.trim(), question.groundTruth, question.prompt)
+
+    return {
+      questionId: question.id,
+      format: formatName,
+      model: modelName,
+      expected: question.groundTruth,
+      actual: text.trim(),
+      correct,
+      inputTokens: usage.inputTokens ?? 0,
+      outputTokens: usage.outputTokens ?? 0,
+      latencyMs,
+    }
+  }
+  catch (error) {
+    consola.error(`Error evaluating ${question.id} with ${formatName}/${modelName}:`, error)
+
+    await setTimeout(RATE_LIMIT_DELAY_MS)
+
+    return {
+      questionId: question.id,
+      format: formatName,
+      model: modelName,
+      expected: question.groundTruth,
+      actual: '',
+      correct: false,
+      inputTokens: 0,
+      outputTokens: 0,
+      latencyMs: Date.now() - startTime,
+    }
+  }
+}
diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts
new file mode 100644
index 0000000..e1081e3
--- /dev/null
+++ b/benchmarks/src/formatters.ts
@@ -0,0 +1,90 @@
+/**
+ * Format converters for TOON benchmarks
+ *
+ * Converts data to different formats:
+ * - JSON
+ * - TOON
+ * - CSV
+ * - Markdown key-value
+ * - YAML
+ */
+
+import { stringify as stringifyCSV } from 'csv-stringify/sync'
+import { stringify as stringifyYAML } from 'yaml'
+import { encode as encodeToon } from '../../src/index'
+
+export const formatters = {
+  'json': (data: unknown): string => JSON.stringify(data, undefined, 2),
+  'toon': (data: unknown): string => encodeToon(data),
+  'csv': (data: unknown): string => toCSV(data),
+  'markdown-kv': (data: unknown): string => toMarkdownKV(data),
+  'yaml': (data: unknown): string => stringifyYAML(data),
+}
+
+function toCSV(data: unknown): string {
+  const sections: string[] = []
+
+  // Handle top-level object with arrays
+  if (typeof data === 'object' && data !== null && !Array.isArray(data)) {
+    for (const [key, value] of Object.entries(data)) {
+      if (Array.isArray(value) && value.length > 0) {
+        sections.push(`# ${key}`)
+        sections.push(stringifyCSV(value, { header: true }))
+      }
+    }
+    return sections.join('\n').trim()
+  }
+
+  // Root-level array
+  if (Array.isArray(data) && data.length > 0) {
+    return stringifyCSV(data, { header: true }).trim()
+  }
+
+  return ''
+}
+
+function toMarkdownKV(data: unknown, indent = 0): string {
+  const spaces = '  '.repeat(indent)
+  const lines: string[] = []
+
+  if (Array.isArray(data)) {
+    data.forEach((item, i) => {
+      if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
+        Object.entries(item).forEach(([key, value]) => {
+          if (typeof value === 'object' && value !== null) {
+            lines.push(`${spaces}**${key}**:`)
+            lines.push(toMarkdownKV(value, indent + 1))
+          }
+          else {
+            lines.push(`${spaces}**${key}**: ${value}`)
+          }
+        })
+        if (i < data.length - 1)
+          lines.push('')
+      }
+      else {
+        lines.push(`${spaces}- ${item}`)
+      }
+    })
+  }
+  else if (typeof data === 'object' && data !== null) {
+    Object.entries(data).forEach(([key, value]) => {
+      if (Array.isArray(value)) {
+        lines.push(`${spaces}**${key}**:`)
+        lines.push(toMarkdownKV(value, indent + 1))
+      }
+      else if (typeof value === 'object' && value !== null) {
+        lines.push(`${spaces}**${key}**:`)
+        lines.push(toMarkdownKV(value, indent + 1))
+      }
+      else {
+        lines.push(`${spaces}**${key}**: ${value}`)
+      }
+    })
+  }
+  else {
+    lines.push(`${spaces}${data}`)
+  }
+
+  return lines.join('\n')
+}
diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts
new file mode 100644
index 0000000..e211dce
--- /dev/null
+++ b/benchmarks/src/questions.ts
@@ -0,0 +1,398 @@
+/* eslint-disable no-console */
+
+/**
+ * Question generation for TOON benchmarks
+ *
+ * Generates ~200 questions across different types:
+ * - Field retrieval (50%): "What is X's Y?"
+ * - Aggregation (25%): "How many X have Y?"
+ * - Filtering (25%): "List/count X where Y"
+ *
+ * Questions are generated dynamically based on actual data values
+ */
+
+import type { Question } from './types'
+import { datasets } from './datasets'
+
+/**
+ * Generate all questions from datasets
+ */
+export function generateQuestions(): Question[] {
+  const questions: Question[] = []
+  let idCounter = 1
+
+  // Get datasets
+  const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || []
+  const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || []
+  const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || []
+  const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || []
+
+  // ========================================
+  // TABULAR DATASET QUESTIONS (70 questions)
+  // ========================================
+
+  if (tabular.length > 0) {
+    // Field retrieval: specific employees (40 questions)
+    for (let i = 0; i < Math.min(40, tabular.length); i++) {
+      const emp = tabular[i * 2] || tabular[i]
+      if (!emp)
+        continue
+
+      // Alternate between different field types
+      if (i % 3 === 0) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the salary of ${emp.name}?`,
+          groundTruth: String(emp.salary),
+          type: 'field-retrieval',
+          dataset: 'tabular',
+        })
+      }
+      else if (i % 3 === 1) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What department does ${emp.name} work in?`,
+          groundTruth: emp.department,
+          type: 'field-retrieval',
+          dataset: 'tabular',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the email address of ${emp.name}?`,
+          groundTruth: emp.email,
+          type: 'field-retrieval',
+          dataset: 'tabular',
+        })
+      }
+    }
+
+    // Aggregation: count by department
+    const departments = [...new Set(tabular.map((e: any) => e.department))]
+    for (const dept of departments.slice(0, 6)) {
+      const count = tabular.filter((e: any) => e.department === dept).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many employees work in ${dept}?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'tabular',
+      })
+    }
+
+    // Aggregation: salary ranges (4 questions)
+    const salaryThresholds = [60000, 80000, 100000, 120000]
+    for (const threshold of salaryThresholds) {
+      const count = tabular.filter((e: any) => e.salary > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many employees have a salary greater than ${threshold}?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'tabular',
+      })
+    }
+
+    // Filtering: active status
+    const activeCount = tabular.filter((e: any) => e.active).length
+    const inactiveCount = tabular.filter((e: any) => !e.active).length
+    questions.push(
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many employees are active?',
+        groundTruth: String(activeCount),
+        type: 'filtering',
+        dataset: 'tabular',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many employees are inactive?',
+        groundTruth: String(inactiveCount),
+        type: 'filtering',
+        dataset: 'tabular',
+      },
+    )
+
+    // Complex filtering: multi-condition (8 questions)
+    for (const dept of departments.slice(0, 4)) {
+      const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many employees in ${dept} have a salary greater than 80000?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'tabular',
+      })
+    }
+
+    for (const exp of [5, 10]) {
+      const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many active employees have more than ${exp} years of experience?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'tabular',
+      })
+    }
+  }
+
+  // ========================================
+  // NESTED DATASET QUESTIONS (50 questions)
+  // ========================================
+
+  if (nested.length > 0) {
+    // Field retrieval: order totals (20 questions)
+    for (let i = 0; i < Math.min(20, nested.length); i++) {
+      const order = nested[i * 2] || nested[i]
+      if (!order)
+        continue
+
+      if (i % 2 === 0) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the total amount for order ${order.orderId}?`,
+          groundTruth: String(order.total),
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the status of order ${order.orderId}?`,
+          groundTruth: order.status,
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
+    }
+
+    // Field retrieval: customer info (15 questions)
+    for (let i = 0; i < Math.min(15, nested.length); i++) {
+      const order = nested[i * 3] || nested[i]
+      if (!order)
+        continue
+
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `What is the customer name for order ${order.orderId}?`,
+        groundTruth: order.customer.name,
+        type: 'field-retrieval',
+        dataset: 'nested',
+      })
+    }
+
+    // Aggregation: count by status
+    const statuses = [...new Set(nested.map((o: any) => o.status))]
+    for (const status of statuses) {
+      const count = nested.filter((o: any) => o.status === status).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many orders have status "${status}"?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'nested',
+      })
+    }
+
+    // Aggregation: total revenue
+    const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0)
+    questions.push({
+      id: `q${idCounter++}`,
+      prompt: 'What is the total revenue across all orders?',
+      groundTruth: String(totalRevenue.toFixed(2)),
+      type: 'aggregation',
+      dataset: 'nested',
+    })
+
+    // Filtering: high-value orders (3 questions)
+    const highValueThresholds = [200, 400, 600]
+    for (const threshold of highValueThresholds) {
+      const count = nested.filter((o: any) => o.total > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many orders have a total greater than ${threshold}?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'nested',
+      })
+    }
+  }
+
+  // ========================================
+  // ANALYTICS DATASET QUESTIONS (40 questions)
+  // ========================================
+
+  if (analytics.length > 0) {
+    // Field retrieval: specific dates (20 questions)
+    for (let i = 0; i < Math.min(20, analytics.length); i++) {
+      const metric = analytics[i * 3] || analytics[i]
+      if (!metric)
+        continue
+
+      if (i % 2 === 0) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many views were recorded on ${metric.date}?`,
+          groundTruth: String(metric.views),
+          type: 'field-retrieval',
+          dataset: 'analytics',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What was the revenue on ${metric.date}?`,
+          groundTruth: String(metric.revenue),
+          type: 'field-retrieval',
+          dataset: 'analytics',
+        })
+      }
+    }
+
+    // Aggregation: totals (4 questions)
+    const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0)
+    const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0)
+    const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0)
+
+    questions.push(
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the total number of views across all dates?',
+        groundTruth: String(totalViews),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the total revenue across all dates?',
+        groundTruth: String(totalRevenue.toFixed(2)),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the total number of conversions across all dates?',
+        groundTruth: String(totalConversions),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+    )
+
+    // Filtering: high-performing days (10 questions)
+    const viewThresholds = [5000, 6000, 7000]
+    for (const threshold of viewThresholds) {
+      const count = analytics.filter((m: any) => m.views > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many days had more than ${threshold} views?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'analytics',
+      })
+    }
+
+    const conversionThresholds = [10, 20, 30]
+    for (const threshold of conversionThresholds) {
+      const count = analytics.filter((m: any) => m.conversions > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many days had more than ${threshold} conversions?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'analytics',
+      })
+    }
+  }
+
+  // ========================================
+  // GITHUB DATASET QUESTIONS (40 questions)
+  // ========================================
+
+  if (github.length > 0) {
+    // Field retrieval: specific repos (20 questions)
+    for (let i = 0; i < Math.min(20, github.length); i++) {
+      const repo = github[i * 10] || github[i]
+      if (!repo)
+        continue
+
+      if (i % 2 === 0) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many stars does ${repo.owner}/${repo.name} have?`,
+          groundTruth: String(repo.stars),
+          type: 'field-retrieval',
+          dataset: 'github',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many forks does ${repo.owner}/${repo.name} have?`,
+          groundTruth: String(repo.forks),
+          type: 'field-retrieval',
+          dataset: 'github',
+        })
+      }
+    }
+
+    // Aggregation: count by owner (5 questions)
+    const owners = [...new Set(github.map((r: any) => r.owner))]
+    for (const owner of owners.slice(0, 5)) {
+      const count = github.filter((r: any) => r.owner === owner).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories does ${owner} have in the dataset?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'github',
+      })
+    }
+
+    // Aggregation: total stars
+    const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0)
+    questions.push({
+      id: `q${idCounter++}`,
+      prompt: 'What is the total number of stars across all repositories?',
+      groundTruth: String(totalStars),
+      type: 'aggregation',
+      dataset: 'github',
+    })
+
+    // Filtering: popular repos (8 questions)
+    const starThresholds = [10000, 50000, 100000]
+    for (const threshold of starThresholds) {
+      const count = github.filter((r: any) => r.stars > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories have more than ${threshold} stars?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'github',
+      })
+    }
+
+    const forkThresholds = [1000, 5000, 10000]
+    for (const threshold of forkThresholds) {
+      const count = github.filter((r: any) => r.forks > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories have more than ${threshold} forks?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'github',
+      })
+    }
+  }
+
+  console.log(`📊 Question breakdown:`)
+  console.log(`   Tabular: ${questions.filter(q => q.dataset === 'tabular').length}`)
+  console.log(`   Nested: ${questions.filter(q => q.dataset === 'nested').length}`)
+  console.log(`   Analytics: ${questions.filter(q => q.dataset === 'analytics').length}`)
+  console.log(`   GitHub: ${questions.filter(q => q.dataset === 'github').length}`)
+  console.log(`   Total: ${questions.length}`)
+
+  return questions
+}
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
new file mode 100644
index 0000000..2638622
--- /dev/null
+++ b/benchmarks/src/report.ts
@@ -0,0 +1,288 @@
+/**
+ * Report generation for TOON benchmarks
+ *
+ * Handles:
+ * - Statistical analysis
+ * - Twitter-ready markdown report generation with visual elements
+ * - Per-dataset breakdowns
+ * - Cost analysis
+ * - Result file saving
+ */
+
+import type { EvaluationResult, FormatResult, Question } from './types'
+import * as fsp from 'node:fs/promises'
+import * as path from 'node:path'
+import { encode } from 'gpt-tokenizer'
+import { BENCHMARKS_DIR } from './constants'
+import { datasets } from './datasets'
+import { models } from './evaluate'
+
+/**
+ * Calculate per-format statistics from evaluation results
+ */
+export function calculateFormatResults(
+  results: EvaluationResult[],
+  tokenCounts: Record<string, number>,
+): FormatResult[] {
+  const formatNames = [...new Set(results.map(r => r.format))]
+
+  return formatNames.map((formatName) => {
+    const formatResults = results.filter(r => r.format === formatName)
+    const correctCount = formatResults.filter(r => r.correct).length
+    const totalCount = formatResults.length
+    const accuracy = correctCount / totalCount
+
+    // Calculate average tokens across all datasets for this format
+    const avgTokens = Object.entries(tokenCounts)
+      .filter(([key]) => key.startsWith(`${formatName}-`))
+      .reduce((sum, [, tokens]) => sum + tokens, 0) / datasets.length
+
+    const avgInputTokens = formatResults.reduce((sum, r) => sum + r.inputTokens, 0) / totalCount
+    const avgLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount
+
+    return {
+      format: formatName,
+      accuracy,
+      totalTokens: Math.round(avgTokens),
+      avgInputTokens: Math.round(avgInputTokens),
+      avgLatency: Math.round(avgLatency),
+      correctCount,
+      totalCount,
+    }
+  }).sort((a, b) => b.accuracy - a.accuracy)
+}
+
+/**
+ * Generate embeddable markdown report from results
+ */
+export function generateMarkdownReport(
+  formatResults: FormatResult[],
+  results: EvaluationResult[],
+  questions: Question[],
+  tokenCounts: Record<string, number>,
+): string {
+  const lines: string[] = [
+    '### Retrieval Accuracy',
+    '',
+  ]
+
+  const toon = formatResults.find(r => r.format === 'toon')
+  const json = formatResults.find(r => r.format === 'json')
+
+  // Model-by-model breakdown (most interesting result)
+  const modelCount = Object.keys(models).length
+  lines.push(`Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:`, '', '```')
+
+  for (const modelName of Object.keys(models)) {
+    const modelResults = formatResults.map((fr) => {
+      const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format)
+      const correctCount = modelFormatResults.filter(r => r.correct).length
+      const totalCount = modelFormatResults.length
+      const accuracy = totalCount > 0 ? correctCount / totalCount : 0
+
+      return {
+        format: fr.format,
+        accuracy,
+        correctCount,
+        totalCount,
+      }
+    }).sort((a, b) => b.accuracy - a.accuracy)
+
+    const bestResult = modelResults[0]!
+    const bar = createTokenBar(bestResult.accuracy, 1, 20)
+
+    lines.push(`${modelName.padEnd(20)} ${bar} ${(bestResult.accuracy * 100).toFixed(1)}% accuracy`)
+  }
+
+  lines.push('```', '')
+
+  // Summary comparison
+  if (toon && json) {
+    const tokenSavings = ((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1)
+    lines.push(
+      `**TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.**`,
+      '',
+    )
+  }
+
+  // Simple format comparison table
+  lines.push(
+    '| Format | Accuracy | Average Tokens |',
+    '| ------ | -------- | -------------- |',
+  )
+
+  for (const result of formatResults) {
+    lines.push(
+      `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.totalTokens.toLocaleString()} |`,
+    )
+  }
+
+  lines.push('', '<details>', '<summary><strong>View detailed breakdown by dataset and model</strong></summary>', '', '#### Performance by Dataset', '')
+
+  for (const dataset of datasets) {
+    lines.push(`##### ${dataset.description}`, '')
+
+    const datasetResults = formatResults.map((fr) => {
+      const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
+      if (datasetFormatResults.length === 0)
+        return undefined
+
+      const formatDatasetResults = datasetFormatResults.filter(r => r.format === fr.format)
+      if (formatDatasetResults.length === 0)
+        return undefined
+
+      const correctCount = formatDatasetResults.filter(r => r.correct).length
+      const totalCount = formatDatasetResults.length
+      const accuracy = totalCount > 0 ? correctCount / totalCount : 0
+
+      // Get token count for this dataset+format
+      const tokenKey = `${fr.format}-${dataset.name}`
+      const tokens = tokenCounts[tokenKey] || fr.totalTokens
+
+      return {
+        format: fr.format,
+        accuracy,
+        tokens,
+        correctCount,
+        totalCount,
+      }
+    }).filter(Boolean) as { format: string, accuracy: number, tokens: number, correctCount: number, totalCount: number }[]
+
+    if (datasetResults.length === 0)
+      continue
+
+    // Sort by efficiency
+    datasetResults.sort((a, b) => {
+      const effA = (a.accuracy ** 2) / (a.tokens / 1000)
+      const effB = (b.accuracy ** 2) / (b.tokens / 1000)
+      return effB - effA
+    })
+
+    lines.push(
+      '| Format | Accuracy | Tokens | Correct/Total |',
+      '|--------|----------|--------|---------------|',
+    )
+
+    for (const result of datasetResults.slice(0, 6)) {
+      lines.push(
+        `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
+      )
+    }
+
+    lines.push('')
+  }
+
+  // Model breakdown
+  lines.push('', '#### Performance by Model', '')
+
+  for (const modelName of Object.keys(models)) {
+    lines.push(`##### ${modelName}`, '')
+
+    const modelResults = formatResults.map((fr) => {
+      const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format)
+      const correctCount = modelFormatResults.filter(r => r.correct).length
+      const totalCount = modelFormatResults.length
+      const accuracy = correctCount / totalCount
+
+      return {
+        format: fr.format,
+        accuracy,
+        correctCount,
+        totalCount,
+      }
+    }).sort((a, b) => b.accuracy - a.accuracy)
+
+    lines.push('| Format | Accuracy | Correct/Total |', '|--------|----------|---------------|')
+
+    for (const result of modelResults) {
+      lines.push(`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`)
+    }
+
+    lines.push('')
+  }
+
+  // Methodology
+  lines.push(
+    '',
+    '#### Methodology',
+    '',
+    '- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).',
+    '- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.',
+    '- **Question types**: Field retrieval, aggregation, and filtering tasks.',
+    '- **Real data**: Faker.js-generated datasets + GitHub repositories.',
+    '',
+    '</details>',
+    '',
+  )
+
+  return lines.join('\n')
+}
+
+/**
+ * Calculate token counts for all format+dataset combinations
+ */
+export function calculateTokenCounts(
+  formatters: Record<string, (data: any) => string>,
+): Record<string, number> {
+  const tokenCounts: Record<string, number> = {}
+
+  for (const [formatName, formatter] of Object.entries(formatters)) {
+    for (const dataset of datasets) {
+      const formatted = formatter(dataset.data)
+      const key = `${formatName}-${dataset.name}`
+      tokenCounts[key] = encode(formatted).length
+    }
+  }
+
+  return tokenCounts
+}
+
+/**
+ * Save results to disk
+ */
+export async function saveResults(
+  results: EvaluationResult[],
+  formatResults: FormatResult[],
+  questions: Question[],
+  tokenCounts: Record<string, number>,
+): Promise<string> {
+  const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
+  await fsp.mkdir(resultsDir, { recursive: true })
+
+  // Save raw results
+  await fsp.writeFile(
+    path.join(resultsDir, 'raw-results.json'),
+    JSON.stringify(results, undefined, 2),
+  )
+
+  // Save summary
+  await fsp.writeFile(
+    path.join(resultsDir, 'summary.json'),
+    JSON.stringify({
+      formatResults,
+      questions: questions.length,
+      models: Object.keys(models),
+      datasets: datasets.map(d => ({ name: d.name, description: d.description })),
+      tokenCounts,
+      timestamp: new Date().toISOString(),
+    }, undefined, 2),
+  )
+
+  // Generate markdown report
+  const report = generateMarkdownReport(formatResults, results, questions, tokenCounts)
+  await fsp.writeFile(
+    path.join(resultsDir, 'report.md'),
+    report,
+  )
+
+  return resultsDir
+}
+
+/**
+ * Generate visual bar chart for token counts
+ */
+function createTokenBar(tokens: number, maxTokens: number, width = 30): string {
+  const filled = Math.round((tokens / maxTokens) * width)
+  const empty = width - filled
+  return '█'.repeat(filled) + '░'.repeat(empty)
+}
diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts
new file mode 100644
index 0000000..bca48fa
--- /dev/null
+++ b/benchmarks/src/types.ts
@@ -0,0 +1,35 @@
+export interface Dataset {
+  name: string
+  description: string
+  data: any
+}
+
+export interface Question {
+  id: string
+  prompt: string
+  groundTruth: string
+  type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison'
+  dataset: string
+}
+
+export interface EvaluationResult {
+  questionId: string
+  format: string
+  model: string
+  expected: string
+  actual: string
+  correct: boolean
+  inputTokens: number
+  outputTokens: number
+  latencyMs: number
+}
+
+export interface FormatResult {
+  format: string
+  accuracy: number
+  totalTokens: number
+  avgInputTokens: number
+  avgLatency: number
+  correctCount: number
+  totalCount: number
+}
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
deleted file mode 100644
index 146fbed..0000000
--- a/docs/benchmarks.md
+++ /dev/null
@@ -1,158 +0,0 @@
-| Example | JSON | TOON | Tokens Saved | Reduction |
-| ------- | ---- | ---- | ------------ | --------- |
-| 👤 Simple user object | 31 | 18 | 13 | **41.9%** |
-| 🏷️ User with tags | 48 | 28 | 20 | **41.7%** |
-| 📦 Small product catalog | 117 | 49 | 68 | **58.1%** |
-| 👥 API response with users | 123 | 53 | 70 | **56.9%** |
-| ⚙️ Nested configuration | 68 | 42 | 26 | **38.2%** |
-| 🛒 E-commerce order | 163 | 94 | 69 | **42.3%** |
-| 📊 Analytics data | 209 | 94 | 115 | **55.0%** |
-| 📈 Large dataset (50 records) | 2159 | 762 | 1397 | **64.7%** |
-| **Total** | **2918** | **1140** | **1778** | **60.9%** |
-
-<details>
-<summary><strong>View detailed results</strong></summary>
-
-### 📦 Small product catalog
-
-**Savings: 68 tokens (58.1% reduction)**
-
-**JSON** (117 tokens):
-
-```json
-{
-  "items": [
-    {
-      "sku": "A1",
-      "name": "Widget",
-      "qty": 2,
-      "price": 9.99
-    },
-    {
-      "sku": "B2",
-      "name": "Gadget",
-      "qty": 1,
-      "price": 14.5
-    },
-    {
-      "sku": "C3",
-      "name": "Doohickey",
-      "qty": 5,
-      "price": 7.25
-    }
-  ]
-}
-```
-
-**TOON** (49 tokens):
-
-```
-items[3]{sku,name,qty,price}:
-  A1,Widget,2,9.99
-  B2,Gadget,1,14.5
-  C3,Doohickey,5,7.25
-```
-
----
-
-### 👥 API response with users
-
-**Savings: 70 tokens (56.9% reduction)**
-
-**JSON** (123 tokens):
-
-```json
-{
-  "users": [
-    {
-      "id": 1,
-      "name": "Alice",
-      "email": "alice@example.com",
-      "active": true
-    },
-    {
-      "id": 2,
-      "name": "Bob",
-      "email": "bob@example.com",
-      "active": true
-    },
-    {
-      "id": 3,
-      "name": "Charlie",
-      "email": "charlie@example.com",
-      "active": false
-    }
-  ],
-  "total": 3,
-  "page": 1
-}
-```
-
-**TOON** (53 tokens):
-
-```
-users[3]{id,name,email,active}:
-  1,Alice,alice@example.com,true
-  2,Bob,bob@example.com,true
-  3,Charlie,charlie@example.com,false
-total: 3
-page: 1
-```
-
----
-
-### 📊 Analytics data
-
-**Savings: 115 tokens (55.0% reduction)**
-
-**JSON** (209 tokens):
-
-```json
-{
-  "metrics": [
-    {
-      "date": "2025-01-01",
-      "views": 1234,
-      "clicks": 89,
-      "conversions": 12
-    },
-    {
-      "date": "2025-01-02",
-      "views": 2345,
-      "clicks": 156,
-      "conversions": 23
-    },
-    {
-      "date": "2025-01-03",
-      "views": 1890,
-      "clicks": 123,
-      "conversions": 18
-    },
-    {
-      "date": "2025-01-04",
-      "views": 3456,
-      "clicks": 234,
-      "conversions": 34
-    },
-    {
-      "date": "2025-01-05",
-      "views": 2789,
-      "clicks": 178,
-      "conversions": 27
-    }
-  ]
-}
-```
-
-**TOON** (94 tokens):
-
-```
-metrics[5]{date,views,clicks,conversions}:
-  2025-01-01,1234,89,12
-  2025-01-02,2345,156,23
-  2025-01-03,1890,123,18
-  2025-01-04,3456,234,34
-  2025-01-05,2789,178,27
-```
-
-</details>
diff --git a/package.json b/package.json
index 61525de..8f13df7 100644
--- a/package.json
+++ b/package.json
@@ -26,7 +26,7 @@
     "dist"
   ],
   "scripts": {
-    "automd": "tsx scripts/generate-bench.ts && automd",
+    "automd": "automd",
     "build": "tsdown",
     "lint": "eslint .",
     "lint:fix": "eslint . --fix",
@@ -35,16 +35,16 @@
     "release": "bumpp"
   },
   "devDependencies": {
-    "@antfu/eslint-config": "^6.0.0",
+    "@antfu/eslint-config": "^6.1.0",
     "@types/node": "^24.9.1",
     "automd": "^0.4.2",
     "bumpp": "^10.3.1",
     "eslint": "^9.38.0",
     "gpt-tokenizer": "^3.2.0",
-    "tsdown": "^0.15.9",
+    "tsdown": "^0.15.10",
     "tsx": "^4.20.6",
     "typescript": "^5.9.3",
-    "vitest": "^3.2.4"
+    "vitest": "^4.0.3"
   },
   "pnpm": {
     "onlyBuiltDependencies": [
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 20244df..3894eb4 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -9,8 +9,8 @@ importers:
   .:
     devDependencies:
       '@antfu/eslint-config':
-        specifier: ^6.0.0
-        version: 6.0.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
+        specifier: ^6.1.0
+        version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
       '@types/node':
         specifier: ^24.9.1
         version: 24.9.1
@@ -27,8 +27,8 @@ importers:
         specifier: ^3.2.0
         version: 3.2.0
       tsdown:
-        specifier: ^0.15.9
-        version: 0.15.9(typescript@5.9.3)
+        specifier: ^0.15.10
+        version: 0.15.10(typescript@5.9.3)
       tsx:
         specifier: ^4.20.6
         version: 4.20.6
@@ -36,17 +36,93 @@ importers:
         specifier: ^5.9.3
         version: 5.9.3
       vitest:
-        specifier: ^3.2.4
-        version: 3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
+        specifier: ^4.0.3
+        version: 4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
+
+  benchmarks:
+    devDependencies:
+      '@ai-sdk/anthropic':
+        specifier: ^2.0.37
+        version: 2.0.37(zod@4.1.12)
+      '@ai-sdk/google':
+        specifier: ^2.0.23
+        version: 2.0.23(zod@4.1.12)
+      '@ai-sdk/openai':
+        specifier: ^2.0.53
+        version: 2.0.53(zod@4.1.12)
+      '@ai-sdk/provider':
+        specifier: ^2.0.0
+        version: 2.0.0
+      '@antfu/eslint-config':
+        specifier: ^6.1.0
+        version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
+      '@faker-js/faker':
+        specifier: ^10.1.0
+        version: 10.1.0
+      ai:
+        specifier: ^5.0.80
+        version: 5.0.80(zod@4.1.12)
+      consola:
+        specifier: ^3.4.2
+        version: 3.4.2
+      csv-stringify:
+        specifier: ^6.6.0
+        version: 6.6.0
+      gpt-tokenizer:
+        specifier: ^3.2.0
+        version: 3.2.0
+      ofetch:
+        specifier: ^1.4.1
+        version: 1.4.1
+      p-map:
+        specifier: ^7.0.3
+        version: 7.0.3
+      yaml:
+        specifier: ^2.8.1
+        version: 2.8.1
 
 packages:
 
-  '@antfu/eslint-config@6.0.0':
-    resolution: {integrity: sha512-M2RM+x+hpxpASEZzQh4d5uaUEHn8sYNVlTB+CySpLkDs2rr3QFvRR7KqNdnox/OIPc6YWMsIEnM/XUbQP52nTA==}
+  '@ai-sdk/anthropic@2.0.37':
+    resolution: {integrity: sha512-r2e9BWoobisH9B5b7x3yYG/k9WlsZqa4D94o7gkwktReqrjjv83zNMop4KmlJsh/zBhbsaP8S8SUfiwK+ESxgg==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
+  '@ai-sdk/gateway@2.0.1':
+    resolution: {integrity: sha512-vPVIbnP35ZnayS937XLo85vynR85fpBQWHCdUweq7apzqFOTU2YkUd4V3msebEHbQ2Zro60ZShDDy9SMiyWTqA==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
+  '@ai-sdk/google@2.0.23':
+    resolution: {integrity: sha512-VbCnKR+6aWUVLkAiSW5gUEtST7KueEmlt+d6qwDikxlLnFG9pzy59je8MiDVeM5G2tuSXbvZQF78PGIfXDBmow==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
+  '@ai-sdk/openai@2.0.53':
+    resolution: {integrity: sha512-GIkR3+Fyif516ftXv+YPSPstnAHhcZxNoR2s8uSHhQ1yBT7I7aQYTVwpjAuYoT3GR+TeP50q7onj2/nDRbT2FQ==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
+  '@ai-sdk/provider-utils@3.0.12':
+    resolution: {integrity: sha512-ZtbdvYxdMoria+2SlNarEk6Hlgyf+zzcznlD55EAl+7VZvJaSg2sqPvwArY7L6TfDEDJsnCq0fdhBSkYo0Xqdg==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
+  '@ai-sdk/provider@2.0.0':
+    resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==}
+    engines: {node: '>=18'}
+
+  '@antfu/eslint-config@6.1.0':
+    resolution: {integrity: sha512-m/L9TGvtG3r4tkfq5BY6THz7pk0g6yuJwwA0SkLEDHJJpt0upuABhs8v3SU8yaPtCGUxq8k2QTLMZ3WPg4vSdw==}
     hasBin: true
     peerDependencies:
       '@eslint-react/eslint-plugin': ^2.0.1
-      '@next/eslint-plugin-next': ^15.4.0-canary.115
+      '@next/eslint-plugin-next': '>=15.0.0'
       '@prettier/plugin-xml': ^3.4.1
       '@unocss/eslint-plugin': '>=0.50.0'
       astro-eslint-parser: ^1.0.2
@@ -99,20 +175,20 @@ packages:
   '@antfu/install-pkg@1.1.0':
     resolution: {integrity: sha512-MGQsmw10ZyI+EJo45CdSER4zEb+p31LpDAFp2Z3gkSd1yqVZGi0Ebx++YTEMonJy4oChEMLsxZ64j8FH6sSqtQ==}
 
-  '@babel/generator@7.28.3':
-    resolution: {integrity: sha512-3lSpxGgvnmZznmBkCRnVREPUFJv2wrv9iAoFDvADJc0ypmdOxdUtcLeBgBJ6zE0PMeTKnxeQzyk0xTBq4Ep7zw==}
+  '@babel/generator@7.28.5':
+    resolution: {integrity: sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==}
     engines: {node: '>=6.9.0'}
 
   '@babel/helper-string-parser@7.27.1':
     resolution: {integrity: sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/helper-validator-identifier@7.27.1':
-    resolution: {integrity: sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==}
+  '@babel/helper-validator-identifier@7.28.5':
+    resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/parser@7.28.4':
-    resolution: {integrity: sha512-yZbBqeM6TkpP9du/I2pUZnJsRMGGvOuIrhjzC1AwHwW+6he4mni6Bp/m8ijn0iOuZuPI2BfkCoSRunpyjnrQKg==}
+  '@babel/parser@7.28.5':
+    resolution: {integrity: sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==}
     engines: {node: '>=6.0.0'}
     hasBin: true
 
@@ -120,8 +196,8 @@ packages:
     resolution: {integrity: sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ==}
     engines: {node: '>=6.9.0'}
 
-  '@babel/types@7.28.4':
-    resolution: {integrity: sha512-bkFqkLhh3pMBUQQkpVgWDWq/lqzc2678eUyDlTBhRqhCHFguYYGM0Efga7tYk4TogG/3x0EEl66/OQ+WGbWB/Q==}
+  '@babel/types@7.28.5':
+    resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==}
     engines: {node: '>=6.9.0'}
 
   '@clack/core@0.5.0':
@@ -143,10 +219,14 @@ packages:
     resolution: {integrity: sha512-YAdE/IJSpwbOTiaURNCKECdAwqrJuFiZhylmesBcIRawtYKnBR2wxPhoIewMg+Yu+QuYvHfJNReWpoxGBKOChA==}
     engines: {node: '>=18'}
 
-  '@es-joy/jsdoccomment@0.58.0':
-    resolution: {integrity: sha512-smMc5pDht/UVsCD3hhw/a/e/p8m0RdRYiluXToVfd+d4yaQQh7nn9bACjkk6nXJvat7EWPAxuFkMEFfrxeGa3Q==}
+  '@es-joy/jsdoccomment@0.76.0':
+    resolution: {integrity: sha512-g+RihtzFgGTx2WYCuTHbdOXJeAlGnROws0TeALx9ow/ZmOROOZkVg5wp/B44n0WJgI4SQFP1eWM2iRPlU2Y14w==}
     engines: {node: '>=20.11.0'}
 
+  '@es-joy/resolve.exports@1.0.0':
+    resolution: {integrity: sha512-bbrmzsAZ9GA/3oBS6r8PWMtZarEhKHr413hak8ArwMEZ5DtaLErnkcyEWUsXy7urBcmVu/TpDzHPDVM5uIbx9A==}
+    engines: {node: '>=10'}
+
   '@esbuild/aix-ppc64@0.25.11':
     resolution: {integrity: sha512-Xt1dOL13m8u0WE8iplx9Ibbm+hFAO0GsU2P34UNoDGvZYkY8ifSiy6Zuc1lYxfG7svWE2fzqCUmFp5HCn51gJg==}
     engines: {node: '>=18'}
@@ -368,6 +448,10 @@ packages:
     resolution: {integrity: sha512-sB5uyeq+dwCWyPi31B2gQlVlo+j5brPlWx4yZBrEaRo/nhdDE8Xke1gsGgtiBdaBTxuTkceLVuVt/pclrasb0A==}
     engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
 
+  '@faker-js/faker@10.1.0':
+    resolution: {integrity: sha512-C3mrr3b5dRVlKPJdfrAXS8+dq+rq8Qm5SNRazca0JKgw1HQERFmrVb0towvMmw5uu8hHKNiQasMaR/tydf3Zsg==}
+    engines: {node: ^20.19.0 || ^22.13.0 || ^23.5.0 || >=24.0.0, npm: '>=10'}
+
   '@humanfs/core@0.19.1':
     resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==}
     engines: {node: '>=18.18.0'}
@@ -412,6 +496,14 @@ packages:
     resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==}
     engines: {node: '>= 8'}
 
+  '@opentelemetry/api@1.9.0':
+    resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==}
+    engines: {node: '>=8.0.0'}
+
+  '@oxc-project/runtime@0.95.0':
+    resolution: {integrity: sha512-qJS5pNepwMGnafO9ayKGz7rfPQgUBuunHpnP1//9Qa0zK3oT3t1EhT+I+pV9MUA+ZKez//OFqxCxf1vijCKb2Q==}
+    engines: {node: ^20.19.0 || >=22.12.0}
+
   '@oxc-project/types@0.95.0':
     resolution: {integrity: sha512-vACy7vhpMPhjEJhULNxrdR0D943TkA/MigMpJCHmBHvMXxRStRi/dPtTlfQ3uDwWSzRpT8z+7ImjZVf8JWBocQ==}
 
@@ -700,6 +792,13 @@ packages:
     cpu: [x64]
     os: [win32]
 
+  '@sindresorhus/base62@1.0.0':
+    resolution: {integrity: sha512-TeheYy0ILzBEI/CO55CP6zJCSdSWeRtGnHy8U8dWSUH4I68iqTsy7HkMktR4xakThc9jotkPQUXT4ITdbV7cHA==}
+    engines: {node: '>=18'}
+
+  '@standard-schema/spec@1.0.0':
+    resolution: {integrity: sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==}
+
   '@stylistic/eslint-plugin@5.5.0':
     resolution: {integrity: sha512-IeZF+8H0ns6prg4VrkhgL+yrvDXWDH2cKchrbh80ejG9dQgZWp10epHMbgRuQvgchLII/lfh6Xn3lu6+6L86Hw==}
     engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
@@ -795,12 +894,16 @@ packages:
     resolution: {integrity: sha512-tUFMXI4gxzzMXt4xpGJEsBsTox0XbNQ1y94EwlD/CuZwFcQP79xfQqMhau9HsRc/J0cAPA/HZt1dZPtGn9V/7w==}
     engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0}
 
-  '@vitest/eslint-plugin@1.3.23':
-    resolution: {integrity: sha512-kp1vjoJTdVf8jWdzr/JpHIPfh3HMR6JBr2p7XuH4YNx0UXmV4XWdgzvCpAmH8yb39Gry31LULiuBcuhyc/OqkQ==}
+  '@vercel/oidc@3.0.3':
+    resolution: {integrity: sha512-yNEQvPcVrK9sIe637+I0jD6leluPxzwJKx/Haw6F4H77CdDsszUn5V3o96LPziXkSNE2B83+Z3mjqGKBK/R6Gg==}
+    engines: {node: '>= 20'}
+
+  '@vitest/eslint-plugin@1.3.25':
+    resolution: {integrity: sha512-7qM/FrA2VyUmrorP0TQ/Oqhn6wsAcktg6euBn0XmpgF0yT2mDxjziu2QLy86i2mOJ41Wtt55z6aUWo+bfmyAeg==}
     engines: {node: '>=18'}
     peerDependencies:
-      eslint: '>= 8.57.0'
-      typescript: '>= 5.0.0'
+      eslint: '>=8.57.0'
+      typescript: '>=5.0.0'
       vitest: '*'
     peerDependenciesMeta:
       typescript:
@@ -808,34 +911,34 @@ packages:
       vitest:
         optional: true
 
-  '@vitest/expect@3.2.4':
-    resolution: {integrity: sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==}
+  '@vitest/expect@4.0.3':
+    resolution: {integrity: sha512-v3eSDx/bF25pzar6aEJrrdTXJduEBU3uSGXHslIdGIpJVP8tQQHV6x1ZfzbFQ/bLIomLSbR/2ZCfnaEGkWkiVQ==}
 
-  '@vitest/mocker@3.2.4':
-    resolution: {integrity: sha512-46ryTE9RZO/rfDd7pEqFl7etuyzekzEhUbTW3BvmeO/BcCMEgq59BKhek3dXDWgAj4oMK6OZi+vRr1wPW6qjEQ==}
+  '@vitest/mocker@4.0.3':
+    resolution: {integrity: sha512-evZcRspIPbbiJEe748zI2BRu94ThCBE+RkjCpVF8yoVYuTV7hMe+4wLF/7K86r8GwJHSmAPnPbZhpXWWrg1qbA==}
     peerDependencies:
       msw: ^2.4.9
-      vite: ^5.0.0 || ^6.0.0 || ^7.0.0-0
+      vite: ^6.0.0 || ^7.0.0-0
     peerDependenciesMeta:
       msw:
         optional: true
       vite:
         optional: true
 
-  '@vitest/pretty-format@3.2.4':
-    resolution: {integrity: sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==}
+  '@vitest/pretty-format@4.0.3':
+    resolution: {integrity: sha512-N7gly/DRXzxa9w9sbDXwD9QNFYP2hw90LLLGDobPNwiWgyW95GMxsCt29/COIKKh3P7XJICR38PSDePenMBtsw==}
 
-  '@vitest/runner@3.2.4':
-    resolution: {integrity: sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==}
+  '@vitest/runner@4.0.3':
+    resolution: {integrity: sha512-1/aK6fPM0lYXWyGKwop2Gbvz1plyTps/HDbIIJXYtJtspHjpXIeB3If07eWpVH4HW7Rmd3Rl+IS/+zEAXrRtXA==}
 
-  '@vitest/snapshot@3.2.4':
-    resolution: {integrity: sha512-dEYtS7qQP2CjU27QBC5oUOxLE/v5eLkGqPE0ZKEIDGMs4vKWe7IjgLOeauHsR0D5YuuycGRO5oSRXnwnmA78fQ==}
+  '@vitest/snapshot@4.0.3':
+    resolution: {integrity: sha512-amnYmvZ5MTjNCP1HZmdeczAPLRD6iOm9+2nMRUGxbe/6sQ0Ymur0NnR9LIrWS8JA3wKE71X25D6ya/3LN9YytA==}
 
-  '@vitest/spy@3.2.4':
-    resolution: {integrity: sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==}
+  '@vitest/spy@4.0.3':
+    resolution: {integrity: sha512-82vVL8Cqz7rbXaNUl35V2G7xeNMAjBdNOVaHbrzznT9BmiCiPOzhf0FhU3eP41nP1bLDm/5wWKZqkG4nyU95DQ==}
 
-  '@vitest/utils@3.2.4':
-    resolution: {integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==}
+  '@vitest/utils@4.0.3':
+    resolution: {integrity: sha512-qV6KJkq8W3piW6MDIbGOmn1xhvcW4DuA07alqaQ+vdx7YA49J85pnwnxigZVQFQw3tWnQNRKWwhz5wbP6iv/GQ==}
 
   '@vue/compiler-core@3.5.22':
     resolution: {integrity: sha512-jQ0pFPmZwTEiRNSb+i9Ow/I/cHv2tXYqsnHKKyCQ08irI2kdF5qmYedmF8si8mA7zepUFmJ2hqzS8CQmNOWOkQ==}
@@ -862,6 +965,12 @@ packages:
     engines: {node: '>=0.4.0'}
     hasBin: true
 
+  ai@5.0.80:
+    resolution: {integrity: sha512-g1o6pjxm1eTtyh295dRhsg0gvZaHFlSo2oruWrK2rIR7KafWEhNB2A2/aJ9hyPT9AMI8JnQJyto1Tl9DMqwc9w==}
+    engines: {node: '>=18'}
+    peerDependencies:
+      zod: ^3.25.76 || ^4.1.8
+
   ajv@6.12.6:
     resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==}
 
@@ -898,8 +1007,8 @@ packages:
   balanced-match@1.0.2:
     resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==}
 
-  baseline-browser-mapping@2.8.19:
-    resolution: {integrity: sha512-zoKGUdu6vb2jd3YOq0nnhEDQVbPcHhco3UImJrv5dSkvxTc2pl2WjOPsjZXDwPDSl5eghIMuY3R6J9NDKF3KcQ==}
+  baseline-browser-mapping@2.8.20:
+    resolution: {integrity: sha512-JMWsdF+O8Orq3EMukbUN1QfbLK9mX2CkUmQBcW2T0s8OmdAUL5LLM/6wFwSrqXzlXB13yhyK9gTKS1rIizOduQ==}
     hasBin: true
 
   birpc@2.6.1:
@@ -954,8 +1063,8 @@ packages:
   ccount@2.0.1:
     resolution: {integrity: sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==}
 
-  chai@5.3.3:
-    resolution: {integrity: sha512-4zNhdJD/iOjSH0A05ea+Ke6MU5mmpQcbQsSOkgdaUMJ9zTlDTD/GYlwohmIE2u0gaxHYiVHEn1Fw9mZ/ktJWgw==}
+  chai@6.2.0:
+    resolution: {integrity: sha512-aUTnJc/JipRzJrNADXVvpVqi6CO0dn3nx4EVPxijri+fj3LUUDyZQOgVeW54Ob3Y1Xh9Iz8f+CgaCl8v0mn9bA==}
     engines: {node: '>=18'}
 
   chalk@4.1.2:
@@ -968,10 +1077,6 @@ packages:
   character-entities@2.0.2:
     resolution: {integrity: sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==}
 
-  check-error@2.1.1:
-    resolution: {integrity: sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==}
-    engines: {node: '>= 16'}
-
   chokidar@4.0.3:
     resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==}
     engines: {node: '>= 14.16.0'}
@@ -1023,6 +1128,9 @@ packages:
     engines: {node: '>=4'}
     hasBin: true
 
+  csv-stringify@6.6.0:
+    resolution: {integrity: sha512-YW32lKOmIBgbxtu3g5SaiqWNwa/9ISQt2EcgOq0+RAIFufFp9is6tqNnKahqE5kuKvrnYAzs28r+s6pXJR8Vcw==}
+
   debug@4.4.3:
     resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==}
     engines: {node: '>=6.0'}
@@ -1035,10 +1143,6 @@ packages:
   decode-named-character-reference@1.2.0:
     resolution: {integrity: sha512-c6fcElNV6ShtZXmsgNgFFV5tVX2PaV4g+MOAkb8eXHvn6sryJBrZa9r0zV6+dtTyoCKxtDy5tyQ5ZwQuidtd+Q==}
 
-  deep-eql@5.0.2:
-    resolution: {integrity: sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==}
-    engines: {node: '>=6'}
-
   deep-is@0.1.4:
     resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==}
 
@@ -1085,8 +1189,8 @@ packages:
       oxc-resolver:
         optional: true
 
-  electron-to-chromium@1.5.238:
-    resolution: {integrity: sha512-khBdc+w/Gv+cS8e/Pbnaw/FXcBUeKrRVik9IxfXtgREOWyJhR4tj43n3amkVogJ/yeQUqzkrZcFhtIxIdqmmcQ==}
+  electron-to-chromium@1.5.240:
+    resolution: {integrity: sha512-OBwbZjWgrCOH+g6uJsA2/7Twpas2OlepS9uvByJjR2datRDuKGYeD+nP8lBBks2qnB7bGJNHDUx7c/YLaT3QMQ==}
 
   empathic@2.0.0:
     resolution: {integrity: sha512-i6UzDscO/XfAcNYD75CfICkmfLedpyPDdozrLMmQc5ORaQcdMoc21OnlEylMIqI7U8eniKrPMxxtj8k0vhmJhA==}
@@ -1186,8 +1290,8 @@ packages:
       typescript:
         optional: true
 
-  eslint-plugin-jsdoc@59.1.0:
-    resolution: {integrity: sha512-sg9mzjjzfnMynyY4W8FDiQv3i8eFcKVEHDt4Xh7MLskP3QkMt2z6p7FuzSw7jJSKFues6RaK2GWvmkB1FLPxXg==}
+  eslint-plugin-jsdoc@61.1.9:
+    resolution: {integrity: sha512-X2AzSGbq1CzBRgKcVAu2qzOV9ogqygkUDk5AX6eNK5G+kY3I5Op5E5b99fE+FN0/bGnk2KGcsMIG6ZLF+di69A==}
     engines: {node: '>=20.11.0'}
     peerDependencies:
       eslint: ^7.0.0 || ^8.0.0 || ^9.0.0
@@ -1324,6 +1428,10 @@ packages:
     resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==}
     engines: {node: '>=0.10.0'}
 
+  eventsource-parser@3.0.6:
+    resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==}
+    engines: {node: '>=18.0.0'}
+
   expect-type@1.2.2:
     resolution: {integrity: sha512-JhFGDVJ7tmDJItKhYgJCGLOWjuK9vPxiXoUFLwLDc99NlmklilbiQJwoctZtt13+xMw91MCk/REan6MWHqDjyA==}
     engines: {node: '>=12.0.0'}
@@ -1444,6 +1552,9 @@ packages:
   hookable@5.5.3:
     resolution: {integrity: sha512-Yc+BQe8SvoXH1643Qez1zqLRmbA5rCL+sSmk6TVos0LWVfNIB7PGncdlId77WzLGSIB5KaWgTaNTs2lNVEI6VQ==}
 
+  html-entities@2.6.0:
+    resolution: {integrity: sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==}
+
   ignore@5.3.2:
     resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==}
     engines: {node: '>= 4'}
@@ -1487,9 +1598,6 @@ packages:
     resolution: {integrity: sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==}
     hasBin: true
 
-  js-tokens@9.0.1:
-    resolution: {integrity: sha512-mxa9E9ITFOt0ban3j6L5MpjwegGz6lBQmM1IJkWeBZGcMxto50+eWdjC/52xDbS2vy0k7vIMK0Fe2wfL9OQSpQ==}
-
   js-yaml@4.1.0:
     resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==}
     hasBin: true
@@ -1502,9 +1610,9 @@ packages:
     resolution: {integrity: sha512-iZ8Bdb84lWRuGHamRXFyML07r21pcwBrLkHEuHgEY5UbCouBwv7ECknDRKzsQIXMiqpPymqtIf8TC/shYKB5rw==}
     engines: {node: '>=12.0.0'}
 
-  jsdoc-type-pratt-parser@5.4.0:
-    resolution: {integrity: sha512-F9GQ+F1ZU6qvSrZV8fNFpjDNf614YzR2eF6S0+XbDjAcUI28FSoXnYZFjQmb1kFx3rrJb5PnxUH3/Yti6fcM+g==}
-    engines: {node: '>=12.0.0'}
+  jsdoc-type-pratt-parser@6.10.0:
+    resolution: {integrity: sha512-+LexoTRyYui5iOhJGn13N9ZazL23nAHGkXsa1p/C8yeq79WRfLBag6ZZ0FQG2aRoc9yfo59JT9EYCQonOkHKkQ==}
+    engines: {node: '>=20.0.0'}
 
   jsesc@3.0.2:
     resolution: {integrity: sha512-xKqzzWXDttJuOcawBt4KnKHHIf5oQ/Cxax+0PWFG+DFDgHNAdi+TXECADI+RYiFUMmx8792xsMbbgXj4CwnP4g==}
@@ -1522,6 +1630,9 @@ packages:
   json-schema-traverse@0.4.1:
     resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==}
 
+  json-schema@0.4.0:
+    resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==}
+
   json-stable-stringify-without-jsonify@1.0.1:
     resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==}
 
@@ -1562,11 +1673,8 @@ packages:
   longest-streak@3.1.0:
     resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==}
 
-  loupe@3.2.1:
-    resolution: {integrity: sha512-CdzqowRJCeLU72bHvWqwRBBlLcMEtIvGrlvef74kMnV2AolS9Y8xUv1I0U/MNAWMhBlKIoyuEgoJ0t/bbwHbLQ==}
-
-  magic-string@0.30.19:
-    resolution: {integrity: sha512-2N21sPY9Ws53PZvsEpVtNuSW+ScYbQdp4b9qUaL+9QkHUrGFKo56Lg9Emg5s9V/qrtNBmiR01sYhUOwu3H+VOw==}
+  magic-string@0.30.21:
+    resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==}
 
   markdown-table@3.0.4:
     resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==}
@@ -1750,8 +1858,8 @@ packages:
     engines: {node: ^14.16.0 || >=16.10.0}
     hasBin: true
 
-  object-deep-merge@1.0.5:
-    resolution: {integrity: sha512-3DioFgOzetbxbeUq8pB2NunXo8V0n4EvqsWM/cJoI6IA9zghd7cl/2pBOuWRf4dlvA+fcg5ugFMZaN2/RuoaGg==}
+  object-deep-merge@2.0.0:
+    resolution: {integrity: sha512-3DC3UMpeffLTHiuXSy/UG4NOIYTLlY9u3V82+djSCLYClWobZiS4ivYzpIUWrRY/nfsJ8cWsKyG3QfyLePmhvg==}
 
   ofetch@1.4.1:
     resolution: {integrity: sha512-QZj2DfGplQAr2oj9KzceK9Hwz6Whxazmn85yYeVuS3u9XTMOGMRx0kO95MQ+vLsj/S/NwBDMMLU5hpxvI6Tklw==}
@@ -1771,6 +1879,10 @@ packages:
     resolution: {integrity: sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==}
     engines: {node: '>=10'}
 
+  p-map@7.0.3:
+    resolution: {integrity: sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA==}
+    engines: {node: '>=18'}
+
   package-manager-detector@1.5.0:
     resolution: {integrity: sha512-uBj69dVlYe/+wxj8JOpr97XfsxH/eumMt6HqjNTmJDf/6NO9s+0uxeOneIz3AsPt2m6y9PqzDzd3ATcU17MNfw==}
 
@@ -1799,10 +1911,6 @@ packages:
   pathe@2.0.3:
     resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==}
 
-  pathval@2.0.1:
-    resolution: {integrity: sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ==}
-    engines: {node: '>= 14.16'}
-
   perfect-debounce@2.0.0:
     resolution: {integrity: sha512-fkEH/OBiKrqqI/yIgjR92lMfs2K8105zt/VT6+7eTjNwisrsh47CeIED9z58zI7DfKdH3uHAn25ziRZn3kgAow==}
 
@@ -1875,6 +1983,10 @@ packages:
     resolution: {integrity: sha512-cnE+y8bz4NhMjISKbgeVJtqNbtf5QpjZP+Bslo+UqkIt9QPnX9q095eiRRASJG1/tz6dlNr6Z5NsBiWYokp6EQ==}
     hasBin: true
 
+  reserved-identifiers@1.2.0:
+    resolution: {integrity: sha512-yE7KUfFvaBFzGPs5H3Ops1RevfUEsDc5Iz65rOwWg4lE8HJSYtle77uul3+573457oHvBKuHYDl/xqUkKpEEdw==}
+    engines: {node: '>=18'}
+
   resolve-from@4.0.0:
     resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==}
     engines: {node: '>=4'}
@@ -1886,13 +1998,13 @@ packages:
     resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==}
     engines: {iojs: '>=1.0.0', node: '>=0.10.0'}
 
-  rolldown-plugin-dts@0.16.12:
-    resolution: {integrity: sha512-9dGjm5oqtKcbZNhpzyBgb8KrYiU616A7IqcFWG7Msp1RKAXQ/hapjivRg+g5IYWSiFhnk3OKYV5T4Ft1t8Cczg==}
+  rolldown-plugin-dts@0.17.1:
+    resolution: {integrity: sha512-dQfoYD9kwSau7UQPg0UubprCDcwWeEKYd9SU9O2MpOdKy3VHy3/DaDF+x6w9+KE/w6J8qxkHVjwG1K2QmmQAFA==}
     engines: {node: '>=20.18.0'}
     peerDependencies:
       '@ts-macro/tsc': ^0.3.6
       '@typescript/native-preview': '>=7.0.0-dev.20250601.1'
-      rolldown: ^1.0.0-beta.9
+      rolldown: ^1.0.0-beta.44
       typescript: ^5.0.0
       vue-tsc: ~3.1.0
     peerDependenciesMeta:
@@ -1971,9 +2083,6 @@ packages:
     resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
     engines: {node: '>=8'}
 
-  strip-literal@3.1.0:
-    resolution: {integrity: sha512-8r3mkIM/2+PpjHoOtiAW8Rg3jJLHaV7xPwG+YRGrv6FP0wwk/toTpATxWYOW0BKdWwl82VT2tFYi5DlROa0Mxg==}
-
   supports-color@7.2.0:
     resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
     engines: {node: '>=8'}
@@ -1999,22 +2108,18 @@ packages:
     resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==}
     engines: {node: '>=12.0.0'}
 
-  tinypool@1.1.1:
-    resolution: {integrity: sha512-Zba82s87IFq9A9XmjiX5uZA/ARWDrB03OHlq+Vw1fSdt0I+4/Kutwy8BP4Y/y/aORMo61FQ0vIb5j44vSo5Pkg==}
-    engines: {node: ^18.0.0 || >=20.0.0}
-
-  tinyrainbow@2.0.0:
-    resolution: {integrity: sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==}
-    engines: {node: '>=14.0.0'}
-
-  tinyspy@4.0.4:
-    resolution: {integrity: sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==}
+  tinyrainbow@3.0.3:
+    resolution: {integrity: sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==}
     engines: {node: '>=14.0.0'}
 
   to-regex-range@5.0.1:
     resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
     engines: {node: '>=8.0'}
 
+  to-valid-identifier@1.0.0:
+    resolution: {integrity: sha512-41wJyvKep3yT2tyPqX/4blcfybknGB4D+oETKLs7Q76UiPqRpUJK3hr1nxelyYO0PHKVzJwlu0aCeEAsGI6rpw==}
+    engines: {node: '>=20'}
+
   toml-eslint-parser@0.10.0:
     resolution: {integrity: sha512-khrZo4buq4qVmsGzS5yQjKe/WsFvV8fGfOjDQN0q4iy9FjRfPWRgTFrU8u1R2iu/SfWLhY9WnCi4Jhdrcbtg+g==}
     engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
@@ -2034,8 +2139,8 @@ packages:
     peerDependencies:
       typescript: '>=4.0.0'
 
-  tsdown@0.15.9:
-    resolution: {integrity: sha512-C0EJYpXIYdlJokTumIL4lmv/wEiB20oa6iiYsXFE7Q0VKF3Ju6TQ7XAn4JQdm+2iQGEfl8cnEKcX5DB7iVR5Dw==}
+  tsdown@0.15.10:
+    resolution: {integrity: sha512-8zbSN4GW7ZzhjIYl/rWrruGzl1cJiDtAjb8l5XVF2cVme1+aDLVcExw+Ph4gNcfdGg6ZfYPh5kmcpIfh5xHisw==}
     engines: {node: '>=20.19.0'}
     hasBin: true
     peerDependencies:
@@ -2068,10 +2173,6 @@ packages:
     resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
     engines: {node: '>= 0.8.0'}
 
-  type-fest@4.2.0:
-    resolution: {integrity: sha512-5zknd7Dss75pMSED270A1RQS3KloqRJA9XbXLe0eCxyw7xXFb3rd+9B0UQ/0E+LQT6lnrLviEolYORlRWamn4w==}
-    engines: {node: '>=16'}
-
   typescript@5.9.3:
     resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==}
     engines: {node: '>=14.17'}
@@ -2098,6 +2199,11 @@ packages:
   unist-util-visit@5.0.0:
     resolution: {integrity: sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==}
 
+  unrun@0.2.0:
+    resolution: {integrity: sha512-iaCxWG/6kmjP3wUTBheowjFm6LuI8fd/A3Uz7DbMoz8HvQsJThh7tWZKWJfVltOSK3LuIJFzepr7g6fbuhUasw==}
+    engines: {node: '>=20.19.0'}
+    hasBin: true
+
   untyped@2.0.0:
     resolution: {integrity: sha512-nwNCjxJTjNuLCgFr42fEak5OcLuB3ecca+9ksPFNvtfYSLpjf+iJqSIaSnIile6ZPbKYxI5k2AfXqeopGudK/g==}
     hasBin: true
@@ -2114,13 +2220,8 @@ packages:
   util-deprecate@1.0.2:
     resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==}
 
-  vite-node@3.2.4:
-    resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==}
-    engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
-    hasBin: true
-
-  vite@7.1.11:
-    resolution: {integrity: sha512-uzcxnSDVjAopEUjljkWh8EIrg6tlzrjFUfMcR1EVsRDGwf/ccef0qQPRyOrROwhrTDaApueq+ja+KLPlzR/zdg==}
+  vite@7.1.12:
+    resolution: {integrity: sha512-ZWyE8YXEXqJrrSLvYgrRP7p62OziLW7xI5HYGWFzOvupfAlrLvURSzv/FyGyy0eidogEM3ujU+kUG1zuHgb6Ug==}
     engines: {node: ^20.19.0 || >=22.12.0}
     hasBin: true
     peerDependencies:
@@ -2159,16 +2260,18 @@ packages:
       yaml:
         optional: true
 
-  vitest@3.2.4:
-    resolution: {integrity: sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==}
-    engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0}
+  vitest@4.0.3:
+    resolution: {integrity: sha512-IUSop8jgaT7w0g1yOM/35qVtKjr/8Va4PrjzH1OUb0YH4c3OXB2lCZDkMAB6glA8T5w8S164oJGsbcmAecr4sA==}
+    engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0}
     hasBin: true
     peerDependencies:
       '@edge-runtime/vm': '*'
       '@types/debug': ^4.1.12
-      '@types/node': ^18.0.0 || ^20.0.0 || >=22.0.0
-      '@vitest/browser': 3.2.4
-      '@vitest/ui': 3.2.4
+      '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0
+      '@vitest/browser-playwright': 4.0.3
+      '@vitest/browser-preview': 4.0.3
+      '@vitest/browser-webdriverio': 4.0.3
+      '@vitest/ui': 4.0.3
       happy-dom: '*'
       jsdom: '*'
     peerDependenciesMeta:
@@ -2178,7 +2281,11 @@ packages:
         optional: true
       '@types/node':
         optional: true
-      '@vitest/browser':
+      '@vitest/browser-playwright':
+        optional: true
+      '@vitest/browser-preview':
+        optional: true
+      '@vitest/browser-webdriverio':
         optional: true
       '@vitest/ui':
         optional: true
@@ -2224,12 +2331,51 @@ packages:
     resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
     engines: {node: '>=10'}
 
+  zod@4.1.12:
+    resolution: {integrity: sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ==}
+
   zwitch@2.0.4:
     resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==}
 
 snapshots:
 
-  '@antfu/eslint-config@6.0.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
+  '@ai-sdk/anthropic@2.0.37(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      zod: 4.1.12
+
+  '@ai-sdk/gateway@2.0.1(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      '@vercel/oidc': 3.0.3
+      zod: 4.1.12
+
+  '@ai-sdk/google@2.0.23(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      zod: 4.1.12
+
+  '@ai-sdk/openai@2.0.53(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      zod: 4.1.12
+
+  '@ai-sdk/provider-utils@3.0.12(zod@4.1.12)':
+    dependencies:
+      '@ai-sdk/provider': 2.0.0
+      '@standard-schema/spec': 1.0.0
+      eventsource-parser: 3.0.6
+      zod: 4.1.12
+
+  '@ai-sdk/provider@2.0.0':
+    dependencies:
+      json-schema: 0.4.0
+
+  '@antfu/eslint-config@6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
     dependencies:
       '@antfu/install-pkg': 1.1.0
       '@clack/prompts': 0.11.0
@@ -2238,7 +2384,7 @@ snapshots:
       '@stylistic/eslint-plugin': 5.5.0(eslint@9.38.0(jiti@2.6.1))
       '@typescript-eslint/eslint-plugin': 8.46.2(@typescript-eslint/parser@8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3))(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)
       '@typescript-eslint/parser': 8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)
-      '@vitest/eslint-plugin': 1.3.23(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
+      '@vitest/eslint-plugin': 1.3.25(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
       ansis: 4.2.0
       cac: 6.7.14
       eslint: 9.38.0(jiti@2.6.1)
@@ -2248,7 +2394,7 @@ snapshots:
       eslint-plugin-antfu: 3.1.1(eslint@9.38.0(jiti@2.6.1))
       eslint-plugin-command: 3.3.1(eslint@9.38.0(jiti@2.6.1))
       eslint-plugin-import-lite: 0.3.0(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)
-      eslint-plugin-jsdoc: 59.1.0(eslint@9.38.0(jiti@2.6.1))
+      eslint-plugin-jsdoc: 61.1.9(eslint@9.38.0(jiti@2.6.1))
       eslint-plugin-jsonc: 2.21.0(eslint@9.38.0(jiti@2.6.1))
       eslint-plugin-n: 17.23.1(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)
       eslint-plugin-no-only-tests: 3.3.0
@@ -2280,28 +2426,28 @@ snapshots:
       package-manager-detector: 1.5.0
       tinyexec: 1.0.1
 
-  '@babel/generator@7.28.3':
+  '@babel/generator@7.28.5':
     dependencies:
-      '@babel/parser': 7.28.4
-      '@babel/types': 7.28.4
+      '@babel/parser': 7.28.5
+      '@babel/types': 7.28.5
       '@jridgewell/gen-mapping': 0.3.13
       '@jridgewell/trace-mapping': 0.3.31
       jsesc: 3.1.0
 
   '@babel/helper-string-parser@7.27.1': {}
 
-  '@babel/helper-validator-identifier@7.27.1': {}
+  '@babel/helper-validator-identifier@7.28.5': {}
 
-  '@babel/parser@7.28.4':
+  '@babel/parser@7.28.5':
     dependencies:
-      '@babel/types': 7.28.4
+      '@babel/types': 7.28.5
 
   '@babel/runtime@7.28.4': {}
 
-  '@babel/types@7.28.4':
+  '@babel/types@7.28.5':
     dependencies:
       '@babel/helper-string-parser': 7.27.1
-      '@babel/helper-validator-identifier': 7.27.1
+      '@babel/helper-validator-identifier': 7.28.5
 
   '@clack/core@0.5.0':
     dependencies:
@@ -2338,13 +2484,15 @@ snapshots:
       esquery: 1.6.0
       jsdoc-type-pratt-parser: 4.1.0
 
-  '@es-joy/jsdoccomment@0.58.0':
+  '@es-joy/jsdoccomment@0.76.0':
     dependencies:
       '@types/estree': 1.0.8
       '@typescript-eslint/types': 8.46.2
       comment-parser: 1.4.1
       esquery: 1.6.0
-      jsdoc-type-pratt-parser: 5.4.0
+      jsdoc-type-pratt-parser: 6.10.0
+
+  '@es-joy/resolve.exports@1.0.0': {}
 
   '@esbuild/aix-ppc64@0.25.11':
     optional: true
@@ -2505,6 +2653,8 @@ snapshots:
       '@eslint/core': 0.16.0
       levn: 0.4.1
 
+  '@faker-js/faker@10.1.0': {}
+
   '@humanfs/core@0.19.1': {}
 
   '@humanfs/node@0.16.7':
@@ -2549,6 +2699,10 @@ snapshots:
       '@nodelib/fs.scandir': 2.1.5
       fastq: 1.19.1
 
+  '@opentelemetry/api@1.9.0': {}
+
+  '@oxc-project/runtime@0.95.0': {}
+
   '@oxc-project/types@0.95.0': {}
 
   '@parcel/watcher-android-arm64@2.5.1':
@@ -2729,6 +2883,10 @@ snapshots:
   '@rollup/rollup-win32-x64-msvc@4.52.5':
     optional: true
 
+  '@sindresorhus/base62@1.0.0': {}
+
+  '@standard-schema/spec@1.0.0': {}
+
   '@stylistic/eslint-plugin@5.5.0(eslint@9.38.0(jiti@2.6.1))':
     dependencies:
       '@eslint-community/eslint-utils': 4.9.0(eslint@9.38.0(jiti@2.6.1))
@@ -2864,62 +3022,61 @@ snapshots:
       '@typescript-eslint/types': 8.46.2
       eslint-visitor-keys: 4.2.1
 
-  '@vitest/eslint-plugin@1.3.23(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
+  '@vercel/oidc@3.0.3': {}
+
+  '@vitest/eslint-plugin@1.3.25(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
     dependencies:
       '@typescript-eslint/scope-manager': 8.46.2
       '@typescript-eslint/utils': 8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)
       eslint: 9.38.0(jiti@2.6.1)
     optionalDependencies:
       typescript: 5.9.3
-      vitest: 3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
+      vitest: 4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
     transitivePeerDependencies:
       - supports-color
 
-  '@vitest/expect@3.2.4':
+  '@vitest/expect@4.0.3':
     dependencies:
+      '@standard-schema/spec': 1.0.0
       '@types/chai': 5.2.3
-      '@vitest/spy': 3.2.4
-      '@vitest/utils': 3.2.4
-      chai: 5.3.3
-      tinyrainbow: 2.0.0
+      '@vitest/spy': 4.0.3
+      '@vitest/utils': 4.0.3
+      chai: 6.2.0
+      tinyrainbow: 3.0.3
 
-  '@vitest/mocker@3.2.4(vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
+  '@vitest/mocker@4.0.3(vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
     dependencies:
-      '@vitest/spy': 3.2.4
+      '@vitest/spy': 4.0.3
       estree-walker: 3.0.3
-      magic-string: 0.30.19
+      magic-string: 0.30.21
     optionalDependencies:
-      vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
+      vite: 7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
 
-  '@vitest/pretty-format@3.2.4':
+  '@vitest/pretty-format@4.0.3':
     dependencies:
-      tinyrainbow: 2.0.0
+      tinyrainbow: 3.0.3
 
-  '@vitest/runner@3.2.4':
+  '@vitest/runner@4.0.3':
     dependencies:
-      '@vitest/utils': 3.2.4
-      pathe: 2.0.3
-      strip-literal: 3.1.0
-
-  '@vitest/snapshot@3.2.4':
-    dependencies:
-      '@vitest/pretty-format': 3.2.4
-      magic-string: 0.30.19
+      '@vitest/utils': 4.0.3
       pathe: 2.0.3
 
-  '@vitest/spy@3.2.4':
+  '@vitest/snapshot@4.0.3':
     dependencies:
-      tinyspy: 4.0.4
+      '@vitest/pretty-format': 4.0.3
+      magic-string: 0.30.21
+      pathe: 2.0.3
 
-  '@vitest/utils@3.2.4':
+  '@vitest/spy@4.0.3': {}
+
+  '@vitest/utils@4.0.3':
     dependencies:
-      '@vitest/pretty-format': 3.2.4
-      loupe: 3.2.1
-      tinyrainbow: 2.0.0
+      '@vitest/pretty-format': 4.0.3
+      tinyrainbow: 3.0.3
 
   '@vue/compiler-core@3.5.22':
     dependencies:
-      '@babel/parser': 7.28.4
+      '@babel/parser': 7.28.5
       '@vue/shared': 3.5.22
       entities: 4.5.0
       estree-walker: 2.0.2
@@ -2932,13 +3089,13 @@ snapshots:
 
   '@vue/compiler-sfc@3.5.22':
     dependencies:
-      '@babel/parser': 7.28.4
+      '@babel/parser': 7.28.5
       '@vue/compiler-core': 3.5.22
       '@vue/compiler-dom': 3.5.22
       '@vue/compiler-ssr': 3.5.22
       '@vue/shared': 3.5.22
       estree-walker: 2.0.2
-      magic-string: 0.30.19
+      magic-string: 0.30.21
       postcss: 8.5.6
       source-map-js: 1.2.1
 
@@ -2955,6 +3112,14 @@ snapshots:
 
   acorn@8.15.0: {}
 
+  ai@5.0.80(zod@4.1.12):
+    dependencies:
+      '@ai-sdk/gateway': 2.0.1(zod@4.1.12)
+      '@ai-sdk/provider': 2.0.0
+      '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
+      '@opentelemetry/api': 1.9.0
+      zod: 4.1.12
+
   ajv@6.12.6:
     dependencies:
       fast-deep-equal: 3.1.3
@@ -2978,7 +3143,7 @@ snapshots:
 
   ast-kit@2.1.3:
     dependencies:
-      '@babel/parser': 7.28.4
+      '@babel/parser': 7.28.5
       pathe: 2.0.3
 
   automd@0.4.2:
@@ -2990,7 +3155,7 @@ snapshots:
       defu: 6.1.4
       destr: 2.0.5
       didyoumean2: 7.0.4
-      magic-string: 0.30.19
+      magic-string: 0.30.21
       mdbox: 0.1.1
       mlly: 1.8.0
       ofetch: 1.4.1
@@ -3005,7 +3170,7 @@ snapshots:
 
   balanced-match@1.0.2: {}
 
-  baseline-browser-mapping@2.8.19: {}
+  baseline-browser-mapping@2.8.20: {}
 
   birpc@2.6.1: {}
 
@@ -3026,9 +3191,9 @@ snapshots:
 
   browserslist@4.27.0:
     dependencies:
-      baseline-browser-mapping: 2.8.19
+      baseline-browser-mapping: 2.8.20
       caniuse-lite: 1.0.30001751
-      electron-to-chromium: 1.5.238
+      electron-to-chromium: 1.5.240
       node-releases: 2.0.26
       update-browserslist-db: 1.1.4(browserslist@4.27.0)
 
@@ -3073,13 +3238,7 @@ snapshots:
 
   ccount@2.0.1: {}
 
-  chai@5.3.3:
-    dependencies:
-      assertion-error: 2.0.1
-      check-error: 2.1.1
-      deep-eql: 5.0.2
-      loupe: 3.2.1
-      pathval: 2.0.1
+  chai@6.2.0: {}
 
   chalk@4.1.2:
     dependencies:
@@ -3090,8 +3249,6 @@ snapshots:
 
   character-entities@2.0.2: {}
 
-  check-error@2.1.1: {}
-
   chokidar@4.0.3:
     dependencies:
       readdirp: 4.1.2
@@ -3134,6 +3291,8 @@ snapshots:
 
   cssesc@3.0.0: {}
 
+  csv-stringify@6.6.0: {}
+
   debug@4.4.3:
     dependencies:
       ms: 2.1.3
@@ -3142,8 +3301,6 @@ snapshots:
     dependencies:
       character-entities: 2.0.2
 
-  deep-eql@5.0.2: {}
-
   deep-is@0.1.4: {}
 
   defu@6.1.4: {}
@@ -3172,7 +3329,7 @@ snapshots:
 
   dts-resolver@2.1.2: {}
 
-  electron-to-chromium@1.5.238: {}
+  electron-to-chromium@1.5.240: {}
 
   empathic@2.0.0: {}
 
@@ -3275,9 +3432,10 @@ snapshots:
     optionalDependencies:
       typescript: 5.9.3
 
-  eslint-plugin-jsdoc@59.1.0(eslint@9.38.0(jiti@2.6.1)):
+  eslint-plugin-jsdoc@61.1.9(eslint@9.38.0(jiti@2.6.1)):
     dependencies:
-      '@es-joy/jsdoccomment': 0.58.0
+      '@es-joy/jsdoccomment': 0.76.0
+      '@es-joy/resolve.exports': 1.0.0
       are-docs-informative: 0.0.2
       comment-parser: 1.4.1
       debug: 4.4.3
@@ -3285,10 +3443,12 @@ snapshots:
       eslint: 9.38.0(jiti@2.6.1)
       espree: 10.4.0
       esquery: 1.6.0
-      object-deep-merge: 1.0.5
+      html-entities: 2.6.0
+      object-deep-merge: 2.0.0
       parse-imports-exports: 0.2.4
       semver: 7.7.3
       spdx-expression-parse: 4.0.0
+      to-valid-identifier: 1.0.0
     transitivePeerDependencies:
       - supports-color
 
@@ -3367,7 +3527,7 @@ snapshots:
 
   eslint-plugin-unicorn@61.0.2(eslint@9.38.0(jiti@2.6.1)):
     dependencies:
-      '@babel/helper-validator-identifier': 7.27.1
+      '@babel/helper-validator-identifier': 7.28.5
       '@eslint-community/eslint-utils': 4.9.0(eslint@9.38.0(jiti@2.6.1))
       '@eslint/plugin-kit': 0.3.5
       change-case: 5.4.4
@@ -3504,6 +3664,8 @@ snapshots:
 
   esutils@2.0.3: {}
 
+  eventsource-parser@3.0.6: {}
+
   expect-type@1.2.2: {}
 
   exsolve@1.0.7: {}
@@ -3604,6 +3766,8 @@ snapshots:
 
   hookable@5.5.3: {}
 
+  html-entities@2.6.0: {}
+
   ignore@5.3.2: {}
 
   ignore@7.0.5: {}
@@ -3633,8 +3797,6 @@ snapshots:
 
   jiti@2.6.1: {}
 
-  js-tokens@9.0.1: {}
-
   js-yaml@4.1.0:
     dependencies:
       argparse: 2.0.1
@@ -3643,7 +3805,7 @@ snapshots:
 
   jsdoc-type-pratt-parser@4.8.0: {}
 
-  jsdoc-type-pratt-parser@5.4.0: {}
+  jsdoc-type-pratt-parser@6.10.0: {}
 
   jsesc@3.0.2: {}
 
@@ -3653,6 +3815,8 @@ snapshots:
 
   json-schema-traverse@0.4.1: {}
 
+  json-schema@0.4.0: {}
+
   json-stable-stringify-without-jsonify@1.0.1: {}
 
   jsonc-eslint-parser@2.4.1:
@@ -3693,9 +3857,7 @@ snapshots:
 
   longest-streak@3.1.0: {}
 
-  loupe@3.2.1: {}
-
-  magic-string@0.30.19:
+  magic-string@0.30.21:
     dependencies:
       '@jridgewell/sourcemap-codec': 1.5.5
 
@@ -4066,9 +4228,7 @@ snapshots:
       pkg-types: 2.3.0
       tinyexec: 1.0.1
 
-  object-deep-merge@1.0.5:
-    dependencies:
-      type-fest: 4.2.0
+  object-deep-merge@2.0.0: {}
 
   ofetch@1.4.1:
     dependencies:
@@ -4095,6 +4255,8 @@ snapshots:
     dependencies:
       p-limit: 3.1.0
 
+  p-map@7.0.3: {}
+
   package-manager-detector@1.5.0: {}
 
   parent-module@1.0.1:
@@ -4115,8 +4277,6 @@ snapshots:
 
   pathe@2.0.3: {}
 
-  pathval@2.0.1: {}
-
   perfect-debounce@2.0.0: {}
 
   picocolors@1.1.1: {}
@@ -4184,23 +4344,25 @@ snapshots:
     dependencies:
       jsesc: 3.0.2
 
+  reserved-identifiers@1.2.0: {}
+
   resolve-from@4.0.0: {}
 
   resolve-pkg-maps@1.0.0: {}
 
   reusify@1.1.0: {}
 
-  rolldown-plugin-dts@0.16.12(rolldown@1.0.0-beta.44)(typescript@5.9.3):
+  rolldown-plugin-dts@0.17.1(rolldown@1.0.0-beta.44)(typescript@5.9.3):
     dependencies:
-      '@babel/generator': 7.28.3
-      '@babel/parser': 7.28.4
-      '@babel/types': 7.28.4
+      '@babel/generator': 7.28.5
+      '@babel/parser': 7.28.5
+      '@babel/types': 7.28.5
       ast-kit: 2.1.3
       birpc: 2.6.1
       debug: 4.4.3
       dts-resolver: 2.1.2
       get-tsconfig: 4.13.0
-      magic-string: 0.30.19
+      magic-string: 0.30.21
       rolldown: 1.0.0-beta.44
     optionalDependencies:
       typescript: 5.9.3
@@ -4299,10 +4461,6 @@ snapshots:
 
   strip-json-comments@3.1.1: {}
 
-  strip-literal@3.1.0:
-    dependencies:
-      js-tokens: 9.0.1
-
   supports-color@7.2.0:
     dependencies:
       has-flag: 4.0.0
@@ -4324,16 +4482,17 @@ snapshots:
       fdir: 6.5.0(picomatch@4.0.3)
       picomatch: 4.0.3
 
-  tinypool@1.1.1: {}
-
-  tinyrainbow@2.0.0: {}
-
-  tinyspy@4.0.4: {}
+  tinyrainbow@3.0.3: {}
 
   to-regex-range@5.0.1:
     dependencies:
       is-number: 7.0.0
 
+  to-valid-identifier@1.0.0:
+    dependencies:
+      '@sindresorhus/base62': 1.0.0
+      reserved-identifiers: 1.2.0
+
   toml-eslint-parser@0.10.0:
     dependencies:
       eslint-visitor-keys: 3.4.3
@@ -4349,7 +4508,7 @@ snapshots:
       picomatch: 4.0.3
       typescript: 5.9.3
 
-  tsdown@0.15.9(typescript@5.9.3):
+  tsdown@0.15.10(typescript@5.9.3):
     dependencies:
       ansis: 4.2.0
       cac: 6.7.14
@@ -4359,12 +4518,13 @@ snapshots:
       empathic: 2.0.0
       hookable: 5.5.3
       rolldown: 1.0.0-beta.44
-      rolldown-plugin-dts: 0.16.12(rolldown@1.0.0-beta.44)(typescript@5.9.3)
+      rolldown-plugin-dts: 0.17.1(rolldown@1.0.0-beta.44)(typescript@5.9.3)
       semver: 7.7.3
       tinyexec: 1.0.1
       tinyglobby: 0.2.15
       tree-kill: 1.2.2
       unconfig: 7.3.3
+      unrun: 0.2.0
     optionalDependencies:
       typescript: 5.9.3
     transitivePeerDependencies:
@@ -4388,8 +4548,6 @@ snapshots:
     dependencies:
       prelude-ls: 1.2.1
 
-  type-fest@4.2.0: {}
-
   typescript@5.9.3: {}
 
   ufo@1.6.1: {}
@@ -4422,6 +4580,12 @@ snapshots:
       unist-util-is: 6.0.1
       unist-util-visit-parents: 6.0.2
 
+  unrun@0.2.0:
+    dependencies:
+      '@oxc-project/runtime': 0.95.0
+      rolldown: 1.0.0-beta.44
+      synckit: 0.11.11
+
   untyped@2.0.0:
     dependencies:
       citty: 0.1.6
@@ -4442,28 +4606,7 @@ snapshots:
 
   util-deprecate@1.0.2: {}
 
-  vite-node@3.2.4(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1):
-    dependencies:
-      cac: 6.7.14
-      debug: 4.4.3
-      es-module-lexer: 1.7.0
-      pathe: 2.0.3
-      vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
-    transitivePeerDependencies:
-      - '@types/node'
-      - jiti
-      - less
-      - lightningcss
-      - sass
-      - sass-embedded
-      - stylus
-      - sugarss
-      - supports-color
-      - terser
-      - tsx
-      - yaml
-
-  vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1):
+  vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1):
     dependencies:
       esbuild: 0.25.11
       fdir: 6.5.0(picomatch@4.0.3)
@@ -4478,30 +4621,27 @@ snapshots:
       tsx: 4.20.6
       yaml: 2.8.1
 
-  vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1):
+  vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1):
     dependencies:
-      '@types/chai': 5.2.3
-      '@vitest/expect': 3.2.4
-      '@vitest/mocker': 3.2.4(vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
-      '@vitest/pretty-format': 3.2.4
-      '@vitest/runner': 3.2.4
-      '@vitest/snapshot': 3.2.4
-      '@vitest/spy': 3.2.4
-      '@vitest/utils': 3.2.4
-      chai: 5.3.3
+      '@vitest/expect': 4.0.3
+      '@vitest/mocker': 4.0.3(vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
+      '@vitest/pretty-format': 4.0.3
+      '@vitest/runner': 4.0.3
+      '@vitest/snapshot': 4.0.3
+      '@vitest/spy': 4.0.3
+      '@vitest/utils': 4.0.3
       debug: 4.4.3
+      es-module-lexer: 1.7.0
       expect-type: 1.2.2
-      magic-string: 0.30.19
+      magic-string: 0.30.21
       pathe: 2.0.3
       picomatch: 4.0.3
       std-env: 3.10.0
       tinybench: 2.9.0
       tinyexec: 0.3.2
       tinyglobby: 0.2.15
-      tinypool: 1.1.1
-      tinyrainbow: 2.0.0
-      vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
-      vite-node: 3.2.4(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
+      tinyrainbow: 3.0.3
+      vite: 7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)
       why-is-node-running: 2.3.0
     optionalDependencies:
       '@types/debug': 4.1.12
@@ -4554,4 +4694,6 @@ snapshots:
 
   yocto-queue@0.1.0: {}
 
+  zod@4.1.12: {}
+
   zwitch@2.0.4: {}
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
new file mode 100644
index 0000000..76137e2
--- /dev/null
+++ b/pnpm-workspace.yaml
@@ -0,0 +1,2 @@
+packages:
+  - benchmarks
diff --git a/scripts/generate-bench.ts b/scripts/generate-bench.ts
deleted file mode 100644
index a4d8950..0000000
--- a/scripts/generate-bench.ts
+++ /dev/null
@@ -1,213 +0,0 @@
-import * as fsp from 'node:fs/promises'
-import * as path from 'node:path'
-import * as url from 'node:url'
-import { encode } from 'gpt-tokenizer' // o200k_base encoding (default)
-import { encode as encodeToon } from '../src/index'
-
-interface BenchmarkResult {
-  name: string
-  emoji: string
-  jsonTokens: number
-  toonTokens: number
-  savings: number
-  savingsPercent: string
-}
-
-const rootDir = url.fileURLToPath(new URL('../', import.meta.url))
-const benchPath = path.join(rootDir, 'docs', 'benchmarks.md')
-
-const BENCHMARK_EXAMPLES = [
-  {
-    name: 'Simple user object',
-    emoji: '👤',
-    data: {
-      id: 123,
-      name: 'Alice',
-      email: 'alice@example.com',
-      active: true,
-    },
-  },
-  {
-    name: 'User with tags',
-    emoji: '🏷️',
-    data: {
-      user: {
-        id: 123,
-        name: 'Ada',
-        tags: ['reading', 'gaming', 'coding'],
-        active: true,
-      },
-    },
-  },
-  {
-    name: 'Small product catalog',
-    emoji: '📦',
-    data: {
-      items: [
-        { sku: 'A1', name: 'Widget', qty: 2, price: 9.99 },
-        { sku: 'B2', name: 'Gadget', qty: 1, price: 14.5 },
-        { sku: 'C3', name: 'Doohickey', qty: 5, price: 7.25 },
-      ],
-    },
-  },
-  {
-    name: 'API response with users',
-    emoji: '👥',
-    data: {
-      users: [
-        { id: 1, name: 'Alice', email: 'alice@example.com', active: true },
-        { id: 2, name: 'Bob', email: 'bob@example.com', active: true },
-        { id: 3, name: 'Charlie', email: 'charlie@example.com', active: false },
-      ],
-      total: 3,
-      page: 1,
-    },
-  },
-  {
-    name: 'Nested configuration',
-    emoji: '⚙️',
-    data: {
-      database: {
-        host: 'localhost',
-        port: 5432,
-        credentials: {
-          username: 'dbuser',
-          password: 'secret123',
-        },
-      },
-      cache: {
-        enabled: true,
-        ttl: 3600,
-      },
-    },
-  },
-  {
-    name: 'E-commerce order',
-    emoji: '🛒',
-    data: {
-      orderId: 'ORD-2025-001',
-      customer: {
-        id: 456,
-        name: 'Jane Doe',
-        email: 'jane@example.com',
-      },
-      items: [
-        { sku: 'PROD-A', name: 'Premium Widget', quantity: 2, price: 29.99 },
-        { sku: 'PROD-B', name: 'Deluxe Gadget', quantity: 1, price: 49.99 },
-      ],
-      subtotal: 109.97,
-      tax: 10.99,
-      total: 120.96,
-      status: 'pending',
-    },
-  },
-  {
-    name: 'Analytics data',
-    emoji: '📊',
-    data: {
-      metrics: [
-        { date: '2025-01-01', views: 1234, clicks: 89, conversions: 12 },
-        { date: '2025-01-02', views: 2345, clicks: 156, conversions: 23 },
-        { date: '2025-01-03', views: 1890, clicks: 123, conversions: 18 },
-        { date: '2025-01-04', views: 3456, clicks: 234, conversions: 34 },
-        { date: '2025-01-05', views: 2789, clicks: 178, conversions: 27 },
-      ],
-    },
-  },
-  {
-    name: 'Large dataset (50 records)',
-    emoji: '📈',
-    data: {
-      records: Array.from({ length: 50 }, (_, i) => ({
-        id: i + 1,
-        name: `User ${i + 1}`,
-        email: `user${i + 1}@example.com`,
-        score: (i * 7) % 100,
-        active: i % 3 !== 0,
-      })),
-    },
-  },
-] as const
-
-const DETAILED_EXAMPLE_INDICES = [2, 3, 6] // Small product catalog, API response, Analytics data
-
-// Calculate total savings
-let totalJsonTokens = 0
-let totalToonTokens = 0
-
-const results: BenchmarkResult[] = []
-
-for (const example of BENCHMARK_EXAMPLES) {
-  const jsonString = JSON.stringify(example.data, null, 2)
-  const toonString = encodeToon(example.data)
-
-  const jsonTokens = encode(jsonString).length
-  const toonTokens = encode(toonString).length
-  const savings = jsonTokens - toonTokens
-  const savingsPercent = ((savings / jsonTokens) * 100).toFixed(1)
-
-  totalJsonTokens += jsonTokens
-  totalToonTokens += toonTokens
-
-  results.push({
-    name: example.name,
-    emoji: example.emoji,
-    jsonTokens,
-    toonTokens,
-    savings,
-    savingsPercent,
-  })
-}
-
-const totalSavings = totalJsonTokens - totalToonTokens
-const totalSavingsPercent = ((totalSavings / totalJsonTokens) * 100).toFixed(1)
-
-// Generate markdown content matching README style
-const summaryRows = results
-  .map(result => `| ${result.emoji} ${result.name} | ${result.jsonTokens} | ${result.toonTokens} | ${result.savings} | **${result.savingsPercent}%** |`)
-  .join('\n')
-
-const detailedExamples = DETAILED_EXAMPLE_INDICES
-  .map((exampleIndex, i) => {
-    const example = BENCHMARK_EXAMPLES[exampleIndex]!
-    const result = results[exampleIndex]!
-    const separator = i < DETAILED_EXAMPLE_INDICES.length - 1 ? '\n\n---' : ''
-
-    return `### ${result.emoji} ${result.name}
-
-**Savings: ${result.savings} tokens (${result.savingsPercent}% reduction)**
-
-**JSON** (${result.jsonTokens} tokens):
-
-\`\`\`json
-${JSON.stringify(example.data, null, 2)}
-\`\`\`
-
-**TOON** (${result.toonTokens} tokens):
-
-\`\`\`
-${encodeToon(example.data)}
-\`\`\`${separator}`
-  })
-  .join('\n\n')
-
-const markdown = `
-| Example | JSON | TOON | Tokens Saved | Reduction |
-| ------- | ---- | ---- | ------------ | --------- |
-${summaryRows}
-| **Total** | **${totalJsonTokens}** | **${totalToonTokens}** | **${totalSavings}** | **${totalSavingsPercent}%** |
-
-<details>
-<summary><strong>View detailed results</strong></summary>
-
-${detailedExamples}
-
-</details>
-`.trimStart()
-
-console.log(markdown)
-
-await fsp.mkdir(path.join(rootDir, 'docs'), { recursive: true })
-await fsp.writeFile(benchPath, markdown, 'utf-8')
-
-console.log(`✅ Benchmark written to ${benchPath}`)