From 3c840259feb12a0f241d642769cb60e15a820209 Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Mon, 27 Oct 2025 11:48:33 +0100 Subject: [PATCH] test: add LLM retrieval accuracy tests --- .gitignore | 1 + README.md | 301 +- benchmarks/.env.example | 3 + benchmarks/data/github-repos.json | 1302 ++ benchmarks/package.json | 26 + benchmarks/results/accuracy/accuracy.md | 96 + benchmarks/results/accuracy/raw-results.json | 17492 ++++++++++++++++ benchmarks/results/accuracy/report.md | 96 + benchmarks/results/accuracy/summary.json | 95 + benchmarks/results/token-efficiency.md | 141 + benchmarks/scripts/accuracy-benchmark.ts | 140 + benchmarks/scripts/fetch-github-data.ts | 78 + .../scripts/token-efficiency-benchmark.ts | 228 + benchmarks/src/constants.ts | 39 + benchmarks/src/datasets.ts | 146 + benchmarks/src/evaluate.ts | 133 + benchmarks/src/formatters.ts | 90 + benchmarks/src/questions.ts | 398 + benchmarks/src/report.ts | 288 + benchmarks/src/types.ts | 35 + docs/benchmarks.md | 158 - package.json | 8 +- pnpm-lock.yaml | 618 +- pnpm-workspace.yaml | 2 + scripts/generate-bench.ts | 213 - 25 files changed, 21404 insertions(+), 723 deletions(-) create mode 100644 benchmarks/.env.example create mode 100644 benchmarks/data/github-repos.json create mode 100644 benchmarks/package.json create mode 100644 benchmarks/results/accuracy/accuracy.md create mode 100644 benchmarks/results/accuracy/raw-results.json create mode 100644 benchmarks/results/accuracy/report.md create mode 100644 benchmarks/results/accuracy/summary.json create mode 100644 benchmarks/results/token-efficiency.md create mode 100644 benchmarks/scripts/accuracy-benchmark.ts create mode 100644 benchmarks/scripts/fetch-github-data.ts create mode 100644 benchmarks/scripts/token-efficiency-benchmark.ts create mode 100644 benchmarks/src/constants.ts create mode 100644 benchmarks/src/datasets.ts create mode 100644 benchmarks/src/evaluate.ts create mode 100644 benchmarks/src/formatters.ts create mode 100644 benchmarks/src/questions.ts create mode 100644 benchmarks/src/report.ts create mode 100644 benchmarks/src/types.ts delete mode 100644 docs/benchmarks.md create mode 100644 pnpm-workspace.yaml delete mode 100644 scripts/generate-bench.ts diff --git a/.gitignore b/.gitignore index b186605..f73f2b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ dist node_modules .DS_Store +.env diff --git a/README.md b/README.md index 546493c..271be53 100644 --- a/README.md +++ b/README.md @@ -42,168 +42,148 @@ users[2]{id,name,role}: - ๐Ÿ“ **Indentation-based structure:** replaces braces with whitespace for better readability - ๐Ÿงบ **Tabular arrays:** declare keys once, then stream rows without repetition -## Token Benchmarks +## Benchmarks -> [!NOTE] -> Benchmarks for LLM accuracy and retrieval are currently in development. + - +### Token Efficiency -| Example | JSON | TOON | Tokens Saved | Reduction | -| ------- | ---- | ---- | ------------ | --------- | -| ๐Ÿ‘ค Simple user object | 31 | 18 | 13 | **41.9%** | -| ๐Ÿท๏ธ User with tags | 48 | 28 | 20 | **41.7%** | -| ๐Ÿ“ฆ Small product catalog | 117 | 49 | 68 | **58.1%** | -| ๐Ÿ‘ฅ API response with users | 123 | 53 | 70 | **56.9%** | -| โš™๏ธ Nested configuration | 68 | 42 | 26 | **38.2%** | -| ๐Ÿ›’ E-commerce order | 163 | 94 | 69 | **42.3%** | -| ๐Ÿ“Š Analytics data | 209 | 94 | 115 | **55.0%** | -| ๐Ÿ“ˆ Large dataset (50 records) | 2159 | 762 | 1397 | **64.7%** | -| **Total** | **2918** | **1140** | **1778** | **60.9%** | +``` +โญ GitHub Repositories โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 8,745 tokens (JSON: 15,145) ๐Ÿ’ฐ 42.3% saved +๐Ÿ“ˆ Analytics Time Series โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 3,631 tokens (JSON: 9,024) ๐Ÿ’ฐ 59.8% saved +๐Ÿ‘ฅ API Response โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 2,593 tokens (JSON: 4,589) ๐Ÿ’ฐ 43.5% saved +๐Ÿ›’ E-commerce Order โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 203 tokens (JSON: 338) ๐Ÿ’ฐ 39.9% saved +``` + +**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) โ†’ 47.9% savings
-View detailed results +View detailed examples -### ๐Ÿ“ฆ Small product catalog +#### โญ GitHub Repositories -**Savings: 68 tokens (58.1% reduction)** +**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata -**JSON** (117 tokens): +**Savings:** 6,400 tokens (42.3% reduction) + +**JSON** (15,145 tokens): ```json { - "items": [ + "repositories": [ { - "sku": "A1", - "name": "Widget", - "qty": 2, - "price": 9.99 + "id": 28457823, + "name": "freeCodeCamp", + "repo": "freeCodeCamp/freeCodeCamp", + "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...", + "createdAt": "2014-12-24T17:49:19Z", + "updatedAt": "2025-10-27T07:40:58Z", + "pushedAt": "2025-10-26T11:31:08Z", + "stars": 430828, + "watchers": 8582, + "forks": 42136, + "defaultBranch": "main" }, { - "sku": "B2", - "name": "Gadget", - "qty": 1, - "price": 14.5 + "id": 132750724, + "name": "build-your-own-x", + "repo": "codecrafters-io/build-your-own-x", + "description": "Master programming by recreating your favorite technologies from scratch.", + "createdAt": "2018-05-09T12:03:18Z", + "updatedAt": "2025-10-27T07:43:25Z", + "pushedAt": "2025-10-10T18:45:01Z", + "stars": 430102, + "watchers": 6322, + "forks": 40388, + "defaultBranch": "master" }, { - "sku": "C3", - "name": "Doohickey", - "qty": 5, - "price": 7.25 + "id": 21737465, + "name": "awesome", + "repo": "sindresorhus/awesome", + "description": "๐Ÿ˜Ž Awesome lists about all kinds of interesting topics", + "createdAt": "2014-07-11T13:42:37Z", + "updatedAt": "2025-10-27T07:44:27Z", + "pushedAt": "2025-10-23T17:26:53Z", + "stars": 409760, + "watchers": 8016, + "forks": 32015, + "defaultBranch": "main" } ] } ``` -**TOON** (49 tokens): +**TOON** (8,745 tokens): ``` -items[3]{sku,name,qty,price}: - A1,Widget,2,9.99 - B2,Gadget,1,14.5 - C3,Doohickey,5,7.25 +repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}: + 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main + 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master + 21737465,awesome,sindresorhus/awesome,๐Ÿ˜Ž Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main ``` --- -### ๐Ÿ‘ฅ API response with users +#### ๐Ÿ“ˆ Analytics Time Series -**Savings: 70 tokens (56.9% reduction)** +**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue) -**JSON** (123 tokens): +**Savings:** 5,393 tokens (59.8% reduction) -```json -{ - "users": [ - { - "id": 1, - "name": "Alice", - "email": "alice@example.com", - "active": true - }, - { - "id": 2, - "name": "Bob", - "email": "bob@example.com", - "active": true - }, - { - "id": 3, - "name": "Charlie", - "email": "charlie@example.com", - "active": false - } - ], - "total": 3, - "page": 1 -} -``` - -**TOON** (53 tokens): - -``` -users[3]{id,name,email,active}: - 1,Alice,alice@example.com,true - 2,Bob,bob@example.com,true - 3,Charlie,charlie@example.com,false -total: 3 -page: 1 -``` - ---- - -### ๐Ÿ“Š Analytics data - -**Savings: 115 tokens (55.0% reduction)** - -**JSON** (209 tokens): +**JSON** (9,024 tokens): ```json { "metrics": [ + { + "date": "2024-12-31", + "views": 3769, + "clicks": 400, + "conversions": 59, + "revenue": 198.98 + }, { "date": "2025-01-01", - "views": 1234, - "clicks": 89, - "conversions": 12 + "views": 5742, + "clicks": 463, + "conversions": 28, + "revenue": 295.77 }, { "date": "2025-01-02", - "views": 2345, - "clicks": 156, - "conversions": 23 + "views": 3669, + "clicks": 336, + "conversions": 102, + "revenue": 624.23 }, { "date": "2025-01-03", - "views": 1890, - "clicks": 123, - "conversions": 18 + "views": 1332, + "clicks": 304, + "conversions": 99, + "revenue": 113.06 }, { "date": "2025-01-04", - "views": 3456, - "clicks": 234, - "conversions": 34 - }, - { - "date": "2025-01-05", - "views": 2789, - "clicks": 178, - "conversions": 27 + "views": 1444, + "clicks": 222, + "conversions": 88, + "revenue": 986.69 } ] } ``` -**TOON** (94 tokens): +**TOON** (3,631 tokens): ``` -metrics[5]{date,views,clicks,conversions}: - 2025-01-01,1234,89,12 - 2025-01-02,2345,156,23 - 2025-01-03,1890,123,18 - 2025-01-04,3456,234,34 - 2025-01-05,2789,178,27 +metrics[5]{date,views,clicks,conversions,revenue}: + 2024-12-31,3769,400,59,198.98 + 2025-01-01,5742,463,28,295.77 + 2025-01-02,3669,336,102,624.23 + 2025-01-03,1332,304,99,113.06 + 2025-01-04,1444,222,88,986.69 ```
@@ -213,6 +193,107 @@ metrics[5]{date,views,clicks,conversions}: > [!NOTE] > Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers. + + +### Retrieval Accuracy + +Tested across **2 LLMs** with data retrieval tasks: + +``` +gpt-4o-mini โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 72.3% accuracy +claude-haiku-4-5 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 76.7% accuracy +``` + +**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.** + +| Format | Accuracy | Average Tokens | +| ------ | -------- | -------------- | +| `toon` | 73.9% | 4.678 | +| `json` | 73.6% | 8.713 | +| `markdown-kv` | 73.6% | 8.649 | +| `csv` | 72.3% | 4.745 | +| `yaml` | 71.7% | 7.091 | + +
+View detailed breakdown by dataset and model + +#### Performance by Dataset + +##### Uniform employee records (TOON optimal format) + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 72.4% | 2.483 | 84/116 | +| `csv` | 69.0% | 2.337 | 80/116 | +| `yaml` | 68.1% | 4.969 | 79/116 | +| `markdown-kv` | 68.1% | 6.270 | 79/116 | +| `json` | 68.1% | 6.347 | 79/116 | + +##### E-commerce orders with nested structures + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 84.1% | 5.967 | 74/88 | +| `csv` | 83.0% | 6.735 | 73/88 | +| `yaml` | 81.8% | 7.328 | 72/88 | +| `markdown-kv` | 86.4% | 9.110 | 76/88 | +| `json` | 84.1% | 9.694 | 74/88 | + +##### Time-series analytics data + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `csv` | 72.4% | 1.393 | 42/58 | +| `toon` | 70.7% | 1.515 | 41/58 | +| `yaml` | 72.4% | 2.938 | 42/58 | +| `json` | 74.1% | 3.665 | 43/58 | +| `markdown-kv` | 70.7% | 3.779 | 41/58 | + +##### Popular GitHub repositories + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 64.3% | 8.745 | 36/56 | +| `csv` | 62.5% | 8.513 | 35/56 | +| `json` | 67.9% | 15.145 | 38/56 | +| `markdown-kv` | 67.9% | 15.436 | 38/56 | +| `yaml` | 62.5% | 13.129 | 35/56 | + + +#### Performance by Model + +##### gpt-4o-mini + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `toon` | 72.3% | 115/159 | +| `json` | 71.7% | 114/159 | +| `markdown-kv` | 70.4% | 112/159 | +| `csv` | 69.2% | 110/159 | +| `yaml` | 68.6% | 109/159 | + +##### claude-haiku-4-5 + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `markdown-kv` | 76.7% | 122/159 | +| `toon` | 75.5% | 120/159 | +| `json` | 75.5% | 120/159 | +| `csv` | 75.5% | 120/159 | +| `yaml` | 74.8% | 119/159 | + + +#### Methodology + +- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching). +- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding. +- **Question types**: Field retrieval, aggregation, and filtering tasks. +- **Real data**: Faker.js-generated datasets + GitHub repositories. + +
+ + + ## Installation ```bash diff --git a/benchmarks/.env.example b/benchmarks/.env.example new file mode 100644 index 0000000..df70883 --- /dev/null +++ b/benchmarks/.env.example @@ -0,0 +1,3 @@ +OPENAI_API_KEY= +ANTHROPIC_API_KEY= +GOOGLE_GENERATIVE_AI_API_KEY= diff --git a/benchmarks/data/github-repos.json b/benchmarks/data/github-repos.json new file mode 100644 index 0000000..b7ed072 --- /dev/null +++ b/benchmarks/data/github-repos.json @@ -0,0 +1,1302 @@ +[ + { + "id": 28457823, + "name": "freeCodeCamp", + "repo": "freeCodeCamp/freeCodeCamp", + "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming, and computer science for free.", + "createdAt": "2014-12-24T17:49:19Z", + "updatedAt": "2025-10-27T07:40:58Z", + "pushedAt": "2025-10-26T11:31:08Z", + "stars": 430828, + "watchers": 8582, + "forks": 42136, + "defaultBranch": "main" + }, + { + "id": 132750724, + "name": "build-your-own-x", + "repo": "codecrafters-io/build-your-own-x", + "description": "Master programming by recreating your favorite technologies from scratch.", + "createdAt": "2018-05-09T12:03:18Z", + "updatedAt": "2025-10-27T07:43:25Z", + "pushedAt": "2025-10-10T18:45:01Z", + "stars": 430102, + "watchers": 6322, + "forks": 40388, + "defaultBranch": "master" + }, + { + "id": 21737465, + "name": "awesome", + "repo": "sindresorhus/awesome", + "description": "๐Ÿ˜Ž Awesome lists about all kinds of interesting topics", + "createdAt": "2014-07-11T13:42:37Z", + "updatedAt": "2025-10-27T07:44:27Z", + "pushedAt": "2025-10-23T17:26:53Z", + "stars": 409760, + "watchers": 8016, + "forks": 32015, + "defaultBranch": "main" + }, + { + "id": 13491895, + "name": "free-programming-books", + "repo": "EbookFoundation/free-programming-books", + "description": ":books: Freely available programming books", + "createdAt": "2013-10-11T06:50:37Z", + "updatedAt": "2025-10-27T07:36:14Z", + "pushedAt": "2025-10-26T23:24:34Z", + "stars": 375134, + "watchers": 9788, + "forks": 65149, + "defaultBranch": "main" + }, + { + "id": 54346799, + "name": "public-apis", + "repo": "public-apis/public-apis", + "description": "A collective list of free APIs", + "createdAt": "2016-03-20T23:49:42Z", + "updatedAt": "2025-10-27T07:45:41Z", + "pushedAt": "2025-05-20T15:56:34Z", + "stars": 373288, + "watchers": 4392, + "forks": 39386, + "defaultBranch": "master" + }, + { + "id": 85077558, + "name": "developer-roadmap", + "repo": "kamranahmedse/developer-roadmap", + "description": "Interactive roadmaps, guides and other educational content to help developers grow in their careers.", + "createdAt": "2017-03-15T13:45:52Z", + "updatedAt": "2025-10-27T07:26:36Z", + "pushedAt": "2025-10-24T10:20:46Z", + "stars": 342038, + "watchers": 6887, + "forks": 43222, + "defaultBranch": "master" + }, + { + "id": 60493101, + "name": "coding-interview-university", + "repo": "jwasham/coding-interview-university", + "description": "A complete computer science study plan to become a software engineer.", + "createdAt": "2016-06-06T02:34:12Z", + "updatedAt": "2025-10-27T07:46:31Z", + "pushedAt": "2025-08-28T14:42:47Z", + "stars": 331885, + "watchers": 8512, + "forks": 81046, + "defaultBranch": "main" + }, + { + "id": 83222441, + "name": "system-design-primer", + "repo": "donnemartin/system-design-primer", + "description": "Learn how to design large-scale systems. Prep for the system design interview. Includes Anki flashcards.", + "createdAt": "2017-02-26T16:15:28Z", + "updatedAt": "2025-10-27T07:38:55Z", + "pushedAt": "2025-05-21T11:13:33Z", + "stars": 324162, + "watchers": 6818, + "forks": 52866, + "defaultBranch": "master" + }, + { + "id": 177736533, + "name": "996.ICU", + "repo": "996icu/996.ICU", + "description": "Repo for counting stars and contributing. Press F to pay respect to glorious developers.", + "createdAt": "2019-03-26T07:31:14Z", + "updatedAt": "2025-10-27T07:35:11Z", + "pushedAt": "2025-08-22T06:01:29Z", + "stars": 274700, + "watchers": 4217, + "forks": 21033, + "defaultBranch": "master" + }, + { + "id": 21289110, + "name": "awesome-python", + "repo": "vinta/awesome-python", + "description": "An opinionated list of awesome Python frameworks, libraries, software and resources.", + "createdAt": "2014-06-27T21:00:06Z", + "updatedAt": "2025-10-27T07:40:04Z", + "pushedAt": "2025-10-16T13:40:58Z", + "stars": 266460, + "watchers": 6127, + "forks": 26579, + "defaultBranch": "master" + }, + { + "id": 36633370, + "name": "awesome-selfhosted", + "repo": "awesome-selfhosted/awesome-selfhosted", + "description": "A list of Free Software network services and web applications which can be hosted on your own servers", + "createdAt": "2015-06-01T02:33:17Z", + "updatedAt": "2025-10-27T07:43:02Z", + "pushedAt": "2025-10-23T10:47:33Z", + "stars": 254916, + "watchers": 2995, + "forks": 11798, + "defaultBranch": "master" + }, + { + "id": 88011908, + "name": "project-based-learning", + "repo": "practical-tutorials/project-based-learning", + "description": "Curated list of project-based tutorials", + "createdAt": "2017-04-12T05:07:46Z", + "updatedAt": "2025-10-27T07:45:41Z", + "pushedAt": "2024-08-15T05:33:54Z", + "stars": 247930, + "watchers": 3445, + "forks": 32413, + "defaultBranch": "master" + }, + { + "id": 10270250, + "name": "react", + "repo": "facebook/react", + "description": "The library for web and native user interfaces.", + "createdAt": "2013-05-24T16:15:54Z", + "updatedAt": "2025-10-27T06:47:16Z", + "pushedAt": "2025-10-24T22:08:43Z", + "stars": 240059, + "watchers": 6687, + "forks": 49664, + "defaultBranch": "main" + }, + { + "id": 63476337, + "name": "Python", + "repo": "TheAlgorithms/Python", + "description": "All Algorithms implemented in Python", + "createdAt": "2016-07-16T09:44:01Z", + "updatedAt": "2025-10-27T07:26:23Z", + "pushedAt": "2025-10-20T00:59:36Z", + "stars": 212044, + "watchers": 5975, + "forks": 48986, + "defaultBranch": "master" + }, + { + "id": 11730342, + "name": "vue", + "repo": "vuejs/vue", + "description": "This is the repo for Vue 2. For Vue 3, go to https://github.com/vuejs/core", + "createdAt": "2013-07-29T03:24:51Z", + "updatedAt": "2025-10-27T05:37:40Z", + "pushedAt": "2024-10-10T07:24:15Z", + "stars": 209624, + "watchers": 5787, + "forks": 33796, + "defaultBranch": "main" + }, + { + "id": 2325298, + "name": "linux", + "repo": "torvalds/linux", + "description": "Linux kernel source tree", + "createdAt": "2011-09-04T22:48:12Z", + "updatedAt": "2025-10-27T07:25:34Z", + "pushedAt": "2025-10-26T23:00:24Z", + "stars": 205761, + "watchers": 7739, + "forks": 58023, + "defaultBranch": "master" + }, + { + "id": 19415064, + "name": "computer-science", + "repo": "ossu/computer-science", + "description": "๐ŸŽ“ Path to a free self-taught education in Computer Science!", + "createdAt": "2014-05-04T00:18:39Z", + "updatedAt": "2025-10-27T07:25:53Z", + "pushedAt": "2025-08-23T18:48:52Z", + "stars": 196024, + "watchers": 5935, + "forks": 24465, + "defaultBranch": "master" + }, + { + "id": 126577260, + "name": "javascript-algorithms", + "repo": "trekhleb/javascript-algorithms", + "description": "๐Ÿ“ Algorithms and data structures implemented in JavaScript with explanations and links to further readings", + "createdAt": "2018-03-24T07:47:04Z", + "updatedAt": "2025-10-27T07:26:50Z", + "pushedAt": "2025-10-22T15:03:29Z", + "stars": 193648, + "watchers": 4267, + "forks": 30919, + "defaultBranch": "master" + }, + { + "id": 45717250, + "name": "tensorflow", + "repo": "tensorflow/tensorflow", + "description": "An Open Source Machine Learning Framework for Everyone", + "createdAt": "2015-11-07T01:19:20Z", + "updatedAt": "2025-10-27T07:33:01Z", + "pushedAt": "2025-10-27T06:15:29Z", + "stars": 192220, + "watchers": 7431, + "forks": 74928, + "defaultBranch": "master" + }, + { + "id": 138393139, + "name": "the-book-of-secret-knowledge", + "repo": "trimstray/the-book-of-secret-knowledge", + "description": "A collection of inspiring lists, manuals, cheatsheets, blogs, hacks, one-liners, cli/web tools and more.", + "createdAt": "2018-06-23T10:43:14Z", + "updatedAt": "2025-10-27T07:43:08Z", + "pushedAt": "2024-11-19T14:00:38Z", + "stars": 191315, + "watchers": 2679, + "forks": 11763, + "defaultBranch": "master" + }, + { + "id": 14440270, + "name": "You-Dont-Know-JS", + "repo": "getify/You-Dont-Know-JS", + "description": "A book series (2 published editions) on the JS language.", + "createdAt": "2013-11-16T02:37:24Z", + "updatedAt": "2025-10-27T07:25:47Z", + "pushedAt": "2025-05-20T14:22:36Z", + "stars": 183631, + "watchers": 5802, + "forks": 33668, + "defaultBranch": "2nd-ed" + }, + { + "id": 121395510, + "name": "CS-Notes", + "repo": "CyC2018/CS-Notes", + "description": ":books: ๆŠ€ๆœฏ้ข่ฏ•ๅฟ…ๅค‡ๅŸบ็ก€็Ÿฅ่ฏ†ใ€Leetcodeใ€่ฎก็ฎ—ๆœบๆ“ไฝœ็ณป็ปŸใ€่ฎก็ฎ—ๆœบ็ฝ‘็ปœใ€็ณป็ปŸ่ฎพ่ฎก", + "createdAt": "2018-02-13T14:56:24Z", + "updatedAt": "2025-10-27T07:19:57Z", + "pushedAt": "2024-08-21T09:40:10Z", + "stars": 182646, + "watchers": 5252, + "forks": 51251, + "defaultBranch": "master" + }, + { + "id": 291137, + "name": "ohmyzsh", + "repo": "ohmyzsh/ohmyzsh", + "description": "๐Ÿ™ƒ A delightful community-driven (with 2,400+ contributors) framework for managing your zsh configuration. Includes 300+ optional plugins (rails, git, macOS, hub, docker, homebrew, node, php, python, etc), 140+ themes to spice up your morning, and an auto-update tool that makes it easy to keep up with the latest updates from the community.", + "createdAt": "2009-08-28T18:15:37Z", + "updatedAt": "2025-10-27T07:25:29Z", + "pushedAt": "2025-10-26T13:17:47Z", + "stars": 182297, + "watchers": 2618, + "forks": 26259, + "defaultBranch": "master" + }, + { + "id": 614765452, + "name": "AutoGPT", + "repo": "Significant-Gravitas/AutoGPT", + "description": "AutoGPT is the vision of accessible AI for everyone, to use and to build on. Our mission is to provide the tools, so that you can focus on what matters.", + "createdAt": "2023-03-16T09:21:07Z", + "updatedAt": "2025-10-27T07:34:44Z", + "pushedAt": "2025-10-27T00:10:36Z", + "stars": 179292, + "watchers": 1547, + "forks": 46077, + "defaultBranch": "master" + }, + { + "id": 41881900, + "name": "vscode", + "repo": "microsoft/vscode", + "description": "Visual Studio Code", + "createdAt": "2015-09-03T20:23:38Z", + "updatedAt": "2025-10-27T07:26:11Z", + "pushedAt": "2025-10-27T07:29:25Z", + "stars": 177925, + "watchers": 3364, + "forks": 35788, + "defaultBranch": "main" + }, + { + "id": 123458551, + "name": "Python-100-Days", + "repo": "jackfrued/Python-100-Days", + "description": "Python - 100ๅคฉไปŽๆ–ฐๆ‰‹ๅˆฐๅคงๅธˆ", + "createdAt": "2018-03-01T16:05:52Z", + "updatedAt": "2025-10-27T07:26:50Z", + "pushedAt": "2025-03-28T10:29:23Z", + "stars": 173752, + "watchers": 6098, + "forks": 54771, + "defaultBranch": "master" + }, + { + "id": 2126244, + "name": "bootstrap", + "repo": "twbs/bootstrap", + "description": "The most popular HTML, CSS, and JavaScript framework for developing responsive, mobile first projects on the web.", + "createdAt": "2011-07-29T21:19:00Z", + "updatedAt": "2025-10-27T07:25:34Z", + "pushedAt": "2025-10-26T18:41:31Z", + "stars": 173599, + "watchers": 6681, + "forks": 79156, + "defaultBranch": "main" + }, + { + "id": 31792824, + "name": "flutter", + "repo": "flutter/flutter", + "description": "Flutter makes it easy and fast to build beautiful apps for mobile and beyond", + "createdAt": "2015-03-06T22:54:58Z", + "updatedAt": "2025-10-27T07:31:00Z", + "pushedAt": "2025-10-27T05:33:32Z", + "stars": 173546, + "watchers": 3481, + "forks": 29414, + "defaultBranch": "master" + }, + { + "id": 1062897, + "name": "gitignore", + "repo": "github/gitignore", + "description": "A collection of useful .gitignore templates", + "createdAt": "2010-11-08T20:17:14Z", + "updatedAt": "2025-10-27T07:34:35Z", + "pushedAt": "2025-09-10T18:42:03Z", + "stars": 170298, + "watchers": 3366, + "forks": 82998, + "defaultBranch": "main" + }, + { + "id": 35955666, + "name": "the-art-of-command-line", + "repo": "jlevy/the-art-of-command-line", + "description": "Master the command line, in one page", + "createdAt": "2015-05-20T15:11:03Z", + "updatedAt": "2025-10-27T07:26:07Z", + "pushedAt": "2024-06-25T18:13:44Z", + "stars": 158582, + "watchers": 2812, + "forks": 14754, + "defaultBranch": "master" + }, + { + "id": 527591471, + "name": "stable-diffusion-webui", + "repo": "AUTOMATIC1111/stable-diffusion-webui", + "description": "Stable Diffusion web UI", + "createdAt": "2022-08-22T14:05:26Z", + "updatedAt": "2025-10-27T07:49:02Z", + "pushedAt": "2025-10-07T20:06:10Z", + "stars": 157565, + "watchers": 1154, + "forks": 29246, + "defaultBranch": "master" + }, + { + "id": 21540759, + "name": "awesome-go", + "repo": "avelino/awesome-go", + "description": "A curated list of awesome Go frameworks, libraries and software", + "createdAt": "2014-07-06T13:42:15Z", + "updatedAt": "2025-10-27T07:49:36Z", + "pushedAt": "2025-10-22T12:15:14Z", + "stars": 155801, + "watchers": 2818, + "forks": 12706, + "defaultBranch": "main" + }, + { + "id": 658928958, + "name": "ollama", + "repo": "ollama/ollama", + "description": "Get up and running with OpenAI gpt-oss, DeepSeek-R1, Gemma 3 and other models.", + "createdAt": "2023-06-26T19:39:32Z", + "updatedAt": "2025-10-27T07:43:05Z", + "pushedAt": "2025-10-27T01:25:05Z", + "stars": 154808, + "watchers": 877, + "forks": 13467, + "defaultBranch": "main" + }, + { + "id": 233472199, + "name": "Microsoft-Activation-Scripts", + "repo": "massgravel/Microsoft-Activation-Scripts", + "description": "Open-source Windows and Office activator featuring HWID, Ohook, TSforge, KMS38, and Online KMS activation methods, along with advanced troubleshooting.", + "createdAt": "2020-01-12T23:03:34Z", + "updatedAt": "2025-10-27T07:44:35Z", + "pushedAt": "2025-09-30T22:22:59Z", + "stars": 153864, + "watchers": 1319, + "forks": 14861, + "defaultBranch": "master" + }, + { + "id": 132464395, + "name": "JavaGuide", + "repo": "Snailclimb/JavaGuide", + "description": "ใ€ŒJavaๅญฆไน +้ข่ฏ•ๆŒ‡ๅ—ใ€ไธ€ไปฝๆถต็›–ๅคง้ƒจๅˆ† Java ็จ‹ๅบๅ‘˜ๆ‰€้œ€่ฆๆŽŒๆก็š„ๆ ธๅฟƒ็Ÿฅ่ฏ†ใ€‚ๅ‡†ๅค‡ Java ้ข่ฏ•๏ผŒ้ฆ–้€‰ JavaGuide๏ผ", + "createdAt": "2018-05-07T13:27:00Z", + "updatedAt": "2025-10-27T07:25:13Z", + "pushedAt": "2025-10-20T08:53:33Z", + "stars": 152308, + "watchers": 4470, + "forks": 46020, + "defaultBranch": "main" + }, + { + "id": 193215554, + "name": "n8n", + "repo": "n8n-io/n8n", + "description": "Fair-code workflow automation platform with native AI capabilities. Combine visual building with custom code, self-host or cloud, 400+ integrations.", + "createdAt": "2019-06-22T09:24:21Z", + "updatedAt": "2025-10-27T07:48:50Z", + "pushedAt": "2025-10-27T07:12:52Z", + "stars": 151975, + "watchers": 880, + "forks": 48459, + "defaultBranch": "master" + }, + { + "id": 155220641, + "name": "transformers", + "repo": "huggingface/transformers", + "description": "๐Ÿค— Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training. ", + "createdAt": "2018-10-29T13:56:00Z", + "updatedAt": "2025-10-27T07:45:24Z", + "pushedAt": "2025-10-25T16:31:22Z", + "stars": 151659, + "watchers": 1166, + "forks": 30955, + "defaultBranch": "main" + }, + { + "id": 6498492, + "name": "javascript", + "repo": "airbnb/javascript", + "description": "JavaScript Style Guide", + "createdAt": "2012-11-01T23:13:50Z", + "updatedAt": "2025-10-27T06:50:33Z", + "pushedAt": "2025-09-17T18:12:44Z", + "stars": 147687, + "watchers": 3705, + "forks": 26797, + "defaultBranch": "master" + }, + { + "id": 1039520, + "name": "youtube-dl", + "repo": "ytdl-org/youtube-dl", + "description": "Command-line program to download videos from YouTube.com and other video sites", + "createdAt": "2010-10-31T14:35:07Z", + "updatedAt": "2025-10-27T07:30:15Z", + "pushedAt": "2025-10-18T10:02:28Z", + "stars": 138545, + "watchers": 2160, + "forks": 10527, + "defaultBranch": "master" + }, + { + "id": 574523116, + "name": "awesome-chatgpt-prompts", + "repo": "f/awesome-chatgpt-prompts", + "description": "This repo includes ChatGPT prompt curation to use ChatGPT and other LLM tools better.", + "createdAt": "2022-12-05T13:54:13Z", + "updatedAt": "2025-10-27T07:42:24Z", + "pushedAt": "2025-10-14T17:23:13Z", + "stars": 135794, + "watchers": 1562, + "forks": 18073, + "defaultBranch": "main" + }, + { + "id": 70107786, + "name": "next.js", + "repo": "vercel/next.js", + "description": "The React Framework", + "createdAt": "2016-10-05T23:32:51Z", + "updatedAt": "2025-10-27T07:38:47Z", + "pushedAt": "2025-10-27T07:02:37Z", + "stars": 135306, + "watchers": 1497, + "forks": 29680, + "defaultBranch": "canary" + }, + { + "id": 599320067, + "name": "langflow", + "repo": "langflow-ai/langflow", + "description": "Langflow is a powerful tool for building and deploying AI-powered agents and workflows.", + "createdAt": "2023-02-08T22:28:03Z", + "updatedAt": "2025-10-27T07:22:05Z", + "pushedAt": "2025-10-27T00:28:51Z", + "stars": 134904, + "watchers": 453, + "forks": 7853, + "defaultBranch": "main" + }, + { + "id": 307260205, + "name": "yt-dlp", + "repo": "yt-dlp/yt-dlp", + "description": "A feature-rich command-line audio/video downloader", + "createdAt": "2020-10-26T04:22:55Z", + "updatedAt": "2025-10-27T07:35:17Z", + "pushedAt": "2025-10-25T22:47:00Z", + "stars": 132793, + "watchers": 675, + "forks": 10659, + "defaultBranch": "master" + }, + { + "id": 58028038, + "name": "HelloGitHub", + "repo": "521xueweihan/HelloGitHub", + "description": ":octocat: ๅˆ†ไบซ GitHub ไธŠๆœ‰่ถฃใ€ๅ…ฅ้—จ็บง็š„ๅผ€ๆบ้กน็›ฎใ€‚Share interesting, entry-level open source projects on GitHub.", + "createdAt": "2016-05-04T06:24:11Z", + "updatedAt": "2025-10-27T07:49:37Z", + "pushedAt": "2025-09-28T02:00:22Z", + "stars": 132228, + "watchers": 4182, + "forks": 10822, + "defaultBranch": "master" + }, + { + "id": 62607227, + "name": "tech-interview-handbook", + "repo": "yangshun/tech-interview-handbook", + "description": "๐Ÿ’ฏ Curated coding interview preparation materials for busy software engineers", + "createdAt": "2016-07-05T05:00:48Z", + "updatedAt": "2025-10-27T07:26:22Z", + "pushedAt": "2025-08-27T00:17:33Z", + "stars": 131399, + "watchers": 2182, + "forks": 15942, + "defaultBranch": "main" + }, + { + "id": 23096959, + "name": "go", + "repo": "golang/go", + "description": "The Go programming language", + "createdAt": "2014-08-19T04:33:40Z", + "updatedAt": "2025-10-27T07:25:58Z", + "pushedAt": "2025-10-27T04:49:52Z", + "stars": 130538, + "watchers": 3346, + "forks": 18415, + "defaultBranch": "master" + }, + { + "id": 111583593, + "name": "scrcpy", + "repo": "Genymobile/scrcpy", + "description": "Display and control your Android device", + "createdAt": "2017-11-21T18:00:27Z", + "updatedAt": "2025-10-27T07:30:24Z", + "pushedAt": "2025-10-26T10:52:03Z", + "stars": 130238, + "watchers": 1321, + "forks": 12191, + "defaultBranch": "master" + }, + { + "id": 241576270, + "name": "fucking-algorithm", + "repo": "labuladong/fucking-algorithm", + "description": "ๅˆท็ฎ—ๆณ•ๅ…จ้ ๅฅ—่ทฏ๏ผŒ่ฎคๅ‡† labuladong ๅฐฑๅคŸไบ†๏ผEnglish version supported! Crack LeetCode, not only how, but also why. ", + "createdAt": "2020-02-19T09:01:23Z", + "updatedAt": "2025-10-27T07:27:20Z", + "pushedAt": "2025-10-08T04:06:00Z", + "stars": 129651, + "watchers": 2283, + "forks": 23450, + "defaultBranch": "master" + }, + { + "id": 112507086, + "name": "30-seconds-of-code", + "repo": "Chalarangelo/30-seconds-of-code", + "description": "Coding articles to level up your development skills", + "createdAt": "2017-11-29T17:35:03Z", + "updatedAt": "2025-10-27T07:26:47Z", + "pushedAt": "2025-10-22T12:51:11Z", + "stars": 125630, + "watchers": 2594, + "forks": 12358, + "defaultBranch": "master" + }, + { + "id": 184456251, + "name": "PowerToys", + "repo": "microsoft/PowerToys", + "description": "Microsoft PowerToys is a collection of utilities that help you customize Windows and streamline everyday tasks", + "createdAt": "2019-05-01T17:44:02Z", + "updatedAt": "2025-10-27T07:50:46Z", + "pushedAt": "2025-10-27T02:44:52Z", + "stars": 125223, + "watchers": 1164, + "forks": 7451, + "defaultBranch": "main" + }, + { + "id": 29028775, + "name": "react-native", + "repo": "facebook/react-native", + "description": "A framework for building native applications using React", + "createdAt": "2015-01-09T18:10:16Z", + "updatedAt": "2025-10-27T07:20:37Z", + "pushedAt": "2025-10-27T06:53:57Z", + "stars": 124320, + "watchers": 3563, + "forks": 24914, + "defaultBranch": "main" + }, + { + "id": 9384267, + "name": "electron", + "repo": "electron/electron", + "description": ":electron: Build cross-platform desktop apps with JavaScript, HTML, and CSS", + "createdAt": "2013-04-12T01:47:36Z", + "updatedAt": "2025-10-27T07:25:42Z", + "pushedAt": "2025-10-27T06:46:57Z", + "stars": 118841, + "watchers": 2801, + "forks": 16578, + "defaultBranch": "main" + }, + { + "id": 20580498, + "name": "kubernetes", + "repo": "kubernetes/kubernetes", + "description": "Production-Grade Container Scheduling and Management", + "createdAt": "2014-06-06T22:56:04Z", + "updatedAt": "2025-10-27T07:31:13Z", + "pushedAt": "2025-10-26T22:21:34Z", + "stars": 118226, + "watchers": 3189, + "forks": 41578, + "defaultBranch": "master" + }, + { + "id": 552661142, + "name": "langchain", + "repo": "langchain-ai/langchain", + "description": "๐Ÿฆœ๐Ÿ”— Build context-aware reasoning applications", + "createdAt": "2022-10-17T02:58:36Z", + "updatedAt": "2025-10-27T07:37:09Z", + "pushedAt": "2025-10-27T07:39:14Z", + "stars": 118140, + "watchers": 775, + "forks": 19453, + "defaultBranch": "master" + }, + { + "id": 561730219, + "name": "hello-algo", + "repo": "krahets/hello-algo", + "description": "ใ€ŠHello ็ฎ—ๆณ•ใ€‹๏ผšๅŠจ็”ปๅ›พ่งฃใ€ไธ€้”ฎ่ฟ่กŒ็š„ๆ•ฐๆฎ็ป“ๆž„ไธŽ็ฎ—ๆณ•ๆ•™็จ‹ใ€‚ๆ”ฏๆŒ Python, Java, C++, C, C#, JS, Go, Swift, Rust, Ruby, Kotlin, TS, Dart ไปฃ็ ใ€‚็ฎ€ไฝ“็‰ˆๅ’Œ็นไฝ“็‰ˆๅŒๆญฅๆ›ดๆ–ฐ๏ผŒEnglish version in translation", + "createdAt": "2022-11-04T11:08:34Z", + "updatedAt": "2025-10-27T07:28:05Z", + "pushedAt": "2025-10-16T21:33:36Z", + "stars": 118081, + "watchers": 582, + "forks": 14500, + "defaultBranch": "main" + }, + { + "id": 626805178, + "name": "dify", + "repo": "langgenius/dify", + "description": "Production-ready platform for agentic workflow development.", + "createdAt": "2023-04-12T07:40:24Z", + "updatedAt": "2025-10-27T07:45:31Z", + "pushedAt": "2025-10-27T07:48:43Z", + "stars": 117359, + "watchers": 697, + "forks": 18125, + "defaultBranch": "main" + }, + { + "id": 14098069, + "name": "free-programming-books-zh_CN", + "repo": "justjavac/free-programming-books-zh_CN", + "description": ":books: ๅ…่ดน็š„่ฎก็ฎ—ๆœบ็ผ–็จ‹็ฑปไธญๆ–‡ไนฆ็ฑ๏ผŒๆฌข่ฟŽๆŠ•็จฟ", + "createdAt": "2013-11-04T01:59:19Z", + "updatedAt": "2025-10-27T07:25:46Z", + "pushedAt": "2024-07-15T08:55:20Z", + "stars": 115537, + "watchers": 5860, + "forks": 28362, + "defaultBranch": "main" + }, + { + "id": 32484381, + "name": "free-for-dev", + "repo": "ripienaar/free-for-dev", + "description": "A list of SaaS, PaaS and IaaS offerings that have free tiers of interest to devops and infradev", + "createdAt": "2015-03-18T21:06:26Z", + "updatedAt": "2025-10-27T07:26:05Z", + "pushedAt": "2025-10-23T04:49:00Z", + "stars": 114093, + "watchers": 1734, + "forks": 11683, + "defaultBranch": "master" + }, + { + "id": 27193779, + "name": "node", + "repo": "nodejs/node", + "description": "Node.js JavaScript runtime โœจ๐Ÿข๐Ÿš€โœจ", + "createdAt": "2014-11-26T19:57:11Z", + "updatedAt": "2025-10-27T07:38:07Z", + "pushedAt": "2025-10-27T01:02:07Z", + "stars": 113974, + "watchers": 2964, + "forks": 33571, + "defaultBranch": "main" + }, + { + "id": 701547123, + "name": "open-webui", + "repo": "open-webui/open-webui", + "description": "User-friendly AI Interface (Supports Ollama, OpenAI API, ...)", + "createdAt": "2023-10-06T22:08:27Z", + "updatedAt": "2025-10-27T07:32:58Z", + "pushedAt": "2025-10-27T05:20:59Z", + "stars": 113474, + "watchers": 516, + "forks": 15764, + "defaultBranch": "main" + }, + { + "id": 943149, + "name": "d3", + "repo": "d3/d3", + "description": "Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:", + "createdAt": "2010-09-27T17:22:42Z", + "updatedAt": "2025-10-27T07:25:31Z", + "pushedAt": "2025-07-27T11:30:40Z", + "stars": 111683, + "watchers": 3558, + "forks": 22851, + "defaultBranch": "main" + }, + { + "id": 808144141, + "name": "FreeDomain", + "repo": "DigitalPlatDev/FreeDomain", + "description": "DigitalPlat FreeDomain: Free Domain For Everyone", + "createdAt": "2024-05-30T13:23:00Z", + "updatedAt": "2025-10-27T07:49:47Z", + "pushedAt": "2025-09-25T12:12:01Z", + "stars": 111350, + "watchers": 120, + "forks": 2066, + "defaultBranch": "main" + }, + { + "id": 231283452, + "name": "excalidraw", + "repo": "excalidraw/excalidraw", + "description": "Virtual whiteboard for sketching hand-drawn like diagrams", + "createdAt": "2020-01-02T01:04:43Z", + "updatedAt": "2025-10-27T07:49:00Z", + "pushedAt": "2025-10-27T06:42:25Z", + "stars": 109225, + "watchers": 467, + "forks": 11332, + "defaultBranch": "master" + }, + { + "id": 576201, + "name": "three.js", + "repo": "mrdoob/three.js", + "description": "JavaScript 3D Library.", + "createdAt": "2010-03-23T18:58:01Z", + "updatedAt": "2025-10-27T07:25:30Z", + "pushedAt": "2025-10-26T17:25:47Z", + "stars": 109123, + "watchers": 2517, + "forks": 36051, + "defaultBranch": "dev" + }, + { + "id": 23088740, + "name": "axios", + "repo": "axios/axios", + "description": "Promise based HTTP client for the browser and node.js", + "createdAt": "2014-08-18T22:30:27Z", + "updatedAt": "2025-10-27T05:22:18Z", + "pushedAt": "2025-10-26T22:46:40Z", + "stars": 108017, + "watchers": 1169, + "forks": 11366, + "defaultBranch": "v1.x" + }, + { + "id": 724712, + "name": "rust", + "repo": "rust-lang/rust", + "description": "Empowering everyone to build reliable and efficient software.", + "createdAt": "2010-06-16T20:39:03Z", + "updatedAt": "2025-10-27T06:39:34Z", + "pushedAt": "2025-10-27T07:25:41Z", + "stars": 107453, + "watchers": 1467, + "forks": 13897, + "defaultBranch": "master" + }, + { + "id": 20929025, + "name": "TypeScript", + "repo": "microsoft/TypeScript", + "description": "TypeScript is a superset of JavaScript that compiles to clean JavaScript output.", + "createdAt": "2014-06-17T15:28:39Z", + "updatedAt": "2025-10-27T07:20:39Z", + "pushedAt": "2025-10-27T00:06:54Z", + "stars": 106530, + "watchers": 2148, + "forks": 13084, + "defaultBranch": "main" + }, + { + "id": 133442384, + "name": "deno", + "repo": "denoland/deno", + "description": "A modern runtime for JavaScript and TypeScript.", + "createdAt": "2018-05-15T01:34:26Z", + "updatedAt": "2025-10-27T07:14:57Z", + "pushedAt": "2025-10-24T23:41:20Z", + "stars": 104915, + "watchers": 1398, + "forks": 5753, + "defaultBranch": "main" + }, + { + "id": 103633984, + "name": "nodebestpractices", + "repo": "goldbergyoni/nodebestpractices", + "description": ":white_check_mark: The Node.js best practices list (July 2024)", + "createdAt": "2017-09-15T08:33:19Z", + "updatedAt": "2025-10-27T07:26:43Z", + "pushedAt": "2025-04-15T21:52:42Z", + "stars": 104439, + "watchers": 1944, + "forks": 10627, + "defaultBranch": "master" + }, + { + "id": 63537249, + "name": "create-react-app", + "repo": "facebook/create-react-app", + "description": "Set up a modern web app by running one command.", + "createdAt": "2016-07-17T14:55:11Z", + "updatedAt": "2025-10-27T07:26:24Z", + "pushedAt": "2025-02-15T01:32:11Z", + "stars": 103811, + "watchers": 1892, + "forks": 27146, + "defaultBranch": "main" + }, + { + "id": 206462776, + "name": "GitHub-Chinese-Top-Charts", + "repo": "GrowingGit/GitHub-Chinese-Top-Charts", + "description": ":cn: GitHubไธญๆ–‡ๆŽ’่กŒๆฆœ๏ผŒๅ„่ฏญ่จ€ๅˆ†่ฎพใ€Œ่ฝฏไปถ | ่ต„ๆ–™ใ€ๆฆœๅ•๏ผŒ็ฒพๅ‡†ๅฎšไฝไธญๆ–‡ๅฅฝ้กน็›ฎใ€‚ๅ„ๅ–ๆ‰€้œ€๏ผŒ้ซ˜ๆ•ˆๅญฆไน ใ€‚", + "createdAt": "2019-09-05T03:01:56Z", + "updatedAt": "2025-10-27T06:04:01Z", + "pushedAt": "2024-10-12T06:51:36Z", + "stars": 103336, + "watchers": 2607, + "forks": 13364, + "defaultBranch": "master" + }, + { + "id": 15634981, + "name": "godot", + "repo": "godotengine/godot", + "description": "Godot Engine โ€“ Multi-platform 2D and 3D game engine", + "createdAt": "2014-01-04T16:05:36Z", + "updatedAt": "2025-10-27T07:16:51Z", + "pushedAt": "2025-10-25T20:48:20Z", + "stars": 102604, + "watchers": 1493, + "forks": 23450, + "defaultBranch": "master" + }, + { + "id": 299354207, + "name": "rustdesk", + "repo": "rustdesk/rustdesk", + "description": "An open-source remote desktop application designed for self-hosting, as an alternative to TeamViewer.", + "createdAt": "2020-09-28T15:36:08Z", + "updatedAt": "2025-10-27T07:42:29Z", + "pushedAt": "2025-10-26T13:28:57Z", + "stars": 101456, + "watchers": 548, + "forks": 14837, + "defaultBranch": "master" + }, + { + "id": 655806940, + "name": "generative-ai-for-beginners", + "repo": "microsoft/generative-ai-for-beginners", + "description": "21 Lessons, Get Started Building with Generative AI ", + "createdAt": "2023-06-19T16:28:59Z", + "updatedAt": "2025-10-27T07:38:12Z", + "pushedAt": "2025-10-27T03:19:39Z", + "stars": 100935, + "watchers": 889, + "forks": 53478, + "defaultBranch": "main" + }, + { + "id": 100060912, + "name": "terminal", + "repo": "microsoft/terminal", + "description": "The new Windows Terminal and the original Windows console host, all in the same place!", + "createdAt": "2017-08-11T18:38:22Z", + "updatedAt": "2025-10-27T05:40:24Z", + "pushedAt": "2025-10-22T01:31:33Z", + "stars": 100726, + "watchers": 1334, + "forks": 8879, + "defaultBranch": "main" + }, + { + "id": 48378947, + "name": "frp", + "repo": "fatedier/frp", + "description": "A fast reverse proxy to help you expose a local server behind a NAT or firewall to the internet.", + "createdAt": "2015-12-21T15:24:59Z", + "updatedAt": "2025-10-27T07:00:25Z", + "pushedAt": "2025-10-17T02:53:43Z", + "stars": 100015, + "watchers": 1563, + "forks": 14562, + "defaultBranch": "dev" + }, + { + "id": 908531752, + "name": "DeepSeek-V3", + "repo": "deepseek-ai/DeepSeek-V3", + "description": null, + "createdAt": "2024-12-26T09:52:40Z", + "updatedAt": "2025-10-27T07:28:30Z", + "pushedAt": "2025-08-28T03:24:37Z", + "stars": 99981, + "watchers": 750, + "forks": 16309, + "defaultBranch": "main" + }, + { + "id": 55076063, + "name": "Awesome-Hacking", + "repo": "Hack-with-Github/Awesome-Hacking", + "description": "A collection of various awesome lists for hackers, pentesters and security researchers", + "createdAt": "2016-03-30T15:47:10Z", + "updatedAt": "2025-10-27T07:49:40Z", + "pushedAt": "2025-01-18T01:48:02Z", + "stars": 99684, + "watchers": 3931, + "forks": 9634, + "defaultBranch": "master" + }, + { + "id": 15204860, + "name": "papers-we-love", + "repo": "papers-we-love/papers-we-love", + "description": "Papers from the computer science community to read and discuss.", + "createdAt": "2013-12-15T14:31:41Z", + "updatedAt": "2025-10-27T07:49:42Z", + "pushedAt": "2025-10-10T15:35:14Z", + "stars": 99626, + "watchers": 3159, + "forks": 6144, + "defaultBranch": "main" + }, + { + "id": 24195339, + "name": "angular", + "repo": "angular/angular", + "description": "Deliver web apps with confidence ๐Ÿš€", + "createdAt": "2014-09-18T16:12:01Z", + "updatedAt": "2025-10-27T07:05:22Z", + "pushedAt": "2025-10-24T19:28:33Z", + "stars": 99167, + "watchers": 2980, + "forks": 26724, + "defaultBranch": "main" + }, + { + "id": 585146387, + "name": "ui", + "repo": "shadcn-ui/ui", + "description": "A set of beautifully-designed, accessible components and a code distribution platform. Works with your favorite frameworks. Open Source. Open Code.", + "createdAt": "2023-01-04T12:43:27Z", + "updatedAt": "2025-10-27T07:34:00Z", + "pushedAt": "2025-10-27T07:18:39Z", + "stars": 98464, + "watchers": 306, + "forks": 7031, + "defaultBranch": "main" + }, + { + "id": 196701619, + "name": "tauri", + "repo": "tauri-apps/tauri", + "description": "Build smaller, faster, and more secure desktop and mobile applications with a web frontend.", + "createdAt": "2019-07-13T09:09:37Z", + "updatedAt": "2025-10-27T07:27:10Z", + "pushedAt": "2025-10-26T13:55:16Z", + "stars": 98199, + "watchers": 530, + "forks": 3133, + "defaultBranch": "dev" + }, + { + "id": 157616880, + "name": "iptv", + "repo": "iptv-org/iptv", + "description": "Collection of publicly available IPTV channels from all over the world", + "createdAt": "2018-11-14T22:00:57Z", + "updatedAt": "2025-10-27T07:13:48Z", + "pushedAt": "2025-10-27T00:13:17Z", + "stars": 98051, + "watchers": 1950, + "forks": 4195, + "defaultBranch": "master" + }, + { + "id": 23083156, + "name": "material-ui", + "repo": "mui/material-ui", + "description": "Material UI: Comprehensive React component library that implements Google's Material Design. Free forever.", + "createdAt": "2014-08-18T19:11:54Z", + "updatedAt": "2025-10-27T07:25:58Z", + "pushedAt": "2025-10-27T07:11:45Z", + "stars": 96875, + "watchers": 1312, + "forks": 32696, + "defaultBranch": "master" + }, + { + "id": 34526884, + "name": "ant-design", + "repo": "ant-design/ant-design", + "description": "An enterprise-class UI design language and React UI library", + "createdAt": "2015-04-24T15:37:24Z", + "updatedAt": "2025-10-27T07:19:39Z", + "pushedAt": "2025-10-27T07:44:37Z", + "stars": 96467, + "watchers": 236, + "forks": 53873, + "defaultBranch": "master" + }, + { + "id": 243950408, + "name": "HowToCook", + "repo": "Anduin2017/HowToCook", + "description": "็จ‹ๅบๅ‘˜ๅœจๅฎถๅš้ฅญๆ–นๆณ•ๆŒ‡ๅ—ใ€‚Programmer's guide about how to cook at home (Simplified Chinese only).", + "createdAt": "2020-02-29T10:43:49Z", + "updatedAt": "2025-10-27T07:31:17Z", + "pushedAt": "2025-10-23T12:40:47Z", + "stars": 95393, + "watchers": 488, + "forks": 10650, + "defaultBranch": "master" + }, + { + "id": 33614304, + "name": "thefuck", + "repo": "nvbn/thefuck", + "description": "Magnificent app which corrects your previous console command.", + "createdAt": "2015-04-08T15:08:04Z", + "updatedAt": "2025-10-27T07:26:06Z", + "pushedAt": "2024-07-19T14:56:13Z", + "stars": 94482, + "watchers": 825, + "forks": 3792, + "defaultBranch": "master" + }, + { + "id": 65600975, + "name": "pytorch", + "repo": "pytorch/pytorch", + "description": "Tensors and Dynamic neural networks in Python with strong GPU acceleration", + "createdAt": "2016-08-13T05:26:41Z", + "updatedAt": "2025-10-27T07:51:08Z", + "pushedAt": "2025-10-27T07:51:03Z", + "stars": 94273, + "watchers": 1771, + "forks": 25671, + "defaultBranch": "main" + }, + { + "id": 74791366, + "name": "clean-code-javascript", + "repo": "ryanmcdermott/clean-code-javascript", + "description": "Clean Code concepts adapted for JavaScript", + "createdAt": "2016-11-25T22:25:41Z", + "updatedAt": "2025-10-27T03:36:56Z", + "pushedAt": "2024-07-29T07:24:37Z", + "stars": 93960, + "watchers": 1744, + "forks": 12496, + "defaultBranch": "master" + }, + { + "id": 101296881, + "name": "every-programmer-should-know", + "repo": "mtdvio/every-programmer-should-know", + "description": "A collection of (mostly) technical things every software developer should know about", + "createdAt": "2017-08-24T13:18:26Z", + "updatedAt": "2025-10-27T07:26:42Z", + "pushedAt": "2025-10-22T15:21:18Z", + "stars": 93814, + "watchers": 2011, + "forks": 8436, + "defaultBranch": "master" + }, + { + "id": 16408992, + "name": "neovim", + "repo": "neovim/neovim", + "description": "Vim-fork focused on extensibility and usability", + "createdAt": "2014-01-31T13:39:22Z", + "updatedAt": "2025-10-27T07:30:43Z", + "pushedAt": "2025-10-27T05:15:23Z", + "stars": 93731, + "watchers": 971, + "forks": 6378, + "defaultBranch": "master" + }, + { + "id": 943398999, + "name": "system-prompts-and-models-of-ai-tools", + "repo": "x1xhlol/system-prompts-and-models-of-ai-tools", + "description": "FULL Augment Code, Claude Code, Cluely, CodeBuddy, Comet, Cursor, Devin AI, Junie, Kiro, Leap.new, Lovable, Manus Agent Tools, NotionAI, Orchids.app, Perplexity, Poke, Qoder, Replit, Same.dev, Trae, Traycer AI, VSCode Agent, Warp.dev, Windsurf, Xcode, Z.ai Code, dia & v0. (And other Open Sourced) System Prompts, Internal Tools & AI Models", + "createdAt": "2025-03-05T16:38:29Z", + "updatedAt": "2025-10-27T07:37:40Z", + "pushedAt": "2025-10-19T18:44:24Z", + "stars": 93282, + "watchers": 1183, + "forks": 25228, + "defaultBranch": "main" + }, + { + "id": 22790488, + "name": "java-design-patterns", + "repo": "iluwatar/java-design-patterns", + "description": "Design patterns implemented in Java", + "createdAt": "2014-08-09T16:45:18Z", + "updatedAt": "2025-10-27T07:35:54Z", + "pushedAt": "2025-10-21T21:30:34Z", + "stars": 93215, + "watchers": 3717, + "forks": 27309, + "defaultBranch": "master" + }, + { + "id": 90796663, + "name": "puppeteer", + "repo": "puppeteer/puppeteer", + "description": "JavaScript API for Chrome and Firefox", + "createdAt": "2017-05-09T22:16:13Z", + "updatedAt": "2025-10-27T07:31:12Z", + "pushedAt": "2025-10-26T04:03:55Z", + "stars": 92724, + "watchers": 1184, + "forks": 9314, + "defaultBranch": "main" + }, + { + "id": 311525798, + "name": "Web-Dev-For-Beginners", + "repo": "microsoft/Web-Dev-For-Beginners", + "description": "24 Lessons, 12 Weeks, Get Started as a Web Developer", + "createdAt": "2020-11-10T02:44:00Z", + "updatedAt": "2025-10-27T07:27:35Z", + "pushedAt": "2025-10-25T00:47:36Z", + "stars": 92476, + "watchers": 2690, + "forks": 14330, + "defaultBranch": "main" + }, + { + "id": 589831718, + "name": "ComfyUI", + "repo": "comfyanonymous/ComfyUI", + "description": "The most powerful and modular diffusion model GUI, api and backend with a graph/nodes interface.", + "createdAt": "2023-01-17T03:15:56Z", + "updatedAt": "2025-10-27T07:46:53Z", + "pushedAt": "2025-10-27T00:23:05Z", + "stars": 92036, + "watchers": 614, + "forks": 10341, + "defaultBranch": "master" + }, + { + "id": 63539055, + "name": "awesome-mac", + "repo": "jaywcjlove/awesome-mac", + "description": "๏ฃฟ Now we have become very big, Different from the original idea. Collect premium software in various categories.", + "createdAt": "2016-07-17T15:33:47Z", + "updatedAt": "2025-10-27T07:50:40Z", + "pushedAt": "2025-10-25T04:02:03Z", + "stars": 91815, + "watchers": 1517, + "forks": 6947, + "defaultBranch": "master" + }, + { + "id": 919443098, + "name": "DeepSeek-R1", + "repo": "deepseek-ai/DeepSeek-R1", + "description": null, + "createdAt": "2025-01-20T11:57:28Z", + "updatedAt": "2025-10-27T06:56:07Z", + "pushedAt": "2025-06-27T08:35:54Z", + "stars": 91380, + "watchers": 607, + "forks": 11766, + "defaultBranch": "main" + }, + { + "id": 160919119, + "name": "fastapi", + "repo": "fastapi/fastapi", + "description": "FastAPI framework, high performance, easy to learn, fast to code, ready for production", + "createdAt": "2018-12-08T08:21:47Z", + "updatedAt": "2025-10-27T07:49:54Z", + "pushedAt": "2025-10-23T20:55:59Z", + "stars": 91203, + "watchers": 723, + "forks": 8123, + "defaultBranch": "master" + }, + { + "id": 106017343, + "name": "tailwindcss", + "repo": "tailwindlabs/tailwindcss", + "description": "A utility-first CSS framework for rapid UI development.", + "createdAt": "2017-10-06T14:59:14Z", + "updatedAt": "2025-10-27T07:48:03Z", + "pushedAt": "2025-10-24T11:53:16Z", + "stars": 90800, + "watchers": 615, + "forks": 4771, + "defaultBranch": "main" + } +] diff --git a/benchmarks/package.json b/benchmarks/package.json new file mode 100644 index 0000000..b3c7f71 --- /dev/null +++ b/benchmarks/package.json @@ -0,0 +1,26 @@ +{ + "name": "@toon/benchmarks", + "type": "module", + "private": true, + "scripts": { + "benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts", + "benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts", + "fetch-github-data": "tsx scripts/fetch-github-data.ts", + "test": "vitest" + }, + "devDependencies": { + "@ai-sdk/anthropic": "^2.0.37", + "@ai-sdk/google": "^2.0.23", + "@ai-sdk/openai": "^2.0.53", + "@ai-sdk/provider": "^2.0.0", + "@antfu/eslint-config": "^6.1.0", + "@faker-js/faker": "^10.1.0", + "ai": "^5.0.80", + "consola": "^3.4.2", + "csv-stringify": "^6.6.0", + "gpt-tokenizer": "^3.2.0", + "ofetch": "^1.4.1", + "p-map": "^7.0.3", + "yaml": "^2.8.1" + } +} diff --git a/benchmarks/results/accuracy/accuracy.md b/benchmarks/results/accuracy/accuracy.md new file mode 100644 index 0000000..e435df6 --- /dev/null +++ b/benchmarks/results/accuracy/accuracy.md @@ -0,0 +1,96 @@ +### Retrieval Accuracy + +Tested across **2 LLMs** with data retrieval tasks: + +``` +gpt-4o-mini โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 72.3% accuracy +claude-haiku-4-5 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 76.7% accuracy +``` + +**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.** + +| Format | Accuracy | Average Tokens | +| ------ | -------- | -------------- | +| `toon` | 73.9% | 4.678 | +| `json` | 73.6% | 8.713 | +| `markdown-kv` | 73.6% | 8.649 | +| `csv` | 72.3% | 4.745 | +| `yaml` | 71.7% | 7.091 | + +
+View detailed breakdown by dataset and model + +#### Performance by Dataset + +##### Uniform employee records (TOON optimal format) + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 72.4% | 2.483 | 84/116 | +| `csv` | 69.0% | 2.337 | 80/116 | +| `yaml` | 68.1% | 4.969 | 79/116 | +| `markdown-kv` | 68.1% | 6.270 | 79/116 | +| `json` | 68.1% | 6.347 | 79/116 | + +##### E-commerce orders with nested structures + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 84.1% | 5.967 | 74/88 | +| `csv` | 83.0% | 6.735 | 73/88 | +| `yaml` | 81.8% | 7.328 | 72/88 | +| `markdown-kv` | 86.4% | 9.110 | 76/88 | +| `json` | 84.1% | 9.694 | 74/88 | + +##### Time-series analytics data + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `csv` | 72.4% | 1.393 | 42/58 | +| `toon` | 70.7% | 1.515 | 41/58 | +| `yaml` | 72.4% | 2.938 | 42/58 | +| `json` | 74.1% | 3.665 | 43/58 | +| `markdown-kv` | 70.7% | 3.779 | 41/58 | + +##### Popular GitHub repositories + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 64.3% | 8.745 | 36/56 | +| `csv` | 62.5% | 8.513 | 35/56 | +| `json` | 67.9% | 15.145 | 38/56 | +| `markdown-kv` | 67.9% | 15.436 | 38/56 | +| `yaml` | 62.5% | 13.129 | 35/56 | + + +#### Performance by Model + +##### gpt-4o-mini + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `toon` | 72.3% | 115/159 | +| `json` | 71.7% | 114/159 | +| `markdown-kv` | 70.4% | 112/159 | +| `csv` | 69.2% | 110/159 | +| `yaml` | 68.6% | 109/159 | + +##### claude-haiku-4-5 + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `markdown-kv` | 76.7% | 122/159 | +| `toon` | 75.5% | 120/159 | +| `json` | 75.5% | 120/159 | +| `csv` | 75.5% | 120/159 | +| `yaml` | 74.8% | 119/159 | + + +#### Methodology + +- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching) +- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding +- **Question types**: Field retrieval, aggregation, and filtering tasks +- **Real data**: faker.js-generated datasets + real GitHub repository data + +
diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json new file mode 100644 index 0000000..a5a21a0 --- /dev/null +++ b/benchmarks/results/accuracy/raw-results.json @@ -0,0 +1,17492 @@ +[ + { + "questionId": "q1", + "format": "json", + "model": "gpt-4o-mini", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 1313 + }, + { + "questionId": "q1", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 7870, + "outputTokens": 6, + "latencyMs": 1346 + }, + { + "questionId": "q1", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 1191 + }, + { + "questionId": "q1", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 2982, + "outputTokens": 6, + "latencyMs": 1399 + }, + { + "questionId": "q1", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 5010 + }, + { + "questionId": "q1", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 2856, + "outputTokens": 6, + "latencyMs": 1472 + }, + { + "questionId": "q1", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 1667 + }, + { + "questionId": "q1", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 6365, + "outputTokens": 6, + "latencyMs": 1507 + }, + { + "questionId": "q1", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 1325 + }, + { + "questionId": "q1", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "56176", + "actual": "56176", + "correct": true, + "inputTokens": 5760, + "outputTokens": 6, + "latencyMs": 2280 + }, + { + "questionId": "q2", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6391, + "outputTokens": 2, + "latencyMs": 3167 + }, + { + "questionId": "q2", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7869, + "outputTokens": 4, + "latencyMs": 1267 + }, + { + "questionId": "q2", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2528, + "outputTokens": 2, + "latencyMs": 1402 + }, + { + "questionId": "q2", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2981, + "outputTokens": 4, + "latencyMs": 1290 + }, + { + "questionId": "q2", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2382, + "outputTokens": 2, + "latencyMs": 5070 + }, + { + "questionId": "q2", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2855, + "outputTokens": 4, + "latencyMs": 1320 + }, + { + "questionId": "q2", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6317, + "outputTokens": 2, + "latencyMs": 1745 + }, + { + "questionId": "q2", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6364, + "outputTokens": 4, + "latencyMs": 1191 + }, + { + "questionId": "q2", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5013, + "outputTokens": 2, + "latencyMs": 2713 + }, + { + "questionId": "q2", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5759, + "outputTokens": 4, + "latencyMs": 1309 + }, + { + "questionId": "q3", + "format": "json", + "model": "gpt-4o-mini", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 6393, + "outputTokens": 7, + "latencyMs": 1160 + }, + { + "questionId": "q3", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 7874, + "outputTokens": 12, + "latencyMs": 1338 + }, + { + "questionId": "q3", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 2530, + "outputTokens": 7, + "latencyMs": 1478 + }, + { + "questionId": "q3", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 2986, + "outputTokens": 12, + "latencyMs": 1563 + }, + { + "questionId": "q3", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 2384, + "outputTokens": 7, + "latencyMs": 1310 + }, + { + "questionId": "q3", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 2860, + "outputTokens": 12, + "latencyMs": 1236 + }, + { + "questionId": "q3", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 6319, + "outputTokens": 7, + "latencyMs": 2236 + }, + { + "questionId": "q3", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 6369, + "outputTokens": 12, + "latencyMs": 1253 + }, + { + "questionId": "q3", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 5015, + "outputTokens": 7, + "latencyMs": 1917 + }, + { + "questionId": "q3", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "lorenza.kunze@yahoo.com", + "actual": "lorenza.kunze@yahoo.com", + "correct": true, + "inputTokens": 5764, + "outputTokens": 12, + "latencyMs": 1332 + }, + { + "questionId": "q4", + "format": "json", + "model": "gpt-4o-mini", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 2945 + }, + { + "questionId": "q4", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 7870, + "outputTokens": 6, + "latencyMs": 1773 + }, + { + "questionId": "q4", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 1294 + }, + { + "questionId": "q4", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 2982, + "outputTokens": 6, + "latencyMs": 980 + }, + { + "questionId": "q4", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 1747 + }, + { + "questionId": "q4", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 2856, + "outputTokens": 6, + "latencyMs": 1197 + }, + { + "questionId": "q4", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 1039 + }, + { + "questionId": "q4", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 6365, + "outputTokens": 6, + "latencyMs": 1453 + }, + { + "questionId": "q4", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 1056 + }, + { + "questionId": "q4", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "117381", + "actual": "117381", + "correct": true, + "inputTokens": 5760, + "outputTokens": 6, + "latencyMs": 1564 + }, + { + "questionId": "q5", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 2, + "latencyMs": 1263 + }, + { + "questionId": "q5", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7868, + "outputTokens": 4, + "latencyMs": 1097 + }, + { + "questionId": "q5", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 2, + "latencyMs": 1248 + }, + { + "questionId": "q5", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2980, + "outputTokens": 4, + "latencyMs": 1486 + }, + { + "questionId": "q5", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 2, + "latencyMs": 1311 + }, + { + "questionId": "q5", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2854, + "outputTokens": 4, + "latencyMs": 1019 + }, + { + "questionId": "q5", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6316, + "outputTokens": 2, + "latencyMs": 1287 + }, + { + "questionId": "q5", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6363, + "outputTokens": 4, + "latencyMs": 1243 + }, + { + "questionId": "q5", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5012, + "outputTokens": 2, + "latencyMs": 1339 + }, + { + "questionId": "q5", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5758, + "outputTokens": 4, + "latencyMs": 1621 + }, + { + "questionId": "q6", + "format": "json", + "model": "gpt-4o-mini", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 6391, + "outputTokens": 6, + "latencyMs": 1625 + }, + { + "questionId": "q6", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 11, + "latencyMs": 1328 + }, + { + "questionId": "q6", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 6, + "latencyMs": 1463 + }, + { + "questionId": "q6", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 11, + "latencyMs": 1149 + }, + { + "questionId": "q6", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 6, + "latencyMs": 1474 + }, + { + "questionId": "q6", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 11, + "latencyMs": 977 + }, + { + "questionId": "q6", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 6, + "latencyMs": 2079 + }, + { + "questionId": "q6", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 11, + "latencyMs": 1134 + }, + { + "questionId": "q6", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 6, + "latencyMs": 1124 + }, + { + "questionId": "q6", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "jayda60@hotmail.com", + "actual": "jayda60@hotmail.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 11, + "latencyMs": 1053 + }, + { + "questionId": "q7", + "format": "json", + "model": "gpt-4o-mini", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 1427 + }, + { + "questionId": "q7", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 7870, + "outputTokens": 6, + "latencyMs": 1246 + }, + { + "questionId": "q7", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 1171 + }, + { + "questionId": "q7", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 2982, + "outputTokens": 6, + "latencyMs": 1547 + }, + { + "questionId": "q7", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 1523 + }, + { + "questionId": "q7", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 2856, + "outputTokens": 6, + "latencyMs": 1148 + }, + { + "questionId": "q7", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 1360 + }, + { + "questionId": "q7", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 6365, + "outputTokens": 6, + "latencyMs": 1100 + }, + { + "questionId": "q7", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 1116 + }, + { + "questionId": "q7", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "92971", + "actual": "92971", + "correct": true, + "inputTokens": 5760, + "outputTokens": 6, + "latencyMs": 1202 + }, + { + "questionId": "q8", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Operations", + "correct": false, + "inputTokens": 6391, + "outputTokens": 2, + "latencyMs": 974 + }, + { + "questionId": "q8", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7871, + "outputTokens": 4, + "latencyMs": 1357 + }, + { + "questionId": "q8", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2528, + "outputTokens": 2, + "latencyMs": 1107 + }, + { + "questionId": "q8", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2983, + "outputTokens": 4, + "latencyMs": 1126 + }, + { + "questionId": "q8", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2382, + "outputTokens": 2, + "latencyMs": 1124 + }, + { + "questionId": "q8", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2857, + "outputTokens": 4, + "latencyMs": 1208 + }, + { + "questionId": "q8", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Operations", + "correct": false, + "inputTokens": 6317, + "outputTokens": 2, + "latencyMs": 1463 + }, + { + "questionId": "q8", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6366, + "outputTokens": 4, + "latencyMs": 1175 + }, + { + "questionId": "q8", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5013, + "outputTokens": 2, + "latencyMs": 1952 + }, + { + "questionId": "q8", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5761, + "outputTokens": 4, + "latencyMs": 1271 + }, + { + "questionId": "q9", + "format": "json", + "model": "gpt-4o-mini", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 6393, + "outputTokens": 7, + "latencyMs": 1301 + }, + { + "questionId": "q9", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 11, + "latencyMs": 1371 + }, + { + "questionId": "q9", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2530, + "outputTokens": 7, + "latencyMs": 1197 + }, + { + "questionId": "q9", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 11, + "latencyMs": 1088 + }, + { + "questionId": "q9", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2384, + "outputTokens": 7, + "latencyMs": 1310 + }, + { + "questionId": "q9", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 11, + "latencyMs": 1300 + }, + { + "questionId": "q9", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 6319, + "outputTokens": 7, + "latencyMs": 1531 + }, + { + "questionId": "q9", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 11, + "latencyMs": 1275 + }, + { + "questionId": "q9", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrence.hansen@yahoo.com", + "correct": false, + "inputTokens": 5015, + "outputTokens": 7, + "latencyMs": 1245 + }, + { + "questionId": "q9", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 11, + "latencyMs": 1215 + }, + { + "questionId": "q10", + "format": "json", + "model": "gpt-4o-mini", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 6392, + "outputTokens": 3, + "latencyMs": 4959 + }, + { + "questionId": "q10", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 7870, + "outputTokens": 6, + "latencyMs": 1269 + }, + { + "questionId": "q10", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 2529, + "outputTokens": 3, + "latencyMs": 1111 + }, + { + "questionId": "q10", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 2982, + "outputTokens": 6, + "latencyMs": 1254 + }, + { + "questionId": "q10", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 2383, + "outputTokens": 3, + "latencyMs": 1616 + }, + { + "questionId": "q10", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 2856, + "outputTokens": 6, + "latencyMs": 1123 + }, + { + "questionId": "q10", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 6318, + "outputTokens": 3, + "latencyMs": 1201 + }, + { + "questionId": "q10", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 6365, + "outputTokens": 6, + "latencyMs": 1371 + }, + { + "questionId": "q10", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 5014, + "outputTokens": 3, + "latencyMs": 1503 + }, + { + "questionId": "q10", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 5760, + "outputTokens": 6, + "latencyMs": 1249 + }, + { + "questionId": "q11", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6391, + "outputTokens": 2, + "latencyMs": 1383 + }, + { + "questionId": "q11", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7869, + "outputTokens": 4, + "latencyMs": 1081 + }, + { + "questionId": "q11", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2528, + "outputTokens": 2, + "latencyMs": 1677 + }, + { + "questionId": "q11", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2981, + "outputTokens": 4, + "latencyMs": 1072 + }, + { + "questionId": "q11", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2382, + "outputTokens": 2, + "latencyMs": 1142 + }, + { + "questionId": "q11", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2855, + "outputTokens": 4, + "latencyMs": 991 + }, + { + "questionId": "q11", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6317, + "outputTokens": 2, + "latencyMs": 1339 + }, + { + "questionId": "q11", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6364, + "outputTokens": 4, + "latencyMs": 1117 + }, + { + "questionId": "q11", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5013, + "outputTokens": 2, + "latencyMs": 2483 + }, + { + "questionId": "q11", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5759, + "outputTokens": 4, + "latencyMs": 1187 + }, + { + "questionId": "q12", + "format": "json", + "model": "gpt-4o-mini", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 6390, + "outputTokens": 5, + "latencyMs": 1827 + }, + { + "questionId": "q12", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 7867, + "outputTokens": 9, + "latencyMs": 1121 + }, + { + "questionId": "q12", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 2527, + "outputTokens": 5, + "latencyMs": 1373 + }, + { + "questionId": "q12", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 2979, + "outputTokens": 9, + "latencyMs": 1284 + }, + { + "questionId": "q12", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 2381, + "outputTokens": 5, + "latencyMs": 1751 + }, + { + "questionId": "q12", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 2853, + "outputTokens": 9, + "latencyMs": 1140 + }, + { + "questionId": "q12", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 6316, + "outputTokens": 5, + "latencyMs": 1624 + }, + { + "questionId": "q12", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 6362, + "outputTokens": 9, + "latencyMs": 1071 + }, + { + "questionId": "q12", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 5012, + "outputTokens": 5, + "latencyMs": 1970 + }, + { + "questionId": "q12", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "allan21@gmail.com", + "actual": "allan21@gmail.com", + "correct": true, + "inputTokens": 5757, + "outputTokens": 9, + "latencyMs": 1437 + }, + { + "questionId": "q13", + "format": "json", + "model": "gpt-4o-mini", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 6389, + "outputTokens": 3, + "latencyMs": 1263 + }, + { + "questionId": "q13", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 7868, + "outputTokens": 6, + "latencyMs": 1277 + }, + { + "questionId": "q13", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2526, + "outputTokens": 3, + "latencyMs": 1151 + }, + { + "questionId": "q13", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2980, + "outputTokens": 6, + "latencyMs": 1260 + }, + { + "questionId": "q13", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2380, + "outputTokens": 3, + "latencyMs": 1071 + }, + { + "questionId": "q13", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2854, + "outputTokens": 6, + "latencyMs": 891 + }, + { + "questionId": "q13", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 6315, + "outputTokens": 3, + "latencyMs": 1548 + }, + { + "questionId": "q13", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 6363, + "outputTokens": 6, + "latencyMs": 1456 + }, + { + "questionId": "q13", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 5011, + "outputTokens": 3, + "latencyMs": 1268 + }, + { + "questionId": "q13", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 5758, + "outputTokens": 6, + "latencyMs": 1205 + }, + { + "questionId": "q14", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 2, + "latencyMs": 1310 + }, + { + "questionId": "q14", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7868, + "outputTokens": 4, + "latencyMs": 1071 + }, + { + "questionId": "q14", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 2, + "latencyMs": 895 + }, + { + "questionId": "q14", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2980, + "outputTokens": 4, + "latencyMs": 1020 + }, + { + "questionId": "q14", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 2, + "latencyMs": 1168 + }, + { + "questionId": "q14", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2854, + "outputTokens": 4, + "latencyMs": 977 + }, + { + "questionId": "q14", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Operations", + "correct": false, + "inputTokens": 6316, + "outputTokens": 2, + "latencyMs": 1370 + }, + { + "questionId": "q14", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6363, + "outputTokens": 4, + "latencyMs": 1508 + }, + { + "questionId": "q14", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5012, + "outputTokens": 2, + "latencyMs": 3622 + }, + { + "questionId": "q14", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5758, + "outputTokens": 4, + "latencyMs": 1249 + }, + { + "questionId": "q15", + "format": "json", + "model": "gpt-4o-mini", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 6391, + "outputTokens": 7, + "latencyMs": 3269 + }, + { + "questionId": "q15", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 7869, + "outputTokens": 9, + "latencyMs": 1538 + }, + { + "questionId": "q15", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 7, + "latencyMs": 1413 + }, + { + "questionId": "q15", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 2981, + "outputTokens": 9, + "latencyMs": 1027 + }, + { + "questionId": "q15", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 7, + "latencyMs": 1257 + }, + { + "questionId": "q15", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 2855, + "outputTokens": 9, + "latencyMs": 1169 + }, + { + "questionId": "q15", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 7, + "latencyMs": 1464 + }, + { + "questionId": "q15", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 6364, + "outputTokens": 9, + "latencyMs": 1799 + }, + { + "questionId": "q15", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 7, + "latencyMs": 1616 + }, + { + "questionId": "q15", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "alexandria61@gmail.com", + "actual": "alexandria61@gmail.com", + "correct": true, + "inputTokens": 5759, + "outputTokens": 9, + "latencyMs": 1349 + }, + { + "questionId": "q16", + "format": "json", + "model": "gpt-4o-mini", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 6390, + "outputTokens": 3, + "latencyMs": 1298 + }, + { + "questionId": "q16", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 7870, + "outputTokens": 6, + "latencyMs": 1115 + }, + { + "questionId": "q16", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 2527, + "outputTokens": 3, + "latencyMs": 1180 + }, + { + "questionId": "q16", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 2982, + "outputTokens": 6, + "latencyMs": 1110 + }, + { + "questionId": "q16", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 2381, + "outputTokens": 3, + "latencyMs": 1235 + }, + { + "questionId": "q16", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 2856, + "outputTokens": 6, + "latencyMs": 1228 + }, + { + "questionId": "q16", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 6316, + "outputTokens": 3, + "latencyMs": 1832 + }, + { + "questionId": "q16", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 6365, + "outputTokens": 6, + "latencyMs": 1401 + }, + { + "questionId": "q16", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 5012, + "outputTokens": 3, + "latencyMs": 933 + }, + { + "questionId": "q16", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "89436", + "actual": "89436", + "correct": true, + "inputTokens": 5760, + "outputTokens": 6, + "latencyMs": 1570 + }, + { + "questionId": "q17", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6393, + "outputTokens": 2, + "latencyMs": 1221 + }, + { + "questionId": "q17", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7872, + "outputTokens": 4, + "latencyMs": 1293 + }, + { + "questionId": "q17", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2530, + "outputTokens": 2, + "latencyMs": 1147 + }, + { + "questionId": "q17", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2984, + "outputTokens": 4, + "latencyMs": 923 + }, + { + "questionId": "q17", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2384, + "outputTokens": 2, + "latencyMs": 1180 + }, + { + "questionId": "q17", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2858, + "outputTokens": 4, + "latencyMs": 1025 + }, + { + "questionId": "q17", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6319, + "outputTokens": 2, + "latencyMs": 1748 + }, + { + "questionId": "q17", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6367, + "outputTokens": 4, + "latencyMs": 1188 + }, + { + "questionId": "q17", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5015, + "outputTokens": 2, + "latencyMs": 1452 + }, + { + "questionId": "q17", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5762, + "outputTokens": 4, + "latencyMs": 1329 + }, + { + "questionId": "q18", + "format": "json", + "model": "gpt-4o-mini", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 6391, + "outputTokens": 6, + "latencyMs": 768 + }, + { + "questionId": "q18", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 10, + "latencyMs": 1150 + }, + { + "questionId": "q18", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 6, + "latencyMs": 1501 + }, + { + "questionId": "q18", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 10, + "latencyMs": 1201 + }, + { + "questionId": "q18", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 6, + "latencyMs": 1604 + }, + { + "questionId": "q18", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 10, + "latencyMs": 1060 + }, + { + "questionId": "q18", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 6, + "latencyMs": 1350 + }, + { + "questionId": "q18", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 10, + "latencyMs": 1154 + }, + { + "questionId": "q18", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 6, + "latencyMs": 1199 + }, + { + "questionId": "q18", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "kelvin54@yahoo.com", + "actual": "kelvin54@yahoo.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 10, + "latencyMs": 1216 + }, + { + "questionId": "q19", + "format": "json", + "model": "gpt-4o-mini", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 1412 + }, + { + "questionId": "q19", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 7872, + "outputTokens": 6, + "latencyMs": 1908 + }, + { + "questionId": "q19", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 1366 + }, + { + "questionId": "q19", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 2984, + "outputTokens": 6, + "latencyMs": 1054 + }, + { + "questionId": "q19", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 1121 + }, + { + "questionId": "q19", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 2858, + "outputTokens": 6, + "latencyMs": 1262 + }, + { + "questionId": "q19", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 4632 + }, + { + "questionId": "q19", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 6367, + "outputTokens": 6, + "latencyMs": 1118 + }, + { + "questionId": "q19", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 928 + }, + { + "questionId": "q19", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "143365", + "actual": "143365", + "correct": true, + "inputTokens": 5762, + "outputTokens": 6, + "latencyMs": 1191 + }, + { + "questionId": "q20", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 2, + "latencyMs": 1053 + }, + { + "questionId": "q20", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7868, + "outputTokens": 4, + "latencyMs": 1096 + }, + { + "questionId": "q20", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 2, + "latencyMs": 1784 + }, + { + "questionId": "q20", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2980, + "outputTokens": 4, + "latencyMs": 1093 + }, + { + "questionId": "q20", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 2, + "latencyMs": 1335 + }, + { + "questionId": "q20", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2854, + "outputTokens": 4, + "latencyMs": 1546 + }, + { + "questionId": "q20", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6316, + "outputTokens": 2, + "latencyMs": 1293 + }, + { + "questionId": "q20", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6363, + "outputTokens": 4, + "latencyMs": 1230 + }, + { + "questionId": "q20", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5012, + "outputTokens": 2, + "latencyMs": 1467 + }, + { + "questionId": "q20", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5758, + "outputTokens": 4, + "latencyMs": 1370 + }, + { + "questionId": "q21", + "format": "json", + "model": "gpt-4o-mini", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 6394, + "outputTokens": 6, + "latencyMs": 5026 + }, + { + "questionId": "q21", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 7876, + "outputTokens": 9, + "latencyMs": 1786 + }, + { + "questionId": "q21", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 2531, + "outputTokens": 6, + "latencyMs": 826 + }, + { + "questionId": "q21", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 2988, + "outputTokens": 9, + "latencyMs": 909 + }, + { + "questionId": "q21", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 2385, + "outputTokens": 6, + "latencyMs": 1120 + }, + { + "questionId": "q21", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 2862, + "outputTokens": 9, + "latencyMs": 996 + }, + { + "questionId": "q21", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 6320, + "outputTokens": 6, + "latencyMs": 1639 + }, + { + "questionId": "q21", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 6371, + "outputTokens": 9, + "latencyMs": 1299 + }, + { + "questionId": "q21", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 5016, + "outputTokens": 6, + "latencyMs": 1151 + }, + { + "questionId": "q21", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "dean19@gmail.com", + "actual": "dean19@gmail.com", + "correct": true, + "inputTokens": 5766, + "outputTokens": 9, + "latencyMs": 1246 + }, + { + "questionId": "q22", + "format": "json", + "model": "gpt-4o-mini", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 6392, + "outputTokens": 3, + "latencyMs": 1838 + }, + { + "questionId": "q22", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 7871, + "outputTokens": 6, + "latencyMs": 1191 + }, + { + "questionId": "q22", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 2529, + "outputTokens": 3, + "latencyMs": 980 + }, + { + "questionId": "q22", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 2983, + "outputTokens": 6, + "latencyMs": 1299 + }, + { + "questionId": "q22", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 2383, + "outputTokens": 3, + "latencyMs": 1027 + }, + { + "questionId": "q22", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 2857, + "outputTokens": 6, + "latencyMs": 1433 + }, + { + "questionId": "q22", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 6318, + "outputTokens": 3, + "latencyMs": 2256 + }, + { + "questionId": "q22", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 6366, + "outputTokens": 6, + "latencyMs": 1091 + }, + { + "questionId": "q22", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 5014, + "outputTokens": 3, + "latencyMs": 1288 + }, + { + "questionId": "q22", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "111314", + "actual": "111314", + "correct": true, + "inputTokens": 5761, + "outputTokens": 6, + "latencyMs": 1306 + }, + { + "questionId": "q23", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6389, + "outputTokens": 2, + "latencyMs": 1951 + }, + { + "questionId": "q23", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7868, + "outputTokens": 4, + "latencyMs": 1440 + }, + { + "questionId": "q23", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2526, + "outputTokens": 2, + "latencyMs": 978 + }, + { + "questionId": "q23", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2980, + "outputTokens": 4, + "latencyMs": 1385 + }, + { + "questionId": "q23", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2380, + "outputTokens": 2, + "latencyMs": 2311 + }, + { + "questionId": "q23", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2854, + "outputTokens": 4, + "latencyMs": 1066 + }, + { + "questionId": "q23", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6315, + "outputTokens": 2, + "latencyMs": 1914 + }, + { + "questionId": "q23", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6363, + "outputTokens": 4, + "latencyMs": 1596 + }, + { + "questionId": "q23", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5011, + "outputTokens": 2, + "latencyMs": 1820 + }, + { + "questionId": "q23", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5758, + "outputTokens": 4, + "latencyMs": 1067 + }, + { + "questionId": "q24", + "format": "json", + "model": "gpt-4o-mini", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 6391, + "outputTokens": 6, + "latencyMs": 2594 + }, + { + "questionId": "q24", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 7869, + "outputTokens": 10, + "latencyMs": 1139 + }, + { + "questionId": "q24", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 6, + "latencyMs": 1225 + }, + { + "questionId": "q24", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 2981, + "outputTokens": 10, + "latencyMs": 1082 + }, + { + "questionId": "q24", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 6, + "latencyMs": 4857 + }, + { + "questionId": "q24", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 2855, + "outputTokens": 10, + "latencyMs": 1082 + }, + { + "questionId": "q24", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 6, + "latencyMs": 1272 + }, + { + "questionId": "q24", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 6364, + "outputTokens": 10, + "latencyMs": 1201 + }, + { + "questionId": "q24", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 6, + "latencyMs": 1197 + }, + { + "questionId": "q24", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "laurel54@yahoo.com", + "actual": "laurel54@yahoo.com", + "correct": true, + "inputTokens": 5759, + "outputTokens": 10, + "latencyMs": 1198 + }, + { + "questionId": "q25", + "format": "json", + "model": "gpt-4o-mini", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 6392, + "outputTokens": 3, + "latencyMs": 1085 + }, + { + "questionId": "q25", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 7873, + "outputTokens": 6, + "latencyMs": 1102 + }, + { + "questionId": "q25", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 2529, + "outputTokens": 3, + "latencyMs": 1350 + }, + { + "questionId": "q25", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 2985, + "outputTokens": 6, + "latencyMs": 1300 + }, + { + "questionId": "q25", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 2383, + "outputTokens": 3, + "latencyMs": 998 + }, + { + "questionId": "q25", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 2859, + "outputTokens": 6, + "latencyMs": 972 + }, + { + "questionId": "q25", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 6318, + "outputTokens": 3, + "latencyMs": 1331 + }, + { + "questionId": "q25", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 6368, + "outputTokens": 6, + "latencyMs": 1027 + }, + { + "questionId": "q25", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 5014, + "outputTokens": 3, + "latencyMs": 1170 + }, + { + "questionId": "q25", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "89553", + "actual": "89553", + "correct": true, + "inputTokens": 5763, + "outputTokens": 6, + "latencyMs": 1074 + }, + { + "questionId": "q26", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6389, + "outputTokens": 2, + "latencyMs": 1862 + }, + { + "questionId": "q26", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7866, + "outputTokens": 4, + "latencyMs": 1435 + }, + { + "questionId": "q26", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2526, + "outputTokens": 2, + "latencyMs": 989 + }, + { + "questionId": "q26", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2978, + "outputTokens": 4, + "latencyMs": 1035 + }, + { + "questionId": "q26", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2380, + "outputTokens": 2, + "latencyMs": 2157 + }, + { + "questionId": "q26", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2852, + "outputTokens": 4, + "latencyMs": 1094 + }, + { + "questionId": "q26", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6315, + "outputTokens": 2, + "latencyMs": 1912 + }, + { + "questionId": "q26", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6361, + "outputTokens": 4, + "latencyMs": 1364 + }, + { + "questionId": "q26", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5011, + "outputTokens": 2, + "latencyMs": 1435 + }, + { + "questionId": "q26", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5756, + "outputTokens": 4, + "latencyMs": 1082 + }, + { + "questionId": "q27", + "format": "json", + "model": "gpt-4o-mini", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 6392, + "outputTokens": 9, + "latencyMs": 1274 + }, + { + "questionId": "q27", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 14, + "latencyMs": 1130 + }, + { + "questionId": "q27", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2529, + "outputTokens": 9, + "latencyMs": 1795 + }, + { + "questionId": "q27", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 14, + "latencyMs": 1309 + }, + { + "questionId": "q27", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2383, + "outputTokens": 9, + "latencyMs": 1406 + }, + { + "questionId": "q27", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 14, + "latencyMs": 1398 + }, + { + "questionId": "q27", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 6318, + "outputTokens": 9, + "latencyMs": 1114 + }, + { + "questionId": "q27", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 14, + "latencyMs": 1251 + }, + { + "questionId": "q27", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 5014, + "outputTokens": 9, + "latencyMs": 1941 + }, + { + "questionId": "q27", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 14, + "latencyMs": 1218 + }, + { + "questionId": "q28", + "format": "json", + "model": "gpt-4o-mini", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 1395 + }, + { + "questionId": "q28", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 7871, + "outputTokens": 6, + "latencyMs": 1342 + }, + { + "questionId": "q28", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 919 + }, + { + "questionId": "q28", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 2983, + "outputTokens": 6, + "latencyMs": 1187 + }, + { + "questionId": "q28", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 1131 + }, + { + "questionId": "q28", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 2857, + "outputTokens": 6, + "latencyMs": 1191 + }, + { + "questionId": "q28", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 1435 + }, + { + "questionId": "q28", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 6366, + "outputTokens": 6, + "latencyMs": 1095 + }, + { + "questionId": "q28", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 4588 + }, + { + "questionId": "q28", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 5761, + "outputTokens": 6, + "latencyMs": 1291 + }, + { + "questionId": "q29", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6392, + "outputTokens": 2, + "latencyMs": 1688 + }, + { + "questionId": "q29", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7872, + "outputTokens": 4, + "latencyMs": 1301 + }, + { + "questionId": "q29", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2529, + "outputTokens": 2, + "latencyMs": 1914 + }, + { + "questionId": "q29", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2984, + "outputTokens": 4, + "latencyMs": 1447 + }, + { + "questionId": "q29", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2383, + "outputTokens": 2, + "latencyMs": 1725 + }, + { + "questionId": "q29", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2858, + "outputTokens": 4, + "latencyMs": 923 + }, + { + "questionId": "q29", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6318, + "outputTokens": 2, + "latencyMs": 879 + }, + { + "questionId": "q29", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6367, + "outputTokens": 4, + "latencyMs": 1322 + }, + { + "questionId": "q29", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5014, + "outputTokens": 2, + "latencyMs": 1394 + }, + { + "questionId": "q29", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5762, + "outputTokens": 4, + "latencyMs": 1008 + }, + { + "questionId": "q30", + "format": "json", + "model": "gpt-4o-mini", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 6391, + "outputTokens": 7, + "latencyMs": 894 + }, + { + "questionId": "q30", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 7869, + "outputTokens": 12, + "latencyMs": 1220 + }, + { + "questionId": "q30", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 7, + "latencyMs": 2225 + }, + { + "questionId": "q30", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 2981, + "outputTokens": 12, + "latencyMs": 1282 + }, + { + "questionId": "q30", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 7, + "latencyMs": 1414 + }, + { + "questionId": "q30", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 2855, + "outputTokens": 12, + "latencyMs": 1686 + }, + { + "questionId": "q30", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 7, + "latencyMs": 1113 + }, + { + "questionId": "q30", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 6364, + "outputTokens": 12, + "latencyMs": 1089 + }, + { + "questionId": "q30", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 7, + "latencyMs": 949 + }, + { + "questionId": "q30", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "carley.bauch@yahoo.com", + "actual": "carley.bauch@yahoo.com", + "correct": true, + "inputTokens": 5759, + "outputTokens": 12, + "latencyMs": 1273 + }, + { + "questionId": "q31", + "format": "json", + "model": "gpt-4o-mini", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 6394, + "outputTokens": 3, + "latencyMs": 4741 + }, + { + "questionId": "q31", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 7874, + "outputTokens": 6, + "latencyMs": 1132 + }, + { + "questionId": "q31", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 2531, + "outputTokens": 3, + "latencyMs": 1184 + }, + { + "questionId": "q31", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 2986, + "outputTokens": 6, + "latencyMs": 1137 + }, + { + "questionId": "q31", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 2385, + "outputTokens": 3, + "latencyMs": 963 + }, + { + "questionId": "q31", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 2860, + "outputTokens": 6, + "latencyMs": 1096 + }, + { + "questionId": "q31", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 6320, + "outputTokens": 3, + "latencyMs": 1399 + }, + { + "questionId": "q31", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 6369, + "outputTokens": 6, + "latencyMs": 1594 + }, + { + "questionId": "q31", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 5016, + "outputTokens": 3, + "latencyMs": 1900 + }, + { + "questionId": "q31", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "142029", + "actual": "142029", + "correct": true, + "inputTokens": 5764, + "outputTokens": 6, + "latencyMs": 1274 + }, + { + "questionId": "q32", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Sales", + "correct": false, + "inputTokens": 6390, + "outputTokens": 2, + "latencyMs": 5224 + }, + { + "questionId": "q32", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7869, + "outputTokens": 4, + "latencyMs": 1038 + }, + { + "questionId": "q32", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 2, + "latencyMs": 1902 + }, + { + "questionId": "q32", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2981, + "outputTokens": 4, + "latencyMs": 1010 + }, + { + "questionId": "q32", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 2, + "latencyMs": 3263 + }, + { + "questionId": "q32", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2855, + "outputTokens": 4, + "latencyMs": 871 + }, + { + "questionId": "q32", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Sales", + "correct": false, + "inputTokens": 6316, + "outputTokens": 2, + "latencyMs": 1278 + }, + { + "questionId": "q32", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6364, + "outputTokens": 4, + "latencyMs": 1048 + }, + { + "questionId": "q32", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Sales", + "correct": false, + "inputTokens": 5012, + "outputTokens": 2, + "latencyMs": 1271 + }, + { + "questionId": "q32", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5759, + "outputTokens": 4, + "latencyMs": 1075 + }, + { + "questionId": "q33", + "format": "json", + "model": "gpt-4o-mini", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 6394, + "outputTokens": 7, + "latencyMs": 1139 + }, + { + "questionId": "q33", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 7872, + "outputTokens": 14, + "latencyMs": 1319 + }, + { + "questionId": "q33", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 2531, + "outputTokens": 7, + "latencyMs": 1856 + }, + { + "questionId": "q33", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 2984, + "outputTokens": 14, + "latencyMs": 1393 + }, + { + "questionId": "q33", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 2385, + "outputTokens": 7, + "latencyMs": 1766 + }, + { + "questionId": "q33", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 2858, + "outputTokens": 14, + "latencyMs": 1609 + }, + { + "questionId": "q33", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 6320, + "outputTokens": 7, + "latencyMs": 1329 + }, + { + "questionId": "q33", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 6367, + "outputTokens": 14, + "latencyMs": 1178 + }, + { + "questionId": "q33", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 5016, + "outputTokens": 7, + "latencyMs": 1890 + }, + { + "questionId": "q33", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "cheyenne_skiles@hotmail.com", + "actual": "cheyenne_skiles@hotmail.com", + "correct": true, + "inputTokens": 5762, + "outputTokens": 14, + "latencyMs": 1326 + }, + { + "questionId": "q34", + "format": "json", + "model": "gpt-4o-mini", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 6392, + "outputTokens": 3, + "latencyMs": 1898 + }, + { + "questionId": "q34", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 7871, + "outputTokens": 6, + "latencyMs": 1074 + }, + { + "questionId": "q34", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 2529, + "outputTokens": 3, + "latencyMs": 1382 + }, + { + "questionId": "q34", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 2983, + "outputTokens": 6, + "latencyMs": 1060 + }, + { + "questionId": "q34", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 2383, + "outputTokens": 3, + "latencyMs": 1286 + }, + { + "questionId": "q34", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 2857, + "outputTokens": 6, + "latencyMs": 1591 + }, + { + "questionId": "q34", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 6318, + "outputTokens": 3, + "latencyMs": 2158 + }, + { + "questionId": "q34", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 6366, + "outputTokens": 6, + "latencyMs": 1532 + }, + { + "questionId": "q34", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 5014, + "outputTokens": 3, + "latencyMs": 1381 + }, + { + "questionId": "q34", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 5761, + "outputTokens": 6, + "latencyMs": 2262 + }, + { + "questionId": "q35", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6391, + "outputTokens": 2, + "latencyMs": 2664 + }, + { + "questionId": "q35", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7871, + "outputTokens": 4, + "latencyMs": 1260 + }, + { + "questionId": "q35", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2528, + "outputTokens": 2, + "latencyMs": 1563 + }, + { + "questionId": "q35", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2983, + "outputTokens": 4, + "latencyMs": 1415 + }, + { + "questionId": "q35", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2382, + "outputTokens": 2, + "latencyMs": 1038 + }, + { + "questionId": "q35", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2857, + "outputTokens": 4, + "latencyMs": 1021 + }, + { + "questionId": "q35", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6317, + "outputTokens": 2, + "latencyMs": 4276 + }, + { + "questionId": "q35", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6366, + "outputTokens": 4, + "latencyMs": 1301 + }, + { + "questionId": "q35", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5013, + "outputTokens": 2, + "latencyMs": 1399 + }, + { + "questionId": "q35", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5761, + "outputTokens": 4, + "latencyMs": 1197 + }, + { + "questionId": "q36", + "format": "json", + "model": "gpt-4o-mini", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 6390, + "outputTokens": 9, + "latencyMs": 1390 + }, + { + "questionId": "q36", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 7869, + "outputTokens": 14, + "latencyMs": 1482 + }, + { + "questionId": "q36", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 2527, + "outputTokens": 9, + "latencyMs": 1754 + }, + { + "questionId": "q36", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 2981, + "outputTokens": 14, + "latencyMs": 1100 + }, + { + "questionId": "q36", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 2381, + "outputTokens": 9, + "latencyMs": 1421 + }, + { + "questionId": "q36", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 2855, + "outputTokens": 14, + "latencyMs": 2173 + }, + { + "questionId": "q36", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 6316, + "outputTokens": 9, + "latencyMs": 2911 + }, + { + "questionId": "q36", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 6364, + "outputTokens": 14, + "latencyMs": 1235 + }, + { + "questionId": "q36", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 5012, + "outputTokens": 9, + "latencyMs": 1303 + }, + { + "questionId": "q36", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "macey.gottlieb5@yahoo.com", + "actual": "macey.gottlieb5@yahoo.com", + "correct": true, + "inputTokens": 5759, + "outputTokens": 14, + "latencyMs": 1148 + }, + { + "questionId": "q37", + "format": "json", + "model": "gpt-4o-mini", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 6390, + "outputTokens": 3, + "latencyMs": 1430 + }, + { + "questionId": "q37", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 7868, + "outputTokens": 6, + "latencyMs": 1089 + }, + { + "questionId": "q37", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 2527, + "outputTokens": 3, + "latencyMs": 1059 + }, + { + "questionId": "q37", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 2980, + "outputTokens": 6, + "latencyMs": 1057 + }, + { + "questionId": "q37", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 2381, + "outputTokens": 3, + "latencyMs": 1716 + }, + { + "questionId": "q37", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 2854, + "outputTokens": 6, + "latencyMs": 904 + }, + { + "questionId": "q37", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 6316, + "outputTokens": 3, + "latencyMs": 2950 + }, + { + "questionId": "q37", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 6363, + "outputTokens": 6, + "latencyMs": 1189 + }, + { + "questionId": "q37", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 5012, + "outputTokens": 3, + "latencyMs": 1050 + }, + { + "questionId": "q37", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "89773", + "actual": "89773", + "correct": true, + "inputTokens": 5758, + "outputTokens": 6, + "latencyMs": 1329 + }, + { + "questionId": "q38", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 2, + "latencyMs": 3410 + }, + { + "questionId": "q38", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7868, + "outputTokens": 4, + "latencyMs": 1891 + }, + { + "questionId": "q38", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 2, + "latencyMs": 1010 + }, + { + "questionId": "q38", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2980, + "outputTokens": 4, + "latencyMs": 988 + }, + { + "questionId": "q38", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 2, + "latencyMs": 1364 + }, + { + "questionId": "q38", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2854, + "outputTokens": 4, + "latencyMs": 1395 + }, + { + "questionId": "q38", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6316, + "outputTokens": 2, + "latencyMs": 2293 + }, + { + "questionId": "q38", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6363, + "outputTokens": 4, + "latencyMs": 1137 + }, + { + "questionId": "q38", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5012, + "outputTokens": 2, + "latencyMs": 1451 + }, + { + "questionId": "q38", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 5758, + "outputTokens": 4, + "latencyMs": 1100 + }, + { + "questionId": "q39", + "format": "json", + "model": "gpt-4o-mini", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 6390, + "outputTokens": 10, + "latencyMs": 1674 + }, + { + "questionId": "q39", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 7869, + "outputTokens": 13, + "latencyMs": 1403 + }, + { + "questionId": "q39", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 2527, + "outputTokens": 10, + "latencyMs": 1413 + }, + { + "questionId": "q39", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 2981, + "outputTokens": 13, + "latencyMs": 1200 + }, + { + "questionId": "q39", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 2381, + "outputTokens": 10, + "latencyMs": 1730 + }, + { + "questionId": "q39", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 2855, + "outputTokens": 13, + "latencyMs": 1226 + }, + { + "questionId": "q39", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 6316, + "outputTokens": 10, + "latencyMs": 1251 + }, + { + "questionId": "q39", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 6364, + "outputTokens": 13, + "latencyMs": 1337 + }, + { + "questionId": "q39", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 5012, + "outputTokens": 10, + "latencyMs": 2368 + }, + { + "questionId": "q39", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "georgianna_renner@yahoo.com", + "actual": "georgianna_renner@yahoo.com", + "correct": true, + "inputTokens": 5759, + "outputTokens": 13, + "latencyMs": 1251 + }, + { + "questionId": "q40", + "format": "json", + "model": "gpt-4o-mini", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 6391, + "outputTokens": 3, + "latencyMs": 3815 + }, + { + "questionId": "q40", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 7871, + "outputTokens": 6, + "latencyMs": 1169 + }, + { + "questionId": "q40", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 2528, + "outputTokens": 3, + "latencyMs": 1070 + }, + { + "questionId": "q40", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 2983, + "outputTokens": 6, + "latencyMs": 1162 + }, + { + "questionId": "q40", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 2382, + "outputTokens": 3, + "latencyMs": 1115 + }, + { + "questionId": "q40", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "49741", + "actual": "144426", + "correct": false, + "inputTokens": 2857, + "outputTokens": 6, + "latencyMs": 1365 + }, + { + "questionId": "q40", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 6317, + "outputTokens": 3, + "latencyMs": 2004 + }, + { + "questionId": "q40", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 6366, + "outputTokens": 6, + "latencyMs": 1113 + }, + { + "questionId": "q40", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 5013, + "outputTokens": 3, + "latencyMs": 3055 + }, + { + "questionId": "q40", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "49741", + "actual": "49741", + "correct": true, + "inputTokens": 5761, + "outputTokens": 6, + "latencyMs": 1392 + }, + { + "questionId": "q41", + "format": "json", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 3877 + }, + { + "questionId": "q41", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 1128 + }, + { + "questionId": "q41", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 966 + }, + { + "questionId": "q41", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1070 + }, + { + "questionId": "q41", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 2411 + }, + { + "questionId": "q41", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 1286 + }, + { + "questionId": "q41", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 2082 + }, + { + "questionId": "q41", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1107 + }, + { + "questionId": "q41", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 1216 + }, + { + "questionId": "q41", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1052 + }, + { + "questionId": "q42", + "format": "json", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 1572 + }, + { + "questionId": "q42", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 1084 + }, + { + "questionId": "q42", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 1377 + }, + { + "questionId": "q42", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "14", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1197 + }, + { + "questionId": "q42", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 2705 + }, + { + "questionId": "q42", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 1020 + }, + { + "questionId": "q42", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 5345 + }, + { + "questionId": "q42", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "14", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1207 + }, + { + "questionId": "q42", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 921 + }, + { + "questionId": "q42", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1289 + }, + { + "questionId": "q43", + "format": "json", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 2423 + }, + { + "questionId": "q43", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 1273 + }, + { + "questionId": "q43", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 975 + }, + { + "questionId": "q43", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1301 + }, + { + "questionId": "q43", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 1423 + }, + { + "questionId": "q43", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 927 + }, + { + "questionId": "q43", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 1258 + }, + { + "questionId": "q43", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1250 + }, + { + "questionId": "q43", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 872 + }, + { + "questionId": "q43", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1385 + }, + { + "questionId": "q44", + "format": "json", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 1201 + }, + { + "questionId": "q44", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 1149 + }, + { + "questionId": "q44", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 1498 + }, + { + "questionId": "q44", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1149 + }, + { + "questionId": "q44", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 1098 + }, + { + "questionId": "q44", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 1121 + }, + { + "questionId": "q44", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 2522 + }, + { + "questionId": "q44", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "10", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1532 + }, + { + "questionId": "q44", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "17", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 4914 + }, + { + "questionId": "q44", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "17", + "actual": "15", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1324 + }, + { + "questionId": "q45", + "format": "json", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 1446 + }, + { + "questionId": "q45", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "12", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 1105 + }, + { + "questionId": "q45", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 1297 + }, + { + "questionId": "q45", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "15", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1251 + }, + { + "questionId": "q45", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 1561 + }, + { + "questionId": "q45", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 1292 + }, + { + "questionId": "q45", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 1127 + }, + { + "questionId": "q45", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "12", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1207 + }, + { + "questionId": "q45", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 1582 + }, + { + "questionId": "q45", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "15", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1278 + }, + { + "questionId": "q46", + "format": "json", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 6388, + "outputTokens": 2, + "latencyMs": 1278 + }, + { + "questionId": "q46", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "10", + "correct": false, + "inputTokens": 7865, + "outputTokens": 5, + "latencyMs": 3084 + }, + { + "questionId": "q46", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 2525, + "outputTokens": 2, + "latencyMs": 1289 + }, + { + "questionId": "q46", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "15", + "correct": false, + "inputTokens": 2977, + "outputTokens": 5, + "latencyMs": 1591 + }, + { + "questionId": "q46", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 2379, + "outputTokens": 2, + "latencyMs": 3038 + }, + { + "questionId": "q46", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "15", + "correct": false, + "inputTokens": 2851, + "outputTokens": 5, + "latencyMs": 1447 + }, + { + "questionId": "q46", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 6314, + "outputTokens": 2, + "latencyMs": 1224 + }, + { + "questionId": "q46", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "10", + "correct": false, + "inputTokens": 6360, + "outputTokens": 5, + "latencyMs": 1250 + }, + { + "questionId": "q46", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "16", + "actual": "20", + "correct": false, + "inputTokens": 5010, + "outputTokens": 2, + "latencyMs": 1364 + }, + { + "questionId": "q46", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "16", + "actual": "12", + "correct": false, + "inputTokens": 5755, + "outputTokens": 5, + "latencyMs": 1560 + }, + { + "questionId": "q47", + "format": "json", + "model": "gpt-4o-mini", + "expected": "91", + "actual": "66", + "correct": false, + "inputTokens": 6393, + "outputTokens": 2, + "latencyMs": 989 + }, + { + "questionId": "q47", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "91", + "actual": "89", + "correct": false, + "inputTokens": 7870, + "outputTokens": 5, + "latencyMs": 1358 + }, + { + "questionId": "q47", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "91", + "actual": "66", + "correct": false, + "inputTokens": 2530, + "outputTokens": 2, + "latencyMs": 1406 + }, + { + "questionId": "q47", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "91", + "actual": "85", + "correct": false, + "inputTokens": 2982, + "outputTokens": 5, + "latencyMs": 1123 + }, + { + "questionId": "q47", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "91", + "actual": "66", + "correct": false, + "inputTokens": 2384, + "outputTokens": 2, + "latencyMs": 4883 + }, + { + "questionId": "q47", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "91", + "actual": "85", + "correct": false, + "inputTokens": 2856, + "outputTokens": 5, + "latencyMs": 1402 + }, + { + "questionId": "q47", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "91", + "actual": "66", + "correct": false, + "inputTokens": 6319, + "outputTokens": 2, + "latencyMs": 1915 + }, + { + "questionId": "q47", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "91", + "actual": "89", + "correct": false, + "inputTokens": 6365, + "outputTokens": 5, + "latencyMs": 1263 + }, + { + "questionId": "q47", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "91", + "actual": "66", + "correct": false, + "inputTokens": 5015, + "outputTokens": 2, + "latencyMs": 1448 + }, + { + "questionId": "q47", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "91", + "actual": "89", + "correct": false, + "inputTokens": 5760, + "outputTokens": 5, + "latencyMs": 1243 + }, + { + "questionId": "q48", + "format": "json", + "model": "gpt-4o-mini", + "expected": "67", + "actual": "54", + "correct": false, + "inputTokens": 6393, + "outputTokens": 2, + "latencyMs": 1456 + }, + { + "questionId": "q48", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "57", + "correct": false, + "inputTokens": 7870, + "outputTokens": 5, + "latencyMs": 1186 + }, + { + "questionId": "q48", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "67", + "actual": "54", + "correct": false, + "inputTokens": 2530, + "outputTokens": 2, + "latencyMs": 1076 + }, + { + "questionId": "q48", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "47", + "correct": false, + "inputTokens": 2982, + "outputTokens": 5, + "latencyMs": 1168 + }, + { + "questionId": "q48", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "67", + "actual": "56", + "correct": false, + "inputTokens": 2384, + "outputTokens": 2, + "latencyMs": 3105 + }, + { + "questionId": "q48", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "47", + "correct": false, + "inputTokens": 2856, + "outputTokens": 5, + "latencyMs": 1375 + }, + { + "questionId": "q48", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "67", + "actual": "66", + "correct": false, + "inputTokens": 6319, + "outputTokens": 2, + "latencyMs": 1618 + }, + { + "questionId": "q48", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "47", + "correct": false, + "inputTokens": 6365, + "outputTokens": 5, + "latencyMs": 1454 + }, + { + "questionId": "q48", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "67", + "actual": "54", + "correct": false, + "inputTokens": 5015, + "outputTokens": 2, + "latencyMs": 1244 + }, + { + "questionId": "q48", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "57", + "correct": false, + "inputTokens": 5760, + "outputTokens": 5, + "latencyMs": 1113 + }, + { + "questionId": "q49", + "format": "json", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "30", + "correct": false, + "inputTokens": 6393, + "outputTokens": 2, + "latencyMs": 1267 + }, + { + "questionId": "q49", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 7870, + "outputTokens": 5, + "latencyMs": 1227 + }, + { + "questionId": "q49", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "30", + "correct": false, + "inputTokens": 2530, + "outputTokens": 2, + "latencyMs": 1246 + }, + { + "questionId": "q49", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "27", + "correct": false, + "inputTokens": 2982, + "outputTokens": 5, + "latencyMs": 1127 + }, + { + "questionId": "q49", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "34", + "correct": false, + "inputTokens": 2384, + "outputTokens": 2, + "latencyMs": 1260 + }, + { + "questionId": "q49", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 2856, + "outputTokens": 5, + "latencyMs": 1293 + }, + { + "questionId": "q49", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "24", + "correct": false, + "inputTokens": 6319, + "outputTokens": 2, + "latencyMs": 1246 + }, + { + "questionId": "q49", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "27", + "correct": false, + "inputTokens": 6365, + "outputTokens": 5, + "latencyMs": 1598 + }, + { + "questionId": "q49", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "24", + "correct": false, + "inputTokens": 5015, + "outputTokens": 2, + "latencyMs": 1471 + }, + { + "questionId": "q49", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 5760, + "outputTokens": 5, + "latencyMs": 1311 + }, + { + "questionId": "q50", + "format": "json", + "model": "gpt-4o-mini", + "expected": "26", + "actual": "22", + "correct": false, + "inputTokens": 6393, + "outputTokens": 2, + "latencyMs": 3950 + }, + { + "questionId": "q50", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "26", + "actual": "20", + "correct": false, + "inputTokens": 7870, + "outputTokens": 5, + "latencyMs": 1075 + }, + { + "questionId": "q50", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "26", + "actual": "22", + "correct": false, + "inputTokens": 2530, + "outputTokens": 2, + "latencyMs": 1868 + }, + { + "questionId": "q50", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "26", + "actual": "16", + "correct": false, + "inputTokens": 2982, + "outputTokens": 5, + "latencyMs": 1075 + }, + { + "questionId": "q50", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "26", + "actual": "24", + "correct": false, + "inputTokens": 2384, + "outputTokens": 2, + "latencyMs": 1973 + }, + { + "questionId": "q50", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "26", + "actual": "16", + "correct": false, + "inputTokens": 2856, + "outputTokens": 5, + "latencyMs": 947 + }, + { + "questionId": "q50", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "26", + "actual": "22", + "correct": false, + "inputTokens": 6319, + "outputTokens": 2, + "latencyMs": 1414 + }, + { + "questionId": "q50", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "26", + "actual": "16", + "correct": false, + "inputTokens": 6365, + "outputTokens": 5, + "latencyMs": 1221 + }, + { + "questionId": "q50", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "26", + "actual": "18", + "correct": false, + "inputTokens": 5015, + "outputTokens": 2, + "latencyMs": 1148 + }, + { + "questionId": "q50", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "26", + "actual": "20", + "correct": false, + "inputTokens": 5760, + "outputTokens": 5, + "latencyMs": 1286 + }, + { + "questionId": "q51", + "format": "json", + "model": "gpt-4o-mini", + "expected": "78", + "actual": "66", + "correct": false, + "inputTokens": 6387, + "outputTokens": 2, + "latencyMs": 2525 + }, + { + "questionId": "q51", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "78", + "actual": "81", + "correct": false, + "inputTokens": 7864, + "outputTokens": 5, + "latencyMs": 1613 + }, + { + "questionId": "q51", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "78", + "actual": "66", + "correct": false, + "inputTokens": 2524, + "outputTokens": 2, + "latencyMs": 1132 + }, + { + "questionId": "q51", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "78", + "actual": "78", + "correct": true, + "inputTokens": 2976, + "outputTokens": 5, + "latencyMs": 1104 + }, + { + "questionId": "q51", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "78", + "actual": "77", + "correct": false, + "inputTokens": 2378, + "outputTokens": 2, + "latencyMs": 1069 + }, + { + "questionId": "q51", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "78", + "actual": "73", + "correct": false, + "inputTokens": 2850, + "outputTokens": 5, + "latencyMs": 1113 + }, + { + "questionId": "q51", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "78", + "actual": "66", + "correct": false, + "inputTokens": 6313, + "outputTokens": 2, + "latencyMs": 1999 + }, + { + "questionId": "q51", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "78", + "actual": "78", + "correct": true, + "inputTokens": 6359, + "outputTokens": 5, + "latencyMs": 1214 + }, + { + "questionId": "q51", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "78", + "actual": "66", + "correct": false, + "inputTokens": 5009, + "outputTokens": 2, + "latencyMs": 1613 + }, + { + "questionId": "q51", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "78", + "actual": "77", + "correct": false, + "inputTokens": 5754, + "outputTokens": 5, + "latencyMs": 1012 + }, + { + "questionId": "q52", + "format": "json", + "model": "gpt-4o-mini", + "expected": "22", + "actual": "30", + "correct": false, + "inputTokens": 6387, + "outputTokens": 2, + "latencyMs": 1580 + }, + { + "questionId": "q52", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "22", + "actual": "15", + "correct": false, + "inputTokens": 7864, + "outputTokens": 5, + "latencyMs": 1688 + }, + { + "questionId": "q52", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "22", + "actual": "22", + "correct": true, + "inputTokens": 2524, + "outputTokens": 2, + "latencyMs": 1290 + }, + { + "questionId": "q52", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "22", + "actual": "16", + "correct": false, + "inputTokens": 2976, + "outputTokens": 5, + "latencyMs": 1121 + }, + { + "questionId": "q52", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "22", + "actual": "10", + "correct": false, + "inputTokens": 2378, + "outputTokens": 2, + "latencyMs": 1544 + }, + { + "questionId": "q52", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "22", + "actual": "20", + "correct": false, + "inputTokens": 2850, + "outputTokens": 5, + "latencyMs": 822 + }, + { + "questionId": "q52", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "22", + "actual": "34", + "correct": false, + "inputTokens": 6313, + "outputTokens": 2, + "latencyMs": 2718 + }, + { + "questionId": "q52", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "22", + "actual": "15", + "correct": false, + "inputTokens": 6359, + "outputTokens": 5, + "latencyMs": 1211 + }, + { + "questionId": "q52", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "22", + "actual": "34", + "correct": false, + "inputTokens": 5009, + "outputTokens": 2, + "latencyMs": 1162 + }, + { + "questionId": "q52", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "22", + "actual": "16", + "correct": false, + "inputTokens": 5754, + "outputTokens": 5, + "latencyMs": 1156 + }, + { + "questionId": "q53", + "format": "json", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "24", + "correct": false, + "inputTokens": 6395, + "outputTokens": 2, + "latencyMs": 1089 + }, + { + "questionId": "q53", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "9", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1368 + }, + { + "questionId": "q53", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "24", + "correct": false, + "inputTokens": 2532, + "outputTokens": 2, + "latencyMs": 1850 + }, + { + "questionId": "q53", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "9", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 914 + }, + { + "questionId": "q53", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "34", + "correct": false, + "inputTokens": 2386, + "outputTokens": 2, + "latencyMs": 1156 + }, + { + "questionId": "q53", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1118 + }, + { + "questionId": "q53", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "22", + "correct": false, + "inputTokens": 6321, + "outputTokens": 2, + "latencyMs": 1020 + }, + { + "questionId": "q53", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "8", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1021 + }, + { + "questionId": "q53", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "18", + "correct": false, + "inputTokens": 5017, + "outputTokens": 2, + "latencyMs": 1236 + }, + { + "questionId": "q53", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1574 + }, + { + "questionId": "q54", + "format": "json", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "24", + "correct": false, + "inputTokens": 6395, + "outputTokens": 2, + "latencyMs": 1437 + }, + { + "questionId": "q54", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1091 + }, + { + "questionId": "q54", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "24", + "correct": false, + "inputTokens": 2532, + "outputTokens": 2, + "latencyMs": 1917 + }, + { + "questionId": "q54", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "6", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1095 + }, + { + "questionId": "q54", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "34", + "correct": false, + "inputTokens": 2386, + "outputTokens": 2, + "latencyMs": 4230 + }, + { + "questionId": "q54", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1187 + }, + { + "questionId": "q54", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "24", + "correct": false, + "inputTokens": 6321, + "outputTokens": 2, + "latencyMs": 1197 + }, + { + "questionId": "q54", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "6", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1176 + }, + { + "questionId": "q54", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "18", + "correct": false, + "inputTokens": 5017, + "outputTokens": 2, + "latencyMs": 1249 + }, + { + "questionId": "q54", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1383 + }, + { + "questionId": "q55", + "format": "json", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "30", + "correct": false, + "inputTokens": 6395, + "outputTokens": 2, + "latencyMs": 1149 + }, + { + "questionId": "q55", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1072 + }, + { + "questionId": "q55", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "18", + "correct": false, + "inputTokens": 2532, + "outputTokens": 2, + "latencyMs": 1213 + }, + { + "questionId": "q55", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1507 + }, + { + "questionId": "q55", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "34", + "correct": false, + "inputTokens": 2386, + "outputTokens": 2, + "latencyMs": 1826 + }, + { + "questionId": "q55", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1162 + }, + { + "questionId": "q55", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "24", + "correct": false, + "inputTokens": 6321, + "outputTokens": 2, + "latencyMs": 1008 + }, + { + "questionId": "q55", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1285 + }, + { + "questionId": "q55", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "22", + "correct": false, + "inputTokens": 5017, + "outputTokens": 2, + "latencyMs": 1124 + }, + { + "questionId": "q55", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "9", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1212 + }, + { + "questionId": "q56", + "format": "json", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "22", + "correct": false, + "inputTokens": 6395, + "outputTokens": 2, + "latencyMs": 1232 + }, + { + "questionId": "q56", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "7", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1792 + }, + { + "questionId": "q56", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 2532, + "outputTokens": 2, + "latencyMs": 1357 + }, + { + "questionId": "q56", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "6", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1247 + }, + { + "questionId": "q56", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "22", + "correct": false, + "inputTokens": 2386, + "outputTokens": 2, + "latencyMs": 1043 + }, + { + "questionId": "q56", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "7", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1065 + }, + { + "questionId": "q56", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 6321, + "outputTokens": 2, + "latencyMs": 1298 + }, + { + "questionId": "q56", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "7", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1767 + }, + { + "questionId": "q56", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 5017, + "outputTokens": 2, + "latencyMs": 3525 + }, + { + "questionId": "q56", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "8", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1355 + }, + { + "questionId": "q57", + "format": "json", + "model": "gpt-4o-mini", + "expected": "62", + "actual": "54", + "correct": false, + "inputTokens": 6394, + "outputTokens": 2, + "latencyMs": 1359 + }, + { + "questionId": "q57", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "62", + "actual": "62", + "correct": true, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1447 + }, + { + "questionId": "q57", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "62", + "actual": "54", + "correct": false, + "inputTokens": 2531, + "outputTokens": 2, + "latencyMs": 3832 + }, + { + "questionId": "q57", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "62", + "actual": "62", + "correct": true, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1143 + }, + { + "questionId": "q57", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "62", + "actual": "66", + "correct": false, + "inputTokens": 2385, + "outputTokens": 2, + "latencyMs": 1370 + }, + { + "questionId": "q57", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "62", + "actual": "62", + "correct": true, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1042 + }, + { + "questionId": "q57", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "62", + "actual": "54", + "correct": false, + "inputTokens": 6320, + "outputTokens": 2, + "latencyMs": 1015 + }, + { + "questionId": "q57", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "62", + "actual": "62", + "correct": true, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1395 + }, + { + "questionId": "q57", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "62", + "actual": "54", + "correct": false, + "inputTokens": 5016, + "outputTokens": 2, + "latencyMs": 1008 + }, + { + "questionId": "q57", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "62", + "actual": "62", + "correct": true, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1191 + }, + { + "questionId": "q58", + "format": "json", + "model": "gpt-4o-mini", + "expected": "45", + "actual": "38", + "correct": false, + "inputTokens": 6394, + "outputTokens": 2, + "latencyMs": 1304 + }, + { + "questionId": "q58", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "45", + "actual": "42", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1386 + }, + { + "questionId": "q58", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "45", + "actual": "38", + "correct": false, + "inputTokens": 2531, + "outputTokens": 2, + "latencyMs": 1433 + }, + { + "questionId": "q58", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "45", + "actual": "42", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 967 + }, + { + "questionId": "q58", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "45", + "actual": "42", + "correct": false, + "inputTokens": 2385, + "outputTokens": 2, + "latencyMs": 2469 + }, + { + "questionId": "q58", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "45", + "actual": "42", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 1382 + }, + { + "questionId": "q58", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "45", + "actual": "38", + "correct": false, + "inputTokens": 6320, + "outputTokens": 2, + "latencyMs": 1658 + }, + { + "questionId": "q58", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "45", + "actual": "42", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1450 + }, + { + "questionId": "q58", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "45", + "actual": "38", + "correct": false, + "inputTokens": 5016, + "outputTokens": 2, + "latencyMs": 1428 + }, + { + "questionId": "q58", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "45", + "actual": "38", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1144 + }, + { + "questionId": "q59", + "format": "json", + "model": "gpt-4o-mini", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1577 + }, + { + "questionId": "q59", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1181 + }, + { + "questionId": "q59", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1231 + }, + { + "questionId": "q59", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1407 + }, + { + "questionId": "q59", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1393 + }, + { + "questionId": "q59", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1534 + }, + { + "questionId": "q59", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1456 + }, + { + "questionId": "q59", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1933 + }, + { + "questionId": "q59", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1472 + }, + { + "questionId": "q59", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "96.17", + "actual": "96.17", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1224 + }, + { + "questionId": "q60", + "format": "json", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 2069 + }, + { + "questionId": "q60", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1172 + }, + { + "questionId": "q60", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 1236 + }, + { + "questionId": "q60", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1157 + }, + { + "questionId": "q60", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 1364 + }, + { + "questionId": "q60", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1041 + }, + { + "questionId": "q60", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1478 + }, + { + "questionId": "q60", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1266 + }, + { + "questionId": "q60", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 3477 + }, + { + "questionId": "q60", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 2630 + }, + { + "questionId": "q61", + "format": "json", + "model": "gpt-4o-mini", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1479 + }, + { + "questionId": "q61", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1270 + }, + { + "questionId": "q61", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1270 + }, + { + "questionId": "q61", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1342 + }, + { + "questionId": "q61", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1350 + }, + { + "questionId": "q61", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1205 + }, + { + "questionId": "q61", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1502 + }, + { + "questionId": "q61", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1571 + }, + { + "questionId": "q61", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 2013 + }, + { + "questionId": "q61", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "599.39", + "actual": "599.39", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1428 + }, + { + "questionId": "q62", + "format": "json", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 1666 + }, + { + "questionId": "q62", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1549 + }, + { + "questionId": "q62", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1033 + }, + { + "questionId": "q62", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1061 + }, + { + "questionId": "q62", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 2008 + }, + { + "questionId": "q62", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1214 + }, + { + "questionId": "q62", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1321 + }, + { + "questionId": "q62", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1311 + }, + { + "questionId": "q62", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 1769 + }, + { + "questionId": "q62", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1157 + }, + { + "questionId": "q63", + "format": "json", + "model": "gpt-4o-mini", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1213 + }, + { + "questionId": "q63", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1332 + }, + { + "questionId": "q63", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 3749 + }, + { + "questionId": "q63", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1326 + }, + { + "questionId": "q63", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 947 + }, + { + "questionId": "q63", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1251 + }, + { + "questionId": "q63", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1428 + }, + { + "questionId": "q63", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1659 + }, + { + "questionId": "q63", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 5584 + }, + { + "questionId": "q63", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "528.71", + "actual": "528.71", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1251 + }, + { + "questionId": "q64", + "format": "json", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 2425 + }, + { + "questionId": "q64", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1481 + }, + { + "questionId": "q64", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1109 + }, + { + "questionId": "q64", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1048 + }, + { + "questionId": "q64", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 1256 + }, + { + "questionId": "q64", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1117 + }, + { + "questionId": "q64", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1168 + }, + { + "questionId": "q64", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1504 + }, + { + "questionId": "q64", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 1134 + }, + { + "questionId": "q64", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1059 + }, + { + "questionId": "q65", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 2361 + }, + { + "questionId": "q65", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 11907, + "outputTokens": 8, + "latencyMs": 1158 + }, + { + "questionId": "q65", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1493 + }, + { + "questionId": "q65", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 6993, + "outputTokens": 8, + "latencyMs": 1068 + }, + { + "questionId": "q65", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 1490 + }, + { + "questionId": "q65", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 8414, + "outputTokens": 8, + "latencyMs": 1386 + }, + { + "questionId": "q65", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 1470 + }, + { + "questionId": "q65", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 9289, + "outputTokens": 8, + "latencyMs": 1189 + }, + { + "questionId": "q65", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 2824 + }, + { + "questionId": "q65", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1687.82", + "actual": "1687.82", + "correct": true, + "inputTokens": 8385, + "outputTokens": 8, + "latencyMs": 1565 + }, + { + "questionId": "q66", + "format": "json", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 1480 + }, + { + "questionId": "q66", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1354 + }, + { + "questionId": "q66", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 5334 + }, + { + "questionId": "q66", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1158 + }, + { + "questionId": "q66", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 2043 + }, + { + "questionId": "q66", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1302 + }, + { + "questionId": "q66", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1006 + }, + { + "questionId": "q66", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1106 + }, + { + "questionId": "q66", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 1801 + }, + { + "questionId": "q66", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1626 + }, + { + "questionId": "q67", + "format": "json", + "model": "gpt-4o-mini", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 2107 + }, + { + "questionId": "q67", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1183 + }, + { + "questionId": "q67", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 7091 + }, + { + "questionId": "q67", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1730 + }, + { + "questionId": "q67", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1222 + }, + { + "questionId": "q67", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1447 + }, + { + "questionId": "q67", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 10295 + }, + { + "questionId": "q67", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1228 + }, + { + "questionId": "q67", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1748 + }, + { + "questionId": "q67", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "423.6", + "actual": "423.6", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1373 + }, + { + "questionId": "q68", + "format": "json", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 3836 + }, + { + "questionId": "q68", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1297 + }, + { + "questionId": "q68", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 1927 + }, + { + "questionId": "q68", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1171 + }, + { + "questionId": "q68", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 1551 + }, + { + "questionId": "q68", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1273 + }, + { + "questionId": "q68", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1387 + }, + { + "questionId": "q68", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1237 + }, + { + "questionId": "q68", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 1934 + }, + { + "questionId": "q68", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1132 + }, + { + "questionId": "q69", + "format": "json", + "model": "gpt-4o-mini", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 2267 + }, + { + "questionId": "q69", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1772 + }, + { + "questionId": "q69", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1315 + }, + { + "questionId": "q69", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1165 + }, + { + "questionId": "q69", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1097 + }, + { + "questionId": "q69", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1299 + }, + { + "questionId": "q69", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1779 + }, + { + "questionId": "q69", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 3153 + }, + { + "questionId": "q69", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1813 + }, + { + "questionId": "q69", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "784.03", + "actual": "784.03", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1867 + }, + { + "questionId": "q70", + "format": "json", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 1611 + }, + { + "questionId": "q70", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1173 + }, + { + "questionId": "q70", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 1977 + }, + { + "questionId": "q70", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1108 + }, + { + "questionId": "q70", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 1324 + }, + { + "questionId": "q70", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1225 + }, + { + "questionId": "q70", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1416 + }, + { + "questionId": "q70", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1200 + }, + { + "questionId": "q70", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 1259 + }, + { + "questionId": "q70", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "shipped", + "actual": "shipped", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1433 + }, + { + "questionId": "q71", + "format": "json", + "model": "gpt-4o-mini", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1729 + }, + { + "questionId": "q71", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1143 + }, + { + "questionId": "q71", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1837 + }, + { + "questionId": "q71", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1147 + }, + { + "questionId": "q71", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1777 + }, + { + "questionId": "q71", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1295 + }, + { + "questionId": "q71", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1081 + }, + { + "questionId": "q71", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1692 + }, + { + "questionId": "q71", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1661 + }, + { + "questionId": "q71", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "645.88", + "actual": "645.88", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1475 + }, + { + "questionId": "q72", + "format": "json", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 2979 + }, + { + "questionId": "q72", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1187 + }, + { + "questionId": "q72", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1620 + }, + { + "questionId": "q72", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1532 + }, + { + "questionId": "q72", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 1616 + }, + { + "questionId": "q72", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1435 + }, + { + "questionId": "q72", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1190 + }, + { + "questionId": "q72", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1414 + }, + { + "questionId": "q72", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 2335 + }, + { + "questionId": "q72", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "processing", + "actual": "processing", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1308 + }, + { + "questionId": "q73", + "format": "json", + "model": "gpt-4o-mini", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 3359 + }, + { + "questionId": "q73", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1227 + }, + { + "questionId": "q73", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1439 + }, + { + "questionId": "q73", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1179 + }, + { + "questionId": "q73", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1064 + }, + { + "questionId": "q73", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1144 + }, + { + "questionId": "q73", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1873 + }, + { + "questionId": "q73", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1302 + }, + { + "questionId": "q73", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1956 + }, + { + "questionId": "q73", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "371.91", + "actual": "371.91", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1281 + }, + { + "questionId": "q74", + "format": "json", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 1591 + }, + { + "questionId": "q74", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1279 + }, + { + "questionId": "q74", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 3152 + }, + { + "questionId": "q74", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1061 + }, + { + "questionId": "q74", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 1557 + }, + { + "questionId": "q74", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1313 + }, + { + "questionId": "q74", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1433 + }, + { + "questionId": "q74", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1812 + }, + { + "questionId": "q74", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 1024 + }, + { + "questionId": "q74", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "pending", + "actual": "pending", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1243 + }, + { + "questionId": "q75", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 9740, + "outputTokens": 3, + "latencyMs": 1500 + }, + { + "questionId": "q75", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 11907, + "outputTokens": 6, + "latencyMs": 1275 + }, + { + "questionId": "q75", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 6014, + "outputTokens": 3, + "latencyMs": 1841 + }, + { + "questionId": "q75", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 6993, + "outputTokens": 6, + "latencyMs": 1080 + }, + { + "questionId": "q75", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 6782, + "outputTokens": 3, + "latencyMs": 1209 + }, + { + "questionId": "q75", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 8414, + "outputTokens": 6, + "latencyMs": 1308 + }, + { + "questionId": "q75", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 9159, + "outputTokens": 3, + "latencyMs": 1556 + }, + { + "questionId": "q75", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 9289, + "outputTokens": 6, + "latencyMs": 1240 + }, + { + "questionId": "q75", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 7374, + "outputTokens": 3, + "latencyMs": 1254 + }, + { + "questionId": "q75", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1066", + "actual": "1066", + "correct": true, + "inputTokens": 8385, + "outputTokens": 6, + "latencyMs": 1305 + }, + { + "questionId": "q76", + "format": "json", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 2606 + }, + { + "questionId": "q76", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1422 + }, + { + "questionId": "q76", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 2688 + }, + { + "questionId": "q76", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1041 + }, + { + "questionId": "q76", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 3070 + }, + { + "questionId": "q76", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1167 + }, + { + "questionId": "q76", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1702 + }, + { + "questionId": "q76", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1182 + }, + { + "questionId": "q76", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 1740 + }, + { + "questionId": "q76", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "cancelled", + "actual": "cancelled", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1404 + }, + { + "questionId": "q77", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1596 + }, + { + "questionId": "q77", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 11907, + "outputTokens": 8, + "latencyMs": 2314 + }, + { + "questionId": "q77", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1114 + }, + { + "questionId": "q77", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 6993, + "outputTokens": 8, + "latencyMs": 1289 + }, + { + "questionId": "q77", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 2428 + }, + { + "questionId": "q77", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 8414, + "outputTokens": 8, + "latencyMs": 1325 + }, + { + "questionId": "q77", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 1343 + }, + { + "questionId": "q77", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 9289, + "outputTokens": 8, + "latencyMs": 1783 + }, + { + "questionId": "q77", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 918 + }, + { + "questionId": "q77", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1697.4", + "actual": "1697.4", + "correct": true, + "inputTokens": 8385, + "outputTokens": 8, + "latencyMs": 1308 + }, + { + "questionId": "q78", + "format": "json", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9739, + "outputTokens": 3, + "latencyMs": 1396 + }, + { + "questionId": "q78", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 11906, + "outputTokens": 4, + "latencyMs": 1225 + }, + { + "questionId": "q78", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6013, + "outputTokens": 3, + "latencyMs": 2294 + }, + { + "questionId": "q78", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6992, + "outputTokens": 4, + "latencyMs": 1418 + }, + { + "questionId": "q78", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 6781, + "outputTokens": 3, + "latencyMs": 1613 + }, + { + "questionId": "q78", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 8413, + "outputTokens": 4, + "latencyMs": 1374 + }, + { + "questionId": "q78", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9158, + "outputTokens": 3, + "latencyMs": 1341 + }, + { + "questionId": "q78", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 9288, + "outputTokens": 4, + "latencyMs": 1223 + }, + { + "questionId": "q78", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 7373, + "outputTokens": 3, + "latencyMs": 2230 + }, + { + "questionId": "q78", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "delivered", + "actual": "delivered", + "correct": true, + "inputTokens": 8384, + "outputTokens": 4, + "latencyMs": 1425 + }, + { + "questionId": "q79", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1377 + }, + { + "questionId": "q79", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 11907, + "outputTokens": 9, + "latencyMs": 1550 + }, + { + "questionId": "q79", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1394 + }, + { + "questionId": "q79", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 6993, + "outputTokens": 9, + "latencyMs": 1202 + }, + { + "questionId": "q79", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1435 + }, + { + "questionId": "q79", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 8414, + "outputTokens": 9, + "latencyMs": 1277 + }, + { + "questionId": "q79", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1564 + }, + { + "questionId": "q79", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 9289, + "outputTokens": 9, + "latencyMs": 1200 + }, + { + "questionId": "q79", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1596 + }, + { + "questionId": "q79", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Valerie Braun", + "actual": "Valerie Braun", + "correct": true, + "inputTokens": 8385, + "outputTokens": 9, + "latencyMs": 1151 + }, + { + "questionId": "q80", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1458 + }, + { + "questionId": "q80", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 11907, + "outputTokens": 9, + "latencyMs": 1283 + }, + { + "questionId": "q80", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 4702 + }, + { + "questionId": "q80", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 6993, + "outputTokens": 9, + "latencyMs": 1360 + }, + { + "questionId": "q80", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 6167 + }, + { + "questionId": "q80", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 8414, + "outputTokens": 9, + "latencyMs": 1449 + }, + { + "questionId": "q80", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 6096 + }, + { + "questionId": "q80", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 9289, + "outputTokens": 9, + "latencyMs": 1194 + }, + { + "questionId": "q80", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 7357 + }, + { + "questionId": "q80", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Anita Kozey", + "actual": "Anita Kozey", + "correct": true, + "inputTokens": 8385, + "outputTokens": 9, + "latencyMs": 1213 + }, + { + "questionId": "q81", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 9740, + "outputTokens": 6, + "latencyMs": 2539 + }, + { + "questionId": "q81", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 11907, + "outputTokens": 10, + "latencyMs": 1532 + }, + { + "questionId": "q81", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 6014, + "outputTokens": 6, + "latencyMs": 2960 + }, + { + "questionId": "q81", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 6993, + "outputTokens": 10, + "latencyMs": 1547 + }, + { + "questionId": "q81", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 6782, + "outputTokens": 6, + "latencyMs": 1358 + }, + { + "questionId": "q81", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 8414, + "outputTokens": 10, + "latencyMs": 1424 + }, + { + "questionId": "q81", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 9159, + "outputTokens": 6, + "latencyMs": 958 + }, + { + "questionId": "q81", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 9289, + "outputTokens": 10, + "latencyMs": 1381 + }, + { + "questionId": "q81", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 7374, + "outputTokens": 6, + "latencyMs": 1372 + }, + { + "questionId": "q81", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Elmer Kub PhD", + "actual": "Elmer Kub PhD", + "correct": true, + "inputTokens": 8385, + "outputTokens": 10, + "latencyMs": 1715 + }, + { + "questionId": "q82", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1972 + }, + { + "questionId": "q82", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 11907, + "outputTokens": 10, + "latencyMs": 1315 + }, + { + "questionId": "q82", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1634 + }, + { + "questionId": "q82", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 6993, + "outputTokens": 10, + "latencyMs": 1264 + }, + { + "questionId": "q82", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 1153 + }, + { + "questionId": "q82", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 8414, + "outputTokens": 10, + "latencyMs": 1252 + }, + { + "questionId": "q82", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 1697 + }, + { + "questionId": "q82", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 9289, + "outputTokens": 10, + "latencyMs": 1198 + }, + { + "questionId": "q82", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 1854 + }, + { + "questionId": "q82", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Maxine Zemlak", + "actual": "Maxine Zemlak", + "correct": true, + "inputTokens": 8385, + "outputTokens": 10, + "latencyMs": 1752 + }, + { + "questionId": "q83", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 2076 + }, + { + "questionId": "q83", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1398 + }, + { + "questionId": "q83", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 2263 + }, + { + "questionId": "q83", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 3101 + }, + { + "questionId": "q83", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 1453 + }, + { + "questionId": "q83", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1265 + }, + { + "questionId": "q83", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 8807 + }, + { + "questionId": "q83", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1097 + }, + { + "questionId": "q83", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 1667 + }, + { + "questionId": "q83", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Emanuel Littel", + "actual": "Emanuel Littel", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1198 + }, + { + "questionId": "q84", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 9740, + "outputTokens": 3, + "latencyMs": 2292 + }, + { + "questionId": "q84", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1202 + }, + { + "questionId": "q84", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 6014, + "outputTokens": 3, + "latencyMs": 1801 + }, + { + "questionId": "q84", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1287 + }, + { + "questionId": "q84", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 6782, + "outputTokens": 3, + "latencyMs": 1340 + }, + { + "questionId": "q84", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1163 + }, + { + "questionId": "q84", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 9159, + "outputTokens": 3, + "latencyMs": 2685 + }, + { + "questionId": "q84", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1397 + }, + { + "questionId": "q84", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 7374, + "outputTokens": 3, + "latencyMs": 1289 + }, + { + "questionId": "q84", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Andrew Kling", + "actual": "Andrew Kling", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1155 + }, + { + "questionId": "q85", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 9740, + "outputTokens": 6, + "latencyMs": 1601 + }, + { + "questionId": "q85", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 11907, + "outputTokens": 9, + "latencyMs": 1340 + }, + { + "questionId": "q85", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 6014, + "outputTokens": 6, + "latencyMs": 3525 + }, + { + "questionId": "q85", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 6993, + "outputTokens": 9, + "latencyMs": 1710 + }, + { + "questionId": "q85", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 6782, + "outputTokens": 6, + "latencyMs": 2333 + }, + { + "questionId": "q85", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 8414, + "outputTokens": 9, + "latencyMs": 1168 + }, + { + "questionId": "q85", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 9159, + "outputTokens": 6, + "latencyMs": 1781 + }, + { + "questionId": "q85", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 9289, + "outputTokens": 9, + "latencyMs": 1552 + }, + { + "questionId": "q85", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 7374, + "outputTokens": 6, + "latencyMs": 1584 + }, + { + "questionId": "q85", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Morris O'Hara", + "actual": "Morris O'Hara", + "correct": true, + "inputTokens": 8385, + "outputTokens": 9, + "latencyMs": 1548 + }, + { + "questionId": "q86", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 9740, + "outputTokens": 6, + "latencyMs": 7230 + }, + { + "questionId": "q86", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 11907, + "outputTokens": 9, + "latencyMs": 1933 + }, + { + "questionId": "q86", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 6014, + "outputTokens": 6, + "latencyMs": 1067 + }, + { + "questionId": "q86", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 6993, + "outputTokens": 9, + "latencyMs": 1288 + }, + { + "questionId": "q86", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 6782, + "outputTokens": 6, + "latencyMs": 3954 + }, + { + "questionId": "q86", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 8414, + "outputTokens": 9, + "latencyMs": 1314 + }, + { + "questionId": "q86", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 9159, + "outputTokens": 6, + "latencyMs": 1334 + }, + { + "questionId": "q86", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 9289, + "outputTokens": 9, + "latencyMs": 2441 + }, + { + "questionId": "q86", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 7374, + "outputTokens": 6, + "latencyMs": 1650 + }, + { + "questionId": "q86", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Elijah Franecki", + "actual": "Elijah Franecki", + "correct": true, + "inputTokens": 8385, + "outputTokens": 9, + "latencyMs": 1495 + }, + { + "questionId": "q87", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1262 + }, + { + "questionId": "q87", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 11907, + "outputTokens": 7, + "latencyMs": 1367 + }, + { + "questionId": "q87", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1385 + }, + { + "questionId": "q87", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 6993, + "outputTokens": 7, + "latencyMs": 1313 + }, + { + "questionId": "q87", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 1141 + }, + { + "questionId": "q87", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 8414, + "outputTokens": 7, + "latencyMs": 1300 + }, + { + "questionId": "q87", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 3347 + }, + { + "questionId": "q87", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 9289, + "outputTokens": 7, + "latencyMs": 1457 + }, + { + "questionId": "q87", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 1276 + }, + { + "questionId": "q87", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Malcolm Erdman", + "actual": "Malcolm Erdman", + "correct": true, + "inputTokens": 8385, + "outputTokens": 7, + "latencyMs": 1211 + }, + { + "questionId": "q88", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1635 + }, + { + "questionId": "q88", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 11907, + "outputTokens": 9, + "latencyMs": 1582 + }, + { + "questionId": "q88", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1695 + }, + { + "questionId": "q88", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 6993, + "outputTokens": 9, + "latencyMs": 1318 + }, + { + "questionId": "q88", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 936 + }, + { + "questionId": "q88", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 8414, + "outputTokens": 9, + "latencyMs": 1204 + }, + { + "questionId": "q88", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 996 + }, + { + "questionId": "q88", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 9289, + "outputTokens": 9, + "latencyMs": 1261 + }, + { + "questionId": "q88", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 2276 + }, + { + "questionId": "q88", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Fannie Skiles", + "actual": "Fannie Skiles", + "correct": true, + "inputTokens": 8385, + "outputTokens": 9, + "latencyMs": 1380 + }, + { + "questionId": "q89", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 9740, + "outputTokens": 6, + "latencyMs": 1451 + }, + { + "questionId": "q89", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 11907, + "outputTokens": 10, + "latencyMs": 1977 + }, + { + "questionId": "q89", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 6014, + "outputTokens": 6, + "latencyMs": 1376 + }, + { + "questionId": "q89", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 6993, + "outputTokens": 10, + "latencyMs": 1250 + }, + { + "questionId": "q89", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 6782, + "outputTokens": 6, + "latencyMs": 1273 + }, + { + "questionId": "q89", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 8414, + "outputTokens": 10, + "latencyMs": 1359 + }, + { + "questionId": "q89", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 9159, + "outputTokens": 6, + "latencyMs": 1791 + }, + { + "questionId": "q89", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 9289, + "outputTokens": 10, + "latencyMs": 1273 + }, + { + "questionId": "q89", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 7374, + "outputTokens": 6, + "latencyMs": 2832 + }, + { + "questionId": "q89", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Sonja Emmerich", + "actual": "Sonja Emmerich", + "correct": true, + "inputTokens": 8385, + "outputTokens": 10, + "latencyMs": 1172 + }, + { + "questionId": "q90", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 9740, + "outputTokens": 7, + "latencyMs": 1491 + }, + { + "questionId": "q90", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 11907, + "outputTokens": 10, + "latencyMs": 1414 + }, + { + "questionId": "q90", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 6014, + "outputTokens": 7, + "latencyMs": 1396 + }, + { + "questionId": "q90", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 6993, + "outputTokens": 10, + "latencyMs": 1514 + }, + { + "questionId": "q90", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 6782, + "outputTokens": 7, + "latencyMs": 1573 + }, + { + "questionId": "q90", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 8414, + "outputTokens": 10, + "latencyMs": 1284 + }, + { + "questionId": "q90", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 9159, + "outputTokens": 7, + "latencyMs": 5400 + }, + { + "questionId": "q90", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 9289, + "outputTokens": 10, + "latencyMs": 1486 + }, + { + "questionId": "q90", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 7374, + "outputTokens": 7, + "latencyMs": 1420 + }, + { + "questionId": "q90", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Frank Emmerich DVM", + "actual": "Frank Emmerich DVM", + "correct": true, + "inputTokens": 8385, + "outputTokens": 10, + "latencyMs": 1410 + }, + { + "questionId": "q91", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1248 + }, + { + "questionId": "q91", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 11907, + "outputTokens": 5, + "latencyMs": 1177 + }, + { + "questionId": "q91", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1601 + }, + { + "questionId": "q91", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 6993, + "outputTokens": 5, + "latencyMs": 1822 + }, + { + "questionId": "q91", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1103 + }, + { + "questionId": "q91", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 8414, + "outputTokens": 5, + "latencyMs": 1247 + }, + { + "questionId": "q91", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 1184 + }, + { + "questionId": "q91", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 9289, + "outputTokens": 5, + "latencyMs": 1137 + }, + { + "questionId": "q91", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 949 + }, + { + "questionId": "q91", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Ronald Collins", + "actual": "Ronald Collins", + "correct": true, + "inputTokens": 8385, + "outputTokens": 5, + "latencyMs": 1143 + }, + { + "questionId": "q92", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 9740, + "outputTokens": 4, + "latencyMs": 1021 + }, + { + "questionId": "q92", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 11907, + "outputTokens": 8, + "latencyMs": 1301 + }, + { + "questionId": "q92", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 6014, + "outputTokens": 4, + "latencyMs": 1254 + }, + { + "questionId": "q92", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 6993, + "outputTokens": 8, + "latencyMs": 1375 + }, + { + "questionId": "q92", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 6782, + "outputTokens": 4, + "latencyMs": 1316 + }, + { + "questionId": "q92", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 8414, + "outputTokens": 8, + "latencyMs": 2681 + }, + { + "questionId": "q92", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 9159, + "outputTokens": 4, + "latencyMs": 2427 + }, + { + "questionId": "q92", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 9289, + "outputTokens": 8, + "latencyMs": 1526 + }, + { + "questionId": "q92", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 7374, + "outputTokens": 4, + "latencyMs": 1252 + }, + { + "questionId": "q92", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Jeannie Klein", + "actual": "Jeannie Klein", + "correct": true, + "inputTokens": 8385, + "outputTokens": 8, + "latencyMs": 1324 + }, + { + "questionId": "q93", + "format": "json", + "model": "gpt-4o-mini", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 9740, + "outputTokens": 5, + "latencyMs": 1606 + }, + { + "questionId": "q93", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 11907, + "outputTokens": 8, + "latencyMs": 1223 + }, + { + "questionId": "q93", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 6014, + "outputTokens": 5, + "latencyMs": 1965 + }, + { + "questionId": "q93", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 6993, + "outputTokens": 8, + "latencyMs": 1300 + }, + { + "questionId": "q93", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 6782, + "outputTokens": 5, + "latencyMs": 1110 + }, + { + "questionId": "q93", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 8414, + "outputTokens": 8, + "latencyMs": 1819 + }, + { + "questionId": "q93", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 9159, + "outputTokens": 5, + "latencyMs": 1010 + }, + { + "questionId": "q93", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 9289, + "outputTokens": 8, + "latencyMs": 1224 + }, + { + "questionId": "q93", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 7374, + "outputTokens": 5, + "latencyMs": 1430 + }, + { + "questionId": "q93", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "Joshua Watsica", + "actual": "Joshua Watsica", + "correct": true, + "inputTokens": 8385, + "outputTokens": 8, + "latencyMs": 1158 + }, + { + "questionId": "q94", + "format": "json", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9736, + "outputTokens": 2, + "latencyMs": 1352 + }, + { + "questionId": "q94", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1498 + }, + { + "questionId": "q94", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "12", + "correct": false, + "inputTokens": 6010, + "outputTokens": 2, + "latencyMs": 1249 + }, + { + "questionId": "q94", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1080 + }, + { + "questionId": "q94", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "12", + "correct": false, + "inputTokens": 6778, + "outputTokens": 2, + "latencyMs": 1760 + }, + { + "questionId": "q94", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1156 + }, + { + "questionId": "q94", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9155, + "outputTokens": 2, + "latencyMs": 9923 + }, + { + "questionId": "q94", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1138 + }, + { + "questionId": "q94", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "12", + "correct": false, + "inputTokens": 7370, + "outputTokens": 2, + "latencyMs": 1070 + }, + { + "questionId": "q94", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1114 + }, + { + "questionId": "q95", + "format": "json", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9736, + "outputTokens": 2, + "latencyMs": 830 + }, + { + "questionId": "q95", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1085 + }, + { + "questionId": "q95", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6010, + "outputTokens": 2, + "latencyMs": 2362 + }, + { + "questionId": "q95", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "7", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1198 + }, + { + "questionId": "q95", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6778, + "outputTokens": 2, + "latencyMs": 1630 + }, + { + "questionId": "q95", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1219 + }, + { + "questionId": "q95", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9155, + "outputTokens": 2, + "latencyMs": 2666 + }, + { + "questionId": "q95", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1044 + }, + { + "questionId": "q95", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "12", + "correct": false, + "inputTokens": 7370, + "outputTokens": 2, + "latencyMs": 2187 + }, + { + "questionId": "q95", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1313 + }, + { + "questionId": "q96", + "format": "json", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "20", + "correct": false, + "inputTokens": 9737, + "outputTokens": 2, + "latencyMs": 1087 + }, + { + "questionId": "q96", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1292 + }, + { + "questionId": "q96", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 6011, + "outputTokens": 2, + "latencyMs": 1979 + }, + { + "questionId": "q96", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "7", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1095 + }, + { + "questionId": "q96", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 6779, + "outputTokens": 2, + "latencyMs": 1385 + }, + { + "questionId": "q96", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1507 + }, + { + "questionId": "q96", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9156, + "outputTokens": 2, + "latencyMs": 1579 + }, + { + "questionId": "q96", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1365 + }, + { + "questionId": "q96", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "20", + "correct": false, + "inputTokens": 7371, + "outputTokens": 2, + "latencyMs": 1661 + }, + { + "questionId": "q96", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "7", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1423 + }, + { + "questionId": "q97", + "format": "json", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 9737, + "outputTokens": 2, + "latencyMs": 1815 + }, + { + "questionId": "q97", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1345 + }, + { + "questionId": "q97", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6011, + "outputTokens": 2, + "latencyMs": 2193 + }, + { + "questionId": "q97", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1417 + }, + { + "questionId": "q97", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 6779, + "outputTokens": 2, + "latencyMs": 1721 + }, + { + "questionId": "q97", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1114 + }, + { + "questionId": "q97", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 9156, + "outputTokens": 2, + "latencyMs": 2208 + }, + { + "questionId": "q97", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1895 + }, + { + "questionId": "q97", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 7371, + "outputTokens": 2, + "latencyMs": 1287 + }, + { + "questionId": "q97", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1281 + }, + { + "questionId": "q98", + "format": "json", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9737, + "outputTokens": 2, + "latencyMs": 1387 + }, + { + "questionId": "q98", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1243 + }, + { + "questionId": "q98", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6011, + "outputTokens": 2, + "latencyMs": 1284 + }, + { + "questionId": "q98", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1161 + }, + { + "questionId": "q98", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "15", + "correct": false, + "inputTokens": 6779, + "outputTokens": 2, + "latencyMs": 10406 + }, + { + "questionId": "q98", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1335 + }, + { + "questionId": "q98", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9156, + "outputTokens": 2, + "latencyMs": 1517 + }, + { + "questionId": "q98", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1702 + }, + { + "questionId": "q98", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 7371, + "outputTokens": 2, + "latencyMs": 1676 + }, + { + "questionId": "q98", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1218 + }, + { + "questionId": "q99", + "format": "json", + "model": "gpt-4o-mini", + "expected": "42342.25", + "actual": "$50,000.00", + "correct": false, + "inputTokens": 9737, + "outputTokens": 7, + "latencyMs": 1407 + }, + { + "questionId": "q99", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "50,847.47", + "correct": false, + "inputTokens": 11902, + "outputTokens": 9, + "latencyMs": 1443 + }, + { + "questionId": "q99", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "42342.25", + "actual": "Total revenue across all orders is 42,195.36.", + "correct": false, + "inputTokens": 6011, + "outputTokens": 14, + "latencyMs": 1150 + }, + { + "questionId": "q99", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "41,847.47", + "correct": false, + "inputTokens": 6988, + "outputTokens": 9, + "latencyMs": 1774 + }, + { + "questionId": "q99", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "42342.25", + "actual": "$32,186.73", + "correct": false, + "inputTokens": 6779, + "outputTokens": 7, + "latencyMs": 2654 + }, + { + "questionId": "q99", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "48,847.47", + "correct": false, + "inputTokens": 8409, + "outputTokens": 9, + "latencyMs": 1386 + }, + { + "questionId": "q99", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "42342.25", + "actual": "$34,186.73", + "correct": false, + "inputTokens": 9156, + "outputTokens": 7, + "latencyMs": 1506 + }, + { + "questionId": "q99", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "48,847.47", + "correct": false, + "inputTokens": 9284, + "outputTokens": 9, + "latencyMs": 1509 + }, + { + "questionId": "q99", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "42342.25", + "actual": "Total revenue across all orders is 48780.73.", + "correct": false, + "inputTokens": 7371, + "outputTokens": 13, + "latencyMs": 1700 + }, + { + "questionId": "q99", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "47,847.47", + "correct": false, + "inputTokens": 8380, + "outputTokens": 9, + "latencyMs": 1230 + }, + { + "questionId": "q100", + "format": "json", + "model": "gpt-4o-mini", + "expected": "44", + "actual": "36", + "correct": false, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 1725 + }, + { + "questionId": "q100", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "44", + "actual": "48", + "correct": false, + "inputTokens": 11904, + "outputTokens": 5, + "latencyMs": 1377 + }, + { + "questionId": "q100", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "44", + "actual": "34", + "correct": false, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1399 + }, + { + "questionId": "q100", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "44", + "actual": "47", + "correct": false, + "inputTokens": 6990, + "outputTokens": 5, + "latencyMs": 1094 + }, + { + "questionId": "q100", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "44", + "actual": "34", + "correct": false, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 1617 + }, + { + "questionId": "q100", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "44", + "actual": "47", + "correct": false, + "inputTokens": 8411, + "outputTokens": 5, + "latencyMs": 1344 + }, + { + "questionId": "q100", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "44", + "actual": "36", + "correct": false, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 2396 + }, + { + "questionId": "q100", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "44", + "actual": "48", + "correct": false, + "inputTokens": 9286, + "outputTokens": 5, + "latencyMs": 1145 + }, + { + "questionId": "q100", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "44", + "actual": "36", + "correct": false, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 951 + }, + { + "questionId": "q100", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "44", + "actual": "45", + "correct": false, + "inputTokens": 8382, + "outputTokens": 5, + "latencyMs": 1311 + }, + { + "questionId": "q101", + "format": "json", + "model": "gpt-4o-mini", + "expected": "39", + "actual": "34", + "correct": false, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 866 + }, + { + "questionId": "q101", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "39", + "actual": "38", + "correct": false, + "inputTokens": 11904, + "outputTokens": 5, + "latencyMs": 1964 + }, + { + "questionId": "q101", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "39", + "actual": "30", + "correct": false, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1994 + }, + { + "questionId": "q101", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "39", + "actual": "38", + "correct": false, + "inputTokens": 6990, + "outputTokens": 5, + "latencyMs": 1277 + }, + { + "questionId": "q101", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "39", + "actual": "32", + "correct": false, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 1884 + }, + { + "questionId": "q101", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "39", + "actual": "38", + "correct": false, + "inputTokens": 8411, + "outputTokens": 5, + "latencyMs": 1282 + }, + { + "questionId": "q101", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "39", + "actual": "32", + "correct": false, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1761 + }, + { + "questionId": "q101", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "39", + "actual": "38", + "correct": false, + "inputTokens": 9286, + "outputTokens": 5, + "latencyMs": 1250 + }, + { + "questionId": "q101", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "39", + "actual": "32", + "correct": false, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 1316 + }, + { + "questionId": "q101", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "39", + "actual": "38", + "correct": false, + "inputTokens": 8382, + "outputTokens": 5, + "latencyMs": 1373 + }, + { + "questionId": "q102", + "format": "json", + "model": "gpt-4o-mini", + "expected": "32", + "actual": "27", + "correct": false, + "inputTokens": 9739, + "outputTokens": 2, + "latencyMs": 1389 + }, + { + "questionId": "q102", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "32", + "actual": "28", + "correct": false, + "inputTokens": 11904, + "outputTokens": 5, + "latencyMs": 1215 + }, + { + "questionId": "q102", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "32", + "actual": "24", + "correct": false, + "inputTokens": 6013, + "outputTokens": 2, + "latencyMs": 1034 + }, + { + "questionId": "q102", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "32", + "actual": "26", + "correct": false, + "inputTokens": 6990, + "outputTokens": 5, + "latencyMs": 1063 + }, + { + "questionId": "q102", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "32", + "actual": "25", + "correct": false, + "inputTokens": 6781, + "outputTokens": 2, + "latencyMs": 7312 + }, + { + "questionId": "q102", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "32", + "actual": "28", + "correct": false, + "inputTokens": 8411, + "outputTokens": 5, + "latencyMs": 1387 + }, + { + "questionId": "q102", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "32", + "actual": "27", + "correct": false, + "inputTokens": 9158, + "outputTokens": 2, + "latencyMs": 1488 + }, + { + "questionId": "q102", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "32", + "actual": "28", + "correct": false, + "inputTokens": 9286, + "outputTokens": 5, + "latencyMs": 1268 + }, + { + "questionId": "q102", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "32", + "actual": "27", + "correct": false, + "inputTokens": 7373, + "outputTokens": 2, + "latencyMs": 1274 + }, + { + "questionId": "q102", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "32", + "actual": "26", + "correct": false, + "inputTokens": 8382, + "outputTokens": 5, + "latencyMs": 1354 + }, + { + "questionId": "q103", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1330 + }, + { + "questionId": "q103", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1437 + }, + { + "questionId": "q103", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1341 + }, + { + "questionId": "q103", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1231 + }, + { + "questionId": "q103", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 2515 + }, + { + "questionId": "q103", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1162 + }, + { + "questionId": "q103", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 868 + }, + { + "questionId": "q103", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1149 + }, + { + "questionId": "q103", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1183 + }, + { + "questionId": "q103", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6975", + "actual": "6975", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1119 + }, + { + "questionId": "q104", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1273 + }, + { + "questionId": "q104", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1371 + }, + { + "questionId": "q104", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 2052 + }, + { + "questionId": "q104", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 997 + }, + { + "questionId": "q104", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1152 + }, + { + "questionId": "q104", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1188 + }, + { + "questionId": "q104", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 1259 + }, + { + "questionId": "q104", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1239 + }, + { + "questionId": "q104", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1096 + }, + { + "questionId": "q104", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6686.23", + "actual": "6686.23", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1247 + }, + { + "questionId": "q105", + "format": "json", + "model": "gpt-4o-mini", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1354 + }, + { + "questionId": "q105", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1083 + }, + { + "questionId": "q105", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 869 + }, + { + "questionId": "q105", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1051 + }, + { + "questionId": "q105", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1528 + }, + { + "questionId": "q105", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1126 + }, + { + "questionId": "q105", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1136 + }, + { + "questionId": "q105", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1121 + }, + { + "questionId": "q105", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1217 + }, + { + "questionId": "q105", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "7500", + "actual": "7500", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1099 + }, + { + "questionId": "q106", + "format": "json", + "model": "gpt-4o-mini", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1416 + }, + { + "questionId": "q106", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1526 + }, + { + "questionId": "q106", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1350 + }, + { + "questionId": "q106", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1330 + }, + { + "questionId": "q106", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 2337 + }, + { + "questionId": "q106", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1171 + }, + { + "questionId": "q106", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 3128 + }, + { + "questionId": "q106", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1151 + }, + { + "questionId": "q106", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1988 + }, + { + "questionId": "q106", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "14297.05", + "actual": "14297.05", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1166 + }, + { + "questionId": "q107", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 2217 + }, + { + "questionId": "q107", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1114 + }, + { + "questionId": "q107", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1360 + }, + { + "questionId": "q107", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1079 + }, + { + "questionId": "q107", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1951 + }, + { + "questionId": "q107", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1173 + }, + { + "questionId": "q107", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1076 + }, + { + "questionId": "q107", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1098 + }, + { + "questionId": "q107", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1101 + }, + { + "questionId": "q107", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6692", + "actual": "6692", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1254 + }, + { + "questionId": "q108", + "format": "json", + "model": "gpt-4o-mini", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 2041 + }, + { + "questionId": "q108", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1405 + }, + { + "questionId": "q108", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1170 + }, + { + "questionId": "q108", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1161 + }, + { + "questionId": "q108", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1326 + }, + { + "questionId": "q108", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1259 + }, + { + "questionId": "q108", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 3006 + }, + { + "questionId": "q108", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1461 + }, + { + "questionId": "q108", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 3824 + }, + { + "questionId": "q108", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "9302.76", + "actual": "9302.76", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1391 + }, + { + "questionId": "q109", + "format": "json", + "model": "gpt-4o-mini", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1091 + }, + { + "questionId": "q109", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1188 + }, + { + "questionId": "q109", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1450 + }, + { + "questionId": "q109", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1614 + }, + { + "questionId": "q109", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1642 + }, + { + "questionId": "q109", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1311 + }, + { + "questionId": "q109", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1201 + }, + { + "questionId": "q109", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1261 + }, + { + "questionId": "q109", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 856 + }, + { + "questionId": "q109", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "3285", + "actual": "3285", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 980 + }, + { + "questionId": "q110", + "format": "json", + "model": "gpt-4o-mini", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 3090 + }, + { + "questionId": "q110", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1123 + }, + { + "questionId": "q110", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 2911 + }, + { + "questionId": "q110", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 979 + }, + { + "questionId": "q110", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1118 + }, + { + "questionId": "q110", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 943 + }, + { + "questionId": "q110", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 2639 + }, + { + "questionId": "q110", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1187 + }, + { + "questionId": "q110", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 2402 + }, + { + "questionId": "q110", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "3826.93", + "actual": "3826.93", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1723 + }, + { + "questionId": "q111", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 2401 + }, + { + "questionId": "q111", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1117 + }, + { + "questionId": "q111", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1568 + }, + { + "questionId": "q111", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1132 + }, + { + "questionId": "q111", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1478 + }, + { + "questionId": "q111", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1831 + }, + { + "questionId": "q111", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1631 + }, + { + "questionId": "q111", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1371 + }, + { + "questionId": "q111", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1209 + }, + { + "questionId": "q111", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6191", + "actual": "6191", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1411 + }, + { + "questionId": "q112", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1773 + }, + { + "questionId": "q112", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1090 + }, + { + "questionId": "q112", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1354 + }, + { + "questionId": "q112", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1095 + }, + { + "questionId": "q112", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1135 + }, + { + "questionId": "q112", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 976 + }, + { + "questionId": "q112", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 1311 + }, + { + "questionId": "q112", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1287 + }, + { + "questionId": "q112", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1288 + }, + { + "questionId": "q112", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1854.66", + "actual": "1854.66", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1157 + }, + { + "questionId": "q113", + "format": "json", + "model": "gpt-4o-mini", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1328 + }, + { + "questionId": "q113", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1068 + }, + { + "questionId": "q113", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1020 + }, + { + "questionId": "q113", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1069 + }, + { + "questionId": "q113", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 968 + }, + { + "questionId": "q113", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1436 + }, + { + "questionId": "q113", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1171 + }, + { + "questionId": "q113", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1273 + }, + { + "questionId": "q113", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1788 + }, + { + "questionId": "q113", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "4696", + "actual": "4696", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1050 + }, + { + "questionId": "q114", + "format": "json", + "model": "gpt-4o-mini", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1414 + }, + { + "questionId": "q114", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1192 + }, + { + "questionId": "q114", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 893 + }, + { + "questionId": "q114", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1065 + }, + { + "questionId": "q114", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1155 + }, + { + "questionId": "q114", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1842 + }, + { + "questionId": "q114", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 2740 + }, + { + "questionId": "q114", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1295 + }, + { + "questionId": "q114", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1053 + }, + { + "questionId": "q114", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "4211.6", + "actual": "4211.6", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1118 + }, + { + "questionId": "q115", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1452 + }, + { + "questionId": "q115", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1272 + }, + { + "questionId": "q115", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1039 + }, + { + "questionId": "q115", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1155 + }, + { + "questionId": "q115", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 796 + }, + { + "questionId": "q115", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1048 + }, + { + "questionId": "q115", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 2282 + }, + { + "questionId": "q115", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1592 + }, + { + "questionId": "q115", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 2691 + }, + { + "questionId": "q115", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6196", + "actual": "6196", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1126 + }, + { + "questionId": "q116", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1288 + }, + { + "questionId": "q116", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6105.3", + "actual": "6105.30", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 991 + }, + { + "questionId": "q116", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1257 + }, + { + "questionId": "q116", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1004 + }, + { + "questionId": "q116", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1620 + }, + { + "questionId": "q116", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 991 + }, + { + "questionId": "q116", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 1048 + }, + { + "questionId": "q116", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1189 + }, + { + "questionId": "q116", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 3282 + }, + { + "questionId": "q116", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6105.3", + "actual": "6105.3", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 985 + }, + { + "questionId": "q117", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 871 + }, + { + "questionId": "q117", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1042 + }, + { + "questionId": "q117", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 999 + }, + { + "questionId": "q117", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1111 + }, + { + "questionId": "q117", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1132 + }, + { + "questionId": "q117", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1004 + }, + { + "questionId": "q117", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1162 + }, + { + "questionId": "q117", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1271 + }, + { + "questionId": "q117", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 961 + }, + { + "questionId": "q117", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6528", + "actual": "6528", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1289 + }, + { + "questionId": "q118", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1634 + }, + { + "questionId": "q118", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1198 + }, + { + "questionId": "q118", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 2678 + }, + { + "questionId": "q118", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1155 + }, + { + "questionId": "q118", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1104 + }, + { + "questionId": "q118", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1109 + }, + { + "questionId": "q118", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 3756 + }, + { + "questionId": "q118", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1082 + }, + { + "questionId": "q118", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1451 + }, + { + "questionId": "q118", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1136.09", + "actual": "1136.09", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1730 + }, + { + "questionId": "q119", + "format": "json", + "model": "gpt-4o-mini", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1327 + }, + { + "questionId": "q119", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 1282 + }, + { + "questionId": "q119", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1368 + }, + { + "questionId": "q119", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1487 + }, + { + "questionId": "q119", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 2752 + }, + { + "questionId": "q119", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 909 + }, + { + "questionId": "q119", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 3502 + }, + { + "questionId": "q119", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1212 + }, + { + "questionId": "q119", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1218 + }, + { + "questionId": "q119", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "4689", + "actual": "4689", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1064 + }, + { + "questionId": "q120", + "format": "json", + "model": "gpt-4o-mini", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 2777 + }, + { + "questionId": "q120", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1246 + }, + { + "questionId": "q120", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1424 + }, + { + "questionId": "q120", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1074 + }, + { + "questionId": "q120", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 2803 + }, + { + "questionId": "q120", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1107 + }, + { + "questionId": "q120", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 1066 + }, + { + "questionId": "q120", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1325 + }, + { + "questionId": "q120", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1330 + }, + { + "questionId": "q120", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "2637.73", + "actual": "2637.73", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 1192 + }, + { + "questionId": "q121", + "format": "json", + "model": "gpt-4o-mini", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 3713, + "outputTokens": 3, + "latencyMs": 1139 + }, + { + "questionId": "q121", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 4080, + "outputTokens": 6, + "latencyMs": 994 + }, + { + "questionId": "q121", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 1564, + "outputTokens": 3, + "latencyMs": 1309 + }, + { + "questionId": "q121", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 1509, + "outputTokens": 6, + "latencyMs": 1184 + }, + { + "questionId": "q121", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 1442, + "outputTokens": 3, + "latencyMs": 1182 + }, + { + "questionId": "q121", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 1445, + "outputTokens": 6, + "latencyMs": 1381 + }, + { + "questionId": "q121", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 3830, + "outputTokens": 3, + "latencyMs": 1103 + }, + { + "questionId": "q121", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 3415, + "outputTokens": 6, + "latencyMs": 1220 + }, + { + "questionId": "q121", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 2986, + "outputTokens": 3, + "latencyMs": 1169 + }, + { + "questionId": "q121", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "5685", + "actual": "5685", + "correct": true, + "inputTokens": 3110, + "outputTokens": 6, + "latencyMs": 1208 + }, + { + "questionId": "q122", + "format": "json", + "model": "gpt-4o-mini", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 3712, + "outputTokens": 5, + "latencyMs": 1037 + }, + { + "questionId": "q122", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 4079, + "outputTokens": 8, + "latencyMs": 1278 + }, + { + "questionId": "q122", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 1563, + "outputTokens": 5, + "latencyMs": 1441 + }, + { + "questionId": "q122", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 1508, + "outputTokens": 8, + "latencyMs": 1204 + }, + { + "questionId": "q122", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 1441, + "outputTokens": 5, + "latencyMs": 1782 + }, + { + "questionId": "q122", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 1444, + "outputTokens": 8, + "latencyMs": 1088 + }, + { + "questionId": "q122", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 3829, + "outputTokens": 5, + "latencyMs": 1447 + }, + { + "questionId": "q122", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 3414, + "outputTokens": 8, + "latencyMs": 1356 + }, + { + "questionId": "q122", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 2985, + "outputTokens": 5, + "latencyMs": 1309 + }, + { + "questionId": "q122", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "3421.06", + "actual": "3421.06", + "correct": true, + "inputTokens": 3109, + "outputTokens": 8, + "latencyMs": 995 + }, + { + "questionId": "q123", + "format": "json", + "model": "gpt-4o-mini", + "expected": "344498", + "actual": "188,000", + "correct": false, + "inputTokens": 3710, + "outputTokens": 4, + "latencyMs": 1405 + }, + { + "questionId": "q123", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "344498", + "actual": "188,945", + "correct": false, + "inputTokens": 4077, + "outputTokens": 7, + "latencyMs": 1110 + }, + { + "questionId": "q123", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "344498", + "actual": "186,000", + "correct": false, + "inputTokens": 1561, + "outputTokens": 4, + "latencyMs": 1306 + }, + { + "questionId": "q123", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "344498", + "actual": "337,045", + "correct": false, + "inputTokens": 1506, + "outputTokens": 7, + "latencyMs": 1292 + }, + { + "questionId": "q123", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "344498", + "actual": "188,000", + "correct": false, + "inputTokens": 1439, + "outputTokens": 4, + "latencyMs": 2659 + }, + { + "questionId": "q123", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "344498", + "actual": "372,915", + "correct": false, + "inputTokens": 1442, + "outputTokens": 7, + "latencyMs": 966 + }, + { + "questionId": "q123", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "344498", + "actual": "174,000", + "correct": false, + "inputTokens": 3827, + "outputTokens": 4, + "latencyMs": 1177 + }, + { + "questionId": "q123", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "344498", + "actual": "188,647", + "correct": false, + "inputTokens": 3412, + "outputTokens": 7, + "latencyMs": 1018 + }, + { + "questionId": "q123", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "344498", + "actual": "188,000", + "correct": false, + "inputTokens": 2983, + "outputTokens": 4, + "latencyMs": 1659 + }, + { + "questionId": "q123", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "344498", + "actual": "181,854", + "correct": false, + "inputTokens": 3107, + "outputTokens": 7, + "latencyMs": 1894 + }, + { + "questionId": "q124", + "format": "json", + "model": "gpt-4o-mini", + "expected": "312818.50", + "actual": "188,174.36", + "correct": false, + "inputTokens": 3708, + "outputTokens": 6, + "latencyMs": 2900 + }, + { + "questionId": "q124", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "312818.50", + "actual": "287,745.89", + "correct": false, + "inputTokens": 4075, + "outputTokens": 9, + "latencyMs": 1196 + }, + { + "questionId": "q124", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "312818.50", + "actual": "Total revenue across all dates is 139,155.36.", + "correct": false, + "inputTokens": 1559, + "outputTokens": 14, + "latencyMs": 1401 + }, + { + "questionId": "q124", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "312818.50", + "actual": "487,891.45", + "correct": false, + "inputTokens": 1504, + "outputTokens": 9, + "latencyMs": 1118 + }, + { + "questionId": "q124", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "312818.50", + "actual": "Total revenue across all dates is 155,000.00.", + "correct": false, + "inputTokens": 1437, + "outputTokens": 14, + "latencyMs": 1308 + }, + { + "questionId": "q124", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "312818.50", + "actual": "487,891.89", + "correct": false, + "inputTokens": 1440, + "outputTokens": 9, + "latencyMs": 1120 + }, + { + "questionId": "q124", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "312818.50", + "actual": "Total revenue across all dates is 155,155.36.", + "correct": false, + "inputTokens": 3825, + "outputTokens": 14, + "latencyMs": 1143 + }, + { + "questionId": "q124", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "312818.50", + "actual": "381,968.89", + "correct": false, + "inputTokens": 3410, + "outputTokens": 9, + "latencyMs": 1172 + }, + { + "questionId": "q124", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "312818.50", + "actual": "Total revenue across all dates is 155,155.36.", + "correct": false, + "inputTokens": 2981, + "outputTokens": 14, + "latencyMs": 1179 + }, + { + "questionId": "q124", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "312818.50", + "actual": "381,847.89", + "correct": false, + "inputTokens": 3105, + "outputTokens": 9, + "latencyMs": 1073 + }, + { + "questionId": "q125", + "format": "json", + "model": "gpt-4o-mini", + "expected": "1811", + "actual": "1030", + "correct": false, + "inputTokens": 3710, + "outputTokens": 3, + "latencyMs": 3823 + }, + { + "questionId": "q125", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "1811", + "actual": "1,234", + "correct": false, + "inputTokens": 4078, + "outputTokens": 7, + "latencyMs": 1153 + }, + { + "questionId": "q125", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "1811", + "actual": "1040", + "correct": false, + "inputTokens": 1561, + "outputTokens": 3, + "latencyMs": 1472 + }, + { + "questionId": "q125", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "1811", + "actual": "1,945", + "correct": false, + "inputTokens": 1507, + "outputTokens": 7, + "latencyMs": 940 + }, + { + "questionId": "q125", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "1811", + "actual": "1030", + "correct": false, + "inputTokens": 1439, + "outputTokens": 3, + "latencyMs": 1067 + }, + { + "questionId": "q125", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "1811", + "actual": "1,945", + "correct": false, + "inputTokens": 1443, + "outputTokens": 7, + "latencyMs": 1183 + }, + { + "questionId": "q125", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "1811", + "actual": "Total conversions: 1030", + "correct": false, + "inputTokens": 3827, + "outputTokens": 7, + "latencyMs": 1103 + }, + { + "questionId": "q125", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "1811", + "actual": "1,454", + "correct": false, + "inputTokens": 3413, + "outputTokens": 7, + "latencyMs": 1067 + }, + { + "questionId": "q125", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "1811", + "actual": "1040", + "correct": false, + "inputTokens": 2983, + "outputTokens": 3, + "latencyMs": 932 + }, + { + "questionId": "q125", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "1811", + "actual": "1,454", + "correct": false, + "inputTokens": 3108, + "outputTokens": 7, + "latencyMs": 1530 + }, + { + "questionId": "q126", + "format": "json", + "model": "gpt-4o-mini", + "expected": "42", + "actual": "42", + "correct": true, + "inputTokens": 3710, + "outputTokens": 2, + "latencyMs": 1016 + }, + { + "questionId": "q126", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "42", + "actual": "42", + "correct": true, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 1440 + }, + { + "questionId": "q126", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "42", + "actual": "24", + "correct": false, + "inputTokens": 1561, + "outputTokens": 2, + "latencyMs": 1206 + }, + { + "questionId": "q126", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "42", + "actual": "42", + "correct": true, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 1452 + }, + { + "questionId": "q126", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "42", + "actual": "22", + "correct": false, + "inputTokens": 1439, + "outputTokens": 2, + "latencyMs": 1249 + }, + { + "questionId": "q126", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "42", + "actual": "42", + "correct": true, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 1248 + }, + { + "questionId": "q126", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "42", + "actual": "20", + "correct": false, + "inputTokens": 3827, + "outputTokens": 2, + "latencyMs": 1420 + }, + { + "questionId": "q126", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "42", + "actual": "47", + "correct": false, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 900 + }, + { + "questionId": "q126", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "42", + "actual": "42", + "correct": true, + "inputTokens": 2983, + "outputTokens": 2, + "latencyMs": 1309 + }, + { + "questionId": "q126", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "42", + "actual": "47", + "correct": false, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1216 + }, + { + "questionId": "q127", + "format": "json", + "model": "gpt-4o-mini", + "expected": "28", + "actual": "38", + "correct": false, + "inputTokens": 3710, + "outputTokens": 2, + "latencyMs": 3911 + }, + { + "questionId": "q127", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "28", + "actual": "24", + "correct": false, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 1056 + }, + { + "questionId": "q127", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "28", + "actual": "20", + "correct": false, + "inputTokens": 1561, + "outputTokens": 2, + "latencyMs": 839 + }, + { + "questionId": "q127", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "28", + "actual": "26", + "correct": false, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 965 + }, + { + "questionId": "q127", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "28", + "actual": "20", + "correct": false, + "inputTokens": 1439, + "outputTokens": 2, + "latencyMs": 2163 + }, + { + "questionId": "q127", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "28", + "actual": "23", + "correct": false, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 1006 + }, + { + "questionId": "q127", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "28", + "actual": "18", + "correct": false, + "inputTokens": 3827, + "outputTokens": 2, + "latencyMs": 2619 + }, + { + "questionId": "q127", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "28", + "actual": "24", + "correct": false, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 989 + }, + { + "questionId": "q127", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "28", + "actual": "22", + "correct": false, + "inputTokens": 2983, + "outputTokens": 2, + "latencyMs": 1830 + }, + { + "questionId": "q127", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "28", + "actual": "23", + "correct": false, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1001 + }, + { + "questionId": "q128", + "format": "json", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "15", + "correct": false, + "inputTokens": 3710, + "outputTokens": 2, + "latencyMs": 1217 + }, + { + "questionId": "q128", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 3180 + }, + { + "questionId": "q128", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "15", + "correct": false, + "inputTokens": 1561, + "outputTokens": 2, + "latencyMs": 1076 + }, + { + "questionId": "q128", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "12", + "correct": false, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 912 + }, + { + "questionId": "q128", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "15", + "correct": false, + "inputTokens": 1439, + "outputTokens": 2, + "latencyMs": 2900 + }, + { + "questionId": "q128", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 1389 + }, + { + "questionId": "q128", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "12", + "correct": false, + "inputTokens": 3827, + "outputTokens": 2, + "latencyMs": 1107 + }, + { + "questionId": "q128", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 1150 + }, + { + "questionId": "q128", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "11", + "actual": "18", + "correct": false, + "inputTokens": 2983, + "outputTokens": 2, + "latencyMs": 1047 + }, + { + "questionId": "q128", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1169 + }, + { + "questionId": "q129", + "format": "json", + "model": "gpt-4o-mini", + "expected": "58", + "actual": "36", + "correct": false, + "inputTokens": 3709, + "outputTokens": 2, + "latencyMs": 1007 + }, + { + "questionId": "q129", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "58", + "actual": "50", + "correct": false, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 1342 + }, + { + "questionId": "q129", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "58", + "actual": "24", + "correct": false, + "inputTokens": 1560, + "outputTokens": 2, + "latencyMs": 828 + }, + { + "questionId": "q129", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "58", + "actual": "47", + "correct": false, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 1305 + }, + { + "questionId": "q129", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "58", + "actual": "15", + "correct": false, + "inputTokens": 1438, + "outputTokens": 2, + "latencyMs": 1305 + }, + { + "questionId": "q129", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "58", + "actual": "54", + "correct": false, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 1406 + }, + { + "questionId": "q129", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "58", + "actual": "18", + "correct": false, + "inputTokens": 3826, + "outputTokens": 2, + "latencyMs": 1513 + }, + { + "questionId": "q129", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "58", + "actual": "47", + "correct": false, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 1026 + }, + { + "questionId": "q129", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "58", + "actual": "42", + "correct": false, + "inputTokens": 2982, + "outputTokens": 2, + "latencyMs": 1373 + }, + { + "questionId": "q129", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "58", + "actual": "54", + "correct": false, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1112 + }, + { + "questionId": "q130", + "format": "json", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "34", + "correct": false, + "inputTokens": 3709, + "outputTokens": 2, + "latencyMs": 1248 + }, + { + "questionId": "q130", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 1083 + }, + { + "questionId": "q130", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "24", + "correct": false, + "inputTokens": 1560, + "outputTokens": 2, + "latencyMs": 895 + }, + { + "questionId": "q130", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "38", + "correct": false, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 1087 + }, + { + "questionId": "q130", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "18", + "correct": false, + "inputTokens": 1438, + "outputTokens": 2, + "latencyMs": 1157 + }, + { + "questionId": "q130", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "38", + "correct": false, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 1155 + }, + { + "questionId": "q130", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "18", + "correct": false, + "inputTokens": 3826, + "outputTokens": 2, + "latencyMs": 1959 + }, + { + "questionId": "q130", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 1110 + }, + { + "questionId": "q130", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "41", + "actual": "34", + "correct": false, + "inputTokens": 2982, + "outputTokens": 2, + "latencyMs": 4540 + }, + { + "questionId": "q130", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "41", + "actual": "31", + "correct": false, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1286 + }, + { + "questionId": "q131", + "format": "json", + "model": "gpt-4o-mini", + "expected": "23", + "actual": "18", + "correct": false, + "inputTokens": 3709, + "outputTokens": 2, + "latencyMs": 1059 + }, + { + "questionId": "q131", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "23", + "actual": "20", + "correct": false, + "inputTokens": 4078, + "outputTokens": 5, + "latencyMs": 1302 + }, + { + "questionId": "q131", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "23", + "actual": "18", + "correct": false, + "inputTokens": 1560, + "outputTokens": 2, + "latencyMs": 1019 + }, + { + "questionId": "q131", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "23", + "actual": "20", + "correct": false, + "inputTokens": 1507, + "outputTokens": 5, + "latencyMs": 975 + }, + { + "questionId": "q131", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "23", + "actual": "18", + "correct": false, + "inputTokens": 1438, + "outputTokens": 2, + "latencyMs": 1056 + }, + { + "questionId": "q131", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "23", + "actual": "20", + "correct": false, + "inputTokens": 1443, + "outputTokens": 5, + "latencyMs": 984 + }, + { + "questionId": "q131", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "23", + "actual": "15", + "correct": false, + "inputTokens": 3826, + "outputTokens": 2, + "latencyMs": 1420 + }, + { + "questionId": "q131", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "23", + "actual": "21", + "correct": false, + "inputTokens": 3413, + "outputTokens": 5, + "latencyMs": 1139 + }, + { + "questionId": "q131", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "23", + "actual": "18", + "correct": false, + "inputTokens": 2982, + "outputTokens": 2, + "latencyMs": 1097 + }, + { + "questionId": "q131", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "23", + "actual": "21", + "correct": false, + "inputTokens": 3108, + "outputTokens": 5, + "latencyMs": 1203 + }, + { + "questionId": "q132", + "format": "json", + "model": "gpt-4o-mini", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 15188, + "outputTokens": 3, + "latencyMs": 2257 + }, + { + "questionId": "q132", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 17409, + "outputTokens": 6, + "latencyMs": 1292 + }, + { + "questionId": "q132", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3, + "latencyMs": 1877 + }, + { + "questionId": "q132", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 9279, + "outputTokens": 6, + "latencyMs": 1118 + }, + { + "questionId": "q132", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3, + "latencyMs": 4023 + }, + { + "questionId": "q132", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 9125, + "outputTokens": 6, + "latencyMs": 1134 + }, + { + "questionId": "q132", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3, + "latencyMs": 5304 + }, + { + "questionId": "q132", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 15367, + "outputTokens": 6, + "latencyMs": 1442 + }, + { + "questionId": "q132", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 13172, + "outputTokens": 3, + "latencyMs": 2157 + }, + { + "questionId": "q132", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "430828", + "actual": "430828", + "correct": true, + "inputTokens": 14483, + "outputTokens": 6, + "latencyMs": 1483 + }, + { + "questionId": "q133", + "format": "json", + "model": "gpt-4o-mini", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 15190, + "outputTokens": 3, + "latencyMs": 2084 + }, + { + "questionId": "q133", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 17410, + "outputTokens": 6, + "latencyMs": 2592 + }, + { + "questionId": "q133", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 8791, + "outputTokens": 3, + "latencyMs": 1208 + }, + { + "questionId": "q133", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 9280, + "outputTokens": 6, + "latencyMs": 1261 + }, + { + "questionId": "q133", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 8559, + "outputTokens": 3, + "latencyMs": 1697 + }, + { + "questionId": "q133", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 9126, + "outputTokens": 6, + "latencyMs": 1171 + }, + { + "questionId": "q133", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 15484, + "outputTokens": 3, + "latencyMs": 1704 + }, + { + "questionId": "q133", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 15368, + "outputTokens": 6, + "latencyMs": 1637 + }, + { + "questionId": "q133", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 13174, + "outputTokens": 3, + "latencyMs": 1599 + }, + { + "questionId": "q133", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11798", + "actual": "11798", + "correct": true, + "inputTokens": 14484, + "outputTokens": 6, + "latencyMs": 1505 + }, + { + "questionId": "q134", + "format": "json", + "model": "gpt-4o-mini", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 15193, + "outputTokens": 3, + "latencyMs": 2340 + }, + { + "questionId": "q134", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 17412, + "outputTokens": 6, + "latencyMs": 1380 + }, + { + "questionId": "q134", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 8794, + "outputTokens": 3, + "latencyMs": 1631 + }, + { + "questionId": "q134", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 9282, + "outputTokens": 6, + "latencyMs": 1271 + }, + { + "questionId": "q134", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 8562, + "outputTokens": 3, + "latencyMs": 1620 + }, + { + "questionId": "q134", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 9128, + "outputTokens": 6, + "latencyMs": 1279 + }, + { + "questionId": "q134", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 15487, + "outputTokens": 3, + "latencyMs": 14565 + }, + { + "questionId": "q134", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 15370, + "outputTokens": 6, + "latencyMs": 1559 + }, + { + "questionId": "q134", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 13177, + "outputTokens": 3, + "latencyMs": 1600 + }, + { + "questionId": "q134", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 14486, + "outputTokens": 6, + "latencyMs": 1179 + }, + { + "questionId": "q135", + "format": "json", + "model": "gpt-4o-mini", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 15192, + "outputTokens": 3, + "latencyMs": 2508 + }, + { + "questionId": "q135", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 17412, + "outputTokens": 6, + "latencyMs": 1359 + }, + { + "questionId": "q135", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 8793, + "outputTokens": 3, + "latencyMs": 1188 + }, + { + "questionId": "q135", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 9282, + "outputTokens": 6, + "latencyMs": 1204 + }, + { + "questionId": "q135", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 8561, + "outputTokens": 3, + "latencyMs": 2448 + }, + { + "questionId": "q135", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 9128, + "outputTokens": 6, + "latencyMs": 1311 + }, + { + "questionId": "q135", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 15486, + "outputTokens": 3, + "latencyMs": 2442 + }, + { + "questionId": "q135", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 15370, + "outputTokens": 6, + "latencyMs": 1414 + }, + { + "questionId": "q135", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 13176, + "outputTokens": 3, + "latencyMs": 2254 + }, + { + "questionId": "q135", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 14486, + "outputTokens": 6, + "latencyMs": 1512 + }, + { + "questionId": "q136", + "format": "json", + "model": "gpt-4o-mini", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 15188, + "outputTokens": 3, + "latencyMs": 1565 + }, + { + "questionId": "q136", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 17407, + "outputTokens": 6, + "latencyMs": 1871 + }, + { + "questionId": "q136", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3, + "latencyMs": 1963 + }, + { + "questionId": "q136", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 9277, + "outputTokens": 6, + "latencyMs": 1533 + }, + { + "questionId": "q136", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3, + "latencyMs": 1561 + }, + { + "questionId": "q136", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 9123, + "outputTokens": 6, + "latencyMs": 1200 + }, + { + "questionId": "q136", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3, + "latencyMs": 1657 + }, + { + "questionId": "q136", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 15365, + "outputTokens": 6, + "latencyMs": 1582 + }, + { + "questionId": "q136", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 13172, + "outputTokens": 3, + "latencyMs": 3402 + }, + { + "questionId": "q136", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "135306", + "actual": "135306", + "correct": true, + "inputTokens": 14481, + "outputTokens": 6, + "latencyMs": 1251 + }, + { + "questionId": "q137", + "format": "json", + "model": "gpt-4o-mini", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 15187, + "outputTokens": 3, + "latencyMs": 2019 + }, + { + "questionId": "q137", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 17408, + "outputTokens": 6, + "latencyMs": 1517 + }, + { + "questionId": "q137", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "24914", + "actual": "The repository undefined/react-native does not exist in the provided data.", + "correct": false, + "inputTokens": 8788, + "outputTokens": 14, + "latencyMs": 1737 + }, + { + "questionId": "q137", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 9278, + "outputTokens": 6, + "latencyMs": 1467 + }, + { + "questionId": "q137", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 8556, + "outputTokens": 3, + "latencyMs": 3442 + }, + { + "questionId": "q137", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 9124, + "outputTokens": 6, + "latencyMs": 1300 + }, + { + "questionId": "q137", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 15481, + "outputTokens": 3, + "latencyMs": 1825 + }, + { + "questionId": "q137", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 15366, + "outputTokens": 6, + "latencyMs": 1443 + }, + { + "questionId": "q137", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "24914", + "actual": "124320", + "correct": false, + "inputTokens": 13171, + "outputTokens": 3, + "latencyMs": 1783 + }, + { + "questionId": "q137", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "24914", + "actual": "24914", + "correct": true, + "inputTokens": 14482, + "outputTokens": 6, + "latencyMs": 1362 + }, + { + "questionId": "q138", + "format": "json", + "model": "gpt-4o-mini", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 15187, + "outputTokens": 3, + "latencyMs": 1824 + }, + { + "questionId": "q138", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 17407, + "outputTokens": 6, + "latencyMs": 1479 + }, + { + "questionId": "q138", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "111683", + "actual": "108017", + "correct": false, + "inputTokens": 8788, + "outputTokens": 3, + "latencyMs": 3315 + }, + { + "questionId": "q138", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 9277, + "outputTokens": 6, + "latencyMs": 1270 + }, + { + "questionId": "q138", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 8556, + "outputTokens": 3, + "latencyMs": 1384 + }, + { + "questionId": "q138", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 9123, + "outputTokens": 6, + "latencyMs": 1252 + }, + { + "questionId": "q138", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 15481, + "outputTokens": 3, + "latencyMs": 3048 + }, + { + "questionId": "q138", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 15365, + "outputTokens": 6, + "latencyMs": 1381 + }, + { + "questionId": "q138", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 13171, + "outputTokens": 3, + "latencyMs": 3804 + }, + { + "questionId": "q138", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "111683", + "actual": "111683", + "correct": true, + "inputTokens": 14481, + "outputTokens": 6, + "latencyMs": 1498 + }, + { + "questionId": "q139", + "format": "json", + "model": "gpt-4o-mini", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 15194, + "outputTokens": 3, + "latencyMs": 1726 + }, + { + "questionId": "q139", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 17412, + "outputTokens": 6, + "latencyMs": 1526 + }, + { + "questionId": "q139", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 8795, + "outputTokens": 3, + "latencyMs": 1685 + }, + { + "questionId": "q139", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 9282, + "outputTokens": 6, + "latencyMs": 1140 + }, + { + "questionId": "q139", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "13364", + "actual": "0", + "correct": false, + "inputTokens": 8563, + "outputTokens": 2, + "latencyMs": 1933 + }, + { + "questionId": "q139", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 9128, + "outputTokens": 6, + "latencyMs": 1157 + }, + { + "questionId": "q139", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 15488, + "outputTokens": 3, + "latencyMs": 1249 + }, + { + "questionId": "q139", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 15370, + "outputTokens": 6, + "latencyMs": 1347 + }, + { + "questionId": "q139", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 13178, + "outputTokens": 3, + "latencyMs": 2174 + }, + { + "questionId": "q139", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "13364", + "actual": "13364", + "correct": true, + "inputTokens": 14486, + "outputTokens": 6, + "latencyMs": 1197 + }, + { + "questionId": "q140", + "format": "json", + "model": "gpt-4o-mini", + "expected": "98464", + "actual": "0", + "correct": false, + "inputTokens": 15186, + "outputTokens": 2, + "latencyMs": 3252 + }, + { + "questionId": "q140", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "98464", + "actual": "98464", + "correct": true, + "inputTokens": 17405, + "outputTokens": 6, + "latencyMs": 1667 + }, + { + "questionId": "q140", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "98464", + "actual": "0", + "correct": false, + "inputTokens": 8787, + "outputTokens": 2, + "latencyMs": 1192 + }, + { + "questionId": "q140", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "98464", + "actual": "98464", + "correct": true, + "inputTokens": 9275, + "outputTokens": 6, + "latencyMs": 1113 + }, + { + "questionId": "q140", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "98464", + "actual": "0", + "correct": false, + "inputTokens": 8555, + "outputTokens": 2, + "latencyMs": 2198 + }, + { + "questionId": "q140", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "98464", + "actual": "98464", + "correct": true, + "inputTokens": 9121, + "outputTokens": 6, + "latencyMs": 1187 + }, + { + "questionId": "q140", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "98464", + "actual": "0", + "correct": false, + "inputTokens": 15480, + "outputTokens": 2, + "latencyMs": 8573 + }, + { + "questionId": "q140", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "98464", + "actual": "98464", + "correct": true, + "inputTokens": 15363, + "outputTokens": 6, + "latencyMs": 1311 + }, + { + "questionId": "q140", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "98464", + "actual": "0", + "correct": false, + "inputTokens": 13170, + "outputTokens": 2, + "latencyMs": 3471 + }, + { + "questionId": "q140", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "98464", + "actual": "98464", + "correct": true, + "inputTokens": 14479, + "outputTokens": 6, + "latencyMs": 1457 + }, + { + "questionId": "q141", + "format": "json", + "model": "gpt-4o-mini", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 15188, + "outputTokens": 3, + "latencyMs": 1363 + }, + { + "questionId": "q141", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 17408, + "outputTokens": 6, + "latencyMs": 1803 + }, + { + "questionId": "q141", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3, + "latencyMs": 3696 + }, + { + "questionId": "q141", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 9278, + "outputTokens": 6, + "latencyMs": 1391 + }, + { + "questionId": "q141", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "6378", + "actual": "93731", + "correct": false, + "inputTokens": 8557, + "outputTokens": 3, + "latencyMs": 7861 + }, + { + "questionId": "q141", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 9124, + "outputTokens": 6, + "latencyMs": 1420 + }, + { + "questionId": "q141", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3, + "latencyMs": 1769 + }, + { + "questionId": "q141", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 15366, + "outputTokens": 6, + "latencyMs": 1233 + }, + { + "questionId": "q141", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "6378", + "actual": "93731", + "correct": false, + "inputTokens": 13172, + "outputTokens": 3, + "latencyMs": 1831 + }, + { + "questionId": "q141", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "6378", + "actual": "6378", + "correct": true, + "inputTokens": 14482, + "outputTokens": 6, + "latencyMs": 1507 + }, + { + "questionId": "q142", + "format": "json", + "model": "gpt-4o-mini", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 15190, + "outputTokens": 3, + "latencyMs": 10752 + }, + { + "questionId": "q142", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 17409, + "outputTokens": 6, + "latencyMs": 1672 + }, + { + "questionId": "q142", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 8791, + "outputTokens": 3, + "latencyMs": 1788 + }, + { + "questionId": "q142", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 9279, + "outputTokens": 6, + "latencyMs": 1633 + }, + { + "questionId": "q142", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 8559, + "outputTokens": 3, + "latencyMs": 1365 + }, + { + "questionId": "q142", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 9125, + "outputTokens": 6, + "latencyMs": 1242 + }, + { + "questionId": "q142", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 15484, + "outputTokens": 3, + "latencyMs": 2237 + }, + { + "questionId": "q142", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 15367, + "outputTokens": 6, + "latencyMs": 1275 + }, + { + "questionId": "q142", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 13174, + "outputTokens": 3, + "latencyMs": 3028 + }, + { + "questionId": "q142", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "254916", + "actual": "254916", + "correct": true, + "inputTokens": 14483, + "outputTokens": 6, + "latencyMs": 1615 + }, + { + "questionId": "q143", + "format": "json", + "model": "gpt-4o-mini", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 15188, + "outputTokens": 3, + "latencyMs": 1972 + }, + { + "questionId": "q143", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 17410, + "outputTokens": 6, + "latencyMs": 2308 + }, + { + "questionId": "q143", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3, + "latencyMs": 1361 + }, + { + "questionId": "q143", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 9280, + "outputTokens": 6, + "latencyMs": 1162 + }, + { + "questionId": "q143", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3, + "latencyMs": 2196 + }, + { + "questionId": "q143", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 9126, + "outputTokens": 6, + "latencyMs": 1199 + }, + { + "questionId": "q143", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3, + "latencyMs": 1758 + }, + { + "questionId": "q143", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 15368, + "outputTokens": 6, + "latencyMs": 1340 + }, + { + "questionId": "q143", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 13172, + "outputTokens": 3, + "latencyMs": 2122 + }, + { + "questionId": "q143", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "32413", + "actual": "32413", + "correct": true, + "inputTokens": 14484, + "outputTokens": 6, + "latencyMs": 1156 + }, + { + "questionId": "q144", + "format": "json", + "model": "gpt-4o-mini", + "expected": "240059", + "actual": "0", + "correct": false, + "inputTokens": 15186, + "outputTokens": 2, + "latencyMs": 1208 + }, + { + "questionId": "q144", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "240059", + "actual": "240059", + "correct": true, + "inputTokens": 17405, + "outputTokens": 6, + "latencyMs": 1826 + }, + { + "questionId": "q144", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "240059", + "actual": "undefined", + "correct": false, + "inputTokens": 8787, + "outputTokens": 2, + "latencyMs": 2224 + }, + { + "questionId": "q144", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "240059", + "actual": "240059", + "correct": true, + "inputTokens": 9275, + "outputTokens": 6, + "latencyMs": 1220 + }, + { + "questionId": "q144", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "240059", + "actual": "undefined", + "correct": false, + "inputTokens": 8555, + "outputTokens": 2, + "latencyMs": 1199 + }, + { + "questionId": "q144", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "240059", + "actual": "240059", + "correct": true, + "inputTokens": 9121, + "outputTokens": 6, + "latencyMs": 1264 + }, + { + "questionId": "q144", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "240059", + "actual": "undefined/react does not exist in the provided data.", + "correct": false, + "inputTokens": 15480, + "outputTokens": 11, + "latencyMs": 3072 + }, + { + "questionId": "q144", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "240059", + "actual": "240059", + "correct": true, + "inputTokens": 15363, + "outputTokens": 6, + "latencyMs": 1609 + }, + { + "questionId": "q144", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "240059", + "actual": "undefined/react does not exist in the provided data.", + "correct": false, + "inputTokens": 13170, + "outputTokens": 11, + "latencyMs": 2608 + }, + { + "questionId": "q144", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "240059", + "actual": "240059", + "correct": true, + "inputTokens": 14479, + "outputTokens": 6, + "latencyMs": 1237 + }, + { + "questionId": "q145", + "format": "json", + "model": "gpt-4o-mini", + "expected": "48986", + "actual": "0", + "correct": false, + "inputTokens": 15187, + "outputTokens": 2, + "latencyMs": 1906 + }, + { + "questionId": "q145", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "48986", + "actual": "48986", + "correct": true, + "inputTokens": 17406, + "outputTokens": 6, + "latencyMs": 1399 + }, + { + "questionId": "q145", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "48986", + "actual": "0", + "correct": false, + "inputTokens": 8788, + "outputTokens": 2, + "latencyMs": 2026 + }, + { + "questionId": "q145", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "48986", + "actual": "48986", + "correct": true, + "inputTokens": 9276, + "outputTokens": 6, + "latencyMs": 1318 + }, + { + "questionId": "q145", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "48986", + "actual": "0", + "correct": false, + "inputTokens": 8556, + "outputTokens": 2, + "latencyMs": 1605 + }, + { + "questionId": "q145", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "48986", + "actual": "48986", + "correct": true, + "inputTokens": 9122, + "outputTokens": 6, + "latencyMs": 1270 + }, + { + "questionId": "q145", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "48986", + "actual": "0", + "correct": false, + "inputTokens": 15481, + "outputTokens": 2, + "latencyMs": 5367 + }, + { + "questionId": "q145", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "48986", + "actual": "48986", + "correct": true, + "inputTokens": 15364, + "outputTokens": 6, + "latencyMs": 1204 + }, + { + "questionId": "q145", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "48986", + "actual": "The repository \"undefined/Python\" does not exist in the provided data.", + "correct": false, + "inputTokens": 13171, + "outputTokens": 16, + "latencyMs": 6329 + }, + { + "questionId": "q145", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "48986", + "actual": "48986", + "correct": true, + "inputTokens": 14480, + "outputTokens": 6, + "latencyMs": 1369 + }, + { + "questionId": "q146", + "format": "json", + "model": "gpt-4o-mini", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 15186, + "outputTokens": 3, + "latencyMs": 2063 + }, + { + "questionId": "q146", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 17405, + "outputTokens": 6, + "latencyMs": 1470 + }, + { + "questionId": "q146", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 8787, + "outputTokens": 3, + "latencyMs": 1386 + }, + { + "questionId": "q146", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 9275, + "outputTokens": 6, + "latencyMs": 1104 + }, + { + "questionId": "q146", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 8555, + "outputTokens": 3, + "latencyMs": 1747 + }, + { + "questionId": "q146", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 9121, + "outputTokens": 6, + "latencyMs": 1300 + }, + { + "questionId": "q146", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 15480, + "outputTokens": 3, + "latencyMs": 1443 + }, + { + "questionId": "q146", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 15363, + "outputTokens": 6, + "latencyMs": 1282 + }, + { + "questionId": "q146", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 13170, + "outputTokens": 3, + "latencyMs": 2185 + }, + { + "questionId": "q146", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "209624", + "actual": "209624", + "correct": true, + "inputTokens": 14479, + "outputTokens": 6, + "latencyMs": 1407 + }, + { + "questionId": "q147", + "format": "json", + "model": "gpt-4o-mini", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 15186, + "outputTokens": 3, + "latencyMs": 1743 + }, + { + "questionId": "q147", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 17406, + "outputTokens": 6, + "latencyMs": 1564 + }, + { + "questionId": "q147", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 8787, + "outputTokens": 3, + "latencyMs": 1317 + }, + { + "questionId": "q147", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 9276, + "outputTokens": 6, + "latencyMs": 1258 + }, + { + "questionId": "q147", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 8555, + "outputTokens": 3, + "latencyMs": 2419 + }, + { + "questionId": "q147", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 9122, + "outputTokens": 6, + "latencyMs": 1171 + }, + { + "questionId": "q147", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "58023", + "actual": "undefined/linux does not exist in the provided data.", + "correct": false, + "inputTokens": 15480, + "outputTokens": 11, + "latencyMs": 1680 + }, + { + "questionId": "q147", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 15364, + "outputTokens": 6, + "latencyMs": 1396 + }, + { + "questionId": "q147", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "58023", + "actual": "The repository \"undefined/linux\" does not exist in the provided data.", + "correct": false, + "inputTokens": 13170, + "outputTokens": 15, + "latencyMs": 1418 + }, + { + "questionId": "q147", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "58023", + "actual": "58023", + "correct": true, + "inputTokens": 14480, + "outputTokens": 6, + "latencyMs": 1399 + }, + { + "questionId": "q148", + "format": "json", + "model": "gpt-4o-mini", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 15189, + "outputTokens": 3, + "latencyMs": 1673 + }, + { + "questionId": "q148", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 17407, + "outputTokens": 6, + "latencyMs": 1736 + }, + { + "questionId": "q148", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 8790, + "outputTokens": 3, + "latencyMs": 1754 + }, + { + "questionId": "q148", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 9277, + "outputTokens": 6, + "latencyMs": 1317 + }, + { + "questionId": "q148", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "196024", + "actual": "0", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 3219 + }, + { + "questionId": "q148", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 9123, + "outputTokens": 6, + "latencyMs": 1311 + }, + { + "questionId": "q148", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 15483, + "outputTokens": 3, + "latencyMs": 1346 + }, + { + "questionId": "q148", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 15365, + "outputTokens": 6, + "latencyMs": 1560 + }, + { + "questionId": "q148", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 13173, + "outputTokens": 3, + "latencyMs": 1009 + }, + { + "questionId": "q148", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "196024", + "actual": "196024", + "correct": true, + "inputTokens": 14481, + "outputTokens": 6, + "latencyMs": 1446 + }, + { + "questionId": "q149", + "format": "json", + "model": "gpt-4o-mini", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 15189, + "outputTokens": 3, + "latencyMs": 3361 + }, + { + "questionId": "q149", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 17408, + "outputTokens": 6, + "latencyMs": 1788 + }, + { + "questionId": "q149", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 8790, + "outputTokens": 3, + "latencyMs": 1123 + }, + { + "questionId": "q149", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 9278, + "outputTokens": 6, + "latencyMs": 1235 + }, + { + "questionId": "q149", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 8558, + "outputTokens": 3, + "latencyMs": 1100 + }, + { + "questionId": "q149", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 9124, + "outputTokens": 6, + "latencyMs": 1188 + }, + { + "questionId": "q149", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 15483, + "outputTokens": 3, + "latencyMs": 1557 + }, + { + "questionId": "q149", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 15366, + "outputTokens": 6, + "latencyMs": 1352 + }, + { + "questionId": "q149", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 13173, + "outputTokens": 3, + "latencyMs": 1280 + }, + { + "questionId": "q149", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "30919", + "actual": "30919", + "correct": true, + "inputTokens": 14482, + "outputTokens": 6, + "latencyMs": 1247 + }, + { + "questionId": "q150", + "format": "json", + "model": "gpt-4o-mini", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 15188, + "outputTokens": 3, + "latencyMs": 1394 + }, + { + "questionId": "q150", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 17405, + "outputTokens": 6, + "latencyMs": 1801 + }, + { + "questionId": "q150", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3, + "latencyMs": 2052 + }, + { + "questionId": "q150", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 9275, + "outputTokens": 6, + "latencyMs": 1176 + }, + { + "questionId": "q150", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3, + "latencyMs": 2084 + }, + { + "questionId": "q150", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 9121, + "outputTokens": 6, + "latencyMs": 1191 + }, + { + "questionId": "q150", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3, + "latencyMs": 1261 + }, + { + "questionId": "q150", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 15363, + "outputTokens": 6, + "latencyMs": 1355 + }, + { + "questionId": "q150", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 13172, + "outputTokens": 3, + "latencyMs": 3388 + }, + { + "questionId": "q150", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "192220", + "actual": "192220", + "correct": true, + "inputTokens": 14479, + "outputTokens": 6, + "latencyMs": 1591 + }, + { + "questionId": "q151", + "format": "json", + "model": "gpt-4o-mini", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 15191, + "outputTokens": 3, + "latencyMs": 1942 + }, + { + "questionId": "q151", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 17414, + "outputTokens": 6, + "latencyMs": 1340 + }, + { + "questionId": "q151", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 8792, + "outputTokens": 3, + "latencyMs": 1443 + }, + { + "questionId": "q151", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 9284, + "outputTokens": 6, + "latencyMs": 1732 + }, + { + "questionId": "q151", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 8560, + "outputTokens": 3, + "latencyMs": 1994 + }, + { + "questionId": "q151", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 9130, + "outputTokens": 6, + "latencyMs": 1198 + }, + { + "questionId": "q151", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 15485, + "outputTokens": 3, + "latencyMs": 5013 + }, + { + "questionId": "q151", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 15372, + "outputTokens": 6, + "latencyMs": 1463 + }, + { + "questionId": "q151", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 13175, + "outputTokens": 3, + "latencyMs": 1296 + }, + { + "questionId": "q151", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11763", + "actual": "11763", + "correct": true, + "inputTokens": 14488, + "outputTokens": 6, + "latencyMs": 2877 + }, + { + "questionId": "q152", + "format": "json", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 15188, + "outputTokens": 2, + "latencyMs": 2160 + }, + { + "questionId": "q152", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 17406, + "outputTokens": 5, + "latencyMs": 1947 + }, + { + "questionId": "q152", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 8789, + "outputTokens": 2, + "latencyMs": 1222 + }, + { + "questionId": "q152", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 9276, + "outputTokens": 5, + "latencyMs": 1487 + }, + { + "questionId": "q152", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 8557, + "outputTokens": 2, + "latencyMs": 1450 + }, + { + "questionId": "q152", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 9122, + "outputTokens": 5, + "latencyMs": 1358 + }, + { + "questionId": "q152", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 15482, + "outputTokens": 2, + "latencyMs": 873 + }, + { + "questionId": "q152", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 15364, + "outputTokens": 5, + "latencyMs": 1500 + }, + { + "questionId": "q152", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 13172, + "outputTokens": 2, + "latencyMs": 7031 + }, + { + "questionId": "q152", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "0", + "correct": false, + "inputTokens": 14480, + "outputTokens": 5, + "latencyMs": 1916 + }, + { + "questionId": "q153", + "format": "json", + "model": "gpt-4o-mini", + "expected": "15404143", + "actual": "43115556", + "correct": false, + "inputTokens": 15189, + "outputTokens": 4, + "latencyMs": 3324 + }, + { + "questionId": "q153", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "15404143", + "actual": "13,847,892", + "correct": false, + "inputTokens": 17407, + "outputTokens": 9, + "latencyMs": 1607 + }, + { + "questionId": "q153", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "15404143", + "actual": "10419582", + "correct": false, + "inputTokens": 8790, + "outputTokens": 4, + "latencyMs": 900 + }, + { + "questionId": "q153", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "15404143", + "actual": "13,847,892", + "correct": false, + "inputTokens": 9277, + "outputTokens": 9, + "latencyMs": 1385 + }, + { + "questionId": "q153", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "15404143", + "actual": "10419582", + "correct": false, + "inputTokens": 8558, + "outputTokens": 4, + "latencyMs": 1922 + }, + { + "questionId": "q153", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "15404143", + "actual": "15,847,892", + "correct": false, + "inputTokens": 9123, + "outputTokens": 9, + "latencyMs": 1230 + }, + { + "questionId": "q153", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "15404143", + "actual": "10419580", + "correct": false, + "inputTokens": 15483, + "outputTokens": 4, + "latencyMs": 1716 + }, + { + "questionId": "q153", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "15404143", + "actual": "13,847,892", + "correct": false, + "inputTokens": 15365, + "outputTokens": 9, + "latencyMs": 1384 + }, + { + "questionId": "q153", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "15404143", + "actual": "Total number of stars across all repositories is 4,978,155.", + "correct": false, + "inputTokens": 13173, + "outputTokens": 16, + "latencyMs": 3411 + }, + { + "questionId": "q153", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "15404143", + "actual": "13,847,892", + "correct": false, + "inputTokens": 14481, + "outputTokens": 9, + "latencyMs": 1539 + }, + { + "questionId": "q154", + "format": "json", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "77", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 2523 + }, + { + "questionId": "q154", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 17408, + "outputTokens": 5, + "latencyMs": 1885 + }, + { + "questionId": "q154", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "42", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 1148 + }, + { + "questionId": "q154", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 9278, + "outputTokens": 5, + "latencyMs": 1378 + }, + { + "questionId": "q154", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "42", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 1364 + }, + { + "questionId": "q154", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 9124, + "outputTokens": 5, + "latencyMs": 1125 + }, + { + "questionId": "q154", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "104", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 1276 + }, + { + "questionId": "q154", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 15366, + "outputTokens": 5, + "latencyMs": 1331 + }, + { + "questionId": "q154", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "77", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 1534 + }, + { + "questionId": "q154", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 14482, + "outputTokens": 5, + "latencyMs": 1282 + }, + { + "questionId": "q155", + "format": "json", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "19", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 2206 + }, + { + "questionId": "q155", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "71", + "correct": false, + "inputTokens": 17408, + "outputTokens": 5, + "latencyMs": 1568 + }, + { + "questionId": "q155", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "15", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 1478 + }, + { + "questionId": "q155", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "42", + "correct": false, + "inputTokens": 9278, + "outputTokens": 5, + "latencyMs": 1314 + }, + { + "questionId": "q155", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "12", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 2149 + }, + { + "questionId": "q155", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "47", + "correct": false, + "inputTokens": 9124, + "outputTokens": 5, + "latencyMs": 1485 + }, + { + "questionId": "q155", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "34", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 1043 + }, + { + "questionId": "q155", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "71", + "correct": false, + "inputTokens": 15366, + "outputTokens": 5, + "latencyMs": 1371 + }, + { + "questionId": "q155", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "34", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 1693 + }, + { + "questionId": "q155", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "71", + "correct": false, + "inputTokens": 14482, + "outputTokens": 5, + "latencyMs": 1237 + }, + { + "questionId": "q156", + "format": "json", + "model": "gpt-4o-mini", + "expected": "76", + "actual": "82", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 927 + }, + { + "questionId": "q156", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 17408, + "outputTokens": 5, + "latencyMs": 1274 + }, + { + "questionId": "q156", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "76", + "actual": "34", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 2541 + }, + { + "questionId": "q156", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 9278, + "outputTokens": 5, + "latencyMs": 1116 + }, + { + "questionId": "q156", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "76", + "actual": "34", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 997 + }, + { + "questionId": "q156", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 9124, + "outputTokens": 5, + "latencyMs": 1513 + }, + { + "questionId": "q156", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "76", + "actual": "104", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 3168 + }, + { + "questionId": "q156", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 15366, + "outputTokens": 5, + "latencyMs": 1498 + }, + { + "questionId": "q156", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "76", + "actual": "66", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 1600 + }, + { + "questionId": "q156", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 14482, + "outputTokens": 5, + "latencyMs": 1519 + }, + { + "questionId": "q157", + "format": "json", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "77", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 1809 + }, + { + "questionId": "q157", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "89", + "correct": false, + "inputTokens": 17409, + "outputTokens": 5, + "latencyMs": 1409 + }, + { + "questionId": "q157", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "66", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 1367 + }, + { + "questionId": "q157", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "73", + "correct": false, + "inputTokens": 9279, + "outputTokens": 5, + "latencyMs": 1296 + }, + { + "questionId": "q157", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "66", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 1162 + }, + { + "questionId": "q157", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "89", + "correct": false, + "inputTokens": 9125, + "outputTokens": 5, + "latencyMs": 1435 + }, + { + "questionId": "q157", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "77", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 1774 + }, + { + "questionId": "q157", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "95", + "correct": false, + "inputTokens": 15367, + "outputTokens": 5, + "latencyMs": 1479 + }, + { + "questionId": "q157", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "100", + "actual": "66", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 2710 + }, + { + "questionId": "q157", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "95", + "correct": false, + "inputTokens": 14483, + "outputTokens": 5, + "latencyMs": 1272 + }, + { + "questionId": "q158", + "format": "json", + "model": "gpt-4o-mini", + "expected": "95", + "actual": "42", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 3038 + }, + { + "questionId": "q158", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "42", + "correct": false, + "inputTokens": 17409, + "outputTokens": 5, + "latencyMs": 1562 + }, + { + "questionId": "q158", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "95", + "actual": "38", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 1536 + }, + { + "questionId": "q158", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "42", + "correct": false, + "inputTokens": 9279, + "outputTokens": 5, + "latencyMs": 1216 + }, + { + "questionId": "q158", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "95", + "actual": "34", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 1760 + }, + { + "questionId": "q158", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "42", + "correct": false, + "inputTokens": 9125, + "outputTokens": 5, + "latencyMs": 1255 + }, + { + "questionId": "q158", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "95", + "actual": "66", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 1683 + }, + { + "questionId": "q158", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "47", + "correct": false, + "inputTokens": 15367, + "outputTokens": 5, + "latencyMs": 2256 + }, + { + "questionId": "q158", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "95", + "actual": "38", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 2831 + }, + { + "questionId": "q158", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "47", + "correct": false, + "inputTokens": 14483, + "outputTokens": 5, + "latencyMs": 1980 + }, + { + "questionId": "q159", + "format": "json", + "model": "gpt-4o-mini", + "expected": "83", + "actual": "66", + "correct": false, + "inputTokens": 15189, + "outputTokens": 2, + "latencyMs": 1327 + }, + { + "questionId": "q159", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "83", + "actual": "71", + "correct": false, + "inputTokens": 17409, + "outputTokens": 5, + "latencyMs": 1894 + }, + { + "questionId": "q159", + "format": "toon", + "model": "gpt-4o-mini", + "expected": "83", + "actual": "34", + "correct": false, + "inputTokens": 8790, + "outputTokens": 2, + "latencyMs": 784 + }, + { + "questionId": "q159", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "83", + "actual": "73", + "correct": false, + "inputTokens": 9279, + "outputTokens": 5, + "latencyMs": 1422 + }, + { + "questionId": "q159", + "format": "csv", + "model": "gpt-4o-mini", + "expected": "83", + "actual": "34", + "correct": false, + "inputTokens": 8558, + "outputTokens": 2, + "latencyMs": 2644 + }, + { + "questionId": "q159", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "83", + "actual": "73", + "correct": false, + "inputTokens": 9125, + "outputTokens": 5, + "latencyMs": 1109 + }, + { + "questionId": "q159", + "format": "markdown-kv", + "model": "gpt-4o-mini", + "expected": "83", + "actual": "66", + "correct": false, + "inputTokens": 15483, + "outputTokens": 2, + "latencyMs": 1826 + }, + { + "questionId": "q159", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "83", + "actual": "71", + "correct": false, + "inputTokens": 15367, + "outputTokens": 5, + "latencyMs": 1342 + }, + { + "questionId": "q159", + "format": "yaml", + "model": "gpt-4o-mini", + "expected": "83", + "actual": "38", + "correct": false, + "inputTokens": 13173, + "outputTokens": 2, + "latencyMs": 2055 + }, + { + "questionId": "q159", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "83", + "actual": "71", + "correct": false, + "inputTokens": 14483, + "outputTokens": 5, + "latencyMs": 1537 + } +] \ No newline at end of file diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/accuracy/report.md new file mode 100644 index 0000000..9991de9 --- /dev/null +++ b/benchmarks/results/accuracy/report.md @@ -0,0 +1,96 @@ +### Retrieval Accuracy + +Tested across **2 LLMs** with data retrieval tasks: + +``` +gpt-4o-mini โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 72.3% accuracy +claude-haiku-4-5 โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘ 76.7% accuracy +``` + +**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.** + +| Format | Accuracy | Average Tokens | +| ------ | -------- | -------------- | +| `toon` | 73.9% | 4.678 | +| `json` | 73.6% | 8.713 | +| `markdown-kv` | 73.6% | 8.649 | +| `csv` | 72.3% | 4.745 | +| `yaml` | 71.7% | 7.091 | + +
+View detailed breakdown by dataset and model + +#### Performance by Dataset + +##### Uniform employee records (TOON optimal format) + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 72.4% | 2.483 | 84/116 | +| `csv` | 69.0% | 2.337 | 80/116 | +| `yaml` | 68.1% | 4.969 | 79/116 | +| `markdown-kv` | 68.1% | 6.270 | 79/116 | +| `json` | 68.1% | 6.347 | 79/116 | + +##### E-commerce orders with nested structures + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 84.1% | 5.967 | 74/88 | +| `csv` | 83.0% | 6.735 | 73/88 | +| `yaml` | 81.8% | 7.328 | 72/88 | +| `markdown-kv` | 86.4% | 9.110 | 76/88 | +| `json` | 84.1% | 9.694 | 74/88 | + +##### Time-series analytics data + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `csv` | 72.4% | 1.393 | 42/58 | +| `toon` | 70.7% | 1.515 | 41/58 | +| `yaml` | 72.4% | 2.938 | 42/58 | +| `json` | 74.1% | 3.665 | 43/58 | +| `markdown-kv` | 70.7% | 3.779 | 41/58 | + +##### Popular GitHub repositories + +| Format | Accuracy | Tokens | Correct/Total | +|--------|----------|--------|---------------| +| `toon` | 64.3% | 8.745 | 36/56 | +| `csv` | 62.5% | 8.513 | 35/56 | +| `json` | 67.9% | 15.145 | 38/56 | +| `markdown-kv` | 67.9% | 15.436 | 38/56 | +| `yaml` | 62.5% | 13.129 | 35/56 | + + +#### Performance by Model + +##### gpt-4o-mini + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `toon` | 72.3% | 115/159 | +| `json` | 71.7% | 114/159 | +| `markdown-kv` | 70.4% | 112/159 | +| `csv` | 69.2% | 110/159 | +| `yaml` | 68.6% | 109/159 | + +##### claude-haiku-4-5 + +| Format | Accuracy | Correct/Total | +|--------|----------|---------------| +| `markdown-kv` | 76.7% | 122/159 | +| `toon` | 75.5% | 120/159 | +| `json` | 75.5% | 120/159 | +| `csv` | 75.5% | 120/159 | +| `yaml` | 74.8% | 119/159 | + + +#### Methodology + +- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching). +- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding. +- **Question types**: Field retrieval, aggregation, and filtering tasks. +- **Real data**: Faker.js-generated datasets + GitHub repositories. + +
diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json new file mode 100644 index 0000000..b5dddc2 --- /dev/null +++ b/benchmarks/results/accuracy/summary.json @@ -0,0 +1,95 @@ +{ + "formatResults": [ + { + "format": "toon", + "accuracy": 0.7389937106918238, + "totalTokens": 4678, + "avgInputTokens": 4675, + "avgLatency": 1424, + "correctCount": 235, + "totalCount": 318 + }, + { + "format": "json", + "accuracy": 0.7358490566037735, + "totalTokens": 8713, + "avgInputTokens": 9177, + "avgLatency": 1678, + "correctCount": 234, + "totalCount": 318 + }, + { + "format": "markdown-kv", + "accuracy": 0.7358490566037735, + "totalTokens": 8649, + "avgInputTokens": 8242, + "avgLatency": 1724, + "correctCount": 234, + "totalCount": 318 + }, + { + "format": "csv", + "accuracy": 0.7232704402515723, + "totalTokens": 4745, + "avgInputTokens": 4878, + "avgLatency": 1573, + "correctCount": 230, + "totalCount": 318 + }, + { + "format": "yaml", + "accuracy": 0.7169811320754716, + "totalTokens": 7091, + "avgInputTokens": 7136, + "avgLatency": 1602, + "correctCount": 228, + "totalCount": 318 + } + ], + "questions": 159, + "models": [ + "gpt-4o-mini", + "claude-haiku-4-5" + ], + "datasets": [ + { + "name": "tabular", + "description": "Uniform employee records (TOON optimal format)" + }, + { + "name": "nested", + "description": "E-commerce orders with nested structures" + }, + { + "name": "analytics", + "description": "Time-series analytics data" + }, + { + "name": "github", + "description": "Popular GitHub repositories" + } + ], + "tokenCounts": { + "json-tabular": 6347, + "json-nested": 9694, + "json-analytics": 3665, + "json-github": 15145, + "toon-tabular": 2483, + "toon-nested": 5967, + "toon-analytics": 1515, + "toon-github": 8745, + "csv-tabular": 2337, + "csv-nested": 6735, + "csv-analytics": 1393, + "csv-github": 8513, + "markdown-kv-tabular": 6270, + "markdown-kv-nested": 9110, + "markdown-kv-analytics": 3779, + "markdown-kv-github": 15436, + "yaml-tabular": 4969, + "yaml-nested": 7328, + "yaml-analytics": 2938, + "yaml-github": 13129 + }, + "timestamp": "2025-10-27T10:46:35.127Z" +} \ No newline at end of file diff --git a/benchmarks/results/token-efficiency.md b/benchmarks/results/token-efficiency.md new file mode 100644 index 0000000..090397a --- /dev/null +++ b/benchmarks/results/token-efficiency.md @@ -0,0 +1,141 @@ +### Token Efficiency + +``` +โญ GitHub Repositories โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 8,745 tokens (JSON: 15,145) ๐Ÿ’ฐ 42.3% saved +๐Ÿ“ˆ Analytics Time Series โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 3,631 tokens (JSON: 9,024) ๐Ÿ’ฐ 59.8% saved +๐Ÿ‘ฅ API Response โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 2,593 tokens (JSON: 4,589) ๐Ÿ’ฐ 43.5% saved +๐Ÿ›’ E-commerce Order โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘โ–‘ 203 tokens (JSON: 338) ๐Ÿ’ฐ 39.9% saved +``` + +**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) โ†’ 47.9% savings + +
+View detailed examples + +#### โญ GitHub Repositories + +**Configuration:** Top 100 GitHub repositories with stars, forks, and metadata + +**Savings:** 6,400 tokens (42.3% reduction) + +**JSON** (15,145 tokens): + +```json +{ + "repositories": [ + { + "id": 28457823, + "name": "freeCodeCamp", + "repo": "freeCodeCamp/freeCodeCamp", + "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...", + "createdAt": "2014-12-24T17:49:19Z", + "updatedAt": "2025-10-27T07:40:58Z", + "pushedAt": "2025-10-26T11:31:08Z", + "stars": 430828, + "watchers": 8582, + "forks": 42136, + "defaultBranch": "main" + }, + { + "id": 132750724, + "name": "build-your-own-x", + "repo": "codecrafters-io/build-your-own-x", + "description": "Master programming by recreating your favorite technologies from scratch.", + "createdAt": "2018-05-09T12:03:18Z", + "updatedAt": "2025-10-27T07:43:25Z", + "pushedAt": "2025-10-10T18:45:01Z", + "stars": 430102, + "watchers": 6322, + "forks": 40388, + "defaultBranch": "master" + }, + { + "id": 21737465, + "name": "awesome", + "repo": "sindresorhus/awesome", + "description": "๐Ÿ˜Ž Awesome lists about all kinds of interesting topics", + "createdAt": "2014-07-11T13:42:37Z", + "updatedAt": "2025-10-27T07:44:27Z", + "pushedAt": "2025-10-23T17:26:53Z", + "stars": 409760, + "watchers": 8016, + "forks": 32015, + "defaultBranch": "main" + } + ] +} +``` + +**TOON** (8,745 tokens): + +``` +repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}: + 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,...","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main + 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master + 21737465,awesome,sindresorhus/awesome,๐Ÿ˜Ž Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main +``` + +--- + +#### ๐Ÿ“ˆ Analytics Time Series + +**Configuration:** 180 days of web metrics (views, clicks, conversions, revenue) + +**Savings:** 5,393 tokens (59.8% reduction) + +**JSON** (9,024 tokens): + +```json +{ + "metrics": [ + { + "date": "2024-12-31", + "views": 3769, + "clicks": 400, + "conversions": 59, + "revenue": 198.98 + }, + { + "date": "2025-01-01", + "views": 5742, + "clicks": 463, + "conversions": 28, + "revenue": 295.77 + }, + { + "date": "2025-01-02", + "views": 3669, + "clicks": 336, + "conversions": 102, + "revenue": 624.23 + }, + { + "date": "2025-01-03", + "views": 1332, + "clicks": 304, + "conversions": 99, + "revenue": 113.06 + }, + { + "date": "2025-01-04", + "views": 1444, + "clicks": 222, + "conversions": 88, + "revenue": 986.69 + } + ] +} +``` + +**TOON** (3,631 tokens): + +``` +metrics[5]{date,views,clicks,conversions,revenue}: + 2024-12-31,3769,400,59,198.98 + 2025-01-01,5742,463,28,295.77 + 2025-01-02,3669,336,102,624.23 + 2025-01-03,1332,304,99,113.06 + 2025-01-04,1444,222,88,986.69 +``` + +
diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts new file mode 100644 index 0000000..9867e5c --- /dev/null +++ b/benchmarks/scripts/accuracy-benchmark.ts @@ -0,0 +1,140 @@ +/** + * TOON LLM Accuracy Benchmark + * + * Main entry point that orchestrates the full benchmark: + * 1. Generate questions from datasets + * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv) + * 3. Evaluate each question with each format using LLMs + * 4. Generate reports + */ + +import type { EvaluationResult, Question } from '../src/types' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import { consola } from 'consola' +import pMap from 'p-map' +import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants' +import { datasets } from '../src/datasets' +import { evaluateQuestion, models } from '../src/evaluate' +import { formatters } from '../src/formatters' +import { generateQuestions } from '../src/questions' +import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report' + +consola.start('LLM Accuracy Benchmark for TOON') + +// Check if results already exist +const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy') +const rawResultsPath = path.join(resultsDir, 'raw-results.json') +const summaryPath = path.join(resultsDir, 'summary.json') + +let existingResults: EvaluationResult[] | undefined +let existingTokenCounts: Record | undefined + +try { + const [rawData, summaryData] = await Promise.all([ + fsp.readFile(rawResultsPath, 'utf-8'), + fsp.readFile(summaryPath, 'utf-8'), + ]) + existingResults = JSON.parse(rawData) + const summary = JSON.parse(summaryData) + existingTokenCounts = summary.tokenCounts + consola.info('Found existing results โ€“ regenerating report only') +} +catch { + // Results don't exist, will run full evaluation +} + +if (DRY_RUN) { + consola.info('Limiting questions and models for dry run') +} + +let questions = generateQuestions() + +// Apply dry run limits if enabled +if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) { + questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions) +} + +// Filter models for dry run +const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0 + ? Object.fromEntries( + Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)), + ) + : models + +let results: EvaluationResult[] +let tokenCounts: Record + +if (existingResults && existingTokenCounts) { + // Reuse existing results + results = existingResults + tokenCounts = existingTokenCounts +} +else { + // Run full evaluation + consola.info(`Evaluating ${questions.length} questions`) + consola.info(`Testing ${Object.keys(formatters).length} formats`) + consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`) + + // Calculate token counts for all format+dataset combinations + tokenCounts = calculateTokenCounts(formatters) + + // Format datasets once (reuse for all questions) + const formattedDatasets: Record> = {} + for (const [formatName, formatter] of Object.entries(formatters)) { + formattedDatasets[formatName] = {} + for (const dataset of datasets) { + const formatted = formatter(dataset.data) + formattedDatasets[formatName]![dataset.name] = formatted + } + } + + // Generate evaluation tasks + const tasks: { question: Question, formatName: string, modelName: string }[] = [] + for (const question of questions) { + for (const [formatName] of Object.entries(formatters)) { + for (const [modelName] of Object.entries(activeModels)) { + tasks.push({ question, formatName, modelName }) + } + } + } + + const total = tasks.length + + consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`) + + // Evaluate all tasks in parallel + results = await pMap( + tasks, + async (task, index) => { + const formattedData = formattedDatasets[task.formatName]![task.question.dataset]! + const model = activeModels[task.modelName as keyof typeof activeModels] + + const result = await evaluateQuestion( + task.question, + task.formatName, + formattedData, + model, + task.modelName, + ) + + // Progress update + if ((index + 1) % 10 === 0) { + const percent = (((index + 1) / total) * 100).toFixed(1) + console.log(`โณ Progress: ${index + 1}/${total} (${percent}%)`) + } + + return result + }, + { concurrency: DEFAULT_CONCURRENCY }, + ) + + consola.success('Evaluation complete!') +} + +// Generate/regenerate markdown report +const formatResults = calculateFormatResults(results, tokenCounts) +await saveResults(results, formatResults, questions, tokenCounts) + +consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``) +consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!') diff --git a/benchmarks/scripts/fetch-github-data.ts b/benchmarks/scripts/fetch-github-data.ts new file mode 100644 index 0000000..335dd77 --- /dev/null +++ b/benchmarks/scripts/fetch-github-data.ts @@ -0,0 +1,78 @@ +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import process from 'node:process' +import { consola } from 'consola' +import { ofetch } from 'ofetch' +import { BENCHMARKS_DIR } from '../src/constants' + +try { + // Fetch top 100 repos from GitHub + const repoList = await searchTop100Repos() + const repos = await fetchRepoDetails(repoList) + + if (repos.length === 0) { + consola.error('โŒ No repositories fetched. Exiting.') + process.exit(1) + } + + // Sort by stars descending + repos.sort((a, b) => b.stars - a.stars) + + await saveRepos(repos) + + consola.success('Done!') +} +catch (error) { + consola.error(error) + process.exit(1) +} + +async function searchTop100Repos(): Promise { + consola.start('Fetching top 100 starred repositories from GitHub APIโ€ฆ') + + const response = await ofetch<{ items: { full_name: string }[] }>( + 'https://api.github.com/search/repositories', + { + query: { + q: 'stars:>1', + sort: 'stars', + order: 'desc', + per_page: 100, + }, + headers: { + 'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28', + }, + }, + ) + + return response.items.map(item => item.full_name) +} + +async function fetchRepoDetails(repoList: string[]): Promise[]> { + consola.start(`Fetching ${repoList.length} GitHub repositoriesโ€ฆ`) + + const repos: Record[] = [] + + for (let i = 0; i < repoList.length; i++) { + const repoPath = repoList[i]! + console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}โ€ฆ`) + const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`) + repos.push(repo) + } + + consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`) + + return repos +} + +async function saveRepos(repos: Record[]): Promise { + const outputDir = path.join(BENCHMARKS_DIR, 'data') + const outputFile = path.join(outputDir, 'github-repos.json') + + await fsp.mkdir(outputDir, { recursive: true }) + await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2)) + + const relativePath = path.relative(BENCHMARKS_DIR, outputFile) + consola.info(`Saved to \`${relativePath}\``) +} diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts new file mode 100644 index 0000000..5957115 --- /dev/null +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -0,0 +1,228 @@ +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import { faker } from '@faker-js/faker' +import { consola } from 'consola' +import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default) +import { encode } from '../../src/index' +import githubRepos from '../data/github-repos.json' with { type: 'json' } +import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants' + +interface BenchmarkResult { + name: string + emoji: string + description: string + data: any + jsonTokens: number + toonTokens: number + savings: number + savingsPercent: string + showDetailed: boolean +} + +const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md') + +const BENCHMARK_EXAMPLES = [ + { + name: 'GitHub Repositories', + emoji: 'โญ', + description: 'Top 100 GitHub repositories with stars, forks, and metadata', + getData: () => ({ repositories: githubRepos }), + showDetailed: true, + }, + { + name: 'Analytics Time Series', + emoji: '๐Ÿ“ˆ', + description: '180 days of web metrics (views, clicks, conversions, revenue)', + getData: () => generateAnalytics(180), + showDetailed: true, + }, + { + name: 'API Response', + emoji: '๐Ÿ‘ฅ', + description: '50 user records with metadata and timestamps', + getData: () => generateUsers(50), + showDetailed: false, + }, + { + name: 'E-commerce Order', + emoji: '๐Ÿ›’', + description: 'Nested order with customer and items', + getData: generateOrder, + showDetailed: false, + }, +] as const + +// Calculate total savings +let totalJsonTokens = 0 +let totalToonTokens = 0 + +const results: BenchmarkResult[] = [] + +for (const example of BENCHMARK_EXAMPLES) { + const data = await example.getData() + + const jsonString = JSON.stringify(data, undefined, 2) + const toonString = encode(data) + + const jsonTokens = encodeTokens(jsonString).length + const toonTokens = encodeTokens(toonString).length + const savings = jsonTokens - toonTokens + const savingsPercent = ((savings / jsonTokens) * 100).toFixed(1) + + totalJsonTokens += jsonTokens + totalToonTokens += toonTokens + + results.push({ + name: example.name, + emoji: example.emoji, + description: example.description, + data, + jsonTokens, + toonTokens, + savings, + savingsPercent, + showDetailed: example.showDetailed, + }) +} + +const totalSavings = totalJsonTokens - totalToonTokens +const totalSavingsPercent = ((totalSavings / totalJsonTokens) * 100).toFixed(1) + +// Generate ASCII bar chart visualization +const barChartSection = results + .map((result) => { + const percentage = Number.parseFloat(result.savingsPercent) + const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens + const jsonStr = result.jsonTokens.toLocaleString('en-US') + const toonStr = result.toonTokens.toLocaleString('en-US') + return `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens (JSON: ${jsonStr.padStart(6)}) ๐Ÿ’ฐ ${result.savingsPercent}% saved` + }) + .join('\n') + +// Generate detailed examples (only for selected examples) +const detailedExamples = results + .filter(result => result.showDetailed) + .map((result, i, filtered) => { + // Truncate large datasets for display + let displayData = result.data + if (result.name === 'GitHub Repositories') { + displayData = { + repositories: result.data.repositories.slice(0, 3).map((repo: any) => ({ + ...repo, + description: repo.description?.slice(0, 80) + (repo.description?.length > 80 ? '...' : ''), + })), + } + } + else if (result.name === 'Analytics Time Series') { + displayData = { metrics: result.data.metrics.slice(0, 5) } + } + + const separator = i < filtered.length - 1 ? '\n\n---' : '' + + return `#### ${result.emoji} ${result.name} + +**Configuration:** ${result.description} + +**Savings:** ${result.savings.toLocaleString('en-US')} tokens (${result.savingsPercent}% reduction) + +**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens): + +\`\`\`json +${JSON.stringify(displayData, undefined, 2)} +\`\`\` + +**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens): + +\`\`\` +${encode(displayData)} +\`\`\`${separator}` + }) + .join('\n\n') + +const markdown = `### Token Efficiency + +\`\`\` +${barChartSection} +\`\`\` + +**Total:** ${totalToonTokens.toLocaleString('en-US')} tokens (TOON) vs ${totalJsonTokens.toLocaleString('en-US')} tokens (JSON) โ†’ ${totalSavingsPercent}% savings + +
+View detailed examples + +${detailedExamples} + +
+`.trimStart() + +console.log(markdown) + +await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true }) +await fsp.writeFile(outputFilePath, markdown, 'utf-8') + +consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``) + +// Generate ASCII bar chart +function generateBarChart(percentage: number, maxWidth: number = 25): string { + const filled = Math.round((percentage / 100) * maxWidth) + const empty = maxWidth - filled + return 'โ–ˆ'.repeat(filled) + 'โ–‘'.repeat(empty) +} + +// Generate analytics time series data +function generateAnalytics(days: number) { + return { + metrics: Array.from({ length: days }, (_, i) => { + const date = new Date(2025, 0, 1) + date.setDate(date.getDate() + i) + return { + date: date.toISOString().split('T')[0], + views: Math.floor(Math.random() * 5000) + 1000, + clicks: Math.floor(Math.random() * 500) + 50, + conversions: Math.floor(Math.random() * 100) + 10, + revenue: Number((Math.random() * 1000 + 100).toFixed(2)), + } + }), + } +} + +// Generate user API response +function generateUsers(count: number) { + return { + users: Array.from({ length: count }, (_, i) => ({ + id: i + 1, + name: faker.person.fullName(), + email: faker.internet.email(), + role: faker.helpers.arrayElement(['admin', 'user', 'moderator']), + active: faker.datatype.boolean(), + createdAt: faker.date.past({ years: 2 }).toISOString(), + lastLogin: faker.date.recent({ days: 30 }).toISOString(), + })), + total: count, + page: 1, + } +} + +// Generate nested e-commerce order +function generateOrder() { + return { + orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }), + customer: { + id: faker.number.int({ min: 1000, max: 9999 }), + name: faker.person.fullName(), + email: faker.internet.email(), + phone: faker.phone.number(), + }, + items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({ + sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }), + name: faker.commerce.productName(), + quantity: faker.number.int({ min: 1, max: 5 }), + price: Number(faker.commerce.price({ min: 10, max: 200 })), + })), + subtotal: Number(faker.commerce.price({ min: 100, max: 500 })), + tax: Number(faker.commerce.price({ min: 10, max: 50 })), + total: Number(faker.commerce.price({ min: 110, max: 550 })), + status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']), + createdAt: faker.date.recent({ days: 7 }).toISOString(), + } +} diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts new file mode 100644 index 0000000..e146db0 --- /dev/null +++ b/benchmarks/src/constants.ts @@ -0,0 +1,39 @@ +import process from 'node:process' +import * as url from 'node:url' + +export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url)) +export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url)) + +/** + * Benchmark execution configuration + */ + +/** + * Enable dry run mode for quick testing with limited AI requests + * + * @remarks + * Set via environment variable: `DRY_RUN=true` + */ +export const DRY_RUN: boolean = process.env.DRY_RUN === 'true' + +/** + * Limits applied when DRY_RUN is enabled + */ +export const DRY_RUN_LIMITS = { + /** Maximum number of questions to evaluate */ + maxQuestions: 10, + /** Maximum number of formats to test */ + maxFormats: undefined as number | undefined, + /** Models to use in dry run */ + allowedModels: [] as string[], +} + +/** + * Default concurrency for parallel evaluations + */ +export const DEFAULT_CONCURRENCY = 20 + +/** + * Delay between API requests to avoid rate limiting (in milliseconds) + */ +export const RATE_LIMIT_DELAY_MS = 100 diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts new file mode 100644 index 0000000..87643f2 --- /dev/null +++ b/benchmarks/src/datasets.ts @@ -0,0 +1,146 @@ +/** + * Datasets for TOON benchmarks + * + * These datasets are designed to test TOON's strengths and weaknesses: + * - Tabular: Uniform records (TOON optimal) + * - Nested: Complex structures with nested objects + * - Analytics: Time-series data + */ + +import type { Dataset } from './types' +import { faker } from '@faker-js/faker' +import githubRepos from '../data/github-repos.json' with { type: 'json' } + +// Seed for reproducibility +faker.seed(12345) + +/** + * Tabular dataset: 100 uniform employee records + * + * @remarks + * Tests TOON's tabular array format + */ +const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] +const tabularDataset: Dataset = { + name: 'tabular', + description: 'Uniform employee records (TOON optimal format)', + data: { + employees: Array.from({ length: 100 }, (_, i) => { + const yearsExp = faker.number.int({ min: 1, max: 20 }) + return { + id: i + 1, + name: faker.person.fullName(), + email: faker.internet.email().toLowerCase(), + department: departments[i % departments.length]!, + salary: faker.number.int({ min: 45000, max: 150000 }), + yearsExperience: yearsExp, + active: faker.datatype.boolean(0.8), // 80% active + } + }), + }, +} + +/** + * Nested dataset: 50 e-commerce orders with nested structures + * + * @remarks + * Tests TOON's handling of complex nested objects + */ +const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] +const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] + +const nestedDataset: Dataset = { + name: 'nested', + description: 'E-commerce orders with nested structures', + data: { + orders: Array.from({ length: 50 }, (_, i) => { + const customerId = (i % 20) + 1 + const itemCount = faker.number.int({ min: 1, max: 4 }) + + const items = Array.from({ length: itemCount }, (_, j) => { + const price = faker.number.float({ min: 9.99, max: 199.99, fractionDigits: 2 }) + const quantity = faker.number.int({ min: 1, max: 5 }) + return { + sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`, + name: productNames[j % productNames.length]!, + quantity, + price, + } + }) + + const total = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2)) + + return { + orderId: `ORD-${String(i + 1).padStart(4, '0')}`, + customer: { + id: customerId, + name: faker.person.fullName(), + email: faker.internet.email().toLowerCase(), + }, + items, + total, + status: statuses[i % statuses.length]!, + orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0], + } + }), + }, +} + +/** + * Analytics dataset: 60 days of time-series metrics + * + * @remarks + * Tests TOON's handling of numeric data and date fields + */ +const analyticsDataset: Dataset = { + name: 'analytics', + description: 'Time-series analytics data', + data: { + metrics: Array.from({ length: 60 }, (_, i) => { + const date = new Date('2025-01-01') + date.setDate(date.getDate() + i) + + // Simulate realistic web traffic with some variation + const baseViews = 5000 + const weekendMultiplier = date.getDay() === 0 || date.getDay() === 6 ? 0.7 : 1.0 + const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 })) + const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 })) + const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 })) + const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 }) + const revenue = Number((conversions * avgOrderValue).toFixed(2)) + + return { + date: date.toISOString().split('T')[0]!, + views, + clicks, + conversions, + revenue, + bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }), + } + }), + }, +} + +/** + * GitHub dataset: Popular repositories + * + * @remarks + * Tests TOON's tabular format with real-world data + */ +const githubDataset: Dataset = { + name: 'github', + description: 'Popular GitHub repositories', + data: { + repositories: githubRepos.slice(0, 200), + }, +} + +/** + * All datasets used in the benchmark + */ +export const datasets: Dataset[] = [ + tabularDataset, + nestedDataset, + analyticsDataset, + githubDataset, +] diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts new file mode 100644 index 0000000..ec1c3ec --- /dev/null +++ b/benchmarks/src/evaluate.ts @@ -0,0 +1,133 @@ +/** + * LLM evaluation logic for TOON benchmarks + * + * Handles: + * - Model configuration + * - Question evaluation with LLMs + * - Answer validation using LLM-as-judge + */ + +import type { LanguageModelV2 } from '@ai-sdk/provider' +import type { EvaluationResult, Question } from './types' +import { setTimeout } from 'node:timers/promises' +import { anthropic } from '@ai-sdk/anthropic' +import { openai } from '@ai-sdk/openai' +import { generateText } from 'ai' +import { consola } from 'consola' +import { RATE_LIMIT_DELAY_MS } from './constants' + +/** + * Models used for evaluation + */ +export const models: Record = { + 'gpt-4o-mini': openai('gpt-4o-mini'), + 'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'), +} + +/** + * Validate an answer using LLM-as-judge approach + * More robust than string matching for LLM outputs + */ +export async function validateAnswer( + actual: string, + expected: string, + question: string, +): Promise { + const prompt = `You are validating answers to questions about structured data. + +Question: ${question} +Expected answer: ${expected} +Actual answer: ${actual} + +Is the actual answer correct? Consider: +- Exact matches are correct +- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars") +- Minor formatting differences are acceptable +- Case-insensitive comparison for text + +Respond with only "YES" or "NO".` + + try { + const { text } = await generateText({ + model: models['gpt-4o-mini']!, + prompt, + temperature: 0, + maxOutputTokens: 16, + }) + + await setTimeout(RATE_LIMIT_DELAY_MS) + + return text.trim().toUpperCase() === 'YES' + } + catch (error) { + consola.error('Validation error:', error) + // Fallback to simple string comparison + return actual.toLowerCase().trim() === expected.toLowerCase().trim() + } +} + +/** + * Evaluate a single question with a specific format and model + */ +export async function evaluateQuestion( + question: Question, + formatName: string, + formattedData: string, + model: any, + modelName: string, +): Promise { + const prompt = `Given the following data in ${formatName} format: + +\`\`\` +${formattedData} +\`\`\` + +Question: ${question.prompt} + +Provide only the direct answer, without any additional explanation or formatting.` + + const startTime = Date.now() + + try { + const { text, usage } = await generateText({ + model, + prompt, + temperature: 0, + maxOutputTokens: 50, + }) + + await setTimeout(RATE_LIMIT_DELAY_MS) + + const latencyMs = Date.now() - startTime + const correct = await validateAnswer(text.trim(), question.groundTruth, question.prompt) + + return { + questionId: question.id, + format: formatName, + model: modelName, + expected: question.groundTruth, + actual: text.trim(), + correct, + inputTokens: usage.inputTokens ?? 0, + outputTokens: usage.outputTokens ?? 0, + latencyMs, + } + } + catch (error) { + consola.error(`Error evaluating ${question.id} with ${formatName}/${modelName}:`, error) + + await setTimeout(RATE_LIMIT_DELAY_MS) + + return { + questionId: question.id, + format: formatName, + model: modelName, + expected: question.groundTruth, + actual: '', + correct: false, + inputTokens: 0, + outputTokens: 0, + latencyMs: Date.now() - startTime, + } + } +} diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts new file mode 100644 index 0000000..e1081e3 --- /dev/null +++ b/benchmarks/src/formatters.ts @@ -0,0 +1,90 @@ +/** + * Format converters for TOON benchmarks + * + * Converts data to different formats: + * - JSON + * - TOON + * - CSV + * - Markdown key-value + * - YAML + */ + +import { stringify as stringifyCSV } from 'csv-stringify/sync' +import { stringify as stringifyYAML } from 'yaml' +import { encode as encodeToon } from '../../src/index' + +export const formatters = { + 'json': (data: unknown): string => JSON.stringify(data, undefined, 2), + 'toon': (data: unknown): string => encodeToon(data), + 'csv': (data: unknown): string => toCSV(data), + 'markdown-kv': (data: unknown): string => toMarkdownKV(data), + 'yaml': (data: unknown): string => stringifyYAML(data), +} + +function toCSV(data: unknown): string { + const sections: string[] = [] + + // Handle top-level object with arrays + if (typeof data === 'object' && data !== null && !Array.isArray(data)) { + for (const [key, value] of Object.entries(data)) { + if (Array.isArray(value) && value.length > 0) { + sections.push(`# ${key}`) + sections.push(stringifyCSV(value, { header: true })) + } + } + return sections.join('\n').trim() + } + + // Root-level array + if (Array.isArray(data) && data.length > 0) { + return stringifyCSV(data, { header: true }).trim() + } + + return '' +} + +function toMarkdownKV(data: unknown, indent = 0): string { + const spaces = ' '.repeat(indent) + const lines: string[] = [] + + if (Array.isArray(data)) { + data.forEach((item, i) => { + if (typeof item === 'object' && item !== null && !Array.isArray(item)) { + Object.entries(item).forEach(([key, value]) => { + if (typeof value === 'object' && value !== null) { + lines.push(`${spaces}**${key}**:`) + lines.push(toMarkdownKV(value, indent + 1)) + } + else { + lines.push(`${spaces}**${key}**: ${value}`) + } + }) + if (i < data.length - 1) + lines.push('') + } + else { + lines.push(`${spaces}- ${item}`) + } + }) + } + else if (typeof data === 'object' && data !== null) { + Object.entries(data).forEach(([key, value]) => { + if (Array.isArray(value)) { + lines.push(`${spaces}**${key}**:`) + lines.push(toMarkdownKV(value, indent + 1)) + } + else if (typeof value === 'object' && value !== null) { + lines.push(`${spaces}**${key}**:`) + lines.push(toMarkdownKV(value, indent + 1)) + } + else { + lines.push(`${spaces}**${key}**: ${value}`) + } + }) + } + else { + lines.push(`${spaces}${data}`) + } + + return lines.join('\n') +} diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts new file mode 100644 index 0000000..e211dce --- /dev/null +++ b/benchmarks/src/questions.ts @@ -0,0 +1,398 @@ +/* eslint-disable no-console */ + +/** + * Question generation for TOON benchmarks + * + * Generates ~200 questions across different types: + * - Field retrieval (50%): "What is X's Y?" + * - Aggregation (25%): "How many X have Y?" + * - Filtering (25%): "List/count X where Y" + * + * Questions are generated dynamically based on actual data values + */ + +import type { Question } from './types' +import { datasets } from './datasets' + +/** + * Generate all questions from datasets + */ +export function generateQuestions(): Question[] { + const questions: Question[] = [] + let idCounter = 1 + + // Get datasets + const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || [] + const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || [] + const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || [] + const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || [] + + // ======================================== + // TABULAR DATASET QUESTIONS (70 questions) + // ======================================== + + if (tabular.length > 0) { + // Field retrieval: specific employees (40 questions) + for (let i = 0; i < Math.min(40, tabular.length); i++) { + const emp = tabular[i * 2] || tabular[i] + if (!emp) + continue + + // Alternate between different field types + if (i % 3 === 0) { + questions.push({ + id: `q${idCounter++}`, + prompt: `What is the salary of ${emp.name}?`, + groundTruth: String(emp.salary), + type: 'field-retrieval', + dataset: 'tabular', + }) + } + else if (i % 3 === 1) { + questions.push({ + id: `q${idCounter++}`, + prompt: `What department does ${emp.name} work in?`, + groundTruth: emp.department, + type: 'field-retrieval', + dataset: 'tabular', + }) + } + else { + questions.push({ + id: `q${idCounter++}`, + prompt: `What is the email address of ${emp.name}?`, + groundTruth: emp.email, + type: 'field-retrieval', + dataset: 'tabular', + }) + } + } + + // Aggregation: count by department + const departments = [...new Set(tabular.map((e: any) => e.department))] + for (const dept of departments.slice(0, 6)) { + const count = tabular.filter((e: any) => e.department === dept).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many employees work in ${dept}?`, + groundTruth: String(count), + type: 'aggregation', + dataset: 'tabular', + }) + } + + // Aggregation: salary ranges (4 questions) + const salaryThresholds = [60000, 80000, 100000, 120000] + for (const threshold of salaryThresholds) { + const count = tabular.filter((e: any) => e.salary > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many employees have a salary greater than ${threshold}?`, + groundTruth: String(count), + type: 'aggregation', + dataset: 'tabular', + }) + } + + // Filtering: active status + const activeCount = tabular.filter((e: any) => e.active).length + const inactiveCount = tabular.filter((e: any) => !e.active).length + questions.push( + { + id: `q${idCounter++}`, + prompt: 'How many employees are active?', + groundTruth: String(activeCount), + type: 'filtering', + dataset: 'tabular', + }, + { + id: `q${idCounter++}`, + prompt: 'How many employees are inactive?', + groundTruth: String(inactiveCount), + type: 'filtering', + dataset: 'tabular', + }, + ) + + // Complex filtering: multi-condition (8 questions) + for (const dept of departments.slice(0, 4)) { + const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many employees in ${dept} have a salary greater than 80000?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'tabular', + }) + } + + for (const exp of [5, 10]) { + const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many active employees have more than ${exp} years of experience?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'tabular', + }) + } + } + + // ======================================== + // NESTED DATASET QUESTIONS (50 questions) + // ======================================== + + if (nested.length > 0) { + // Field retrieval: order totals (20 questions) + for (let i = 0; i < Math.min(20, nested.length); i++) { + const order = nested[i * 2] || nested[i] + if (!order) + continue + + if (i % 2 === 0) { + questions.push({ + id: `q${idCounter++}`, + prompt: `What is the total amount for order ${order.orderId}?`, + groundTruth: String(order.total), + type: 'field-retrieval', + dataset: 'nested', + }) + } + else { + questions.push({ + id: `q${idCounter++}`, + prompt: `What is the status of order ${order.orderId}?`, + groundTruth: order.status, + type: 'field-retrieval', + dataset: 'nested', + }) + } + } + + // Field retrieval: customer info (15 questions) + for (let i = 0; i < Math.min(15, nested.length); i++) { + const order = nested[i * 3] || nested[i] + if (!order) + continue + + questions.push({ + id: `q${idCounter++}`, + prompt: `What is the customer name for order ${order.orderId}?`, + groundTruth: order.customer.name, + type: 'field-retrieval', + dataset: 'nested', + }) + } + + // Aggregation: count by status + const statuses = [...new Set(nested.map((o: any) => o.status))] + for (const status of statuses) { + const count = nested.filter((o: any) => o.status === status).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many orders have status "${status}"?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'nested', + }) + } + + // Aggregation: total revenue + const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0) + questions.push({ + id: `q${idCounter++}`, + prompt: 'What is the total revenue across all orders?', + groundTruth: String(totalRevenue.toFixed(2)), + type: 'aggregation', + dataset: 'nested', + }) + + // Filtering: high-value orders (3 questions) + const highValueThresholds = [200, 400, 600] + for (const threshold of highValueThresholds) { + const count = nested.filter((o: any) => o.total > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many orders have a total greater than ${threshold}?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'nested', + }) + } + } + + // ======================================== + // ANALYTICS DATASET QUESTIONS (40 questions) + // ======================================== + + if (analytics.length > 0) { + // Field retrieval: specific dates (20 questions) + for (let i = 0; i < Math.min(20, analytics.length); i++) { + const metric = analytics[i * 3] || analytics[i] + if (!metric) + continue + + if (i % 2 === 0) { + questions.push({ + id: `q${idCounter++}`, + prompt: `How many views were recorded on ${metric.date}?`, + groundTruth: String(metric.views), + type: 'field-retrieval', + dataset: 'analytics', + }) + } + else { + questions.push({ + id: `q${idCounter++}`, + prompt: `What was the revenue on ${metric.date}?`, + groundTruth: String(metric.revenue), + type: 'field-retrieval', + dataset: 'analytics', + }) + } + } + + // Aggregation: totals (4 questions) + const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0) + const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0) + const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0) + + questions.push( + { + id: `q${idCounter++}`, + prompt: 'What is the total number of views across all dates?', + groundTruth: String(totalViews), + type: 'aggregation', + dataset: 'analytics', + }, + { + id: `q${idCounter++}`, + prompt: 'What is the total revenue across all dates?', + groundTruth: String(totalRevenue.toFixed(2)), + type: 'aggregation', + dataset: 'analytics', + }, + { + id: `q${idCounter++}`, + prompt: 'What is the total number of conversions across all dates?', + groundTruth: String(totalConversions), + type: 'aggregation', + dataset: 'analytics', + }, + ) + + // Filtering: high-performing days (10 questions) + const viewThresholds = [5000, 6000, 7000] + for (const threshold of viewThresholds) { + const count = analytics.filter((m: any) => m.views > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many days had more than ${threshold} views?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'analytics', + }) + } + + const conversionThresholds = [10, 20, 30] + for (const threshold of conversionThresholds) { + const count = analytics.filter((m: any) => m.conversions > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many days had more than ${threshold} conversions?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'analytics', + }) + } + } + + // ======================================== + // GITHUB DATASET QUESTIONS (40 questions) + // ======================================== + + if (github.length > 0) { + // Field retrieval: specific repos (20 questions) + for (let i = 0; i < Math.min(20, github.length); i++) { + const repo = github[i * 10] || github[i] + if (!repo) + continue + + if (i % 2 === 0) { + questions.push({ + id: `q${idCounter++}`, + prompt: `How many stars does ${repo.owner}/${repo.name} have?`, + groundTruth: String(repo.stars), + type: 'field-retrieval', + dataset: 'github', + }) + } + else { + questions.push({ + id: `q${idCounter++}`, + prompt: `How many forks does ${repo.owner}/${repo.name} have?`, + groundTruth: String(repo.forks), + type: 'field-retrieval', + dataset: 'github', + }) + } + } + + // Aggregation: count by owner (5 questions) + const owners = [...new Set(github.map((r: any) => r.owner))] + for (const owner of owners.slice(0, 5)) { + const count = github.filter((r: any) => r.owner === owner).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many repositories does ${owner} have in the dataset?`, + groundTruth: String(count), + type: 'aggregation', + dataset: 'github', + }) + } + + // Aggregation: total stars + const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0) + questions.push({ + id: `q${idCounter++}`, + prompt: 'What is the total number of stars across all repositories?', + groundTruth: String(totalStars), + type: 'aggregation', + dataset: 'github', + }) + + // Filtering: popular repos (8 questions) + const starThresholds = [10000, 50000, 100000] + for (const threshold of starThresholds) { + const count = github.filter((r: any) => r.stars > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many repositories have more than ${threshold} stars?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'github', + }) + } + + const forkThresholds = [1000, 5000, 10000] + for (const threshold of forkThresholds) { + const count = github.filter((r: any) => r.forks > threshold).length + questions.push({ + id: `q${idCounter++}`, + prompt: `How many repositories have more than ${threshold} forks?`, + groundTruth: String(count), + type: 'filtering', + dataset: 'github', + }) + } + } + + console.log(`๐Ÿ“Š Question breakdown:`) + console.log(` Tabular: ${questions.filter(q => q.dataset === 'tabular').length}`) + console.log(` Nested: ${questions.filter(q => q.dataset === 'nested').length}`) + console.log(` Analytics: ${questions.filter(q => q.dataset === 'analytics').length}`) + console.log(` GitHub: ${questions.filter(q => q.dataset === 'github').length}`) + console.log(` Total: ${questions.length}`) + + return questions +} diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts new file mode 100644 index 0000000..2638622 --- /dev/null +++ b/benchmarks/src/report.ts @@ -0,0 +1,288 @@ +/** + * Report generation for TOON benchmarks + * + * Handles: + * - Statistical analysis + * - Twitter-ready markdown report generation with visual elements + * - Per-dataset breakdowns + * - Cost analysis + * - Result file saving + */ + +import type { EvaluationResult, FormatResult, Question } from './types' +import * as fsp from 'node:fs/promises' +import * as path from 'node:path' +import { encode } from 'gpt-tokenizer' +import { BENCHMARKS_DIR } from './constants' +import { datasets } from './datasets' +import { models } from './evaluate' + +/** + * Calculate per-format statistics from evaluation results + */ +export function calculateFormatResults( + results: EvaluationResult[], + tokenCounts: Record, +): FormatResult[] { + const formatNames = [...new Set(results.map(r => r.format))] + + return formatNames.map((formatName) => { + const formatResults = results.filter(r => r.format === formatName) + const correctCount = formatResults.filter(r => r.correct).length + const totalCount = formatResults.length + const accuracy = correctCount / totalCount + + // Calculate average tokens across all datasets for this format + const avgTokens = Object.entries(tokenCounts) + .filter(([key]) => key.startsWith(`${formatName}-`)) + .reduce((sum, [, tokens]) => sum + tokens, 0) / datasets.length + + const avgInputTokens = formatResults.reduce((sum, r) => sum + r.inputTokens, 0) / totalCount + const avgLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount + + return { + format: formatName, + accuracy, + totalTokens: Math.round(avgTokens), + avgInputTokens: Math.round(avgInputTokens), + avgLatency: Math.round(avgLatency), + correctCount, + totalCount, + } + }).sort((a, b) => b.accuracy - a.accuracy) +} + +/** + * Generate embeddable markdown report from results + */ +export function generateMarkdownReport( + formatResults: FormatResult[], + results: EvaluationResult[], + questions: Question[], + tokenCounts: Record, +): string { + const lines: string[] = [ + '### Retrieval Accuracy', + '', + ] + + const toon = formatResults.find(r => r.format === 'toon') + const json = formatResults.find(r => r.format === 'json') + + // Model-by-model breakdown (most interesting result) + const modelCount = Object.keys(models).length + lines.push(`Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:`, '', '```') + + for (const modelName of Object.keys(models)) { + const modelResults = formatResults.map((fr) => { + const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format) + const correctCount = modelFormatResults.filter(r => r.correct).length + const totalCount = modelFormatResults.length + const accuracy = totalCount > 0 ? correctCount / totalCount : 0 + + return { + format: fr.format, + accuracy, + correctCount, + totalCount, + } + }).sort((a, b) => b.accuracy - a.accuracy) + + const bestResult = modelResults[0]! + const bar = createTokenBar(bestResult.accuracy, 1, 20) + + lines.push(`${modelName.padEnd(20)} ${bar} ${(bestResult.accuracy * 100).toFixed(1)}% accuracy`) + } + + lines.push('```', '') + + // Summary comparison + if (toon && json) { + const tokenSavings = ((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1) + lines.push( + `**TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.**`, + '', + ) + } + + // Simple format comparison table + lines.push( + '| Format | Accuracy | Average Tokens |', + '| ------ | -------- | -------------- |', + ) + + for (const result of formatResults) { + lines.push( + `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.totalTokens.toLocaleString()} |`, + ) + } + + lines.push('', '
', 'View detailed breakdown by dataset and model', '', '#### Performance by Dataset', '') + + for (const dataset of datasets) { + lines.push(`##### ${dataset.description}`, '') + + const datasetResults = formatResults.map((fr) => { + const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name) + if (datasetFormatResults.length === 0) + return undefined + + const formatDatasetResults = datasetFormatResults.filter(r => r.format === fr.format) + if (formatDatasetResults.length === 0) + return undefined + + const correctCount = formatDatasetResults.filter(r => r.correct).length + const totalCount = formatDatasetResults.length + const accuracy = totalCount > 0 ? correctCount / totalCount : 0 + + // Get token count for this dataset+format + const tokenKey = `${fr.format}-${dataset.name}` + const tokens = tokenCounts[tokenKey] || fr.totalTokens + + return { + format: fr.format, + accuracy, + tokens, + correctCount, + totalCount, + } + }).filter(Boolean) as { format: string, accuracy: number, tokens: number, correctCount: number, totalCount: number }[] + + if (datasetResults.length === 0) + continue + + // Sort by efficiency + datasetResults.sort((a, b) => { + const effA = (a.accuracy ** 2) / (a.tokens / 1000) + const effB = (b.accuracy ** 2) / (b.tokens / 1000) + return effB - effA + }) + + lines.push( + '| Format | Accuracy | Tokens | Correct/Total |', + '|--------|----------|--------|---------------|', + ) + + for (const result of datasetResults.slice(0, 6)) { + lines.push( + `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`, + ) + } + + lines.push('') + } + + // Model breakdown + lines.push('', '#### Performance by Model', '') + + for (const modelName of Object.keys(models)) { + lines.push(`##### ${modelName}`, '') + + const modelResults = formatResults.map((fr) => { + const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format) + const correctCount = modelFormatResults.filter(r => r.correct).length + const totalCount = modelFormatResults.length + const accuracy = correctCount / totalCount + + return { + format: fr.format, + accuracy, + correctCount, + totalCount, + } + }).sort((a, b) => b.accuracy - a.accuracy) + + lines.push('| Format | Accuracy | Correct/Total |', '|--------|----------|---------------|') + + for (const result of modelResults) { + lines.push(`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`) + } + + lines.push('') + } + + // Methodology + lines.push( + '', + '#### Methodology', + '', + '- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).', + '- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.', + '- **Question types**: Field retrieval, aggregation, and filtering tasks.', + '- **Real data**: Faker.js-generated datasets + GitHub repositories.', + '', + '
', + '', + ) + + return lines.join('\n') +} + +/** + * Calculate token counts for all format+dataset combinations + */ +export function calculateTokenCounts( + formatters: Record string>, +): Record { + const tokenCounts: Record = {} + + for (const [formatName, formatter] of Object.entries(formatters)) { + for (const dataset of datasets) { + const formatted = formatter(dataset.data) + const key = `${formatName}-${dataset.name}` + tokenCounts[key] = encode(formatted).length + } + } + + return tokenCounts +} + +/** + * Save results to disk + */ +export async function saveResults( + results: EvaluationResult[], + formatResults: FormatResult[], + questions: Question[], + tokenCounts: Record, +): Promise { + const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy') + await fsp.mkdir(resultsDir, { recursive: true }) + + // Save raw results + await fsp.writeFile( + path.join(resultsDir, 'raw-results.json'), + JSON.stringify(results, undefined, 2), + ) + + // Save summary + await fsp.writeFile( + path.join(resultsDir, 'summary.json'), + JSON.stringify({ + formatResults, + questions: questions.length, + models: Object.keys(models), + datasets: datasets.map(d => ({ name: d.name, description: d.description })), + tokenCounts, + timestamp: new Date().toISOString(), + }, undefined, 2), + ) + + // Generate markdown report + const report = generateMarkdownReport(formatResults, results, questions, tokenCounts) + await fsp.writeFile( + path.join(resultsDir, 'report.md'), + report, + ) + + return resultsDir +} + +/** + * Generate visual bar chart for token counts + */ +function createTokenBar(tokens: number, maxTokens: number, width = 30): string { + const filled = Math.round((tokens / maxTokens) * width) + const empty = width - filled + return 'โ–ˆ'.repeat(filled) + 'โ–‘'.repeat(empty) +} diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts new file mode 100644 index 0000000..bca48fa --- /dev/null +++ b/benchmarks/src/types.ts @@ -0,0 +1,35 @@ +export interface Dataset { + name: string + description: string + data: any +} + +export interface Question { + id: string + prompt: string + groundTruth: string + type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison' + dataset: string +} + +export interface EvaluationResult { + questionId: string + format: string + model: string + expected: string + actual: string + correct: boolean + inputTokens: number + outputTokens: number + latencyMs: number +} + +export interface FormatResult { + format: string + accuracy: number + totalTokens: number + avgInputTokens: number + avgLatency: number + correctCount: number + totalCount: number +} diff --git a/docs/benchmarks.md b/docs/benchmarks.md deleted file mode 100644 index 146fbed..0000000 --- a/docs/benchmarks.md +++ /dev/null @@ -1,158 +0,0 @@ -| Example | JSON | TOON | Tokens Saved | Reduction | -| ------- | ---- | ---- | ------------ | --------- | -| ๐Ÿ‘ค Simple user object | 31 | 18 | 13 | **41.9%** | -| ๐Ÿท๏ธ User with tags | 48 | 28 | 20 | **41.7%** | -| ๐Ÿ“ฆ Small product catalog | 117 | 49 | 68 | **58.1%** | -| ๐Ÿ‘ฅ API response with users | 123 | 53 | 70 | **56.9%** | -| โš™๏ธ Nested configuration | 68 | 42 | 26 | **38.2%** | -| ๐Ÿ›’ E-commerce order | 163 | 94 | 69 | **42.3%** | -| ๐Ÿ“Š Analytics data | 209 | 94 | 115 | **55.0%** | -| ๐Ÿ“ˆ Large dataset (50 records) | 2159 | 762 | 1397 | **64.7%** | -| **Total** | **2918** | **1140** | **1778** | **60.9%** | - -
-View detailed results - -### ๐Ÿ“ฆ Small product catalog - -**Savings: 68 tokens (58.1% reduction)** - -**JSON** (117 tokens): - -```json -{ - "items": [ - { - "sku": "A1", - "name": "Widget", - "qty": 2, - "price": 9.99 - }, - { - "sku": "B2", - "name": "Gadget", - "qty": 1, - "price": 14.5 - }, - { - "sku": "C3", - "name": "Doohickey", - "qty": 5, - "price": 7.25 - } - ] -} -``` - -**TOON** (49 tokens): - -``` -items[3]{sku,name,qty,price}: - A1,Widget,2,9.99 - B2,Gadget,1,14.5 - C3,Doohickey,5,7.25 -``` - ---- - -### ๐Ÿ‘ฅ API response with users - -**Savings: 70 tokens (56.9% reduction)** - -**JSON** (123 tokens): - -```json -{ - "users": [ - { - "id": 1, - "name": "Alice", - "email": "alice@example.com", - "active": true - }, - { - "id": 2, - "name": "Bob", - "email": "bob@example.com", - "active": true - }, - { - "id": 3, - "name": "Charlie", - "email": "charlie@example.com", - "active": false - } - ], - "total": 3, - "page": 1 -} -``` - -**TOON** (53 tokens): - -``` -users[3]{id,name,email,active}: - 1,Alice,alice@example.com,true - 2,Bob,bob@example.com,true - 3,Charlie,charlie@example.com,false -total: 3 -page: 1 -``` - ---- - -### ๐Ÿ“Š Analytics data - -**Savings: 115 tokens (55.0% reduction)** - -**JSON** (209 tokens): - -```json -{ - "metrics": [ - { - "date": "2025-01-01", - "views": 1234, - "clicks": 89, - "conversions": 12 - }, - { - "date": "2025-01-02", - "views": 2345, - "clicks": 156, - "conversions": 23 - }, - { - "date": "2025-01-03", - "views": 1890, - "clicks": 123, - "conversions": 18 - }, - { - "date": "2025-01-04", - "views": 3456, - "clicks": 234, - "conversions": 34 - }, - { - "date": "2025-01-05", - "views": 2789, - "clicks": 178, - "conversions": 27 - } - ] -} -``` - -**TOON** (94 tokens): - -``` -metrics[5]{date,views,clicks,conversions}: - 2025-01-01,1234,89,12 - 2025-01-02,2345,156,23 - 2025-01-03,1890,123,18 - 2025-01-04,3456,234,34 - 2025-01-05,2789,178,27 -``` - -
diff --git a/package.json b/package.json index 61525de..8f13df7 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,7 @@ "dist" ], "scripts": { - "automd": "tsx scripts/generate-bench.ts && automd", + "automd": "automd", "build": "tsdown", "lint": "eslint .", "lint:fix": "eslint . --fix", @@ -35,16 +35,16 @@ "release": "bumpp" }, "devDependencies": { - "@antfu/eslint-config": "^6.0.0", + "@antfu/eslint-config": "^6.1.0", "@types/node": "^24.9.1", "automd": "^0.4.2", "bumpp": "^10.3.1", "eslint": "^9.38.0", "gpt-tokenizer": "^3.2.0", - "tsdown": "^0.15.9", + "tsdown": "^0.15.10", "tsx": "^4.20.6", "typescript": "^5.9.3", - "vitest": "^3.2.4" + "vitest": "^4.0.3" }, "pnpm": { "onlyBuiltDependencies": [ diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 20244df..3894eb4 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: devDependencies: '@antfu/eslint-config': - specifier: ^6.0.0 - version: 6.0.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) + specifier: ^6.1.0 + version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) '@types/node': specifier: ^24.9.1 version: 24.9.1 @@ -27,8 +27,8 @@ importers: specifier: ^3.2.0 version: 3.2.0 tsdown: - specifier: ^0.15.9 - version: 0.15.9(typescript@5.9.3) + specifier: ^0.15.10 + version: 0.15.10(typescript@5.9.3) tsx: specifier: ^4.20.6 version: 4.20.6 @@ -36,17 +36,93 @@ importers: specifier: ^5.9.3 version: 5.9.3 vitest: - specifier: ^3.2.4 - version: 3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) + specifier: ^4.0.3 + version: 4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) + + benchmarks: + devDependencies: + '@ai-sdk/anthropic': + specifier: ^2.0.37 + version: 2.0.37(zod@4.1.12) + '@ai-sdk/google': + specifier: ^2.0.23 + version: 2.0.23(zod@4.1.12) + '@ai-sdk/openai': + specifier: ^2.0.53 + version: 2.0.53(zod@4.1.12) + '@ai-sdk/provider': + specifier: ^2.0.0 + version: 2.0.0 + '@antfu/eslint-config': + specifier: ^6.1.0 + version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) + '@faker-js/faker': + specifier: ^10.1.0 + version: 10.1.0 + ai: + specifier: ^5.0.80 + version: 5.0.80(zod@4.1.12) + consola: + specifier: ^3.4.2 + version: 3.4.2 + csv-stringify: + specifier: ^6.6.0 + version: 6.6.0 + gpt-tokenizer: + specifier: ^3.2.0 + version: 3.2.0 + ofetch: + specifier: ^1.4.1 + version: 1.4.1 + p-map: + specifier: ^7.0.3 + version: 7.0.3 + yaml: + specifier: ^2.8.1 + version: 2.8.1 packages: - '@antfu/eslint-config@6.0.0': - resolution: {integrity: sha512-M2RM+x+hpxpASEZzQh4d5uaUEHn8sYNVlTB+CySpLkDs2rr3QFvRR7KqNdnox/OIPc6YWMsIEnM/XUbQP52nTA==} + '@ai-sdk/anthropic@2.0.37': + resolution: {integrity: sha512-r2e9BWoobisH9B5b7x3yYG/k9WlsZqa4D94o7gkwktReqrjjv83zNMop4KmlJsh/zBhbsaP8S8SUfiwK+ESxgg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/gateway@2.0.1': + resolution: {integrity: sha512-vPVIbnP35ZnayS937XLo85vynR85fpBQWHCdUweq7apzqFOTU2YkUd4V3msebEHbQ2Zro60ZShDDy9SMiyWTqA==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/google@2.0.23': + resolution: {integrity: sha512-VbCnKR+6aWUVLkAiSW5gUEtST7KueEmlt+d6qwDikxlLnFG9pzy59je8MiDVeM5G2tuSXbvZQF78PGIfXDBmow==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/openai@2.0.53': + resolution: {integrity: sha512-GIkR3+Fyif516ftXv+YPSPstnAHhcZxNoR2s8uSHhQ1yBT7I7aQYTVwpjAuYoT3GR+TeP50q7onj2/nDRbT2FQ==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/provider-utils@3.0.12': + resolution: {integrity: sha512-ZtbdvYxdMoria+2SlNarEk6Hlgyf+zzcznlD55EAl+7VZvJaSg2sqPvwArY7L6TfDEDJsnCq0fdhBSkYo0Xqdg==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + + '@ai-sdk/provider@2.0.0': + resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==} + engines: {node: '>=18'} + + '@antfu/eslint-config@6.1.0': + resolution: {integrity: sha512-m/L9TGvtG3r4tkfq5BY6THz7pk0g6yuJwwA0SkLEDHJJpt0upuABhs8v3SU8yaPtCGUxq8k2QTLMZ3WPg4vSdw==} hasBin: true peerDependencies: '@eslint-react/eslint-plugin': ^2.0.1 - '@next/eslint-plugin-next': ^15.4.0-canary.115 + '@next/eslint-plugin-next': '>=15.0.0' '@prettier/plugin-xml': ^3.4.1 '@unocss/eslint-plugin': '>=0.50.0' astro-eslint-parser: ^1.0.2 @@ -99,20 +175,20 @@ packages: '@antfu/install-pkg@1.1.0': resolution: {integrity: sha512-MGQsmw10ZyI+EJo45CdSER4zEb+p31LpDAFp2Z3gkSd1yqVZGi0Ebx++YTEMonJy4oChEMLsxZ64j8FH6sSqtQ==} - '@babel/generator@7.28.3': - resolution: {integrity: sha512-3lSpxGgvnmZznmBkCRnVREPUFJv2wrv9iAoFDvADJc0ypmdOxdUtcLeBgBJ6zE0PMeTKnxeQzyk0xTBq4Ep7zw==} + '@babel/generator@7.28.5': + resolution: {integrity: sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==} engines: {node: '>=6.9.0'} '@babel/helper-string-parser@7.27.1': resolution: {integrity: sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==} engines: {node: '>=6.9.0'} - '@babel/helper-validator-identifier@7.27.1': - resolution: {integrity: sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==} + '@babel/helper-validator-identifier@7.28.5': + resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==} engines: {node: '>=6.9.0'} - '@babel/parser@7.28.4': - resolution: {integrity: sha512-yZbBqeM6TkpP9du/I2pUZnJsRMGGvOuIrhjzC1AwHwW+6he4mni6Bp/m8ijn0iOuZuPI2BfkCoSRunpyjnrQKg==} + '@babel/parser@7.28.5': + resolution: {integrity: sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==} engines: {node: '>=6.0.0'} hasBin: true @@ -120,8 +196,8 @@ packages: resolution: {integrity: sha512-Q/N6JNWvIvPnLDvjlE1OUBLPQHH6l3CltCEsHIujp45zQUSSh8K+gHnaEX45yAT1nyngnINhvWtzN+Nb9D8RAQ==} engines: {node: '>=6.9.0'} - '@babel/types@7.28.4': - resolution: {integrity: sha512-bkFqkLhh3pMBUQQkpVgWDWq/lqzc2678eUyDlTBhRqhCHFguYYGM0Efga7tYk4TogG/3x0EEl66/OQ+WGbWB/Q==} + '@babel/types@7.28.5': + resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==} engines: {node: '>=6.9.0'} '@clack/core@0.5.0': @@ -143,10 +219,14 @@ packages: resolution: {integrity: sha512-YAdE/IJSpwbOTiaURNCKECdAwqrJuFiZhylmesBcIRawtYKnBR2wxPhoIewMg+Yu+QuYvHfJNReWpoxGBKOChA==} engines: {node: '>=18'} - '@es-joy/jsdoccomment@0.58.0': - resolution: {integrity: sha512-smMc5pDht/UVsCD3hhw/a/e/p8m0RdRYiluXToVfd+d4yaQQh7nn9bACjkk6nXJvat7EWPAxuFkMEFfrxeGa3Q==} + '@es-joy/jsdoccomment@0.76.0': + resolution: {integrity: sha512-g+RihtzFgGTx2WYCuTHbdOXJeAlGnROws0TeALx9ow/ZmOROOZkVg5wp/B44n0WJgI4SQFP1eWM2iRPlU2Y14w==} engines: {node: '>=20.11.0'} + '@es-joy/resolve.exports@1.0.0': + resolution: {integrity: sha512-bbrmzsAZ9GA/3oBS6r8PWMtZarEhKHr413hak8ArwMEZ5DtaLErnkcyEWUsXy7urBcmVu/TpDzHPDVM5uIbx9A==} + engines: {node: '>=10'} + '@esbuild/aix-ppc64@0.25.11': resolution: {integrity: sha512-Xt1dOL13m8u0WE8iplx9Ibbm+hFAO0GsU2P34UNoDGvZYkY8ifSiy6Zuc1lYxfG7svWE2fzqCUmFp5HCn51gJg==} engines: {node: '>=18'} @@ -368,6 +448,10 @@ packages: resolution: {integrity: sha512-sB5uyeq+dwCWyPi31B2gQlVlo+j5brPlWx4yZBrEaRo/nhdDE8Xke1gsGgtiBdaBTxuTkceLVuVt/pclrasb0A==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} + '@faker-js/faker@10.1.0': + resolution: {integrity: sha512-C3mrr3b5dRVlKPJdfrAXS8+dq+rq8Qm5SNRazca0JKgw1HQERFmrVb0towvMmw5uu8hHKNiQasMaR/tydf3Zsg==} + engines: {node: ^20.19.0 || ^22.13.0 || ^23.5.0 || >=24.0.0, npm: '>=10'} + '@humanfs/core@0.19.1': resolution: {integrity: sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==} engines: {node: '>=18.18.0'} @@ -412,6 +496,14 @@ packages: resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==} engines: {node: '>= 8'} + '@opentelemetry/api@1.9.0': + resolution: {integrity: sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==} + engines: {node: '>=8.0.0'} + + '@oxc-project/runtime@0.95.0': + resolution: {integrity: sha512-qJS5pNepwMGnafO9ayKGz7rfPQgUBuunHpnP1//9Qa0zK3oT3t1EhT+I+pV9MUA+ZKez//OFqxCxf1vijCKb2Q==} + engines: {node: ^20.19.0 || >=22.12.0} + '@oxc-project/types@0.95.0': resolution: {integrity: sha512-vACy7vhpMPhjEJhULNxrdR0D943TkA/MigMpJCHmBHvMXxRStRi/dPtTlfQ3uDwWSzRpT8z+7ImjZVf8JWBocQ==} @@ -700,6 +792,13 @@ packages: cpu: [x64] os: [win32] + '@sindresorhus/base62@1.0.0': + resolution: {integrity: sha512-TeheYy0ILzBEI/CO55CP6zJCSdSWeRtGnHy8U8dWSUH4I68iqTsy7HkMktR4xakThc9jotkPQUXT4ITdbV7cHA==} + engines: {node: '>=18'} + + '@standard-schema/spec@1.0.0': + resolution: {integrity: sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA==} + '@stylistic/eslint-plugin@5.5.0': resolution: {integrity: sha512-IeZF+8H0ns6prg4VrkhgL+yrvDXWDH2cKchrbh80ejG9dQgZWp10epHMbgRuQvgchLII/lfh6Xn3lu6+6L86Hw==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} @@ -795,12 +894,16 @@ packages: resolution: {integrity: sha512-tUFMXI4gxzzMXt4xpGJEsBsTox0XbNQ1y94EwlD/CuZwFcQP79xfQqMhau9HsRc/J0cAPA/HZt1dZPtGn9V/7w==} engines: {node: ^18.18.0 || ^20.9.0 || >=21.1.0} - '@vitest/eslint-plugin@1.3.23': - resolution: {integrity: sha512-kp1vjoJTdVf8jWdzr/JpHIPfh3HMR6JBr2p7XuH4YNx0UXmV4XWdgzvCpAmH8yb39Gry31LULiuBcuhyc/OqkQ==} + '@vercel/oidc@3.0.3': + resolution: {integrity: sha512-yNEQvPcVrK9sIe637+I0jD6leluPxzwJKx/Haw6F4H77CdDsszUn5V3o96LPziXkSNE2B83+Z3mjqGKBK/R6Gg==} + engines: {node: '>= 20'} + + '@vitest/eslint-plugin@1.3.25': + resolution: {integrity: sha512-7qM/FrA2VyUmrorP0TQ/Oqhn6wsAcktg6euBn0XmpgF0yT2mDxjziu2QLy86i2mOJ41Wtt55z6aUWo+bfmyAeg==} engines: {node: '>=18'} peerDependencies: - eslint: '>= 8.57.0' - typescript: '>= 5.0.0' + eslint: '>=8.57.0' + typescript: '>=5.0.0' vitest: '*' peerDependenciesMeta: typescript: @@ -808,34 +911,34 @@ packages: vitest: optional: true - '@vitest/expect@3.2.4': - resolution: {integrity: sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==} + '@vitest/expect@4.0.3': + resolution: {integrity: sha512-v3eSDx/bF25pzar6aEJrrdTXJduEBU3uSGXHslIdGIpJVP8tQQHV6x1ZfzbFQ/bLIomLSbR/2ZCfnaEGkWkiVQ==} - '@vitest/mocker@3.2.4': - resolution: {integrity: sha512-46ryTE9RZO/rfDd7pEqFl7etuyzekzEhUbTW3BvmeO/BcCMEgq59BKhek3dXDWgAj4oMK6OZi+vRr1wPW6qjEQ==} + '@vitest/mocker@4.0.3': + resolution: {integrity: sha512-evZcRspIPbbiJEe748zI2BRu94ThCBE+RkjCpVF8yoVYuTV7hMe+4wLF/7K86r8GwJHSmAPnPbZhpXWWrg1qbA==} peerDependencies: msw: ^2.4.9 - vite: ^5.0.0 || ^6.0.0 || ^7.0.0-0 + vite: ^6.0.0 || ^7.0.0-0 peerDependenciesMeta: msw: optional: true vite: optional: true - '@vitest/pretty-format@3.2.4': - resolution: {integrity: sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==} + '@vitest/pretty-format@4.0.3': + resolution: {integrity: sha512-N7gly/DRXzxa9w9sbDXwD9QNFYP2hw90LLLGDobPNwiWgyW95GMxsCt29/COIKKh3P7XJICR38PSDePenMBtsw==} - '@vitest/runner@3.2.4': - resolution: {integrity: sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==} + '@vitest/runner@4.0.3': + resolution: {integrity: sha512-1/aK6fPM0lYXWyGKwop2Gbvz1plyTps/HDbIIJXYtJtspHjpXIeB3If07eWpVH4HW7Rmd3Rl+IS/+zEAXrRtXA==} - '@vitest/snapshot@3.2.4': - resolution: {integrity: sha512-dEYtS7qQP2CjU27QBC5oUOxLE/v5eLkGqPE0ZKEIDGMs4vKWe7IjgLOeauHsR0D5YuuycGRO5oSRXnwnmA78fQ==} + '@vitest/snapshot@4.0.3': + resolution: {integrity: sha512-amnYmvZ5MTjNCP1HZmdeczAPLRD6iOm9+2nMRUGxbe/6sQ0Ymur0NnR9LIrWS8JA3wKE71X25D6ya/3LN9YytA==} - '@vitest/spy@3.2.4': - resolution: {integrity: sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==} + '@vitest/spy@4.0.3': + resolution: {integrity: sha512-82vVL8Cqz7rbXaNUl35V2G7xeNMAjBdNOVaHbrzznT9BmiCiPOzhf0FhU3eP41nP1bLDm/5wWKZqkG4nyU95DQ==} - '@vitest/utils@3.2.4': - resolution: {integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==} + '@vitest/utils@4.0.3': + resolution: {integrity: sha512-qV6KJkq8W3piW6MDIbGOmn1xhvcW4DuA07alqaQ+vdx7YA49J85pnwnxigZVQFQw3tWnQNRKWwhz5wbP6iv/GQ==} '@vue/compiler-core@3.5.22': resolution: {integrity: sha512-jQ0pFPmZwTEiRNSb+i9Ow/I/cHv2tXYqsnHKKyCQ08irI2kdF5qmYedmF8si8mA7zepUFmJ2hqzS8CQmNOWOkQ==} @@ -862,6 +965,12 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + ai@5.0.80: + resolution: {integrity: sha512-g1o6pjxm1eTtyh295dRhsg0gvZaHFlSo2oruWrK2rIR7KafWEhNB2A2/aJ9hyPT9AMI8JnQJyto1Tl9DMqwc9w==} + engines: {node: '>=18'} + peerDependencies: + zod: ^3.25.76 || ^4.1.8 + ajv@6.12.6: resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==} @@ -898,8 +1007,8 @@ packages: balanced-match@1.0.2: resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} - baseline-browser-mapping@2.8.19: - resolution: {integrity: sha512-zoKGUdu6vb2jd3YOq0nnhEDQVbPcHhco3UImJrv5dSkvxTc2pl2WjOPsjZXDwPDSl5eghIMuY3R6J9NDKF3KcQ==} + baseline-browser-mapping@2.8.20: + resolution: {integrity: sha512-JMWsdF+O8Orq3EMukbUN1QfbLK9mX2CkUmQBcW2T0s8OmdAUL5LLM/6wFwSrqXzlXB13yhyK9gTKS1rIizOduQ==} hasBin: true birpc@2.6.1: @@ -954,8 +1063,8 @@ packages: ccount@2.0.1: resolution: {integrity: sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==} - chai@5.3.3: - resolution: {integrity: sha512-4zNhdJD/iOjSH0A05ea+Ke6MU5mmpQcbQsSOkgdaUMJ9zTlDTD/GYlwohmIE2u0gaxHYiVHEn1Fw9mZ/ktJWgw==} + chai@6.2.0: + resolution: {integrity: sha512-aUTnJc/JipRzJrNADXVvpVqi6CO0dn3nx4EVPxijri+fj3LUUDyZQOgVeW54Ob3Y1Xh9Iz8f+CgaCl8v0mn9bA==} engines: {node: '>=18'} chalk@4.1.2: @@ -968,10 +1077,6 @@ packages: character-entities@2.0.2: resolution: {integrity: sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==} - check-error@2.1.1: - resolution: {integrity: sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==} - engines: {node: '>= 16'} - chokidar@4.0.3: resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==} engines: {node: '>= 14.16.0'} @@ -1023,6 +1128,9 @@ packages: engines: {node: '>=4'} hasBin: true + csv-stringify@6.6.0: + resolution: {integrity: sha512-YW32lKOmIBgbxtu3g5SaiqWNwa/9ISQt2EcgOq0+RAIFufFp9is6tqNnKahqE5kuKvrnYAzs28r+s6pXJR8Vcw==} + debug@4.4.3: resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} engines: {node: '>=6.0'} @@ -1035,10 +1143,6 @@ packages: decode-named-character-reference@1.2.0: resolution: {integrity: sha512-c6fcElNV6ShtZXmsgNgFFV5tVX2PaV4g+MOAkb8eXHvn6sryJBrZa9r0zV6+dtTyoCKxtDy5tyQ5ZwQuidtd+Q==} - deep-eql@5.0.2: - resolution: {integrity: sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==} - engines: {node: '>=6'} - deep-is@0.1.4: resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==} @@ -1085,8 +1189,8 @@ packages: oxc-resolver: optional: true - electron-to-chromium@1.5.238: - resolution: {integrity: sha512-khBdc+w/Gv+cS8e/Pbnaw/FXcBUeKrRVik9IxfXtgREOWyJhR4tj43n3amkVogJ/yeQUqzkrZcFhtIxIdqmmcQ==} + electron-to-chromium@1.5.240: + resolution: {integrity: sha512-OBwbZjWgrCOH+g6uJsA2/7Twpas2OlepS9uvByJjR2datRDuKGYeD+nP8lBBks2qnB7bGJNHDUx7c/YLaT3QMQ==} empathic@2.0.0: resolution: {integrity: sha512-i6UzDscO/XfAcNYD75CfICkmfLedpyPDdozrLMmQc5ORaQcdMoc21OnlEylMIqI7U8eniKrPMxxtj8k0vhmJhA==} @@ -1186,8 +1290,8 @@ packages: typescript: optional: true - eslint-plugin-jsdoc@59.1.0: - resolution: {integrity: sha512-sg9mzjjzfnMynyY4W8FDiQv3i8eFcKVEHDt4Xh7MLskP3QkMt2z6p7FuzSw7jJSKFues6RaK2GWvmkB1FLPxXg==} + eslint-plugin-jsdoc@61.1.9: + resolution: {integrity: sha512-X2AzSGbq1CzBRgKcVAu2qzOV9ogqygkUDk5AX6eNK5G+kY3I5Op5E5b99fE+FN0/bGnk2KGcsMIG6ZLF+di69A==} engines: {node: '>=20.11.0'} peerDependencies: eslint: ^7.0.0 || ^8.0.0 || ^9.0.0 @@ -1324,6 +1428,10 @@ packages: resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==} engines: {node: '>=0.10.0'} + eventsource-parser@3.0.6: + resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==} + engines: {node: '>=18.0.0'} + expect-type@1.2.2: resolution: {integrity: sha512-JhFGDVJ7tmDJItKhYgJCGLOWjuK9vPxiXoUFLwLDc99NlmklilbiQJwoctZtt13+xMw91MCk/REan6MWHqDjyA==} engines: {node: '>=12.0.0'} @@ -1444,6 +1552,9 @@ packages: hookable@5.5.3: resolution: {integrity: sha512-Yc+BQe8SvoXH1643Qez1zqLRmbA5rCL+sSmk6TVos0LWVfNIB7PGncdlId77WzLGSIB5KaWgTaNTs2lNVEI6VQ==} + html-entities@2.6.0: + resolution: {integrity: sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==} + ignore@5.3.2: resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} engines: {node: '>= 4'} @@ -1487,9 +1598,6 @@ packages: resolution: {integrity: sha512-ekilCSN1jwRvIbgeg/57YFh8qQDNbwDb9xT/qu2DAHbFFZUicIl4ygVaAvzveMhMVr3LnpSKTNnwt8PoOfmKhQ==} hasBin: true - js-tokens@9.0.1: - resolution: {integrity: sha512-mxa9E9ITFOt0ban3j6L5MpjwegGz6lBQmM1IJkWeBZGcMxto50+eWdjC/52xDbS2vy0k7vIMK0Fe2wfL9OQSpQ==} - js-yaml@4.1.0: resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==} hasBin: true @@ -1502,9 +1610,9 @@ packages: resolution: {integrity: sha512-iZ8Bdb84lWRuGHamRXFyML07r21pcwBrLkHEuHgEY5UbCouBwv7ECknDRKzsQIXMiqpPymqtIf8TC/shYKB5rw==} engines: {node: '>=12.0.0'} - jsdoc-type-pratt-parser@5.4.0: - resolution: {integrity: sha512-F9GQ+F1ZU6qvSrZV8fNFpjDNf614YzR2eF6S0+XbDjAcUI28FSoXnYZFjQmb1kFx3rrJb5PnxUH3/Yti6fcM+g==} - engines: {node: '>=12.0.0'} + jsdoc-type-pratt-parser@6.10.0: + resolution: {integrity: sha512-+LexoTRyYui5iOhJGn13N9ZazL23nAHGkXsa1p/C8yeq79WRfLBag6ZZ0FQG2aRoc9yfo59JT9EYCQonOkHKkQ==} + engines: {node: '>=20.0.0'} jsesc@3.0.2: resolution: {integrity: sha512-xKqzzWXDttJuOcawBt4KnKHHIf5oQ/Cxax+0PWFG+DFDgHNAdi+TXECADI+RYiFUMmx8792xsMbbgXj4CwnP4g==} @@ -1522,6 +1630,9 @@ packages: json-schema-traverse@0.4.1: resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==} + json-schema@0.4.0: + resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==} + json-stable-stringify-without-jsonify@1.0.1: resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==} @@ -1562,11 +1673,8 @@ packages: longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} - loupe@3.2.1: - resolution: {integrity: sha512-CdzqowRJCeLU72bHvWqwRBBlLcMEtIvGrlvef74kMnV2AolS9Y8xUv1I0U/MNAWMhBlKIoyuEgoJ0t/bbwHbLQ==} - - magic-string@0.30.19: - resolution: {integrity: sha512-2N21sPY9Ws53PZvsEpVtNuSW+ScYbQdp4b9qUaL+9QkHUrGFKo56Lg9Emg5s9V/qrtNBmiR01sYhUOwu3H+VOw==} + magic-string@0.30.21: + resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==} markdown-table@3.0.4: resolution: {integrity: sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==} @@ -1750,8 +1858,8 @@ packages: engines: {node: ^14.16.0 || >=16.10.0} hasBin: true - object-deep-merge@1.0.5: - resolution: {integrity: sha512-3DioFgOzetbxbeUq8pB2NunXo8V0n4EvqsWM/cJoI6IA9zghd7cl/2pBOuWRf4dlvA+fcg5ugFMZaN2/RuoaGg==} + object-deep-merge@2.0.0: + resolution: {integrity: sha512-3DC3UMpeffLTHiuXSy/UG4NOIYTLlY9u3V82+djSCLYClWobZiS4ivYzpIUWrRY/nfsJ8cWsKyG3QfyLePmhvg==} ofetch@1.4.1: resolution: {integrity: sha512-QZj2DfGplQAr2oj9KzceK9Hwz6Whxazmn85yYeVuS3u9XTMOGMRx0kO95MQ+vLsj/S/NwBDMMLU5hpxvI6Tklw==} @@ -1771,6 +1879,10 @@ packages: resolution: {integrity: sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==} engines: {node: '>=10'} + p-map@7.0.3: + resolution: {integrity: sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA==} + engines: {node: '>=18'} + package-manager-detector@1.5.0: resolution: {integrity: sha512-uBj69dVlYe/+wxj8JOpr97XfsxH/eumMt6HqjNTmJDf/6NO9s+0uxeOneIz3AsPt2m6y9PqzDzd3ATcU17MNfw==} @@ -1799,10 +1911,6 @@ packages: pathe@2.0.3: resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} - pathval@2.0.1: - resolution: {integrity: sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ==} - engines: {node: '>= 14.16'} - perfect-debounce@2.0.0: resolution: {integrity: sha512-fkEH/OBiKrqqI/yIgjR92lMfs2K8105zt/VT6+7eTjNwisrsh47CeIED9z58zI7DfKdH3uHAn25ziRZn3kgAow==} @@ -1875,6 +1983,10 @@ packages: resolution: {integrity: sha512-cnE+y8bz4NhMjISKbgeVJtqNbtf5QpjZP+Bslo+UqkIt9QPnX9q095eiRRASJG1/tz6dlNr6Z5NsBiWYokp6EQ==} hasBin: true + reserved-identifiers@1.2.0: + resolution: {integrity: sha512-yE7KUfFvaBFzGPs5H3Ops1RevfUEsDc5Iz65rOwWg4lE8HJSYtle77uul3+573457oHvBKuHYDl/xqUkKpEEdw==} + engines: {node: '>=18'} + resolve-from@4.0.0: resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==} engines: {node: '>=4'} @@ -1886,13 +1998,13 @@ packages: resolution: {integrity: sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==} engines: {iojs: '>=1.0.0', node: '>=0.10.0'} - rolldown-plugin-dts@0.16.12: - resolution: {integrity: sha512-9dGjm5oqtKcbZNhpzyBgb8KrYiU616A7IqcFWG7Msp1RKAXQ/hapjivRg+g5IYWSiFhnk3OKYV5T4Ft1t8Cczg==} + rolldown-plugin-dts@0.17.1: + resolution: {integrity: sha512-dQfoYD9kwSau7UQPg0UubprCDcwWeEKYd9SU9O2MpOdKy3VHy3/DaDF+x6w9+KE/w6J8qxkHVjwG1K2QmmQAFA==} engines: {node: '>=20.18.0'} peerDependencies: '@ts-macro/tsc': ^0.3.6 '@typescript/native-preview': '>=7.0.0-dev.20250601.1' - rolldown: ^1.0.0-beta.9 + rolldown: ^1.0.0-beta.44 typescript: ^5.0.0 vue-tsc: ~3.1.0 peerDependenciesMeta: @@ -1971,9 +2083,6 @@ packages: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} - strip-literal@3.1.0: - resolution: {integrity: sha512-8r3mkIM/2+PpjHoOtiAW8Rg3jJLHaV7xPwG+YRGrv6FP0wwk/toTpATxWYOW0BKdWwl82VT2tFYi5DlROa0Mxg==} - supports-color@7.2.0: resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==} engines: {node: '>=8'} @@ -1999,22 +2108,18 @@ packages: resolution: {integrity: sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==} engines: {node: '>=12.0.0'} - tinypool@1.1.1: - resolution: {integrity: sha512-Zba82s87IFq9A9XmjiX5uZA/ARWDrB03OHlq+Vw1fSdt0I+4/Kutwy8BP4Y/y/aORMo61FQ0vIb5j44vSo5Pkg==} - engines: {node: ^18.0.0 || >=20.0.0} - - tinyrainbow@2.0.0: - resolution: {integrity: sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==} - engines: {node: '>=14.0.0'} - - tinyspy@4.0.4: - resolution: {integrity: sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==} + tinyrainbow@3.0.3: + resolution: {integrity: sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==} engines: {node: '>=14.0.0'} to-regex-range@5.0.1: resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==} engines: {node: '>=8.0'} + to-valid-identifier@1.0.0: + resolution: {integrity: sha512-41wJyvKep3yT2tyPqX/4blcfybknGB4D+oETKLs7Q76UiPqRpUJK3hr1nxelyYO0PHKVzJwlu0aCeEAsGI6rpw==} + engines: {node: '>=20'} + toml-eslint-parser@0.10.0: resolution: {integrity: sha512-khrZo4buq4qVmsGzS5yQjKe/WsFvV8fGfOjDQN0q4iy9FjRfPWRgTFrU8u1R2iu/SfWLhY9WnCi4Jhdrcbtg+g==} engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} @@ -2034,8 +2139,8 @@ packages: peerDependencies: typescript: '>=4.0.0' - tsdown@0.15.9: - resolution: {integrity: sha512-C0EJYpXIYdlJokTumIL4lmv/wEiB20oa6iiYsXFE7Q0VKF3Ju6TQ7XAn4JQdm+2iQGEfl8cnEKcX5DB7iVR5Dw==} + tsdown@0.15.10: + resolution: {integrity: sha512-8zbSN4GW7ZzhjIYl/rWrruGzl1cJiDtAjb8l5XVF2cVme1+aDLVcExw+Ph4gNcfdGg6ZfYPh5kmcpIfh5xHisw==} engines: {node: '>=20.19.0'} hasBin: true peerDependencies: @@ -2068,10 +2173,6 @@ packages: resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==} engines: {node: '>= 0.8.0'} - type-fest@4.2.0: - resolution: {integrity: sha512-5zknd7Dss75pMSED270A1RQS3KloqRJA9XbXLe0eCxyw7xXFb3rd+9B0UQ/0E+LQT6lnrLviEolYORlRWamn4w==} - engines: {node: '>=16'} - typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} @@ -2098,6 +2199,11 @@ packages: unist-util-visit@5.0.0: resolution: {integrity: sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==} + unrun@0.2.0: + resolution: {integrity: sha512-iaCxWG/6kmjP3wUTBheowjFm6LuI8fd/A3Uz7DbMoz8HvQsJThh7tWZKWJfVltOSK3LuIJFzepr7g6fbuhUasw==} + engines: {node: '>=20.19.0'} + hasBin: true + untyped@2.0.0: resolution: {integrity: sha512-nwNCjxJTjNuLCgFr42fEak5OcLuB3ecca+9ksPFNvtfYSLpjf+iJqSIaSnIile6ZPbKYxI5k2AfXqeopGudK/g==} hasBin: true @@ -2114,13 +2220,8 @@ packages: util-deprecate@1.0.2: resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} - vite-node@3.2.4: - resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==} - engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} - hasBin: true - - vite@7.1.11: - resolution: {integrity: sha512-uzcxnSDVjAopEUjljkWh8EIrg6tlzrjFUfMcR1EVsRDGwf/ccef0qQPRyOrROwhrTDaApueq+ja+KLPlzR/zdg==} + vite@7.1.12: + resolution: {integrity: sha512-ZWyE8YXEXqJrrSLvYgrRP7p62OziLW7xI5HYGWFzOvupfAlrLvURSzv/FyGyy0eidogEM3ujU+kUG1zuHgb6Ug==} engines: {node: ^20.19.0 || >=22.12.0} hasBin: true peerDependencies: @@ -2159,16 +2260,18 @@ packages: yaml: optional: true - vitest@3.2.4: - resolution: {integrity: sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==} - engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} + vitest@4.0.3: + resolution: {integrity: sha512-IUSop8jgaT7w0g1yOM/35qVtKjr/8Va4PrjzH1OUb0YH4c3OXB2lCZDkMAB6glA8T5w8S164oJGsbcmAecr4sA==} + engines: {node: ^20.0.0 || ^22.0.0 || >=24.0.0} hasBin: true peerDependencies: '@edge-runtime/vm': '*' '@types/debug': ^4.1.12 - '@types/node': ^18.0.0 || ^20.0.0 || >=22.0.0 - '@vitest/browser': 3.2.4 - '@vitest/ui': 3.2.4 + '@types/node': ^20.0.0 || ^22.0.0 || >=24.0.0 + '@vitest/browser-playwright': 4.0.3 + '@vitest/browser-preview': 4.0.3 + '@vitest/browser-webdriverio': 4.0.3 + '@vitest/ui': 4.0.3 happy-dom: '*' jsdom: '*' peerDependenciesMeta: @@ -2178,7 +2281,11 @@ packages: optional: true '@types/node': optional: true - '@vitest/browser': + '@vitest/browser-playwright': + optional: true + '@vitest/browser-preview': + optional: true + '@vitest/browser-webdriverio': optional: true '@vitest/ui': optional: true @@ -2224,12 +2331,51 @@ packages: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} + zod@4.1.12: + resolution: {integrity: sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ==} + zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} snapshots: - '@antfu/eslint-config@6.0.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': + '@ai-sdk/anthropic@2.0.37(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + zod: 4.1.12 + + '@ai-sdk/gateway@2.0.1(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + '@vercel/oidc': 3.0.3 + zod: 4.1.12 + + '@ai-sdk/google@2.0.23(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + zod: 4.1.12 + + '@ai-sdk/openai@2.0.53(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + zod: 4.1.12 + + '@ai-sdk/provider-utils@3.0.12(zod@4.1.12)': + dependencies: + '@ai-sdk/provider': 2.0.0 + '@standard-schema/spec': 1.0.0 + eventsource-parser: 3.0.6 + zod: 4.1.12 + + '@ai-sdk/provider@2.0.0': + dependencies: + json-schema: 0.4.0 + + '@antfu/eslint-config@6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': dependencies: '@antfu/install-pkg': 1.1.0 '@clack/prompts': 0.11.0 @@ -2238,7 +2384,7 @@ snapshots: '@stylistic/eslint-plugin': 5.5.0(eslint@9.38.0(jiti@2.6.1)) '@typescript-eslint/eslint-plugin': 8.46.2(@typescript-eslint/parser@8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3))(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3) '@typescript-eslint/parser': 8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3) - '@vitest/eslint-plugin': 1.3.23(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) + '@vitest/eslint-plugin': 1.3.25(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) ansis: 4.2.0 cac: 6.7.14 eslint: 9.38.0(jiti@2.6.1) @@ -2248,7 +2394,7 @@ snapshots: eslint-plugin-antfu: 3.1.1(eslint@9.38.0(jiti@2.6.1)) eslint-plugin-command: 3.3.1(eslint@9.38.0(jiti@2.6.1)) eslint-plugin-import-lite: 0.3.0(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3) - eslint-plugin-jsdoc: 59.1.0(eslint@9.38.0(jiti@2.6.1)) + eslint-plugin-jsdoc: 61.1.9(eslint@9.38.0(jiti@2.6.1)) eslint-plugin-jsonc: 2.21.0(eslint@9.38.0(jiti@2.6.1)) eslint-plugin-n: 17.23.1(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3) eslint-plugin-no-only-tests: 3.3.0 @@ -2280,28 +2426,28 @@ snapshots: package-manager-detector: 1.5.0 tinyexec: 1.0.1 - '@babel/generator@7.28.3': + '@babel/generator@7.28.5': dependencies: - '@babel/parser': 7.28.4 - '@babel/types': 7.28.4 + '@babel/parser': 7.28.5 + '@babel/types': 7.28.5 '@jridgewell/gen-mapping': 0.3.13 '@jridgewell/trace-mapping': 0.3.31 jsesc: 3.1.0 '@babel/helper-string-parser@7.27.1': {} - '@babel/helper-validator-identifier@7.27.1': {} + '@babel/helper-validator-identifier@7.28.5': {} - '@babel/parser@7.28.4': + '@babel/parser@7.28.5': dependencies: - '@babel/types': 7.28.4 + '@babel/types': 7.28.5 '@babel/runtime@7.28.4': {} - '@babel/types@7.28.4': + '@babel/types@7.28.5': dependencies: '@babel/helper-string-parser': 7.27.1 - '@babel/helper-validator-identifier': 7.27.1 + '@babel/helper-validator-identifier': 7.28.5 '@clack/core@0.5.0': dependencies: @@ -2338,13 +2484,15 @@ snapshots: esquery: 1.6.0 jsdoc-type-pratt-parser: 4.1.0 - '@es-joy/jsdoccomment@0.58.0': + '@es-joy/jsdoccomment@0.76.0': dependencies: '@types/estree': 1.0.8 '@typescript-eslint/types': 8.46.2 comment-parser: 1.4.1 esquery: 1.6.0 - jsdoc-type-pratt-parser: 5.4.0 + jsdoc-type-pratt-parser: 6.10.0 + + '@es-joy/resolve.exports@1.0.0': {} '@esbuild/aix-ppc64@0.25.11': optional: true @@ -2505,6 +2653,8 @@ snapshots: '@eslint/core': 0.16.0 levn: 0.4.1 + '@faker-js/faker@10.1.0': {} + '@humanfs/core@0.19.1': {} '@humanfs/node@0.16.7': @@ -2549,6 +2699,10 @@ snapshots: '@nodelib/fs.scandir': 2.1.5 fastq: 1.19.1 + '@opentelemetry/api@1.9.0': {} + + '@oxc-project/runtime@0.95.0': {} + '@oxc-project/types@0.95.0': {} '@parcel/watcher-android-arm64@2.5.1': @@ -2729,6 +2883,10 @@ snapshots: '@rollup/rollup-win32-x64-msvc@4.52.5': optional: true + '@sindresorhus/base62@1.0.0': {} + + '@standard-schema/spec@1.0.0': {} + '@stylistic/eslint-plugin@5.5.0(eslint@9.38.0(jiti@2.6.1))': dependencies: '@eslint-community/eslint-utils': 4.9.0(eslint@9.38.0(jiti@2.6.1)) @@ -2864,62 +3022,61 @@ snapshots: '@typescript-eslint/types': 8.46.2 eslint-visitor-keys: 4.2.1 - '@vitest/eslint-plugin@1.3.23(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': + '@vercel/oidc@3.0.3': {} + + '@vitest/eslint-plugin@1.3.25(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': dependencies: '@typescript-eslint/scope-manager': 8.46.2 '@typescript-eslint/utils': 8.46.2(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3) eslint: 9.38.0(jiti@2.6.1) optionalDependencies: typescript: 5.9.3 - vitest: 3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) + vitest: 4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) transitivePeerDependencies: - supports-color - '@vitest/expect@3.2.4': + '@vitest/expect@4.0.3': dependencies: + '@standard-schema/spec': 1.0.0 '@types/chai': 5.2.3 - '@vitest/spy': 3.2.4 - '@vitest/utils': 3.2.4 - chai: 5.3.3 - tinyrainbow: 2.0.0 + '@vitest/spy': 4.0.3 + '@vitest/utils': 4.0.3 + chai: 6.2.0 + tinyrainbow: 3.0.3 - '@vitest/mocker@3.2.4(vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': + '@vitest/mocker@4.0.3(vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))': dependencies: - '@vitest/spy': 3.2.4 + '@vitest/spy': 4.0.3 estree-walker: 3.0.3 - magic-string: 0.30.19 + magic-string: 0.30.21 optionalDependencies: - vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) + vite: 7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) - '@vitest/pretty-format@3.2.4': + '@vitest/pretty-format@4.0.3': dependencies: - tinyrainbow: 2.0.0 + tinyrainbow: 3.0.3 - '@vitest/runner@3.2.4': + '@vitest/runner@4.0.3': dependencies: - '@vitest/utils': 3.2.4 - pathe: 2.0.3 - strip-literal: 3.1.0 - - '@vitest/snapshot@3.2.4': - dependencies: - '@vitest/pretty-format': 3.2.4 - magic-string: 0.30.19 + '@vitest/utils': 4.0.3 pathe: 2.0.3 - '@vitest/spy@3.2.4': + '@vitest/snapshot@4.0.3': dependencies: - tinyspy: 4.0.4 + '@vitest/pretty-format': 4.0.3 + magic-string: 0.30.21 + pathe: 2.0.3 - '@vitest/utils@3.2.4': + '@vitest/spy@4.0.3': {} + + '@vitest/utils@4.0.3': dependencies: - '@vitest/pretty-format': 3.2.4 - loupe: 3.2.1 - tinyrainbow: 2.0.0 + '@vitest/pretty-format': 4.0.3 + tinyrainbow: 3.0.3 '@vue/compiler-core@3.5.22': dependencies: - '@babel/parser': 7.28.4 + '@babel/parser': 7.28.5 '@vue/shared': 3.5.22 entities: 4.5.0 estree-walker: 2.0.2 @@ -2932,13 +3089,13 @@ snapshots: '@vue/compiler-sfc@3.5.22': dependencies: - '@babel/parser': 7.28.4 + '@babel/parser': 7.28.5 '@vue/compiler-core': 3.5.22 '@vue/compiler-dom': 3.5.22 '@vue/compiler-ssr': 3.5.22 '@vue/shared': 3.5.22 estree-walker: 2.0.2 - magic-string: 0.30.19 + magic-string: 0.30.21 postcss: 8.5.6 source-map-js: 1.2.1 @@ -2955,6 +3112,14 @@ snapshots: acorn@8.15.0: {} + ai@5.0.80(zod@4.1.12): + dependencies: + '@ai-sdk/gateway': 2.0.1(zod@4.1.12) + '@ai-sdk/provider': 2.0.0 + '@ai-sdk/provider-utils': 3.0.12(zod@4.1.12) + '@opentelemetry/api': 1.9.0 + zod: 4.1.12 + ajv@6.12.6: dependencies: fast-deep-equal: 3.1.3 @@ -2978,7 +3143,7 @@ snapshots: ast-kit@2.1.3: dependencies: - '@babel/parser': 7.28.4 + '@babel/parser': 7.28.5 pathe: 2.0.3 automd@0.4.2: @@ -2990,7 +3155,7 @@ snapshots: defu: 6.1.4 destr: 2.0.5 didyoumean2: 7.0.4 - magic-string: 0.30.19 + magic-string: 0.30.21 mdbox: 0.1.1 mlly: 1.8.0 ofetch: 1.4.1 @@ -3005,7 +3170,7 @@ snapshots: balanced-match@1.0.2: {} - baseline-browser-mapping@2.8.19: {} + baseline-browser-mapping@2.8.20: {} birpc@2.6.1: {} @@ -3026,9 +3191,9 @@ snapshots: browserslist@4.27.0: dependencies: - baseline-browser-mapping: 2.8.19 + baseline-browser-mapping: 2.8.20 caniuse-lite: 1.0.30001751 - electron-to-chromium: 1.5.238 + electron-to-chromium: 1.5.240 node-releases: 2.0.26 update-browserslist-db: 1.1.4(browserslist@4.27.0) @@ -3073,13 +3238,7 @@ snapshots: ccount@2.0.1: {} - chai@5.3.3: - dependencies: - assertion-error: 2.0.1 - check-error: 2.1.1 - deep-eql: 5.0.2 - loupe: 3.2.1 - pathval: 2.0.1 + chai@6.2.0: {} chalk@4.1.2: dependencies: @@ -3090,8 +3249,6 @@ snapshots: character-entities@2.0.2: {} - check-error@2.1.1: {} - chokidar@4.0.3: dependencies: readdirp: 4.1.2 @@ -3134,6 +3291,8 @@ snapshots: cssesc@3.0.0: {} + csv-stringify@6.6.0: {} + debug@4.4.3: dependencies: ms: 2.1.3 @@ -3142,8 +3301,6 @@ snapshots: dependencies: character-entities: 2.0.2 - deep-eql@5.0.2: {} - deep-is@0.1.4: {} defu@6.1.4: {} @@ -3172,7 +3329,7 @@ snapshots: dts-resolver@2.1.2: {} - electron-to-chromium@1.5.238: {} + electron-to-chromium@1.5.240: {} empathic@2.0.0: {} @@ -3275,9 +3432,10 @@ snapshots: optionalDependencies: typescript: 5.9.3 - eslint-plugin-jsdoc@59.1.0(eslint@9.38.0(jiti@2.6.1)): + eslint-plugin-jsdoc@61.1.9(eslint@9.38.0(jiti@2.6.1)): dependencies: - '@es-joy/jsdoccomment': 0.58.0 + '@es-joy/jsdoccomment': 0.76.0 + '@es-joy/resolve.exports': 1.0.0 are-docs-informative: 0.0.2 comment-parser: 1.4.1 debug: 4.4.3 @@ -3285,10 +3443,12 @@ snapshots: eslint: 9.38.0(jiti@2.6.1) espree: 10.4.0 esquery: 1.6.0 - object-deep-merge: 1.0.5 + html-entities: 2.6.0 + object-deep-merge: 2.0.0 parse-imports-exports: 0.2.4 semver: 7.7.3 spdx-expression-parse: 4.0.0 + to-valid-identifier: 1.0.0 transitivePeerDependencies: - supports-color @@ -3367,7 +3527,7 @@ snapshots: eslint-plugin-unicorn@61.0.2(eslint@9.38.0(jiti@2.6.1)): dependencies: - '@babel/helper-validator-identifier': 7.27.1 + '@babel/helper-validator-identifier': 7.28.5 '@eslint-community/eslint-utils': 4.9.0(eslint@9.38.0(jiti@2.6.1)) '@eslint/plugin-kit': 0.3.5 change-case: 5.4.4 @@ -3504,6 +3664,8 @@ snapshots: esutils@2.0.3: {} + eventsource-parser@3.0.6: {} + expect-type@1.2.2: {} exsolve@1.0.7: {} @@ -3604,6 +3766,8 @@ snapshots: hookable@5.5.3: {} + html-entities@2.6.0: {} + ignore@5.3.2: {} ignore@7.0.5: {} @@ -3633,8 +3797,6 @@ snapshots: jiti@2.6.1: {} - js-tokens@9.0.1: {} - js-yaml@4.1.0: dependencies: argparse: 2.0.1 @@ -3643,7 +3805,7 @@ snapshots: jsdoc-type-pratt-parser@4.8.0: {} - jsdoc-type-pratt-parser@5.4.0: {} + jsdoc-type-pratt-parser@6.10.0: {} jsesc@3.0.2: {} @@ -3653,6 +3815,8 @@ snapshots: json-schema-traverse@0.4.1: {} + json-schema@0.4.0: {} + json-stable-stringify-without-jsonify@1.0.1: {} jsonc-eslint-parser@2.4.1: @@ -3693,9 +3857,7 @@ snapshots: longest-streak@3.1.0: {} - loupe@3.2.1: {} - - magic-string@0.30.19: + magic-string@0.30.21: dependencies: '@jridgewell/sourcemap-codec': 1.5.5 @@ -4066,9 +4228,7 @@ snapshots: pkg-types: 2.3.0 tinyexec: 1.0.1 - object-deep-merge@1.0.5: - dependencies: - type-fest: 4.2.0 + object-deep-merge@2.0.0: {} ofetch@1.4.1: dependencies: @@ -4095,6 +4255,8 @@ snapshots: dependencies: p-limit: 3.1.0 + p-map@7.0.3: {} + package-manager-detector@1.5.0: {} parent-module@1.0.1: @@ -4115,8 +4277,6 @@ snapshots: pathe@2.0.3: {} - pathval@2.0.1: {} - perfect-debounce@2.0.0: {} picocolors@1.1.1: {} @@ -4184,23 +4344,25 @@ snapshots: dependencies: jsesc: 3.0.2 + reserved-identifiers@1.2.0: {} + resolve-from@4.0.0: {} resolve-pkg-maps@1.0.0: {} reusify@1.1.0: {} - rolldown-plugin-dts@0.16.12(rolldown@1.0.0-beta.44)(typescript@5.9.3): + rolldown-plugin-dts@0.17.1(rolldown@1.0.0-beta.44)(typescript@5.9.3): dependencies: - '@babel/generator': 7.28.3 - '@babel/parser': 7.28.4 - '@babel/types': 7.28.4 + '@babel/generator': 7.28.5 + '@babel/parser': 7.28.5 + '@babel/types': 7.28.5 ast-kit: 2.1.3 birpc: 2.6.1 debug: 4.4.3 dts-resolver: 2.1.2 get-tsconfig: 4.13.0 - magic-string: 0.30.19 + magic-string: 0.30.21 rolldown: 1.0.0-beta.44 optionalDependencies: typescript: 5.9.3 @@ -4299,10 +4461,6 @@ snapshots: strip-json-comments@3.1.1: {} - strip-literal@3.1.0: - dependencies: - js-tokens: 9.0.1 - supports-color@7.2.0: dependencies: has-flag: 4.0.0 @@ -4324,16 +4482,17 @@ snapshots: fdir: 6.5.0(picomatch@4.0.3) picomatch: 4.0.3 - tinypool@1.1.1: {} - - tinyrainbow@2.0.0: {} - - tinyspy@4.0.4: {} + tinyrainbow@3.0.3: {} to-regex-range@5.0.1: dependencies: is-number: 7.0.0 + to-valid-identifier@1.0.0: + dependencies: + '@sindresorhus/base62': 1.0.0 + reserved-identifiers: 1.2.0 + toml-eslint-parser@0.10.0: dependencies: eslint-visitor-keys: 3.4.3 @@ -4349,7 +4508,7 @@ snapshots: picomatch: 4.0.3 typescript: 5.9.3 - tsdown@0.15.9(typescript@5.9.3): + tsdown@0.15.10(typescript@5.9.3): dependencies: ansis: 4.2.0 cac: 6.7.14 @@ -4359,12 +4518,13 @@ snapshots: empathic: 2.0.0 hookable: 5.5.3 rolldown: 1.0.0-beta.44 - rolldown-plugin-dts: 0.16.12(rolldown@1.0.0-beta.44)(typescript@5.9.3) + rolldown-plugin-dts: 0.17.1(rolldown@1.0.0-beta.44)(typescript@5.9.3) semver: 7.7.3 tinyexec: 1.0.1 tinyglobby: 0.2.15 tree-kill: 1.2.2 unconfig: 7.3.3 + unrun: 0.2.0 optionalDependencies: typescript: 5.9.3 transitivePeerDependencies: @@ -4388,8 +4548,6 @@ snapshots: dependencies: prelude-ls: 1.2.1 - type-fest@4.2.0: {} - typescript@5.9.3: {} ufo@1.6.1: {} @@ -4422,6 +4580,12 @@ snapshots: unist-util-is: 6.0.1 unist-util-visit-parents: 6.0.2 + unrun@0.2.0: + dependencies: + '@oxc-project/runtime': 0.95.0 + rolldown: 1.0.0-beta.44 + synckit: 0.11.11 + untyped@2.0.0: dependencies: citty: 0.1.6 @@ -4442,28 +4606,7 @@ snapshots: util-deprecate@1.0.2: {} - vite-node@3.2.4(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1): - dependencies: - cac: 6.7.14 - debug: 4.4.3 - es-module-lexer: 1.7.0 - pathe: 2.0.3 - vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) - transitivePeerDependencies: - - '@types/node' - - jiti - - less - - lightningcss - - sass - - sass-embedded - - stylus - - sugarss - - supports-color - - terser - - tsx - - yaml - - vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1): + vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1): dependencies: esbuild: 0.25.11 fdir: 6.5.0(picomatch@4.0.3) @@ -4478,30 +4621,27 @@ snapshots: tsx: 4.20.6 yaml: 2.8.1 - vitest@3.2.4(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1): + vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1): dependencies: - '@types/chai': 5.2.3 - '@vitest/expect': 3.2.4 - '@vitest/mocker': 3.2.4(vite@7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) - '@vitest/pretty-format': 3.2.4 - '@vitest/runner': 3.2.4 - '@vitest/snapshot': 3.2.4 - '@vitest/spy': 3.2.4 - '@vitest/utils': 3.2.4 - chai: 5.3.3 + '@vitest/expect': 4.0.3 + '@vitest/mocker': 4.0.3(vite@7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1)) + '@vitest/pretty-format': 4.0.3 + '@vitest/runner': 4.0.3 + '@vitest/snapshot': 4.0.3 + '@vitest/spy': 4.0.3 + '@vitest/utils': 4.0.3 debug: 4.4.3 + es-module-lexer: 1.7.0 expect-type: 1.2.2 - magic-string: 0.30.19 + magic-string: 0.30.21 pathe: 2.0.3 picomatch: 4.0.3 std-env: 3.10.0 tinybench: 2.9.0 tinyexec: 0.3.2 tinyglobby: 0.2.15 - tinypool: 1.1.1 - tinyrainbow: 2.0.0 - vite: 7.1.11(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) - vite-node: 3.2.4(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) + tinyrainbow: 3.0.3 + vite: 7.1.12(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1) why-is-node-running: 2.3.0 optionalDependencies: '@types/debug': 4.1.12 @@ -4554,4 +4694,6 @@ snapshots: yocto-queue@0.1.0: {} + zod@4.1.12: {} + zwitch@2.0.4: {} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml new file mode 100644 index 0000000..76137e2 --- /dev/null +++ b/pnpm-workspace.yaml @@ -0,0 +1,2 @@ +packages: + - benchmarks diff --git a/scripts/generate-bench.ts b/scripts/generate-bench.ts deleted file mode 100644 index a4d8950..0000000 --- a/scripts/generate-bench.ts +++ /dev/null @@ -1,213 +0,0 @@ -import * as fsp from 'node:fs/promises' -import * as path from 'node:path' -import * as url from 'node:url' -import { encode } from 'gpt-tokenizer' // o200k_base encoding (default) -import { encode as encodeToon } from '../src/index' - -interface BenchmarkResult { - name: string - emoji: string - jsonTokens: number - toonTokens: number - savings: number - savingsPercent: string -} - -const rootDir = url.fileURLToPath(new URL('../', import.meta.url)) -const benchPath = path.join(rootDir, 'docs', 'benchmarks.md') - -const BENCHMARK_EXAMPLES = [ - { - name: 'Simple user object', - emoji: '๐Ÿ‘ค', - data: { - id: 123, - name: 'Alice', - email: 'alice@example.com', - active: true, - }, - }, - { - name: 'User with tags', - emoji: '๐Ÿท๏ธ', - data: { - user: { - id: 123, - name: 'Ada', - tags: ['reading', 'gaming', 'coding'], - active: true, - }, - }, - }, - { - name: 'Small product catalog', - emoji: '๐Ÿ“ฆ', - data: { - items: [ - { sku: 'A1', name: 'Widget', qty: 2, price: 9.99 }, - { sku: 'B2', name: 'Gadget', qty: 1, price: 14.5 }, - { sku: 'C3', name: 'Doohickey', qty: 5, price: 7.25 }, - ], - }, - }, - { - name: 'API response with users', - emoji: '๐Ÿ‘ฅ', - data: { - users: [ - { id: 1, name: 'Alice', email: 'alice@example.com', active: true }, - { id: 2, name: 'Bob', email: 'bob@example.com', active: true }, - { id: 3, name: 'Charlie', email: 'charlie@example.com', active: false }, - ], - total: 3, - page: 1, - }, - }, - { - name: 'Nested configuration', - emoji: 'โš™๏ธ', - data: { - database: { - host: 'localhost', - port: 5432, - credentials: { - username: 'dbuser', - password: 'secret123', - }, - }, - cache: { - enabled: true, - ttl: 3600, - }, - }, - }, - { - name: 'E-commerce order', - emoji: '๐Ÿ›’', - data: { - orderId: 'ORD-2025-001', - customer: { - id: 456, - name: 'Jane Doe', - email: 'jane@example.com', - }, - items: [ - { sku: 'PROD-A', name: 'Premium Widget', quantity: 2, price: 29.99 }, - { sku: 'PROD-B', name: 'Deluxe Gadget', quantity: 1, price: 49.99 }, - ], - subtotal: 109.97, - tax: 10.99, - total: 120.96, - status: 'pending', - }, - }, - { - name: 'Analytics data', - emoji: '๐Ÿ“Š', - data: { - metrics: [ - { date: '2025-01-01', views: 1234, clicks: 89, conversions: 12 }, - { date: '2025-01-02', views: 2345, clicks: 156, conversions: 23 }, - { date: '2025-01-03', views: 1890, clicks: 123, conversions: 18 }, - { date: '2025-01-04', views: 3456, clicks: 234, conversions: 34 }, - { date: '2025-01-05', views: 2789, clicks: 178, conversions: 27 }, - ], - }, - }, - { - name: 'Large dataset (50 records)', - emoji: '๐Ÿ“ˆ', - data: { - records: Array.from({ length: 50 }, (_, i) => ({ - id: i + 1, - name: `User ${i + 1}`, - email: `user${i + 1}@example.com`, - score: (i * 7) % 100, - active: i % 3 !== 0, - })), - }, - }, -] as const - -const DETAILED_EXAMPLE_INDICES = [2, 3, 6] // Small product catalog, API response, Analytics data - -// Calculate total savings -let totalJsonTokens = 0 -let totalToonTokens = 0 - -const results: BenchmarkResult[] = [] - -for (const example of BENCHMARK_EXAMPLES) { - const jsonString = JSON.stringify(example.data, null, 2) - const toonString = encodeToon(example.data) - - const jsonTokens = encode(jsonString).length - const toonTokens = encode(toonString).length - const savings = jsonTokens - toonTokens - const savingsPercent = ((savings / jsonTokens) * 100).toFixed(1) - - totalJsonTokens += jsonTokens - totalToonTokens += toonTokens - - results.push({ - name: example.name, - emoji: example.emoji, - jsonTokens, - toonTokens, - savings, - savingsPercent, - }) -} - -const totalSavings = totalJsonTokens - totalToonTokens -const totalSavingsPercent = ((totalSavings / totalJsonTokens) * 100).toFixed(1) - -// Generate markdown content matching README style -const summaryRows = results - .map(result => `| ${result.emoji} ${result.name} | ${result.jsonTokens} | ${result.toonTokens} | ${result.savings} | **${result.savingsPercent}%** |`) - .join('\n') - -const detailedExamples = DETAILED_EXAMPLE_INDICES - .map((exampleIndex, i) => { - const example = BENCHMARK_EXAMPLES[exampleIndex]! - const result = results[exampleIndex]! - const separator = i < DETAILED_EXAMPLE_INDICES.length - 1 ? '\n\n---' : '' - - return `### ${result.emoji} ${result.name} - -**Savings: ${result.savings} tokens (${result.savingsPercent}% reduction)** - -**JSON** (${result.jsonTokens} tokens): - -\`\`\`json -${JSON.stringify(example.data, null, 2)} -\`\`\` - -**TOON** (${result.toonTokens} tokens): - -\`\`\` -${encodeToon(example.data)} -\`\`\`${separator}` - }) - .join('\n\n') - -const markdown = ` -| Example | JSON | TOON | Tokens Saved | Reduction | -| ------- | ---- | ---- | ------------ | --------- | -${summaryRows} -| **Total** | **${totalJsonTokens}** | **${totalToonTokens}** | **${totalSavings}** | **${totalSavingsPercent}%** | - -
-View detailed results - -${detailedExamples} - -
-`.trimStart() - -console.log(markdown) - -await fsp.mkdir(path.join(rootDir, 'docs'), { recursive: true }) -await fsp.writeFile(benchPath, markdown, 'utf-8') - -console.log(`โœ… Benchmark written to ${benchPath}`)