diff --git a/README.md b/README.md
index a97c575..ff96795 100644
--- a/README.md
+++ b/README.md
@@ -87,11 +87,11 @@ Total ████████████░░░░░
"repo": "freeCodeCamp/freeCodeCamp",
"description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…",
"createdAt": "2014-12-24T17:49:19Z",
- "updatedAt": "2025-10-27T07:40:58Z",
- "pushedAt": "2025-10-26T11:31:08Z",
- "stars": 430828,
- "watchers": 8582,
- "forks": 42136,
+ "updatedAt": "2025-10-28T11:58:08Z",
+ "pushedAt": "2025-10-28T10:17:16Z",
+ "stars": 430886,
+ "watchers": 8583,
+ "forks": 42146,
"defaultBranch": "main"
},
{
@@ -100,11 +100,11 @@ Total ████████████░░░░░
"repo": "codecrafters-io/build-your-own-x",
"description": "Master programming by recreating your favorite technologies from scratch.",
"createdAt": "2018-05-09T12:03:18Z",
- "updatedAt": "2025-10-27T07:43:25Z",
+ "updatedAt": "2025-10-28T12:37:11Z",
"pushedAt": "2025-10-10T18:45:01Z",
- "stars": 430102,
- "watchers": 6322,
- "forks": 40388,
+ "stars": 430877,
+ "watchers": 6332,
+ "forks": 40453,
"defaultBranch": "master"
},
{
@@ -113,11 +113,11 @@ Total ████████████░░░░░
"repo": "sindresorhus/awesome",
"description": "😎 Awesome lists about all kinds of interesting topics",
"createdAt": "2014-07-11T13:42:37Z",
- "updatedAt": "2025-10-27T07:44:27Z",
- "pushedAt": "2025-10-23T17:26:53Z",
- "stars": 409760,
- "watchers": 8016,
- "forks": 32015,
+ "updatedAt": "2025-10-28T12:40:21Z",
+ "pushedAt": "2025-10-27T17:57:31Z",
+ "stars": 410052,
+ "watchers": 8017,
+ "forks": 32029,
"defaultBranch": "main"
}
]
@@ -128,9 +128,9 @@ Total ████████████░░░░░
```
repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
- 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
- 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
- 21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
+ 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-28T11:58:08Z","2025-10-28T10:17:16Z",430886,8583,42146,main
+ 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-28T12:37:11Z","2025-10-10T18:45:01Z",430877,6332,40453,master
+ 21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-28T12:40:21Z","2025-10-27T17:57:31Z",410052,8017,32029,main
```
---
@@ -208,36 +208,36 @@ metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
> [!NOTE]
> Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers.
-
+
### Retrieval Accuracy
-Accuracy across **3 LLMs** on **159 data retrieval questions**:
+Accuracy across **3 LLMs** on **154 data retrieval questions**:
```
-gpt-5-nano
- toon ████████████████████ 99.4% (158/159)
- yaml ███████████████████░ 95.0% (151/159)
- csv ██████████████████░░ 92.5% (147/159)
- json ██████████████████░░ 92.5% (147/159)
- xml ██████████████████░░ 91.2% (145/159)
-
-claude-haiku-4-5
- toon ███████████████░░░░░ 75.5% (120/159)
- xml ███████████████░░░░░ 75.5% (120/159)
- csv ███████████████░░░░░ 75.5% (120/159)
- json ███████████████░░░░░ 75.5% (120/159)
- yaml ███████████████░░░░░ 74.2% (118/159)
-
gemini-2.5-flash
- xml ██████████████████░░ 91.8% (146/159)
- csv █████████████████░░░ 86.2% (137/159)
- toon █████████████████░░░ 84.9% (135/159)
- json ████████████████░░░░ 81.8% (130/159)
- yaml ████████████████░░░░ 78.6% (125/159)
+ xml ██████████████████░░ 90.3% (139/154)
+ csv ██████████████████░░ 89.0% (137/154)
+ toon █████████████████░░░ 87.0% (134/154)
+ json ████████████████░░░░ 79.2% (122/154)
+ yaml ███████████████░░░░░ 76.0% (117/154)
+
+gpt-5-nano
+ toon ███████████████████░ 96.1% (148/154)
+ csv ██████████████████░░ 90.3% (139/154)
+ yaml ██████████████████░░ 89.0% (137/154)
+ json ██████████████████░░ 87.7% (135/154)
+ xml █████████████████░░░ 83.8% (129/154)
+
+claude-haiku-4-5-20251001
+ json ██████████░░░░░░░░░░ 48.7% (75/154)
+ toon ██████████░░░░░░░░░░ 48.1% (74/154)
+ xml █████████░░░░░░░░░░░ 47.4% (73/154)
+ yaml █████████░░░░░░░░░░░ 47.4% (73/154)
+ csv █████████░░░░░░░░░░░ 45.5% (70/154)
```
-**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**.
+**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.
Performance by dataset and model
@@ -248,73 +248,73 @@ gemini-2.5-flash
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 87.4% | 2.483 | 152/174 |
-| `csv` | 82.8% | 2.337 | 144/174 |
-| `yaml` | 83.9% | 4.969 | 146/174 |
-| `json` | 83.9% | 6.347 | 146/174 |
-| `xml` | 88.5% | 7.314 | 154/174 |
+| `csv` | 74.7% | 2,337 | 112/150 |
+| `toon` | 76.7% | 2,483 | 115/150 |
+| `yaml` | 70.7% | 4,969 | 106/150 |
+| `xml` | 77.3% | 7,314 | 116/150 |
+| `json` | 69.3% | 6,347 | 104/150 |
##### E-commerce orders with nested structures
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 90.9% | 5.967 | 120/132 |
-| `csv` | 93.9% | 6.735 | 124/132 |
-| `yaml` | 87.1% | 7.328 | 115/132 |
-| `json` | 87.9% | 9.694 | 116/132 |
-| `xml` | 93.2% | 10.992 | 123/132 |
+| `toon` | 80.0% | 5,967 | 96/120 |
+| `csv` | 75.8% | 6,735 | 91/120 |
+| `yaml` | 74.2% | 7,328 | 89/120 |
+| `json` | 79.2% | 9,694 | 95/120 |
+| `xml` | 78.3% | 10,992 | 94/120 |
##### Time-series analytics data
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `csv` | 89.7% | 1.393 | 78/87 |
-| `toon` | 88.5% | 1.515 | 77/87 |
-| `yaml` | 83.9% | 2.938 | 73/87 |
-| `json` | 88.5% | 3.665 | 77/87 |
-| `xml` | 85.1% | 4.376 | 74/87 |
+| `csv` | 75.5% | 1,393 | 77/102 |
+| `toon` | 76.5% | 1,515 | 78/102 |
+| `yaml` | 74.5% | 2,938 | 76/102 |
+| `json` | 76.5% | 3,665 | 78/102 |
+| `xml` | 74.5% | 4,376 | 76/102 |
##### Top 100 GitHub repositories
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 76.2% | 8.745 | 64/84 |
-| `csv` | 69.0% | 8.513 | 58/84 |
-| `yaml` | 71.4% | 13.129 | 60/84 |
-| `json` | 69.0% | 15.145 | 58/84 |
-| `xml` | 71.4% | 17.095 | 60/84 |
+| `toon` | 74.4% | 8,745 | 67/90 |
+| `csv` | 73.3% | 8,513 | 66/90 |
+| `yaml` | 62.2% | 13,129 | 56/90 |
+| `json` | 61.1% | 15,145 | 55/90 |
+| `xml` | 61.1% | 17,095 | 55/90 |
#### Performance by Model
-##### gpt-5-nano
-
-| Format | Accuracy | Correct/Total |
-| ------ | -------- | ------------- |
-| `toon` | 99.4% | 158/159 |
-| `yaml` | 95.0% | 151/159 |
-| `csv` | 92.5% | 147/159 |
-| `json` | 92.5% | 147/159 |
-| `xml` | 91.2% | 145/159 |
-
-##### claude-haiku-4-5
-
-| Format | Accuracy | Correct/Total |
-| ------ | -------- | ------------- |
-| `toon` | 75.5% | 120/159 |
-| `xml` | 75.5% | 120/159 |
-| `csv` | 75.5% | 120/159 |
-| `json` | 75.5% | 120/159 |
-| `yaml` | 74.2% | 118/159 |
-
##### gemini-2.5-flash
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
-| `xml` | 91.8% | 146/159 |
-| `csv` | 86.2% | 137/159 |
-| `toon` | 84.9% | 135/159 |
-| `json` | 81.8% | 130/159 |
-| `yaml` | 78.6% | 125/159 |
+| `xml` | 90.3% | 139/154 |
+| `csv` | 89.0% | 137/154 |
+| `toon` | 87.0% | 134/154 |
+| `json` | 79.2% | 122/154 |
+| `yaml` | 76.0% | 117/154 |
+
+##### gpt-5-nano
+
+| Format | Accuracy | Correct/Total |
+| ------ | -------- | ------------- |
+| `toon` | 96.1% | 148/154 |
+| `csv` | 90.3% | 139/154 |
+| `yaml` | 89.0% | 137/154 |
+| `json` | 87.7% | 135/154 |
+| `xml` | 83.8% | 129/154 |
+
+##### claude-haiku-4-5-20251001
+
+| Format | Accuracy | Correct/Total |
+| ------ | -------- | ------------- |
+| `json` | 48.7% | 75/154 |
+| `toon` | 48.1% | 74/154 |
+| `xml` | 47.4% | 73/154 |
+| `yaml` | 47.4% | 73/154 |
+| `csv` | 45.5% | 70/154 |
@@ -336,32 +336,34 @@ Four datasets designed to test different structural patterns:
#### Question Types
-159 questions are generated dynamically across three categories:
+154 questions are generated dynamically across three categories:
-- **Field retrieval (50%)**: Direct value lookups
+- **Field retrieval (40%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → `75000`
+ - Example: "How many items are in order ORD-0042?" → `3`
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
-- **Aggregation (25%)**: Counting and summation tasks
+- **Aggregation (32%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
- Example: "How many employees work in Engineering?" → `17`
- Example: "What is the total revenue across all orders?" → `45123.50`
+ - Example: "How many employees have salary > 80000?" → `23`
-- **Filtering (25%)**: Conditional queries
+- **Filtering (28%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
- Example: "How many employees in Sales have salary > 80000?" → `5`
- - Example: "How many orders have total > 400?" → `12`
+ - Example: "How many active employees have more than 10 years of experience?" → `8`
#### Evaluation Process
-1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-4. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
+3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
#### Models & Configuration
-- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash`
+- **Models tested**: `gemini-2.5-flash`, `gpt-5-nano`, `claude-haiku-4-5-20251001`
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
-- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
+- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls
diff --git a/benchmarks/data/github-repos.json b/benchmarks/data/github-repos.json
index b7ed072..343e7c0 100644
--- a/benchmarks/data/github-repos.json
+++ b/benchmarks/data/github-repos.json
@@ -5,11 +5,11 @@
"repo": "freeCodeCamp/freeCodeCamp",
"description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming, and computer science for free.",
"createdAt": "2014-12-24T17:49:19Z",
- "updatedAt": "2025-10-27T07:40:58Z",
- "pushedAt": "2025-10-26T11:31:08Z",
- "stars": 430828,
- "watchers": 8582,
- "forks": 42136,
+ "updatedAt": "2025-10-28T11:58:08Z",
+ "pushedAt": "2025-10-28T10:17:16Z",
+ "stars": 430886,
+ "watchers": 8583,
+ "forks": 42146,
"defaultBranch": "main"
},
{
@@ -18,11 +18,11 @@
"repo": "codecrafters-io/build-your-own-x",
"description": "Master programming by recreating your favorite technologies from scratch.",
"createdAt": "2018-05-09T12:03:18Z",
- "updatedAt": "2025-10-27T07:43:25Z",
+ "updatedAt": "2025-10-28T12:37:11Z",
"pushedAt": "2025-10-10T18:45:01Z",
- "stars": 430102,
- "watchers": 6322,
- "forks": 40388,
+ "stars": 430877,
+ "watchers": 6332,
+ "forks": 40453,
"defaultBranch": "master"
},
{
@@ -31,11 +31,11 @@
"repo": "sindresorhus/awesome",
"description": "😎 Awesome lists about all kinds of interesting topics",
"createdAt": "2014-07-11T13:42:37Z",
- "updatedAt": "2025-10-27T07:44:27Z",
- "pushedAt": "2025-10-23T17:26:53Z",
- "stars": 409760,
- "watchers": 8016,
- "forks": 32015,
+ "updatedAt": "2025-10-28T12:40:21Z",
+ "pushedAt": "2025-10-27T17:57:31Z",
+ "stars": 410052,
+ "watchers": 8017,
+ "forks": 32029,
"defaultBranch": "main"
},
{
@@ -44,11 +44,11 @@
"repo": "EbookFoundation/free-programming-books",
"description": ":books: Freely available programming books",
"createdAt": "2013-10-11T06:50:37Z",
- "updatedAt": "2025-10-27T07:36:14Z",
- "pushedAt": "2025-10-26T23:24:34Z",
- "stars": 375134,
- "watchers": 9788,
- "forks": 65149,
+ "updatedAt": "2025-10-28T12:16:59Z",
+ "pushedAt": "2025-10-28T01:52:13Z",
+ "stars": 375307,
+ "watchers": 9786,
+ "forks": 65199,
"defaultBranch": "main"
},
{
@@ -57,11 +57,11 @@
"repo": "public-apis/public-apis",
"description": "A collective list of free APIs",
"createdAt": "2016-03-20T23:49:42Z",
- "updatedAt": "2025-10-27T07:45:41Z",
+ "updatedAt": "2025-10-28T12:33:14Z",
"pushedAt": "2025-05-20T15:56:34Z",
- "stars": 373288,
- "watchers": 4392,
- "forks": 39386,
+ "stars": 374003,
+ "watchers": 4400,
+ "forks": 39473,
"defaultBranch": "master"
},
{
@@ -70,11 +70,11 @@
"repo": "kamranahmedse/developer-roadmap",
"description": "Interactive roadmaps, guides and other educational content to help developers grow in their careers.",
"createdAt": "2017-03-15T13:45:52Z",
- "updatedAt": "2025-10-27T07:26:36Z",
- "pushedAt": "2025-10-24T10:20:46Z",
- "stars": 342038,
- "watchers": 6887,
- "forks": 43222,
+ "updatedAt": "2025-10-28T12:31:02Z",
+ "pushedAt": "2025-10-28T11:09:58Z",
+ "stars": 342136,
+ "watchers": 6886,
+ "forks": 43234,
"defaultBranch": "master"
},
{
@@ -83,11 +83,11 @@
"repo": "jwasham/coding-interview-university",
"description": "A complete computer science study plan to become a software engineer.",
"createdAt": "2016-06-06T02:34:12Z",
- "updatedAt": "2025-10-27T07:46:31Z",
+ "updatedAt": "2025-10-28T12:21:02Z",
"pushedAt": "2025-08-28T14:42:47Z",
- "stars": 331885,
- "watchers": 8512,
- "forks": 81046,
+ "stars": 331947,
+ "watchers": 8511,
+ "forks": 81057,
"defaultBranch": "main"
},
{
@@ -96,11 +96,11 @@
"repo": "donnemartin/system-design-primer",
"description": "Learn how to design large-scale systems. Prep for the system design interview. Includes Anki flashcards.",
"createdAt": "2017-02-26T16:15:28Z",
- "updatedAt": "2025-10-27T07:38:55Z",
+ "updatedAt": "2025-10-28T12:32:56Z",
"pushedAt": "2025-05-21T11:13:33Z",
- "stars": 324162,
- "watchers": 6818,
- "forks": 52866,
+ "stars": 324409,
+ "watchers": 6819,
+ "forks": 52904,
"defaultBranch": "master"
},
{
@@ -109,11 +109,11 @@
"repo": "996icu/996.ICU",
"description": "Repo for counting stars and contributing. Press F to pay respect to glorious developers.",
"createdAt": "2019-03-26T07:31:14Z",
- "updatedAt": "2025-10-27T07:35:11Z",
+ "updatedAt": "2025-10-28T11:07:13Z",
"pushedAt": "2025-08-22T06:01:29Z",
- "stars": 274700,
- "watchers": 4217,
- "forks": 21033,
+ "stars": 274706,
+ "watchers": 4216,
+ "forks": 21029,
"defaultBranch": "master"
},
{
@@ -122,11 +122,11 @@
"repo": "vinta/awesome-python",
"description": "An opinionated list of awesome Python frameworks, libraries, software and resources.",
"createdAt": "2014-06-27T21:00:06Z",
- "updatedAt": "2025-10-27T07:40:04Z",
+ "updatedAt": "2025-10-28T12:28:13Z",
"pushedAt": "2025-10-16T13:40:58Z",
- "stars": 266460,
- "watchers": 6127,
- "forks": 26579,
+ "stars": 266661,
+ "watchers": 6128,
+ "forks": 26604,
"defaultBranch": "master"
},
{
@@ -135,11 +135,11 @@
"repo": "awesome-selfhosted/awesome-selfhosted",
"description": "A list of Free Software network services and web applications which can be hosted on your own servers",
"createdAt": "2015-06-01T02:33:17Z",
- "updatedAt": "2025-10-27T07:43:02Z",
- "pushedAt": "2025-10-23T10:47:33Z",
- "stars": 254916,
- "watchers": 2995,
- "forks": 11798,
+ "updatedAt": "2025-10-28T12:24:53Z",
+ "pushedAt": "2025-10-27T21:40:26Z",
+ "stars": 255143,
+ "watchers": 2990,
+ "forks": 11802,
"defaultBranch": "master"
},
{
@@ -148,11 +148,11 @@
"repo": "practical-tutorials/project-based-learning",
"description": "Curated list of project-based tutorials",
"createdAt": "2017-04-12T05:07:46Z",
- "updatedAt": "2025-10-27T07:45:41Z",
+ "updatedAt": "2025-10-28T12:22:51Z",
"pushedAt": "2024-08-15T05:33:54Z",
- "stars": 247930,
- "watchers": 3445,
- "forks": 32413,
+ "stars": 248050,
+ "watchers": 3446,
+ "forks": 32431,
"defaultBranch": "master"
},
{
@@ -161,11 +161,11 @@
"repo": "facebook/react",
"description": "The library for web and native user interfaces.",
"createdAt": "2013-05-24T16:15:54Z",
- "updatedAt": "2025-10-27T06:47:16Z",
- "pushedAt": "2025-10-24T22:08:43Z",
- "stars": 240059,
- "watchers": 6687,
- "forks": 49664,
+ "updatedAt": "2025-10-28T12:24:55Z",
+ "pushedAt": "2025-10-28T01:25:20Z",
+ "stars": 240100,
+ "watchers": 6686,
+ "forks": 49682,
"defaultBranch": "main"
},
{
@@ -174,11 +174,11 @@
"repo": "TheAlgorithms/Python",
"description": "All Algorithms implemented in Python",
"createdAt": "2016-07-16T09:44:01Z",
- "updatedAt": "2025-10-27T07:26:23Z",
+ "updatedAt": "2025-10-28T12:25:22Z",
"pushedAt": "2025-10-20T00:59:36Z",
- "stars": 212044,
+ "stars": 212119,
"watchers": 5975,
- "forks": 48986,
+ "forks": 49025,
"defaultBranch": "master"
},
{
@@ -187,11 +187,11 @@
"repo": "vuejs/vue",
"description": "This is the repo for Vue 2. For Vue 3, go to https://github.com/vuejs/core",
"createdAt": "2013-07-29T03:24:51Z",
- "updatedAt": "2025-10-27T05:37:40Z",
+ "updatedAt": "2025-10-28T10:39:45Z",
"pushedAt": "2024-10-10T07:24:15Z",
- "stars": 209624,
- "watchers": 5787,
- "forks": 33796,
+ "stars": 209636,
+ "watchers": 5786,
+ "forks": 33795,
"defaultBranch": "main"
},
{
@@ -200,11 +200,11 @@
"repo": "torvalds/linux",
"description": "Linux kernel source tree",
"createdAt": "2011-09-04T22:48:12Z",
- "updatedAt": "2025-10-27T07:25:34Z",
- "pushedAt": "2025-10-26T23:00:24Z",
- "stars": 205761,
- "watchers": 7739,
- "forks": 58023,
+ "updatedAt": "2025-10-28T12:39:23Z",
+ "pushedAt": "2025-10-27T18:11:32Z",
+ "stars": 205858,
+ "watchers": 7743,
+ "forks": 58047,
"defaultBranch": "master"
},
{
@@ -213,11 +213,11 @@
"repo": "ossu/computer-science",
"description": "🎓 Path to a free self-taught education in Computer Science!",
"createdAt": "2014-05-04T00:18:39Z",
- "updatedAt": "2025-10-27T07:25:53Z",
+ "updatedAt": "2025-10-28T12:41:20Z",
"pushedAt": "2025-08-23T18:48:52Z",
- "stars": 196024,
- "watchers": 5935,
- "forks": 24465,
+ "stars": 196086,
+ "watchers": 5936,
+ "forks": 24474,
"defaultBranch": "master"
},
{
@@ -226,11 +226,11 @@
"repo": "trekhleb/javascript-algorithms",
"description": "📝 Algorithms and data structures implemented in JavaScript with explanations and links to further readings",
"createdAt": "2018-03-24T07:47:04Z",
- "updatedAt": "2025-10-27T07:26:50Z",
+ "updatedAt": "2025-10-28T12:37:32Z",
"pushedAt": "2025-10-22T15:03:29Z",
- "stars": 193648,
- "watchers": 4267,
- "forks": 30919,
+ "stars": 193744,
+ "watchers": 4268,
+ "forks": 30929,
"defaultBranch": "master"
},
{
@@ -239,11 +239,11 @@
"repo": "tensorflow/tensorflow",
"description": "An Open Source Machine Learning Framework for Everyone",
"createdAt": "2015-11-07T01:19:20Z",
- "updatedAt": "2025-10-27T07:33:01Z",
- "pushedAt": "2025-10-27T06:15:29Z",
- "stars": 192220,
+ "updatedAt": "2025-10-28T11:56:54Z",
+ "pushedAt": "2025-10-28T12:37:04Z",
+ "stars": 192240,
"watchers": 7431,
- "forks": 74928,
+ "forks": 74932,
"defaultBranch": "master"
},
{
@@ -252,11 +252,11 @@
"repo": "trimstray/the-book-of-secret-knowledge",
"description": "A collection of inspiring lists, manuals, cheatsheets, blogs, hacks, one-liners, cli/web tools and more.",
"createdAt": "2018-06-23T10:43:14Z",
- "updatedAt": "2025-10-27T07:43:08Z",
+ "updatedAt": "2025-10-28T12:40:20Z",
"pushedAt": "2024-11-19T14:00:38Z",
- "stars": 191315,
- "watchers": 2679,
- "forks": 11763,
+ "stars": 191487,
+ "watchers": 2678,
+ "forks": 11764,
"defaultBranch": "master"
},
{
@@ -265,11 +265,11 @@
"repo": "getify/You-Dont-Know-JS",
"description": "A book series (2 published editions) on the JS language.",
"createdAt": "2013-11-16T02:37:24Z",
- "updatedAt": "2025-10-27T07:25:47Z",
+ "updatedAt": "2025-10-28T11:34:43Z",
"pushedAt": "2025-05-20T14:22:36Z",
- "stars": 183631,
- "watchers": 5802,
- "forks": 33668,
+ "stars": 183653,
+ "watchers": 5803,
+ "forks": 33671,
"defaultBranch": "2nd-ed"
},
{
@@ -278,11 +278,11 @@
"repo": "CyC2018/CS-Notes",
"description": ":books: 技术面试必备基础知识、Leetcode、计算机操作系统、计算机网络、系统设计",
"createdAt": "2018-02-13T14:56:24Z",
- "updatedAt": "2025-10-27T07:19:57Z",
+ "updatedAt": "2025-10-28T11:56:57Z",
"pushedAt": "2024-08-21T09:40:10Z",
- "stars": 182646,
- "watchers": 5252,
- "forks": 51251,
+ "stars": 182661,
+ "watchers": 5249,
+ "forks": 51249,
"defaultBranch": "master"
},
{
@@ -291,11 +291,11 @@
"repo": "ohmyzsh/ohmyzsh",
"description": "🙃 A delightful community-driven (with 2,400+ contributors) framework for managing your zsh configuration. Includes 300+ optional plugins (rails, git, macOS, hub, docker, homebrew, node, php, python, etc), 140+ themes to spice up your morning, and an auto-update tool that makes it easy to keep up with the latest updates from the community.",
"createdAt": "2009-08-28T18:15:37Z",
- "updatedAt": "2025-10-27T07:25:29Z",
- "pushedAt": "2025-10-26T13:17:47Z",
- "stars": 182297,
- "watchers": 2618,
- "forks": 26259,
+ "updatedAt": "2025-10-28T12:39:19Z",
+ "pushedAt": "2025-10-27T18:37:07Z",
+ "stars": 182331,
+ "watchers": 2620,
+ "forks": 26261,
"defaultBranch": "master"
},
{
@@ -304,11 +304,11 @@
"repo": "Significant-Gravitas/AutoGPT",
"description": "AutoGPT is the vision of accessible AI for everyone, to use and to build on. Our mission is to provide the tools, so that you can focus on what matters.",
"createdAt": "2023-03-16T09:21:07Z",
- "updatedAt": "2025-10-27T07:34:44Z",
- "pushedAt": "2025-10-27T00:10:36Z",
- "stars": 179292,
+ "updatedAt": "2025-10-28T12:01:03Z",
+ "pushedAt": "2025-10-28T11:50:06Z",
+ "stars": 179337,
"watchers": 1547,
- "forks": 46077,
+ "forks": 46094,
"defaultBranch": "master"
},
{
@@ -317,11 +317,11 @@
"repo": "microsoft/vscode",
"description": "Visual Studio Code",
"createdAt": "2015-09-03T20:23:38Z",
- "updatedAt": "2025-10-27T07:26:11Z",
- "pushedAt": "2025-10-27T07:29:25Z",
- "stars": 177925,
- "watchers": 3364,
- "forks": 35788,
+ "updatedAt": "2025-10-28T12:22:53Z",
+ "pushedAt": "2025-10-28T12:33:55Z",
+ "stars": 177962,
+ "watchers": 3366,
+ "forks": 35810,
"defaultBranch": "main"
},
{
@@ -330,11 +330,11 @@
"repo": "jackfrued/Python-100-Days",
"description": "Python - 100天从新手到大师",
"createdAt": "2018-03-01T16:05:52Z",
- "updatedAt": "2025-10-27T07:26:50Z",
+ "updatedAt": "2025-10-28T12:40:38Z",
"pushedAt": "2025-03-28T10:29:23Z",
- "stars": 173752,
+ "stars": 173818,
"watchers": 6098,
- "forks": 54771,
+ "forks": 54782,
"defaultBranch": "master"
},
{
@@ -343,11 +343,11 @@
"repo": "twbs/bootstrap",
"description": "The most popular HTML, CSS, and JavaScript framework for developing responsive, mobile first projects on the web.",
"createdAt": "2011-07-29T21:19:00Z",
- "updatedAt": "2025-10-27T07:25:34Z",
- "pushedAt": "2025-10-26T18:41:31Z",
- "stars": 173599,
- "watchers": 6681,
- "forks": 79156,
+ "updatedAt": "2025-10-28T12:25:19Z",
+ "pushedAt": "2025-10-28T10:02:33Z",
+ "stars": 173612,
+ "watchers": 6680,
+ "forks": 79159,
"defaultBranch": "main"
},
{
@@ -356,11 +356,11 @@
"repo": "flutter/flutter",
"description": "Flutter makes it easy and fast to build beautiful apps for mobile and beyond",
"createdAt": "2015-03-06T22:54:58Z",
- "updatedAt": "2025-10-27T07:31:00Z",
- "pushedAt": "2025-10-27T05:33:32Z",
- "stars": 173546,
+ "updatedAt": "2025-10-28T12:35:50Z",
+ "pushedAt": "2025-10-28T12:35:51Z",
+ "stars": 173572,
"watchers": 3481,
- "forks": 29414,
+ "forks": 29419,
"defaultBranch": "master"
},
{
@@ -369,11 +369,11 @@
"repo": "github/gitignore",
"description": "A collection of useful .gitignore templates",
"createdAt": "2010-11-08T20:17:14Z",
- "updatedAt": "2025-10-27T07:34:35Z",
+ "updatedAt": "2025-10-28T12:36:17Z",
"pushedAt": "2025-09-10T18:42:03Z",
- "stars": 170298,
- "watchers": 3366,
- "forks": 82998,
+ "stars": 170327,
+ "watchers": 3367,
+ "forks": 82996,
"defaultBranch": "main"
},
{
@@ -382,11 +382,11 @@
"repo": "jlevy/the-art-of-command-line",
"description": "Master the command line, in one page",
"createdAt": "2015-05-20T15:11:03Z",
- "updatedAt": "2025-10-27T07:26:07Z",
+ "updatedAt": "2025-10-28T10:16:58Z",
"pushedAt": "2024-06-25T18:13:44Z",
- "stars": 158582,
+ "stars": 158603,
"watchers": 2812,
- "forks": 14754,
+ "forks": 14753,
"defaultBranch": "master"
},
{
@@ -395,11 +395,11 @@
"repo": "AUTOMATIC1111/stable-diffusion-webui",
"description": "Stable Diffusion web UI",
"createdAt": "2022-08-22T14:05:26Z",
- "updatedAt": "2025-10-27T07:49:02Z",
+ "updatedAt": "2025-10-28T12:41:21Z",
"pushedAt": "2025-10-07T20:06:10Z",
- "stars": 157565,
- "watchers": 1154,
- "forks": 29246,
+ "stars": 157629,
+ "watchers": 1156,
+ "forks": 29254,
"defaultBranch": "master"
},
{
@@ -408,11 +408,11 @@
"repo": "avelino/awesome-go",
"description": "A curated list of awesome Go frameworks, libraries and software",
"createdAt": "2014-07-06T13:42:15Z",
- "updatedAt": "2025-10-27T07:49:36Z",
+ "updatedAt": "2025-10-28T12:41:20Z",
"pushedAt": "2025-10-22T12:15:14Z",
- "stars": 155801,
- "watchers": 2818,
- "forks": 12706,
+ "stars": 155912,
+ "watchers": 2820,
+ "forks": 12712,
"defaultBranch": "main"
},
{
@@ -421,11 +421,11 @@
"repo": "ollama/ollama",
"description": "Get up and running with OpenAI gpt-oss, DeepSeek-R1, Gemma 3 and other models.",
"createdAt": "2023-06-26T19:39:32Z",
- "updatedAt": "2025-10-27T07:43:05Z",
- "pushedAt": "2025-10-27T01:25:05Z",
- "stars": 154808,
- "watchers": 877,
- "forks": 13467,
+ "updatedAt": "2025-10-28T12:05:06Z",
+ "pushedAt": "2025-10-28T08:16:13Z",
+ "stars": 154883,
+ "watchers": 876,
+ "forks": 13480,
"defaultBranch": "main"
},
{
@@ -434,11 +434,11 @@
"repo": "massgravel/Microsoft-Activation-Scripts",
"description": "Open-source Windows and Office activator featuring HWID, Ohook, TSforge, KMS38, and Online KMS activation methods, along with advanced troubleshooting.",
"createdAt": "2020-01-12T23:03:34Z",
- "updatedAt": "2025-10-27T07:44:35Z",
+ "updatedAt": "2025-10-28T12:40:24Z",
"pushedAt": "2025-09-30T22:22:59Z",
- "stars": 153864,
+ "stars": 154022,
"watchers": 1319,
- "forks": 14861,
+ "forks": 14869,
"defaultBranch": "master"
},
{
@@ -447,11 +447,11 @@
"repo": "Snailclimb/JavaGuide",
"description": "「Java学习+面试指南」一份涵盖大部分 Java 程序员所需要掌握的核心知识。准备 Java 面试,首选 JavaGuide!",
"createdAt": "2018-05-07T13:27:00Z",
- "updatedAt": "2025-10-27T07:25:13Z",
- "pushedAt": "2025-10-20T08:53:33Z",
- "stars": 152308,
- "watchers": 4470,
- "forks": 46020,
+ "updatedAt": "2025-10-28T12:01:53Z",
+ "pushedAt": "2025-10-27T11:09:05Z",
+ "stars": 152325,
+ "watchers": 4469,
+ "forks": 46021,
"defaultBranch": "main"
},
{
@@ -460,11 +460,11 @@
"repo": "n8n-io/n8n",
"description": "Fair-code workflow automation platform with native AI capabilities. Combine visual building with custom code, self-host or cloud, 400+ integrations.",
"createdAt": "2019-06-22T09:24:21Z",
- "updatedAt": "2025-10-27T07:48:50Z",
- "pushedAt": "2025-10-27T07:12:52Z",
- "stars": 151975,
- "watchers": 880,
- "forks": 48459,
+ "updatedAt": "2025-10-28T12:41:23Z",
+ "pushedAt": "2025-10-28T12:34:50Z",
+ "stars": 152300,
+ "watchers": 889,
+ "forks": 48578,
"defaultBranch": "master"
},
{
@@ -473,11 +473,11 @@
"repo": "huggingface/transformers",
"description": "🤗 Transformers: the model-definition framework for state-of-the-art machine learning models in text, vision, audio, and multimodal models, for both inference and training. ",
"createdAt": "2018-10-29T13:56:00Z",
- "updatedAt": "2025-10-27T07:45:24Z",
- "pushedAt": "2025-10-25T16:31:22Z",
- "stars": 151659,
- "watchers": 1166,
- "forks": 30955,
+ "updatedAt": "2025-10-28T12:41:10Z",
+ "pushedAt": "2025-10-28T12:38:18Z",
+ "stars": 151745,
+ "watchers": 1167,
+ "forks": 30971,
"defaultBranch": "main"
},
{
@@ -486,11 +486,11 @@
"repo": "airbnb/javascript",
"description": "JavaScript Style Guide",
"createdAt": "2012-11-01T23:13:50Z",
- "updatedAt": "2025-10-27T06:50:33Z",
+ "updatedAt": "2025-10-28T11:07:36Z",
"pushedAt": "2025-09-17T18:12:44Z",
- "stars": 147687,
- "watchers": 3705,
- "forks": 26797,
+ "stars": 147700,
+ "watchers": 3702,
+ "forks": 26795,
"defaultBranch": "master"
},
{
@@ -499,24 +499,37 @@
"repo": "ytdl-org/youtube-dl",
"description": "Command-line program to download videos from YouTube.com and other video sites",
"createdAt": "2010-10-31T14:35:07Z",
- "updatedAt": "2025-10-27T07:30:15Z",
+ "updatedAt": "2025-10-28T12:01:08Z",
"pushedAt": "2025-10-18T10:02:28Z",
- "stars": 138545,
+ "stars": 138581,
"watchers": 2160,
"forks": 10527,
"defaultBranch": "master"
},
+ {
+ "id": 599320067,
+ "name": "langflow",
+ "repo": "langflow-ai/langflow",
+ "description": "Langflow is a powerful tool for building and deploying AI-powered agents and workflows.",
+ "createdAt": "2023-02-08T22:28:03Z",
+ "updatedAt": "2025-10-28T12:04:14Z",
+ "pushedAt": "2025-10-28T11:44:40Z",
+ "stars": 136336,
+ "watchers": 454,
+ "forks": 7859,
+ "defaultBranch": "main"
+ },
{
"id": 574523116,
"name": "awesome-chatgpt-prompts",
"repo": "f/awesome-chatgpt-prompts",
"description": "This repo includes ChatGPT prompt curation to use ChatGPT and other LLM tools better.",
"createdAt": "2022-12-05T13:54:13Z",
- "updatedAt": "2025-10-27T07:42:24Z",
+ "updatedAt": "2025-10-28T12:32:02Z",
"pushedAt": "2025-10-14T17:23:13Z",
- "stars": 135794,
- "watchers": 1562,
- "forks": 18073,
+ "stars": 135843,
+ "watchers": 1563,
+ "forks": 18078,
"defaultBranch": "main"
},
{
@@ -525,37 +538,24 @@
"repo": "vercel/next.js",
"description": "The React Framework",
"createdAt": "2016-10-05T23:32:51Z",
- "updatedAt": "2025-10-27T07:38:47Z",
- "pushedAt": "2025-10-27T07:02:37Z",
- "stars": 135306,
- "watchers": 1497,
- "forks": 29680,
+ "updatedAt": "2025-10-28T12:19:30Z",
+ "pushedAt": "2025-10-28T12:22:48Z",
+ "stars": 135333,
+ "watchers": 1495,
+ "forks": 29693,
"defaultBranch": "canary"
},
- {
- "id": 599320067,
- "name": "langflow",
- "repo": "langflow-ai/langflow",
- "description": "Langflow is a powerful tool for building and deploying AI-powered agents and workflows.",
- "createdAt": "2023-02-08T22:28:03Z",
- "updatedAt": "2025-10-27T07:22:05Z",
- "pushedAt": "2025-10-27T00:28:51Z",
- "stars": 134904,
- "watchers": 453,
- "forks": 7853,
- "defaultBranch": "main"
- },
{
"id": 307260205,
"name": "yt-dlp",
"repo": "yt-dlp/yt-dlp",
"description": "A feature-rich command-line audio/video downloader",
"createdAt": "2020-10-26T04:22:55Z",
- "updatedAt": "2025-10-27T07:35:17Z",
- "pushedAt": "2025-10-25T22:47:00Z",
- "stars": 132793,
- "watchers": 675,
- "forks": 10659,
+ "updatedAt": "2025-10-28T12:38:42Z",
+ "pushedAt": "2025-10-27T23:21:38Z",
+ "stars": 132949,
+ "watchers": 678,
+ "forks": 10668,
"defaultBranch": "master"
},
{
@@ -564,10 +564,10 @@
"repo": "521xueweihan/HelloGitHub",
"description": ":octocat: 分享 GitHub 上有趣、入门级的开源项目。Share interesting, entry-level open source projects on GitHub.",
"createdAt": "2016-05-04T06:24:11Z",
- "updatedAt": "2025-10-27T07:49:37Z",
- "pushedAt": "2025-09-28T02:00:22Z",
- "stars": 132228,
- "watchers": 4182,
+ "updatedAt": "2025-10-28T12:13:38Z",
+ "pushedAt": "2025-10-28T00:14:25Z",
+ "stars": 132365,
+ "watchers": 4187,
"forks": 10822,
"defaultBranch": "master"
},
@@ -577,11 +577,11 @@
"repo": "yangshun/tech-interview-handbook",
"description": "💯 Curated coding interview preparation materials for busy software engineers",
"createdAt": "2016-07-05T05:00:48Z",
- "updatedAt": "2025-10-27T07:26:22Z",
+ "updatedAt": "2025-10-28T09:33:23Z",
"pushedAt": "2025-08-27T00:17:33Z",
- "stars": 131399,
+ "stars": 131430,
"watchers": 2182,
- "forks": 15942,
+ "forks": 15945,
"defaultBranch": "main"
},
{
@@ -590,11 +590,11 @@
"repo": "golang/go",
"description": "The Go programming language",
"createdAt": "2014-08-19T04:33:40Z",
- "updatedAt": "2025-10-27T07:25:58Z",
- "pushedAt": "2025-10-27T04:49:52Z",
- "stars": 130538,
- "watchers": 3346,
- "forks": 18415,
+ "updatedAt": "2025-10-28T11:52:10Z",
+ "pushedAt": "2025-10-28T06:29:46Z",
+ "stars": 130554,
+ "watchers": 3347,
+ "forks": 18419,
"defaultBranch": "master"
},
{
@@ -603,11 +603,11 @@
"repo": "Genymobile/scrcpy",
"description": "Display and control your Android device",
"createdAt": "2017-11-21T18:00:27Z",
- "updatedAt": "2025-10-27T07:30:24Z",
- "pushedAt": "2025-10-26T10:52:03Z",
- "stars": 130238,
- "watchers": 1321,
- "forks": 12191,
+ "updatedAt": "2025-10-28T12:05:50Z",
+ "pushedAt": "2025-10-27T08:59:41Z",
+ "stars": 130304,
+ "watchers": 1322,
+ "forks": 12194,
"defaultBranch": "master"
},
{
@@ -616,11 +616,11 @@
"repo": "labuladong/fucking-algorithm",
"description": "刷算法全靠套路,认准 labuladong 就够了!English version supported! Crack LeetCode, not only how, but also why. ",
"createdAt": "2020-02-19T09:01:23Z",
- "updatedAt": "2025-10-27T07:27:20Z",
+ "updatedAt": "2025-10-28T08:35:53Z",
"pushedAt": "2025-10-08T04:06:00Z",
- "stars": 129651,
+ "stars": 129669,
"watchers": 2283,
- "forks": 23450,
+ "forks": 23452,
"defaultBranch": "master"
},
{
@@ -629,11 +629,11 @@
"repo": "Chalarangelo/30-seconds-of-code",
"description": "Coding articles to level up your development skills",
"createdAt": "2017-11-29T17:35:03Z",
- "updatedAt": "2025-10-27T07:26:47Z",
+ "updatedAt": "2025-10-28T09:14:02Z",
"pushedAt": "2025-10-22T12:51:11Z",
- "stars": 125630,
+ "stars": 125639,
"watchers": 2594,
- "forks": 12358,
+ "forks": 12362,
"defaultBranch": "master"
},
{
@@ -642,11 +642,11 @@
"repo": "microsoft/PowerToys",
"description": "Microsoft PowerToys is a collection of utilities that help you customize Windows and streamline everyday tasks",
"createdAt": "2019-05-01T17:44:02Z",
- "updatedAt": "2025-10-27T07:50:46Z",
- "pushedAt": "2025-10-27T02:44:52Z",
- "stars": 125223,
- "watchers": 1164,
- "forks": 7451,
+ "updatedAt": "2025-10-28T12:21:07Z",
+ "pushedAt": "2025-10-28T10:55:13Z",
+ "stars": 125271,
+ "watchers": 1166,
+ "forks": 7454,
"defaultBranch": "main"
},
{
@@ -655,11 +655,11 @@
"repo": "facebook/react-native",
"description": "A framework for building native applications using React",
"createdAt": "2015-01-09T18:10:16Z",
- "updatedAt": "2025-10-27T07:20:37Z",
- "pushedAt": "2025-10-27T06:53:57Z",
- "stars": 124320,
+ "updatedAt": "2025-10-28T12:36:00Z",
+ "pushedAt": "2025-10-28T12:25:56Z",
+ "stars": 124334,
"watchers": 3563,
- "forks": 24914,
+ "forks": 24916,
"defaultBranch": "main"
},
{
@@ -668,37 +668,37 @@
"repo": "electron/electron",
"description": ":electron: Build cross-platform desktop apps with JavaScript, HTML, and CSS",
"createdAt": "2013-04-12T01:47:36Z",
- "updatedAt": "2025-10-27T07:25:42Z",
- "pushedAt": "2025-10-27T06:46:57Z",
- "stars": 118841,
+ "updatedAt": "2025-10-28T11:35:46Z",
+ "pushedAt": "2025-10-28T09:28:32Z",
+ "stars": 118860,
"watchers": 2801,
- "forks": 16578,
+ "forks": 16584,
"defaultBranch": "main"
},
- {
- "id": 20580498,
- "name": "kubernetes",
- "repo": "kubernetes/kubernetes",
- "description": "Production-Grade Container Scheduling and Management",
- "createdAt": "2014-06-06T22:56:04Z",
- "updatedAt": "2025-10-27T07:31:13Z",
- "pushedAt": "2025-10-26T22:21:34Z",
- "stars": 118226,
- "watchers": 3189,
- "forks": 41578,
- "defaultBranch": "master"
- },
{
"id": 552661142,
"name": "langchain",
"repo": "langchain-ai/langchain",
"description": "🦜🔗 Build context-aware reasoning applications",
"createdAt": "2022-10-17T02:58:36Z",
- "updatedAt": "2025-10-27T07:37:09Z",
- "pushedAt": "2025-10-27T07:39:14Z",
- "stars": 118140,
- "watchers": 775,
- "forks": 19453,
+ "updatedAt": "2025-10-28T12:37:33Z",
+ "pushedAt": "2025-10-27T23:47:43Z",
+ "stars": 118261,
+ "watchers": 776,
+ "forks": 19476,
+ "defaultBranch": "master"
+ },
+ {
+ "id": 20580498,
+ "name": "kubernetes",
+ "repo": "kubernetes/kubernetes",
+ "description": "Production-Grade Container Scheduling and Management",
+ "createdAt": "2014-06-06T22:56:04Z",
+ "updatedAt": "2025-10-28T12:19:38Z",
+ "pushedAt": "2025-10-28T10:29:37Z",
+ "stars": 118246,
+ "watchers": 3189,
+ "forks": 41587,
"defaultBranch": "master"
},
{
@@ -707,10 +707,10 @@
"repo": "krahets/hello-algo",
"description": "《Hello 算法》:动画图解、一键运行的数据结构与算法教程。支持 Python, Java, C++, C, C#, JS, Go, Swift, Rust, Ruby, Kotlin, TS, Dart 代码。简体版和繁体版同步更新,English version in translation",
"createdAt": "2022-11-04T11:08:34Z",
- "updatedAt": "2025-10-27T07:28:05Z",
+ "updatedAt": "2025-10-28T12:30:36Z",
"pushedAt": "2025-10-16T21:33:36Z",
- "stars": 118081,
- "watchers": 582,
+ "stars": 118105,
+ "watchers": 583,
"forks": 14500,
"defaultBranch": "main"
},
@@ -720,11 +720,11 @@
"repo": "langgenius/dify",
"description": "Production-ready platform for agentic workflow development.",
"createdAt": "2023-04-12T07:40:24Z",
- "updatedAt": "2025-10-27T07:45:31Z",
- "pushedAt": "2025-10-27T07:48:43Z",
- "stars": 117359,
- "watchers": 697,
- "forks": 18125,
+ "updatedAt": "2025-10-28T12:18:46Z",
+ "pushedAt": "2025-10-28T10:48:12Z",
+ "stars": 117486,
+ "watchers": 698,
+ "forks": 18151,
"defaultBranch": "main"
},
{
@@ -733,10 +733,10 @@
"repo": "justjavac/free-programming-books-zh_CN",
"description": ":books: 免费的计算机编程类中文书籍,欢迎投稿",
"createdAt": "2013-11-04T01:59:19Z",
- "updatedAt": "2025-10-27T07:25:46Z",
+ "updatedAt": "2025-10-28T09:19:09Z",
"pushedAt": "2024-07-15T08:55:20Z",
- "stars": 115537,
- "watchers": 5860,
+ "stars": 115543,
+ "watchers": 5859,
"forks": 28362,
"defaultBranch": "main"
},
@@ -746,11 +746,11 @@
"repo": "ripienaar/free-for-dev",
"description": "A list of SaaS, PaaS and IaaS offerings that have free tiers of interest to devops and infradev",
"createdAt": "2015-03-18T21:06:26Z",
- "updatedAt": "2025-10-27T07:26:05Z",
+ "updatedAt": "2025-10-28T11:38:56Z",
"pushedAt": "2025-10-23T04:49:00Z",
- "stars": 114093,
- "watchers": 1734,
- "forks": 11683,
+ "stars": 114128,
+ "watchers": 1735,
+ "forks": 11684,
"defaultBranch": "master"
},
{
@@ -759,11 +759,11 @@
"repo": "nodejs/node",
"description": "Node.js JavaScript runtime ✨🐢🚀✨",
"createdAt": "2014-11-26T19:57:11Z",
- "updatedAt": "2025-10-27T07:38:07Z",
- "pushedAt": "2025-10-27T01:02:07Z",
- "stars": 113974,
- "watchers": 2964,
- "forks": 33571,
+ "updatedAt": "2025-10-28T12:34:32Z",
+ "pushedAt": "2025-10-28T11:29:04Z",
+ "stars": 114019,
+ "watchers": 2963,
+ "forks": 33580,
"defaultBranch": "main"
},
{
@@ -772,24 +772,11 @@
"repo": "open-webui/open-webui",
"description": "User-friendly AI Interface (Supports Ollama, OpenAI API, ...)",
"createdAt": "2023-10-06T22:08:27Z",
- "updatedAt": "2025-10-27T07:32:58Z",
- "pushedAt": "2025-10-27T05:20:59Z",
- "stars": 113474,
- "watchers": 516,
- "forks": 15764,
- "defaultBranch": "main"
- },
- {
- "id": 943149,
- "name": "d3",
- "repo": "d3/d3",
- "description": "Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:",
- "createdAt": "2010-09-27T17:22:42Z",
- "updatedAt": "2025-10-27T07:25:31Z",
- "pushedAt": "2025-07-27T11:30:40Z",
- "stars": 111683,
- "watchers": 3558,
- "forks": 22851,
+ "updatedAt": "2025-10-28T12:22:47Z",
+ "pushedAt": "2025-10-28T08:46:37Z",
+ "stars": 113575,
+ "watchers": 515,
+ "forks": 15783,
"defaultBranch": "main"
},
{
@@ -798,11 +785,24 @@
"repo": "DigitalPlatDev/FreeDomain",
"description": "DigitalPlat FreeDomain: Free Domain For Everyone",
"createdAt": "2024-05-30T13:23:00Z",
- "updatedAt": "2025-10-27T07:49:47Z",
+ "updatedAt": "2025-10-28T12:40:49Z",
"pushedAt": "2025-09-25T12:12:01Z",
- "stars": 111350,
+ "stars": 111985,
"watchers": 120,
- "forks": 2066,
+ "forks": 2068,
+ "defaultBranch": "main"
+ },
+ {
+ "id": 943149,
+ "name": "d3",
+ "repo": "d3/d3",
+ "description": "Bring data to life with SVG, Canvas and HTML. :bar_chart::chart_with_upwards_trend::tada:",
+ "createdAt": "2010-09-27T17:22:42Z",
+ "updatedAt": "2025-10-28T09:47:08Z",
+ "pushedAt": "2025-07-27T11:30:40Z",
+ "stars": 111693,
+ "watchers": 3558,
+ "forks": 22850,
"defaultBranch": "main"
},
{
@@ -811,11 +811,11 @@
"repo": "excalidraw/excalidraw",
"description": "Virtual whiteboard for sketching hand-drawn like diagrams",
"createdAt": "2020-01-02T01:04:43Z",
- "updatedAt": "2025-10-27T07:49:00Z",
- "pushedAt": "2025-10-27T06:42:25Z",
- "stars": 109225,
+ "updatedAt": "2025-10-28T12:38:34Z",
+ "pushedAt": "2025-10-28T11:43:31Z",
+ "stars": 109315,
"watchers": 467,
- "forks": 11332,
+ "forks": 11345,
"defaultBranch": "master"
},
{
@@ -824,11 +824,11 @@
"repo": "mrdoob/three.js",
"description": "JavaScript 3D Library.",
"createdAt": "2010-03-23T18:58:01Z",
- "updatedAt": "2025-10-27T07:25:30Z",
- "pushedAt": "2025-10-26T17:25:47Z",
- "stars": 109123,
- "watchers": 2517,
- "forks": 36051,
+ "updatedAt": "2025-10-28T12:07:59Z",
+ "pushedAt": "2025-10-28T12:13:11Z",
+ "stars": 109143,
+ "watchers": 2518,
+ "forks": 36054,
"defaultBranch": "dev"
},
{
@@ -837,11 +837,11 @@
"repo": "axios/axios",
"description": "Promise based HTTP client for the browser and node.js",
"createdAt": "2014-08-18T22:30:27Z",
- "updatedAt": "2025-10-27T05:22:18Z",
- "pushedAt": "2025-10-26T22:46:40Z",
- "stars": 108017,
+ "updatedAt": "2025-10-28T12:10:56Z",
+ "pushedAt": "2025-10-27T19:08:10Z",
+ "stars": 108032,
"watchers": 1169,
- "forks": 11366,
+ "forks": 11371,
"defaultBranch": "v1.x"
},
{
@@ -850,11 +850,11 @@
"repo": "rust-lang/rust",
"description": "Empowering everyone to build reliable and efficient software.",
"createdAt": "2010-06-16T20:39:03Z",
- "updatedAt": "2025-10-27T06:39:34Z",
- "pushedAt": "2025-10-27T07:25:41Z",
- "stars": 107453,
- "watchers": 1467,
- "forks": 13897,
+ "updatedAt": "2025-10-28T12:40:15Z",
+ "pushedAt": "2025-10-28T11:12:51Z",
+ "stars": 107478,
+ "watchers": 1468,
+ "forks": 13900,
"defaultBranch": "master"
},
{
@@ -863,11 +863,11 @@
"repo": "microsoft/TypeScript",
"description": "TypeScript is a superset of JavaScript that compiles to clean JavaScript output.",
"createdAt": "2014-06-17T15:28:39Z",
- "updatedAt": "2025-10-27T07:20:39Z",
- "pushedAt": "2025-10-27T00:06:54Z",
- "stars": 106530,
+ "updatedAt": "2025-10-28T12:19:23Z",
+ "pushedAt": "2025-10-27T23:52:12Z",
+ "stars": 106557,
"watchers": 2148,
- "forks": 13084,
+ "forks": 13086,
"defaultBranch": "main"
},
{
@@ -876,11 +876,11 @@
"repo": "denoland/deno",
"description": "A modern runtime for JavaScript and TypeScript.",
"createdAt": "2018-05-15T01:34:26Z",
- "updatedAt": "2025-10-27T07:14:57Z",
- "pushedAt": "2025-10-24T23:41:20Z",
- "stars": 104915,
+ "updatedAt": "2025-10-28T12:27:16Z",
+ "pushedAt": "2025-10-28T09:10:45Z",
+ "stars": 104939,
"watchers": 1398,
- "forks": 5753,
+ "forks": 5754,
"defaultBranch": "main"
},
{
@@ -889,11 +889,11 @@
"repo": "goldbergyoni/nodebestpractices",
"description": ":white_check_mark: The Node.js best practices list (July 2024)",
"createdAt": "2017-09-15T08:33:19Z",
- "updatedAt": "2025-10-27T07:26:43Z",
+ "updatedAt": "2025-10-28T11:50:28Z",
"pushedAt": "2025-04-15T21:52:42Z",
- "stars": 104439,
+ "stars": 104455,
"watchers": 1944,
- "forks": 10627,
+ "forks": 10625,
"defaultBranch": "master"
},
{
@@ -902,11 +902,11 @@
"repo": "facebook/create-react-app",
"description": "Set up a modern web app by running one command.",
"createdAt": "2016-07-17T14:55:11Z",
- "updatedAt": "2025-10-27T07:26:24Z",
+ "updatedAt": "2025-10-28T12:35:24Z",
"pushedAt": "2025-02-15T01:32:11Z",
- "stars": 103811,
- "watchers": 1892,
- "forks": 27146,
+ "stars": 103813,
+ "watchers": 1891,
+ "forks": 27148,
"defaultBranch": "main"
},
{
@@ -915,11 +915,11 @@
"repo": "GrowingGit/GitHub-Chinese-Top-Charts",
"description": ":cn: GitHub中文排行榜,各语言分设「软件 | 资料」榜单,精准定位中文好项目。各取所需,高效学习。",
"createdAt": "2019-09-05T03:01:56Z",
- "updatedAt": "2025-10-27T06:04:01Z",
+ "updatedAt": "2025-10-28T10:36:09Z",
"pushedAt": "2024-10-12T06:51:36Z",
- "stars": 103336,
+ "stars": 103358,
"watchers": 2607,
- "forks": 13364,
+ "forks": 13363,
"defaultBranch": "master"
},
{
@@ -928,11 +928,11 @@
"repo": "godotengine/godot",
"description": "Godot Engine – Multi-platform 2D and 3D game engine",
"createdAt": "2014-01-04T16:05:36Z",
- "updatedAt": "2025-10-27T07:16:51Z",
- "pushedAt": "2025-10-25T20:48:20Z",
- "stars": 102604,
+ "updatedAt": "2025-10-28T11:39:26Z",
+ "pushedAt": "2025-10-28T08:43:09Z",
+ "stars": 102655,
"watchers": 1493,
- "forks": 23450,
+ "forks": 23457,
"defaultBranch": "master"
},
{
@@ -941,11 +941,11 @@
"repo": "rustdesk/rustdesk",
"description": "An open-source remote desktop application designed for self-hosting, as an alternative to TeamViewer.",
"createdAt": "2020-09-28T15:36:08Z",
- "updatedAt": "2025-10-27T07:42:29Z",
- "pushedAt": "2025-10-26T13:28:57Z",
- "stars": 101456,
+ "updatedAt": "2025-10-28T12:27:03Z",
+ "pushedAt": "2025-10-28T12:25:33Z",
+ "stars": 101531,
"watchers": 548,
- "forks": 14837,
+ "forks": 14850,
"defaultBranch": "master"
},
{
@@ -954,11 +954,11 @@
"repo": "microsoft/generative-ai-for-beginners",
"description": "21 Lessons, Get Started Building with Generative AI ",
"createdAt": "2023-06-19T16:28:59Z",
- "updatedAt": "2025-10-27T07:38:12Z",
+ "updatedAt": "2025-10-28T12:25:17Z",
"pushedAt": "2025-10-27T03:19:39Z",
- "stars": 100935,
- "watchers": 889,
- "forks": 53478,
+ "stars": 101010,
+ "watchers": 887,
+ "forks": 53526,
"defaultBranch": "main"
},
{
@@ -967,9 +967,9 @@
"repo": "microsoft/terminal",
"description": "The new Windows Terminal and the original Windows console host, all in the same place!",
"createdAt": "2017-08-11T18:38:22Z",
- "updatedAt": "2025-10-27T05:40:24Z",
- "pushedAt": "2025-10-22T01:31:33Z",
- "stars": 100726,
+ "updatedAt": "2025-10-28T12:08:57Z",
+ "pushedAt": "2025-10-28T03:04:50Z",
+ "stars": 100746,
"watchers": 1334,
"forks": 8879,
"defaultBranch": "main"
@@ -980,11 +980,11 @@
"repo": "fatedier/frp",
"description": "A fast reverse proxy to help you expose a local server behind a NAT or firewall to the internet.",
"createdAt": "2015-12-21T15:24:59Z",
- "updatedAt": "2025-10-27T07:00:25Z",
- "pushedAt": "2025-10-17T02:53:43Z",
- "stars": 100015,
- "watchers": 1563,
- "forks": 14562,
+ "updatedAt": "2025-10-28T11:57:26Z",
+ "pushedAt": "2025-10-28T09:52:35Z",
+ "stars": 100048,
+ "watchers": 1564,
+ "forks": 14567,
"defaultBranch": "dev"
},
{
@@ -993,11 +993,11 @@
"repo": "deepseek-ai/DeepSeek-V3",
"description": null,
"createdAt": "2024-12-26T09:52:40Z",
- "updatedAt": "2025-10-27T07:28:30Z",
+ "updatedAt": "2025-10-28T12:11:53Z",
"pushedAt": "2025-08-28T03:24:37Z",
- "stars": 99981,
- "watchers": 750,
- "forks": 16309,
+ "stars": 100020,
+ "watchers": 752,
+ "forks": 16313,
"defaultBranch": "main"
},
{
@@ -1006,11 +1006,11 @@
"repo": "Hack-with-Github/Awesome-Hacking",
"description": "A collection of various awesome lists for hackers, pentesters and security researchers",
"createdAt": "2016-03-30T15:47:10Z",
- "updatedAt": "2025-10-27T07:49:40Z",
+ "updatedAt": "2025-10-28T12:11:25Z",
"pushedAt": "2025-01-18T01:48:02Z",
- "stars": 99684,
- "watchers": 3931,
- "forks": 9634,
+ "stars": 99746,
+ "watchers": 3932,
+ "forks": 9633,
"defaultBranch": "master"
},
{
@@ -1019,9 +1019,9 @@
"repo": "papers-we-love/papers-we-love",
"description": "Papers from the computer science community to read and discuss.",
"createdAt": "2013-12-15T14:31:41Z",
- "updatedAt": "2025-10-27T07:49:42Z",
+ "updatedAt": "2025-10-28T12:35:57Z",
"pushedAt": "2025-10-10T15:35:14Z",
- "stars": 99626,
+ "stars": 99660,
"watchers": 3159,
"forks": 6144,
"defaultBranch": "main"
@@ -1032,11 +1032,11 @@
"repo": "angular/angular",
"description": "Deliver web apps with confidence 🚀",
"createdAt": "2014-09-18T16:12:01Z",
- "updatedAt": "2025-10-27T07:05:22Z",
- "pushedAt": "2025-10-24T19:28:33Z",
- "stars": 99167,
+ "updatedAt": "2025-10-28T11:07:05Z",
+ "pushedAt": "2025-10-28T10:04:30Z",
+ "stars": 99174,
"watchers": 2980,
- "forks": 26724,
+ "forks": 26730,
"defaultBranch": "main"
},
{
@@ -1045,11 +1045,11 @@
"repo": "shadcn-ui/ui",
"description": "A set of beautifully-designed, accessible components and a code distribution platform. Works with your favorite frameworks. Open Source. Open Code.",
"createdAt": "2023-01-04T12:43:27Z",
- "updatedAt": "2025-10-27T07:34:00Z",
- "pushedAt": "2025-10-27T07:18:39Z",
- "stars": 98464,
- "watchers": 306,
- "forks": 7031,
+ "updatedAt": "2025-10-28T12:32:30Z",
+ "pushedAt": "2025-10-28T12:41:17Z",
+ "stars": 98552,
+ "watchers": 307,
+ "forks": 7046,
"defaultBranch": "main"
},
{
@@ -1058,11 +1058,11 @@
"repo": "tauri-apps/tauri",
"description": "Build smaller, faster, and more secure desktop and mobile applications with a web frontend.",
"createdAt": "2019-07-13T09:09:37Z",
- "updatedAt": "2025-10-27T07:27:10Z",
- "pushedAt": "2025-10-26T13:55:16Z",
- "stars": 98199,
+ "updatedAt": "2025-10-28T12:32:07Z",
+ "pushedAt": "2025-10-28T10:29:35Z",
+ "stars": 98262,
"watchers": 530,
- "forks": 3133,
+ "forks": 3139,
"defaultBranch": "dev"
},
{
@@ -1071,11 +1071,11 @@
"repo": "iptv-org/iptv",
"description": "Collection of publicly available IPTV channels from all over the world",
"createdAt": "2018-11-14T22:00:57Z",
- "updatedAt": "2025-10-27T07:13:48Z",
- "pushedAt": "2025-10-27T00:13:17Z",
- "stars": 98051,
- "watchers": 1950,
- "forks": 4195,
+ "updatedAt": "2025-10-28T12:32:18Z",
+ "pushedAt": "2025-10-28T00:11:46Z",
+ "stars": 98083,
+ "watchers": 1952,
+ "forks": 4199,
"defaultBranch": "master"
},
{
@@ -1084,9 +1084,9 @@
"repo": "mui/material-ui",
"description": "Material UI: Comprehensive React component library that implements Google's Material Design. Free forever.",
"createdAt": "2014-08-18T19:11:54Z",
- "updatedAt": "2025-10-27T07:25:58Z",
- "pushedAt": "2025-10-27T07:11:45Z",
- "stars": 96875,
+ "updatedAt": "2025-10-28T08:02:20Z",
+ "pushedAt": "2025-10-28T06:08:34Z",
+ "stars": 96887,
"watchers": 1312,
"forks": 32696,
"defaultBranch": "master"
@@ -1097,11 +1097,11 @@
"repo": "ant-design/ant-design",
"description": "An enterprise-class UI design language and React UI library",
"createdAt": "2015-04-24T15:37:24Z",
- "updatedAt": "2025-10-27T07:19:39Z",
- "pushedAt": "2025-10-27T07:44:37Z",
- "stars": 96467,
+ "updatedAt": "2025-10-28T11:00:38Z",
+ "pushedAt": "2025-10-28T10:52:44Z",
+ "stars": 96472,
"watchers": 236,
- "forks": 53873,
+ "forks": 53890,
"defaultBranch": "master"
},
{
@@ -1110,11 +1110,11 @@
"repo": "Anduin2017/HowToCook",
"description": "程序员在家做饭方法指南。Programmer's guide about how to cook at home (Simplified Chinese only).",
"createdAt": "2020-02-29T10:43:49Z",
- "updatedAt": "2025-10-27T07:31:17Z",
- "pushedAt": "2025-10-23T12:40:47Z",
- "stars": 95393,
+ "updatedAt": "2025-10-28T12:35:03Z",
+ "pushedAt": "2025-10-28T11:30:11Z",
+ "stars": 95425,
"watchers": 488,
- "forks": 10650,
+ "forks": 10651,
"defaultBranch": "master"
},
{
@@ -1123,9 +1123,9 @@
"repo": "nvbn/thefuck",
"description": "Magnificent app which corrects your previous console command.",
"createdAt": "2015-04-08T15:08:04Z",
- "updatedAt": "2025-10-27T07:26:06Z",
+ "updatedAt": "2025-10-28T12:34:25Z",
"pushedAt": "2024-07-19T14:56:13Z",
- "stars": 94482,
+ "stars": 94497,
"watchers": 825,
"forks": 3792,
"defaultBranch": "master"
@@ -1136,11 +1136,11 @@
"repo": "pytorch/pytorch",
"description": "Tensors and Dynamic neural networks in Python with strong GPU acceleration",
"createdAt": "2016-08-13T05:26:41Z",
- "updatedAt": "2025-10-27T07:51:08Z",
- "pushedAt": "2025-10-27T07:51:03Z",
- "stars": 94273,
- "watchers": 1771,
- "forks": 25671,
+ "updatedAt": "2025-10-28T12:25:28Z",
+ "pushedAt": "2025-10-28T12:40:19Z",
+ "stars": 94326,
+ "watchers": 1770,
+ "forks": 25678,
"defaultBranch": "main"
},
{
@@ -1149,11 +1149,11 @@
"repo": "ryanmcdermott/clean-code-javascript",
"description": "Clean Code concepts adapted for JavaScript",
"createdAt": "2016-11-25T22:25:41Z",
- "updatedAt": "2025-10-27T03:36:56Z",
+ "updatedAt": "2025-10-28T08:55:17Z",
"pushedAt": "2024-07-29T07:24:37Z",
- "stars": 93960,
+ "stars": 93959,
"watchers": 1744,
- "forks": 12496,
+ "forks": 12495,
"defaultBranch": "master"
},
{
@@ -1162,11 +1162,11 @@
"repo": "mtdvio/every-programmer-should-know",
"description": "A collection of (mostly) technical things every software developer should know about",
"createdAt": "2017-08-24T13:18:26Z",
- "updatedAt": "2025-10-27T07:26:42Z",
+ "updatedAt": "2025-10-28T11:51:44Z",
"pushedAt": "2025-10-22T15:21:18Z",
- "stars": 93814,
+ "stars": 93832,
"watchers": 2011,
- "forks": 8436,
+ "forks": 8437,
"defaultBranch": "master"
},
{
@@ -1175,11 +1175,11 @@
"repo": "neovim/neovim",
"description": "Vim-fork focused on extensibility and usability",
"createdAt": "2014-01-31T13:39:22Z",
- "updatedAt": "2025-10-27T07:30:43Z",
- "pushedAt": "2025-10-27T05:15:23Z",
- "stars": 93731,
- "watchers": 971,
- "forks": 6378,
+ "updatedAt": "2025-10-28T12:38:30Z",
+ "pushedAt": "2025-10-28T08:45:46Z",
+ "stars": 93768,
+ "watchers": 972,
+ "forks": 6376,
"defaultBranch": "master"
},
{
@@ -1188,11 +1188,11 @@
"repo": "x1xhlol/system-prompts-and-models-of-ai-tools",
"description": "FULL Augment Code, Claude Code, Cluely, CodeBuddy, Comet, Cursor, Devin AI, Junie, Kiro, Leap.new, Lovable, Manus Agent Tools, NotionAI, Orchids.app, Perplexity, Poke, Qoder, Replit, Same.dev, Trae, Traycer AI, VSCode Agent, Warp.dev, Windsurf, Xcode, Z.ai Code, dia & v0. (And other Open Sourced) System Prompts, Internal Tools & AI Models",
"createdAt": "2025-03-05T16:38:29Z",
- "updatedAt": "2025-10-27T07:37:40Z",
+ "updatedAt": "2025-10-28T12:37:42Z",
"pushedAt": "2025-10-19T18:44:24Z",
- "stars": 93282,
+ "stars": 93450,
"watchers": 1183,
- "forks": 25228,
+ "forks": 25250,
"defaultBranch": "main"
},
{
@@ -1201,11 +1201,11 @@
"repo": "iluwatar/java-design-patterns",
"description": "Design patterns implemented in Java",
"createdAt": "2014-08-09T16:45:18Z",
- "updatedAt": "2025-10-27T07:35:54Z",
+ "updatedAt": "2025-10-28T11:55:32Z",
"pushedAt": "2025-10-21T21:30:34Z",
- "stars": 93215,
+ "stars": 93230,
"watchers": 3717,
- "forks": 27309,
+ "forks": 27312,
"defaultBranch": "master"
},
{
@@ -1214,9 +1214,9 @@
"repo": "puppeteer/puppeteer",
"description": "JavaScript API for Chrome and Firefox",
"createdAt": "2017-05-09T22:16:13Z",
- "updatedAt": "2025-10-27T07:31:12Z",
- "pushedAt": "2025-10-26T04:03:55Z",
- "stars": 92724,
+ "updatedAt": "2025-10-28T11:55:21Z",
+ "pushedAt": "2025-10-28T11:35:29Z",
+ "stars": 92732,
"watchers": 1184,
"forks": 9314,
"defaultBranch": "main"
@@ -1227,11 +1227,11 @@
"repo": "microsoft/Web-Dev-For-Beginners",
"description": "24 Lessons, 12 Weeks, Get Started as a Web Developer",
"createdAt": "2020-11-10T02:44:00Z",
- "updatedAt": "2025-10-27T07:27:35Z",
- "pushedAt": "2025-10-25T00:47:36Z",
- "stars": 92476,
+ "updatedAt": "2025-10-28T12:11:24Z",
+ "pushedAt": "2025-10-27T13:01:13Z",
+ "stars": 92494,
"watchers": 2690,
- "forks": 14330,
+ "forks": 14334,
"defaultBranch": "main"
},
{
@@ -1240,11 +1240,11 @@
"repo": "comfyanonymous/ComfyUI",
"description": "The most powerful and modular diffusion model GUI, api and backend with a graph/nodes interface.",
"createdAt": "2023-01-17T03:15:56Z",
- "updatedAt": "2025-10-27T07:46:53Z",
- "pushedAt": "2025-10-27T00:23:05Z",
- "stars": 92036,
- "watchers": 614,
- "forks": 10341,
+ "updatedAt": "2025-10-28T12:38:44Z",
+ "pushedAt": "2025-10-28T08:45:49Z",
+ "stars": 92150,
+ "watchers": 615,
+ "forks": 10367,
"defaultBranch": "master"
},
{
@@ -1253,11 +1253,11 @@
"repo": "jaywcjlove/awesome-mac",
"description": " Now we have become very big, Different from the original idea. Collect premium software in various categories.",
"createdAt": "2016-07-17T15:33:47Z",
- "updatedAt": "2025-10-27T07:50:40Z",
- "pushedAt": "2025-10-25T04:02:03Z",
- "stars": 91815,
+ "updatedAt": "2025-10-28T12:29:52Z",
+ "pushedAt": "2025-10-27T17:27:24Z",
+ "stars": 91942,
"watchers": 1517,
- "forks": 6947,
+ "forks": 6956,
"defaultBranch": "master"
},
{
@@ -1266,11 +1266,11 @@
"repo": "deepseek-ai/DeepSeek-R1",
"description": null,
"createdAt": "2025-01-20T11:57:28Z",
- "updatedAt": "2025-10-27T06:56:07Z",
+ "updatedAt": "2025-10-28T12:33:45Z",
"pushedAt": "2025-06-27T08:35:54Z",
- "stars": 91380,
+ "stars": 91406,
"watchers": 607,
- "forks": 11766,
+ "forks": 11768,
"defaultBranch": "main"
},
{
@@ -1279,11 +1279,11 @@
"repo": "fastapi/fastapi",
"description": "FastAPI framework, high performance, easy to learn, fast to code, ready for production",
"createdAt": "2018-12-08T08:21:47Z",
- "updatedAt": "2025-10-27T07:49:54Z",
- "pushedAt": "2025-10-23T20:55:59Z",
- "stars": 91203,
- "watchers": 723,
- "forks": 8123,
+ "updatedAt": "2025-10-28T11:31:45Z",
+ "pushedAt": "2025-10-28T07:50:29Z",
+ "stars": 91252,
+ "watchers": 721,
+ "forks": 8135,
"defaultBranch": "master"
},
{
@@ -1292,11 +1292,11 @@
"repo": "tailwindlabs/tailwindcss",
"description": "A utility-first CSS framework for rapid UI development.",
"createdAt": "2017-10-06T14:59:14Z",
- "updatedAt": "2025-10-27T07:48:03Z",
- "pushedAt": "2025-10-24T11:53:16Z",
- "stars": 90800,
+ "updatedAt": "2025-10-28T12:25:13Z",
+ "pushedAt": "2025-10-28T12:25:08Z",
+ "stars": 90816,
"watchers": 615,
- "forks": 4771,
+ "forks": 4766,
"defaultBranch": "main"
}
]
diff --git a/benchmarks/package.json b/benchmarks/package.json
index 1b071c8..d4316a6 100644
--- a/benchmarks/package.json
+++ b/benchmarks/package.json
@@ -5,7 +5,7 @@
"scripts": {
"benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts",
"benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts",
- "fetch-github-data": "tsx scripts/fetch-github-data.ts",
+ "fetch:github-repos": "tsx scripts/fetch-github-repos.ts",
"test": "vitest"
},
"devDependencies": {
@@ -14,14 +14,16 @@
"@ai-sdk/openai": "^2.0.53",
"@ai-sdk/provider": "^2.0.0",
"@antfu/eslint-config": "^6.1.0",
+ "@clack/prompts": "^0.11.0",
"@faker-js/faker": "^10.1.0",
"ai": "^5.0.80",
- "consola": "^3.4.2",
"csv-stringify": "^6.6.0",
"fast-xml-parser": "^5.3.0",
"gpt-tokenizer": "^3.2.0",
"ofetch": "^1.4.1",
"p-map": "^7.0.3",
+ "p-queue": "^9.0.0",
+ "unstorage": "^1.17.1",
"yaml": "^2.8.1"
}
}
diff --git a/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001 b/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
new file mode 100644
index 0000000..16cc232
--- /dev/null
+++ b/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
@@ -0,0 +1 @@
+[{"questionId":"q1","format":"json","model":"claude-haiku-4-5-20251001","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":7870,"outputTokens":6,"latencyMs":1186.5200829999999},{"questionId":"q1","format":"toon","model":"claude-haiku-4-5-20251001","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":2982,"outputTokens":6,"latencyMs":1549.690209},{"questionId":"q1","format":"csv","model":"claude-haiku-4-5-20251001","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":2856,"outputTokens":6,"latencyMs":1782.079041},{"questionId":"q1","format":"xml","model":"claude-haiku-4-5-20251001","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":9360,"outputTokens":6,"latencyMs":1753.305208},{"questionId":"q1","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":5760,"outputTokens":6,"latencyMs":1592.0202080000001},{"questionId":"q2","format":"json","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7869,"outputTokens":4,"latencyMs":1181.639291},{"questionId":"q2","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2981,"outputTokens":4,"latencyMs":1178.6765409999998},{"questionId":"q2","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2855,"outputTokens":4,"latencyMs":961.8413330000003},{"questionId":"q2","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":9359,"outputTokens":4,"latencyMs":1386.3486249999999},{"questionId":"q2","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5759,"outputTokens":4,"latencyMs":1084.399458},{"questionId":"q3","format":"json","model":"claude-haiku-4-5-20251001","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":7874,"outputTokens":12,"latencyMs":2820.245208},{"questionId":"q3","format":"toon","model":"claude-haiku-4-5-20251001","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":2986,"outputTokens":12,"latencyMs":946.8178749999997},{"questionId":"q3","format":"csv","model":"claude-haiku-4-5-20251001","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":2860,"outputTokens":12,"latencyMs":839.2516249999999},{"questionId":"q3","format":"xml","model":"claude-haiku-4-5-20251001","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":9364,"outputTokens":12,"latencyMs":2712.0417500000003},{"questionId":"q3","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":5764,"outputTokens":12,"latencyMs":1343.5052500000002},{"questionId":"q4","format":"json","model":"claude-haiku-4-5-20251001","expected":"22","actual":"22","isCorrect":true,"inputTokens":7872,"outputTokens":5,"latencyMs":1078.4747500000003},{"questionId":"q4","format":"toon","model":"claude-haiku-4-5-20251001","expected":"22","actual":"22","isCorrect":true,"inputTokens":2984,"outputTokens":5,"latencyMs":1318.797125},{"questionId":"q4","format":"csv","model":"claude-haiku-4-5-20251001","expected":"22","actual":"22","isCorrect":true,"inputTokens":2858,"outputTokens":5,"latencyMs":937.7086669999999},{"questionId":"q4","format":"xml","model":"claude-haiku-4-5-20251001","expected":"22","actual":"22","isCorrect":true,"inputTokens":9362,"outputTokens":5,"latencyMs":1816.421792},{"questionId":"q4","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"22","actual":"22","isCorrect":true,"inputTokens":5762,"outputTokens":5,"latencyMs":1193.6218329999992},{"questionId":"q5","format":"json","model":"claude-haiku-4-5-20251001","expected":"no","actual":"No","isCorrect":true,"inputTokens":7867,"outputTokens":4,"latencyMs":1114.0669579999994},{"questionId":"q5","format":"toon","model":"claude-haiku-4-5-20251001","expected":"no","actual":"No","isCorrect":true,"inputTokens":2979,"outputTokens":4,"latencyMs":939.7355420000004},{"questionId":"q5","format":"csv","model":"claude-haiku-4-5-20251001","expected":"no","actual":"No","isCorrect":true,"inputTokens":2853,"outputTokens":4,"latencyMs":1360.3842089999998},{"questionId":"q5","format":"xml","model":"claude-haiku-4-5-20251001","expected":"no","actual":"No","isCorrect":true,"inputTokens":9357,"outputTokens":4,"latencyMs":1128.1693330000007},{"questionId":"q5","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"no","actual":"No","isCorrect":true,"inputTokens":5757,"outputTokens":4,"latencyMs":1098.7498749999995},{"questionId":"q6","format":"json","model":"claude-haiku-4-5-20251001","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":7870,"outputTokens":6,"latencyMs":976.1237920000003},{"questionId":"q6","format":"toon","model":"claude-haiku-4-5-20251001","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":2982,"outputTokens":6,"latencyMs":976.5635000000002},{"questionId":"q6","format":"csv","model":"claude-haiku-4-5-20251001","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":2856,"outputTokens":6,"latencyMs":996.869584},{"questionId":"q6","format":"xml","model":"claude-haiku-4-5-20251001","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":9360,"outputTokens":6,"latencyMs":1103.3886669999993},{"questionId":"q6","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":5760,"outputTokens":6,"latencyMs":1013.2469170000004},{"questionId":"q7","format":"json","model":"claude-haiku-4-5-20251001","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":7870,"outputTokens":4,"latencyMs":1250.1044579999998},{"questionId":"q7","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":2982,"outputTokens":4,"latencyMs":1005.4357920000002},{"questionId":"q7","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":2856,"outputTokens":4,"latencyMs":1037.9372080000012},{"questionId":"q7","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":9360,"outputTokens":4,"latencyMs":1462.1290829999998},{"questionId":"q7","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":5760,"outputTokens":4,"latencyMs":929.5629590000008},{"questionId":"q8","format":"json","model":"claude-haiku-4-5-20251001","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":7872,"outputTokens":12,"latencyMs":1013.5330840000006},{"questionId":"q8","format":"toon","model":"claude-haiku-4-5-20251001","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":2984,"outputTokens":12,"latencyMs":1235.7225830000007},{"questionId":"q8","format":"csv","model":"claude-haiku-4-5-20251001","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":2858,"outputTokens":12,"latencyMs":913.2110410000005},{"questionId":"q8","format":"xml","model":"claude-haiku-4-5-20251001","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":9362,"outputTokens":12,"latencyMs":1207.9759169999998},{"questionId":"q8","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":5762,"outputTokens":12,"latencyMs":1318.65575},{"questionId":"q9","format":"json","model":"claude-haiku-4-5-20251001","expected":"5","actual":"5","isCorrect":true,"inputTokens":7872,"outputTokens":5,"latencyMs":1529.1206249999977},{"questionId":"q9","format":"toon","model":"claude-haiku-4-5-20251001","expected":"5","actual":"5","isCorrect":true,"inputTokens":2984,"outputTokens":5,"latencyMs":871.3193330000013},{"questionId":"q9","format":"csv","model":"claude-haiku-4-5-20251001","expected":"5","actual":"5","isCorrect":true,"inputTokens":2858,"outputTokens":5,"latencyMs":1737.0914170000015},{"questionId":"q9","format":"xml","model":"claude-haiku-4-5-20251001","expected":"5","actual":"5","isCorrect":true,"inputTokens":9362,"outputTokens":5,"latencyMs":1219.0179590000007},{"questionId":"q9","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"5","actual":"5","isCorrect":true,"inputTokens":5762,"outputTokens":5,"latencyMs":1005.3616249999977},{"questionId":"q10","format":"json","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7869,"outputTokens":4,"latencyMs":1087.0020829999994},{"questionId":"q10","format":"toon","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2981,"outputTokens":4,"latencyMs":1483.961625},{"questionId":"q10","format":"csv","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2855,"outputTokens":4,"latencyMs":872.3541670000013},{"questionId":"q10","format":"xml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":9359,"outputTokens":4,"latencyMs":1126.5349999999999},{"questionId":"q10","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":5759,"outputTokens":4,"latencyMs":1077.6859160000022},{"questionId":"q11","format":"json","model":"claude-haiku-4-5-20251001","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":7869,"outputTokens":6,"latencyMs":1471.1526250000024},{"questionId":"q11","format":"toon","model":"claude-haiku-4-5-20251001","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":2981,"outputTokens":6,"latencyMs":1113.5857079999987},{"questionId":"q11","format":"csv","model":"claude-haiku-4-5-20251001","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":2855,"outputTokens":6,"latencyMs":1265.7001249999958},{"questionId":"q11","format":"xml","model":"claude-haiku-4-5-20251001","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":9359,"outputTokens":6,"latencyMs":1241.400333000005},{"questionId":"q11","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":5759,"outputTokens":6,"latencyMs":1096.3039170000047},{"questionId":"q12","format":"json","model":"claude-haiku-4-5-20251001","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":7866,"outputTokens":4,"latencyMs":1085.2027090000047},{"questionId":"q12","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":2978,"outputTokens":4,"latencyMs":1074.2748330000031},{"questionId":"q12","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":2852,"outputTokens":4,"latencyMs":1060.0096249999988},{"questionId":"q12","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":9356,"outputTokens":4,"latencyMs":990.6856249999983},{"questionId":"q12","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":5756,"outputTokens":4,"latencyMs":1093.7282919999998},{"questionId":"q13","format":"json","model":"claude-haiku-4-5-20251001","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":7869,"outputTokens":9,"latencyMs":1545.3704580000049},{"questionId":"q13","format":"toon","model":"claude-haiku-4-5-20251001","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":2981,"outputTokens":9,"latencyMs":1492.468332999997},{"questionId":"q13","format":"csv","model":"claude-haiku-4-5-20251001","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":2855,"outputTokens":9,"latencyMs":914.2506659999999},{"questionId":"q13","format":"xml","model":"claude-haiku-4-5-20251001","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":9359,"outputTokens":9,"latencyMs":1158.3694999999934},{"questionId":"q13","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":5759,"outputTokens":9,"latencyMs":967.9013750000013},{"questionId":"q14","format":"json","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":7870,"outputTokens":5,"latencyMs":1064.4307919999992},{"questionId":"q14","format":"toon","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":2982,"outputTokens":5,"latencyMs":1047.1003329999949},{"questionId":"q14","format":"csv","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":2856,"outputTokens":5,"latencyMs":886.0778750000027},{"questionId":"q14","format":"xml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":9360,"outputTokens":5,"latencyMs":1139.3237919999956},{"questionId":"q14","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":5760,"outputTokens":5,"latencyMs":1569.7935409999918},{"questionId":"q15","format":"json","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7867,"outputTokens":4,"latencyMs":1291.6255419999943},{"questionId":"q15","format":"toon","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2979,"outputTokens":4,"latencyMs":1056.2912080000096},{"questionId":"q15","format":"csv","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2853,"outputTokens":4,"latencyMs":1008.6230829999986},{"questionId":"q15","format":"xml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":9357,"outputTokens":4,"latencyMs":1359.699041999993},{"questionId":"q15","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":5757,"outputTokens":4,"latencyMs":1213.9625419999938},{"questionId":"q16","format":"json","model":"claude-haiku-4-5-20251001","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":7870,"outputTokens":6,"latencyMs":1165.3891250000015},{"questionId":"q16","format":"toon","model":"claude-haiku-4-5-20251001","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":2982,"outputTokens":6,"latencyMs":6645.897125000003},{"questionId":"q16","format":"csv","model":"claude-haiku-4-5-20251001","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":2856,"outputTokens":6,"latencyMs":1145.3135829999956},{"questionId":"q16","format":"xml","model":"claude-haiku-4-5-20251001","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":9360,"outputTokens":6,"latencyMs":1262.2502920000115},{"questionId":"q16","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":5760,"outputTokens":6,"latencyMs":1027.6706660000054},{"questionId":"q17","format":"json","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7872,"outputTokens":4,"latencyMs":1145.8684579999972},{"questionId":"q17","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2984,"outputTokens":4,"latencyMs":990.2954160000081},{"questionId":"q17","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2858,"outputTokens":4,"latencyMs":875.828416999997},{"questionId":"q17","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":9362,"outputTokens":4,"latencyMs":1851.1875},{"questionId":"q17","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5762,"outputTokens":4,"latencyMs":1215.2204999999958},{"questionId":"q18","format":"json","model":"claude-haiku-4-5-20251001","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":7871,"outputTokens":10,"latencyMs":1288.0537919999915},{"questionId":"q18","format":"toon","model":"claude-haiku-4-5-20251001","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":2983,"outputTokens":10,"latencyMs":1943.9848330000095},{"questionId":"q18","format":"csv","model":"claude-haiku-4-5-20251001","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":2857,"outputTokens":10,"latencyMs":1131.0214169999963},{"questionId":"q18","format":"xml","model":"claude-haiku-4-5-20251001","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":9361,"outputTokens":10,"latencyMs":1305.6347090000054},{"questionId":"q18","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":5761,"outputTokens":10,"latencyMs":1566.2271670000046},{"questionId":"q19","format":"json","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":7874,"outputTokens":5,"latencyMs":1408.747875000001},{"questionId":"q19","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":2986,"outputTokens":5,"latencyMs":1232.375125000006},{"questionId":"q19","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":2860,"outputTokens":5,"latencyMs":15343.73120899999},{"questionId":"q19","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":9364,"outputTokens":5,"latencyMs":1321.6012499999924},{"questionId":"q19","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":5764,"outputTokens":5,"latencyMs":1017.6581250000017},{"questionId":"q20","format":"json","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7867,"outputTokens":4,"latencyMs":1332.0981669999892},{"questionId":"q20","format":"toon","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2979,"outputTokens":4,"latencyMs":1177.5915830000013},{"questionId":"q20","format":"csv","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2853,"outputTokens":4,"latencyMs":1177.6698340000003},{"questionId":"q20","format":"xml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":9357,"outputTokens":4,"latencyMs":1062.9883330000011},{"questionId":"q20","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":5757,"outputTokens":4,"latencyMs":1464.3220000000001},{"questionId":"q21","format":"json","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1493.3682079999999},{"questionId":"q21","format":"toon","model":"claude-haiku-4-5-20251001","expected":"17","actual":"16","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":1097.149790999989},{"questionId":"q21","format":"csv","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":1152.1389590000035},{"questionId":"q21","format":"xml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":1219.1671250000072},{"questionId":"q21","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":1360.0843339999992},{"questionId":"q22","format":"json","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1166.0337080000027},{"questionId":"q22","format":"toon","model":"claude-haiku-4-5-20251001","expected":"17","actual":"14","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":1691.5424579999963},{"questionId":"q22","format":"csv","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":1431.4153340000048},{"questionId":"q22","format":"xml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":1154.9970829999947},{"questionId":"q22","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":941.9446660000103},{"questionId":"q23","format":"json","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1896.4274999999907},{"questionId":"q23","format":"toon","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":1242.7352919999976},{"questionId":"q23","format":"csv","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":959.4865830000053},{"questionId":"q23","format":"xml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":946.2465830000001},{"questionId":"q23","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":1183.5531249999913},{"questionId":"q24","format":"json","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1538.283208000008},{"questionId":"q24","format":"toon","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":1080.8083329999936},{"questionId":"q24","format":"csv","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":1045.9360000000015},{"questionId":"q24","format":"xml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":1093.3812079999916},{"questionId":"q24","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"17","actual":"15","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":1018.5912499999977},{"questionId":"q25","format":"json","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1213.112832999992},{"questionId":"q25","format":"toon","model":"claude-haiku-4-5-20251001","expected":"16","actual":"15","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":1048.4280830000062},{"questionId":"q25","format":"csv","model":"claude-haiku-4-5-20251001","expected":"16","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":1070.1919170000037},{"questionId":"q25","format":"xml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":1166.1562910000066},{"questionId":"q25","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"15","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":862.2464579999796},{"questionId":"q26","format":"json","model":"claude-haiku-4-5-20251001","expected":"16","actual":"10","isCorrect":false,"inputTokens":7865,"outputTokens":5,"latencyMs":1598.766666999989},{"questionId":"q26","format":"toon","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":2977,"outputTokens":5,"latencyMs":853.0929170000018},{"questionId":"q26","format":"csv","model":"claude-haiku-4-5-20251001","expected":"16","actual":"15","isCorrect":false,"inputTokens":2851,"outputTokens":5,"latencyMs":872.8500000000058},{"questionId":"q26","format":"xml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"10","isCorrect":false,"inputTokens":9355,"outputTokens":5,"latencyMs":1258.3260829999927},{"questionId":"q26","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":5755,"outputTokens":5,"latencyMs":1102.5700840000063},{"questionId":"q27","format":"json","model":"claude-haiku-4-5-20251001","expected":"91","actual":"89","isCorrect":false,"inputTokens":7870,"outputTokens":5,"latencyMs":1381.9074580000015},{"questionId":"q27","format":"toon","model":"claude-haiku-4-5-20251001","expected":"91","actual":"85","isCorrect":false,"inputTokens":2982,"outputTokens":5,"latencyMs":867.2984169999836},{"questionId":"q27","format":"csv","model":"claude-haiku-4-5-20251001","expected":"91","actual":"78","isCorrect":false,"inputTokens":2856,"outputTokens":5,"latencyMs":810.5503749999916},{"questionId":"q27","format":"xml","model":"claude-haiku-4-5-20251001","expected":"91","actual":"89","isCorrect":false,"inputTokens":9360,"outputTokens":5,"latencyMs":2055.4735829999845},{"questionId":"q27","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"91","actual":"89","isCorrect":false,"inputTokens":5760,"outputTokens":5,"latencyMs":1134.9885420000064},{"questionId":"q28","format":"json","model":"claude-haiku-4-5-20251001","expected":"67","actual":"57","isCorrect":false,"inputTokens":7870,"outputTokens":5,"latencyMs":1497.8240410000144},{"questionId":"q28","format":"toon","model":"claude-haiku-4-5-20251001","expected":"67","actual":"42","isCorrect":false,"inputTokens":2982,"outputTokens":5,"latencyMs":1041.6871659999888},{"questionId":"q28","format":"csv","model":"claude-haiku-4-5-20251001","expected":"67","actual":"42","isCorrect":false,"inputTokens":2856,"outputTokens":5,"latencyMs":1134.1471669999883},{"questionId":"q28","format":"xml","model":"claude-haiku-4-5-20251001","expected":"67","actual":"57","isCorrect":false,"inputTokens":9360,"outputTokens":5,"latencyMs":1027.211540999997},{"questionId":"q28","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"67","actual":"54","isCorrect":false,"inputTokens":5760,"outputTokens":5,"latencyMs":1031.266249999986},{"questionId":"q29","format":"json","model":"claude-haiku-4-5-20251001","expected":"41","actual":"32","isCorrect":false,"inputTokens":7870,"outputTokens":5,"latencyMs":1564.15516699999},{"questionId":"q29","format":"toon","model":"claude-haiku-4-5-20251001","expected":"41","actual":"27","isCorrect":false,"inputTokens":2982,"outputTokens":5,"latencyMs":1029.4010420000122},{"questionId":"q29","format":"csv","model":"claude-haiku-4-5-20251001","expected":"41","actual":"27","isCorrect":false,"inputTokens":2856,"outputTokens":5,"latencyMs":1028.4109579999931},{"questionId":"q29","format":"xml","model":"claude-haiku-4-5-20251001","expected":"41","actual":"31","isCorrect":false,"inputTokens":9360,"outputTokens":5,"latencyMs":2108.232167000009},{"questionId":"q29","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"41","actual":"31","isCorrect":false,"inputTokens":5760,"outputTokens":5,"latencyMs":874.9285410000011},{"questionId":"q30","format":"json","model":"claude-haiku-4-5-20251001","expected":"26","actual":"20","isCorrect":false,"inputTokens":7870,"outputTokens":5,"latencyMs":998.8196670000034},{"questionId":"q30","format":"toon","model":"claude-haiku-4-5-20251001","expected":"26","actual":"16","isCorrect":false,"inputTokens":2982,"outputTokens":5,"latencyMs":1129.3641669999924},{"questionId":"q30","format":"csv","model":"claude-haiku-4-5-20251001","expected":"26","actual":"16","isCorrect":false,"inputTokens":2856,"outputTokens":5,"latencyMs":986.5463340000133},{"questionId":"q30","format":"xml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"23","isCorrect":false,"inputTokens":9360,"outputTokens":5,"latencyMs":1208.9404170000053},{"questionId":"q30","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"20","isCorrect":false,"inputTokens":5760,"outputTokens":5,"latencyMs":1296.2784589999937},{"questionId":"q31","format":"json","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":7866,"outputTokens":5,"latencyMs":1082.367750000005},{"questionId":"q31","format":"toon","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":2978,"outputTokens":5,"latencyMs":1029.9465830000117},{"questionId":"q31","format":"csv","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":2852,"outputTokens":5,"latencyMs":964.783958999993},{"questionId":"q31","format":"xml","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":9356,"outputTokens":5,"latencyMs":1144.9031670000113},{"questionId":"q31","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":5756,"outputTokens":5,"latencyMs":1221.6876249999914},{"questionId":"q32","format":"json","model":"claude-haiku-4-5-20251001","expected":"96503","actual":"$99,945.47","isCorrect":false,"inputTokens":7867,"outputTokens":9,"latencyMs":1412.950708999997},{"questionId":"q32","format":"toon","model":"claude-haiku-4-5-20251001","expected":"96503","actual":"98,767.41","isCorrect":false,"inputTokens":2979,"outputTokens":9,"latencyMs":951.2037919999857},{"questionId":"q32","format":"csv","model":"claude-haiku-4-5-20251001","expected":"96503","actual":"98,767.29","isCorrect":false,"inputTokens":2853,"outputTokens":9,"latencyMs":1418.869875000004},{"questionId":"q32","format":"xml","model":"claude-haiku-4-5-20251001","expected":"96503","actual":"99,945.67","isCorrect":false,"inputTokens":9357,"outputTokens":9,"latencyMs":1328.8714580000087},{"questionId":"q32","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"96503","actual":"100,889.57","isCorrect":false,"inputTokens":5757,"outputTokens":9,"latencyMs":1629.578165999992},{"questionId":"q33","format":"json","model":"claude-haiku-4-5-20251001","expected":"78","actual":"81","isCorrect":false,"inputTokens":7864,"outputTokens":5,"latencyMs":1490.021458000003},{"questionId":"q33","format":"toon","model":"claude-haiku-4-5-20251001","expected":"78","actual":"78","isCorrect":true,"inputTokens":2976,"outputTokens":5,"latencyMs":1234.7220830000006},{"questionId":"q33","format":"csv","model":"claude-haiku-4-5-20251001","expected":"78","actual":"73","isCorrect":false,"inputTokens":2850,"outputTokens":5,"latencyMs":1048.0889169999864},{"questionId":"q33","format":"xml","model":"claude-haiku-4-5-20251001","expected":"78","actual":"78","isCorrect":true,"inputTokens":9354,"outputTokens":5,"latencyMs":1041.6995839999872},{"questionId":"q33","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"78","actual":"73","isCorrect":false,"inputTokens":5754,"outputTokens":5,"latencyMs":1220.4587089999986},{"questionId":"q34","format":"json","model":"claude-haiku-4-5-20251001","expected":"22","actual":"15","isCorrect":false,"inputTokens":7864,"outputTokens":5,"latencyMs":1041.8724590000056},{"questionId":"q34","format":"toon","model":"claude-haiku-4-5-20251001","expected":"22","actual":"17","isCorrect":false,"inputTokens":2976,"outputTokens":5,"latencyMs":868.8058749999909},{"questionId":"q34","format":"csv","model":"claude-haiku-4-5-20251001","expected":"22","actual":"20","isCorrect":false,"inputTokens":2850,"outputTokens":5,"latencyMs":1096.7095409999893},{"questionId":"q34","format":"xml","model":"claude-haiku-4-5-20251001","expected":"22","actual":"15","isCorrect":false,"inputTokens":9354,"outputTokens":5,"latencyMs":1311.4117499999993},{"questionId":"q34","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"22","actual":"16","isCorrect":false,"inputTokens":5754,"outputTokens":5,"latencyMs":920.101708000002},{"questionId":"q35","format":"json","model":"claude-haiku-4-5-20251001","expected":"12","actual":"9","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1390.3867909999972},{"questionId":"q35","format":"toon","model":"claude-haiku-4-5-20251001","expected":"12","actual":"9","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":956.030666000006},{"questionId":"q35","format":"csv","model":"claude-haiku-4-5-20251001","expected":"12","actual":"10","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":917.8577079999959},{"questionId":"q35","format":"xml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"11","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1165.3076660000079},{"questionId":"q35","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"10","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1244.3233749999781},{"questionId":"q36","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":975.3720840000024},{"questionId":"q36","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":909.2385829999985},{"questionId":"q36","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":986.5588329999882},{"questionId":"q36","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1210.949082999985},{"questionId":"q36","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1132.0805830000027},{"questionId":"q37","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1822.2138749999867},{"questionId":"q37","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":802.9654169999994},{"questionId":"q37","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":942.9198750000214},{"questionId":"q37","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"9","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1467.981999999989},{"questionId":"q37","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"9","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1029.5724589999882},{"questionId":"q38","format":"json","model":"claude-haiku-4-5-20251001","expected":"12","actual":"7","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1234.3802500000165},{"questionId":"q38","format":"toon","model":"claude-haiku-4-5-20251001","expected":"12","actual":"6","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":855.1812079999945},{"questionId":"q38","format":"csv","model":"claude-haiku-4-5-20251001","expected":"12","actual":"7","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1078.424875000026},{"questionId":"q38","format":"xml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"8","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1168.164334000001},{"questionId":"q38","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"8","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1084.7320839999884},{"questionId":"q39","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":928.001540999976},{"questionId":"q39","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":1456.4904170000227},{"questionId":"q39","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1239.191125000012},{"questionId":"q39","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1170.1835409999767},{"questionId":"q39","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"8","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1126.3050839999923},{"questionId":"q40","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"7","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":918.451166999992},{"questionId":"q40","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":817.9093330000178},{"questionId":"q40","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1003.6671669999778},{"questionId":"q40","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"7","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":966.2543329999899},{"questionId":"q40","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":934.2230000000272},{"questionId":"q41","format":"json","model":"claude-haiku-4-5-20251001","expected":"63","actual":"72","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1121.4930000000168},{"questionId":"q41","format":"toon","model":"claude-haiku-4-5-20251001","expected":"63","actual":"62","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":1429.5526669999817},{"questionId":"q41","format":"csv","model":"claude-haiku-4-5-20251001","expected":"63","actual":"62","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1052.8274580000143},{"questionId":"q41","format":"xml","model":"claude-haiku-4-5-20251001","expected":"63","actual":"67","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1366.8131669999857},{"questionId":"q41","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"63","actual":"62","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1374.8300410000084},{"questionId":"q42","format":"json","model":"claude-haiku-4-5-20251001","expected":"53","actual":"54","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1131.5352909999783},{"questionId":"q42","format":"toon","model":"claude-haiku-4-5-20251001","expected":"53","actual":"42","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":1013.7067920000118},{"questionId":"q42","format":"csv","model":"claude-haiku-4-5-20251001","expected":"53","actual":"42","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1538.445458000002},{"questionId":"q42","format":"xml","model":"claude-haiku-4-5-20251001","expected":"53","actual":"54","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1219.700749999989},{"questionId":"q42","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"53","actual":"47","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":1558.5241660000174},{"questionId":"q43","format":"json","model":"claude-haiku-4-5-20251001","expected":"39","actual":"31","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1105.588958999986},{"questionId":"q43","format":"toon","model":"claude-haiku-4-5-20251001","expected":"39","actual":"32","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":1671.423999999999},{"questionId":"q43","format":"csv","model":"claude-haiku-4-5-20251001","expected":"39","actual":"31","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1471.0894580000022},{"questionId":"q43","format":"xml","model":"claude-haiku-4-5-20251001","expected":"39","actual":"31","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1021.8086670000048},{"questionId":"q43","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"39","actual":"31","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":3236.29237499999},{"questionId":"q44","format":"json","model":"claude-haiku-4-5-20251001","expected":"16","actual":"13","isCorrect":false,"inputTokens":7872,"outputTokens":5,"latencyMs":1345.137875000015},{"questionId":"q44","format":"toon","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":2984,"outputTokens":5,"latencyMs":1173.6619169999904},{"questionId":"q44","format":"csv","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":2858,"outputTokens":5,"latencyMs":1064.5449589999916},{"questionId":"q44","format":"xml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"15","isCorrect":false,"inputTokens":9362,"outputTokens":5,"latencyMs":1174.7825829999929},{"questionId":"q44","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"13","isCorrect":false,"inputTokens":5762,"outputTokens":5,"latencyMs":891.3968750000058},{"questionId":"q45","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"7","isCorrect":false,"inputTokens":7873,"outputTokens":5,"latencyMs":1206.6643329999933},{"questionId":"q45","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"9","isCorrect":false,"inputTokens":2985,"outputTokens":5,"latencyMs":868.8163749999949},{"questionId":"q45","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"10","isCorrect":false,"inputTokens":2859,"outputTokens":5,"latencyMs":1792.9049579999992},{"questionId":"q45","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"9","isCorrect":false,"inputTokens":9363,"outputTokens":5,"latencyMs":1148.437707999983},{"questionId":"q45","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"9","isCorrect":false,"inputTokens":5763,"outputTokens":5,"latencyMs":1029.0542910000077},{"questionId":"q46","format":"json","model":"claude-haiku-4-5-20251001","expected":"8","actual":"7","isCorrect":false,"inputTokens":7873,"outputTokens":5,"latencyMs":1053.060541999992},{"questionId":"q46","format":"toon","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":2985,"outputTokens":5,"latencyMs":879.3562079999829},{"questionId":"q46","format":"csv","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":2859,"outputTokens":5,"latencyMs":799.1071669999801},{"questionId":"q46","format":"xml","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":9363,"outputTokens":5,"latencyMs":929.9557909999858},{"questionId":"q46","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":5763,"outputTokens":5,"latencyMs":1211.8399999999965},{"questionId":"q47","format":"json","model":"claude-haiku-4-5-20251001","expected":"15","actual":"8","isCorrect":false,"inputTokens":7873,"outputTokens":5,"latencyMs":1152.2910409999895},{"questionId":"q47","format":"toon","model":"claude-haiku-4-5-20251001","expected":"15","actual":"8","isCorrect":false,"inputTokens":2985,"outputTokens":5,"latencyMs":1118.5688750000263},{"questionId":"q47","format":"csv","model":"claude-haiku-4-5-20251001","expected":"15","actual":"8","isCorrect":false,"inputTokens":2859,"outputTokens":5,"latencyMs":1046.1623750000144},{"questionId":"q47","format":"xml","model":"claude-haiku-4-5-20251001","expected":"15","actual":"9","isCorrect":false,"inputTokens":9363,"outputTokens":5,"latencyMs":1176.6967500000028},{"questionId":"q47","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"15","actual":"9","isCorrect":false,"inputTokens":5763,"outputTokens":5,"latencyMs":1394.3756660000072},{"questionId":"q48","format":"json","model":"claude-haiku-4-5-20251001","expected":"12","actual":"11","isCorrect":false,"inputTokens":7866,"outputTokens":5,"latencyMs":1269.7089170000108},{"questionId":"q48","format":"toon","model":"claude-haiku-4-5-20251001","expected":"12","actual":"13","isCorrect":false,"inputTokens":2978,"outputTokens":5,"latencyMs":953.8776249999937},{"questionId":"q48","format":"csv","model":"claude-haiku-4-5-20251001","expected":"12","actual":"13","isCorrect":false,"inputTokens":2852,"outputTokens":5,"latencyMs":1155.8576659999962},{"questionId":"q48","format":"xml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"13","isCorrect":false,"inputTokens":9356,"outputTokens":5,"latencyMs":1102.9915830000245},{"questionId":"q48","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"13","isCorrect":false,"inputTokens":5756,"outputTokens":5,"latencyMs":1021.1875},{"questionId":"q49","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"11","isCorrect":true,"inputTokens":7866,"outputTokens":5,"latencyMs":1216.4407080000092},{"questionId":"q49","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"13","isCorrect":false,"inputTokens":2978,"outputTokens":5,"latencyMs":887.568707999977},{"questionId":"q49","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"13","isCorrect":false,"inputTokens":2852,"outputTokens":5,"latencyMs":1225.3905000000086},{"questionId":"q49","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"13","isCorrect":false,"inputTokens":9356,"outputTokens":5,"latencyMs":1190.3869160000177},{"questionId":"q49","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"13","isCorrect":false,"inputTokens":5756,"outputTokens":5,"latencyMs":1719.5950000000012},{"questionId":"q50","format":"json","model":"claude-haiku-4-5-20251001","expected":"14","actual":"11","isCorrect":false,"inputTokens":7866,"outputTokens":5,"latencyMs":1738.0369169999904},{"questionId":"q50","format":"toon","model":"claude-haiku-4-5-20251001","expected":"14","actual":"13","isCorrect":false,"inputTokens":2978,"outputTokens":5,"latencyMs":1201.107250000001},{"questionId":"q50","format":"csv","model":"claude-haiku-4-5-20251001","expected":"14","actual":"11","isCorrect":false,"inputTokens":2852,"outputTokens":5,"latencyMs":907.3295829999843},{"questionId":"q50","format":"xml","model":"claude-haiku-4-5-20251001","expected":"14","actual":"13","isCorrect":false,"inputTokens":9356,"outputTokens":5,"latencyMs":1436.143000000011},{"questionId":"q50","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"14","actual":"13","isCorrect":false,"inputTokens":5756,"outputTokens":5,"latencyMs":1883.2225000000035},{"questionId":"q51","format":"json","model":"claude-haiku-4-5-20251001","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":11906,"outputTokens":7,"latencyMs":1635.8380409999518},{"questionId":"q51","format":"toon","model":"claude-haiku-4-5-20251001","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":6992,"outputTokens":7,"latencyMs":1462.3234999999986},{"questionId":"q51","format":"csv","model":"claude-haiku-4-5-20251001","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":8413,"outputTokens":7,"latencyMs":1300.2047499999753},{"questionId":"q51","format":"xml","model":"claude-haiku-4-5-20251001","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":13379,"outputTokens":7,"latencyMs":1363.68604099995},{"questionId":"q51","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":8384,"outputTokens":7,"latencyMs":1285.6198749999749},{"questionId":"q52","format":"json","model":"claude-haiku-4-5-20251001","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":11906,"outputTokens":4,"latencyMs":1320.7241669999785},{"questionId":"q52","format":"toon","model":"claude-haiku-4-5-20251001","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":6992,"outputTokens":4,"latencyMs":1318.163332999975},{"questionId":"q52","format":"csv","model":"claude-haiku-4-5-20251001","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":8413,"outputTokens":4,"latencyMs":1116.9945000000298},{"questionId":"q52","format":"xml","model":"claude-haiku-4-5-20251001","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":13379,"outputTokens":4,"latencyMs":1260.0658749999711},{"questionId":"q52","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":8384,"outputTokens":4,"latencyMs":1247.195166999998},{"questionId":"q53","format":"json","model":"claude-haiku-4-5-20251001","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":11906,"outputTokens":7,"latencyMs":1423.6057919999585},{"questionId":"q53","format":"toon","model":"claude-haiku-4-5-20251001","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":6992,"outputTokens":7,"latencyMs":967.5251250000438},{"questionId":"q53","format":"csv","model":"claude-haiku-4-5-20251001","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":8413,"outputTokens":7,"latencyMs":1428.2470829999656},{"questionId":"q53","format":"xml","model":"claude-haiku-4-5-20251001","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":13379,"outputTokens":7,"latencyMs":2811.347000000009},{"questionId":"q53","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":8384,"outputTokens":7,"latencyMs":1272.7220830000006},{"questionId":"q54","format":"json","model":"claude-haiku-4-5-20251001","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":11906,"outputTokens":4,"latencyMs":949.0859169999603},{"questionId":"q54","format":"toon","model":"claude-haiku-4-5-20251001","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":6992,"outputTokens":4,"latencyMs":1904.324083000014},{"questionId":"q54","format":"csv","model":"claude-haiku-4-5-20251001","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":8413,"outputTokens":4,"latencyMs":1281.7254589999793},{"questionId":"q54","format":"xml","model":"claude-haiku-4-5-20251001","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":13379,"outputTokens":4,"latencyMs":1348.516834000009},{"questionId":"q54","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":8384,"outputTokens":4,"latencyMs":998.9706670000451},{"questionId":"q55","format":"json","model":"claude-haiku-4-5-20251001","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":11906,"outputTokens":7,"latencyMs":1281.9556659999653},{"questionId":"q55","format":"toon","model":"claude-haiku-4-5-20251001","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":6992,"outputTokens":7,"latencyMs":1224.4750420000055},{"questionId":"q55","format":"csv","model":"claude-haiku-4-5-20251001","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":8413,"outputTokens":7,"latencyMs":1189.8885829999927},{"questionId":"q55","format":"xml","model":"claude-haiku-4-5-20251001","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":13379,"outputTokens":7,"latencyMs":1609.6960830000462},{"questionId":"q55","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":8384,"outputTokens":7,"latencyMs":1235.1579999999958},{"questionId":"q56","format":"json","model":"claude-haiku-4-5-20251001","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":11906,"outputTokens":4,"latencyMs":966.7857499999809},{"questionId":"q56","format":"toon","model":"claude-haiku-4-5-20251001","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":6992,"outputTokens":4,"latencyMs":1000.152041000023},{"questionId":"q56","format":"csv","model":"claude-haiku-4-5-20251001","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":8413,"outputTokens":4,"latencyMs":1027.027666000009},{"questionId":"q56","format":"xml","model":"claude-haiku-4-5-20251001","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":13379,"outputTokens":4,"latencyMs":1105.3194580000127},{"questionId":"q56","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":8384,"outputTokens":4,"latencyMs":1311.3249579999829},{"questionId":"q57","format":"json","model":"claude-haiku-4-5-20251001","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":11906,"outputTokens":8,"latencyMs":1438.7089160000323},{"questionId":"q57","format":"toon","model":"claude-haiku-4-5-20251001","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":6992,"outputTokens":8,"latencyMs":1449.8874160000123},{"questionId":"q57","format":"csv","model":"claude-haiku-4-5-20251001","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":8413,"outputTokens":8,"latencyMs":1366.5002499999828},{"questionId":"q57","format":"xml","model":"claude-haiku-4-5-20251001","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":13379,"outputTokens":8,"latencyMs":2524.170374999987},{"questionId":"q57","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":8384,"outputTokens":8,"latencyMs":1157.0967080000555},{"questionId":"q58","format":"json","model":"claude-haiku-4-5-20251001","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":11906,"outputTokens":4,"latencyMs":1084.7611669999897},{"questionId":"q58","format":"toon","model":"claude-haiku-4-5-20251001","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":6992,"outputTokens":4,"latencyMs":1079.5932080000057},{"questionId":"q58","format":"csv","model":"claude-haiku-4-5-20251001","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":8413,"outputTokens":4,"latencyMs":1254.7550829999964},{"questionId":"q58","format":"xml","model":"claude-haiku-4-5-20251001","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":13379,"outputTokens":4,"latencyMs":987.0553330000257},{"questionId":"q58","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":8384,"outputTokens":4,"latencyMs":1188.3672910000314},{"questionId":"q59","format":"json","model":"claude-haiku-4-5-20251001","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":11907,"outputTokens":12,"latencyMs":1428.2384999999776},{"questionId":"q59","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":6993,"outputTokens":12,"latencyMs":1056.711834000016},{"questionId":"q59","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":8414,"outputTokens":12,"latencyMs":1128.8130419999943},{"questionId":"q59","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":13380,"outputTokens":12,"latencyMs":1333.8827909999527},{"questionId":"q59","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":8385,"outputTokens":12,"latencyMs":1013.9923749999725},{"questionId":"q60","format":"json","model":"claude-haiku-4-5-20251001","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":11907,"outputTokens":10,"latencyMs":1022.9114169999957},{"questionId":"q60","format":"toon","model":"claude-haiku-4-5-20251001","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":6993,"outputTokens":10,"latencyMs":1086.3135830000392},{"questionId":"q60","format":"csv","model":"claude-haiku-4-5-20251001","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":8414,"outputTokens":10,"latencyMs":2558.027624999988},{"questionId":"q60","format":"xml","model":"claude-haiku-4-5-20251001","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":13380,"outputTokens":10,"latencyMs":1433.093125000014},{"questionId":"q60","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":8385,"outputTokens":10,"latencyMs":1455.1029160000035},{"questionId":"q61","format":"json","model":"claude-haiku-4-5-20251001","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":11907,"outputTokens":10,"latencyMs":1239.5240420000046},{"questionId":"q61","format":"toon","model":"claude-haiku-4-5-20251001","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":6993,"outputTokens":10,"latencyMs":1192.919125000015},{"questionId":"q61","format":"csv","model":"claude-haiku-4-5-20251001","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":8414,"outputTokens":10,"latencyMs":2287.9099999999744},{"questionId":"q61","format":"xml","model":"claude-haiku-4-5-20251001","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":13380,"outputTokens":10,"latencyMs":1177.0685829999857},{"questionId":"q61","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":8385,"outputTokens":10,"latencyMs":1656.1915000000154},{"questionId":"q62","format":"json","model":"claude-haiku-4-5-20251001","expected":"3","actual":"3","isCorrect":true,"inputTokens":11906,"outputTokens":5,"latencyMs":1299.5080830000225},{"questionId":"q62","format":"toon","model":"claude-haiku-4-5-20251001","expected":"3","actual":"3","isCorrect":true,"inputTokens":6992,"outputTokens":5,"latencyMs":1090.596291999973},{"questionId":"q62","format":"csv","model":"claude-haiku-4-5-20251001","expected":"3","actual":"10","isCorrect":false,"inputTokens":8413,"outputTokens":5,"latencyMs":1349.8798749999842},{"questionId":"q62","format":"xml","model":"claude-haiku-4-5-20251001","expected":"3","actual":"3","isCorrect":true,"inputTokens":13379,"outputTokens":5,"latencyMs":2257.2797920000157},{"questionId":"q62","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"3","actual":"3","isCorrect":true,"inputTokens":8384,"outputTokens":5,"latencyMs":1284.0681660000118},{"questionId":"q63","format":"json","model":"claude-haiku-4-5-20251001","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":11907,"outputTokens":10,"latencyMs":1803.687208999996},{"questionId":"q63","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":6993,"outputTokens":10,"latencyMs":1240.0325420000008},{"questionId":"q63","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":8414,"outputTokens":10,"latencyMs":2587.3457499999786},{"questionId":"q63","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":13380,"outputTokens":10,"latencyMs":1334.2720830000471},{"questionId":"q63","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":8385,"outputTokens":10,"latencyMs":1024.188709000009},{"questionId":"q64","format":"json","model":"claude-haiku-4-5-20251001","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":11907,"outputTokens":11,"latencyMs":1094.068333000003},{"questionId":"q64","format":"toon","model":"claude-haiku-4-5-20251001","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":6993,"outputTokens":11,"latencyMs":1474.1317079999717},{"questionId":"q64","format":"csv","model":"claude-haiku-4-5-20251001","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":8414,"outputTokens":11,"latencyMs":1711.936250000028},{"questionId":"q64","format":"xml","model":"claude-haiku-4-5-20251001","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":13380,"outputTokens":11,"latencyMs":1194.2019169999985},{"questionId":"q64","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":8385,"outputTokens":11,"latencyMs":1781.72537499998},{"questionId":"q65","format":"json","model":"claude-haiku-4-5-20251001","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":11907,"outputTokens":10,"latencyMs":1261.6439169999794},{"questionId":"q65","format":"toon","model":"claude-haiku-4-5-20251001","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":6993,"outputTokens":10,"latencyMs":1146.6620419999817},{"questionId":"q65","format":"csv","model":"claude-haiku-4-5-20251001","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":8414,"outputTokens":10,"latencyMs":1086.1719579999917},{"questionId":"q65","format":"xml","model":"claude-haiku-4-5-20251001","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":13380,"outputTokens":10,"latencyMs":1026.9118749999907},{"questionId":"q65","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":8385,"outputTokens":10,"latencyMs":1348.9515419999952},{"questionId":"q66","format":"json","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":11906,"outputTokens":5,"latencyMs":1245.4214999999967},{"questionId":"q66","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":6992,"outputTokens":5,"latencyMs":1375.6594160000095},{"questionId":"q66","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4","actual":"11","isCorrect":false,"inputTokens":8413,"outputTokens":5,"latencyMs":1127.8399999999674},{"questionId":"q66","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"4","isCorrect":true,"inputTokens":13379,"outputTokens":5,"latencyMs":1797.8928749999614},{"questionId":"q66","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"11","isCorrect":false,"inputTokens":8384,"outputTokens":5,"latencyMs":1221.247250000015},{"questionId":"q67","format":"json","model":"claude-haiku-4-5-20251001","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":11907,"outputTokens":13,"latencyMs":1563.5079999999725},{"questionId":"q67","format":"toon","model":"claude-haiku-4-5-20251001","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":6993,"outputTokens":13,"latencyMs":1365.2586250000168},{"questionId":"q67","format":"csv","model":"claude-haiku-4-5-20251001","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":8414,"outputTokens":13,"latencyMs":2501.536124999984},{"questionId":"q67","format":"xml","model":"claude-haiku-4-5-20251001","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":13380,"outputTokens":13,"latencyMs":1196.730499999947},{"questionId":"q67","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":8385,"outputTokens":13,"latencyMs":1234.3808749999735},{"questionId":"q68","format":"json","model":"claude-haiku-4-5-20251001","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":11907,"outputTokens":12,"latencyMs":1494.3334160000086},{"questionId":"q68","format":"toon","model":"claude-haiku-4-5-20251001","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":6993,"outputTokens":12,"latencyMs":1203.2858750000014},{"questionId":"q68","format":"csv","model":"claude-haiku-4-5-20251001","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":8414,"outputTokens":12,"latencyMs":1266.0875000000233},{"questionId":"q68","format":"xml","model":"claude-haiku-4-5-20251001","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":13380,"outputTokens":12,"latencyMs":1870.719374999986},{"questionId":"q68","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":8385,"outputTokens":12,"latencyMs":1235.286999999953},{"questionId":"q69","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":11902,"outputTokens":5,"latencyMs":1025.6273330000113},{"questionId":"q69","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":6988,"outputTokens":5,"latencyMs":1604.1600000000326},{"questionId":"q69","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8409,"outputTokens":5,"latencyMs":1097.4308339999989},{"questionId":"q69","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":13375,"outputTokens":5,"latencyMs":1692.614540999988},{"questionId":"q69","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8380,"outputTokens":5,"latencyMs":1406.6492500000168},{"questionId":"q70","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":11902,"outputTokens":5,"latencyMs":1389.2959169999813},{"questionId":"q70","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"7","isCorrect":false,"inputTokens":6988,"outputTokens":5,"latencyMs":1044.9547080000048},{"questionId":"q70","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8409,"outputTokens":5,"latencyMs":1066.8629579999833},{"questionId":"q70","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":13375,"outputTokens":5,"latencyMs":1056.8368330000085},{"questionId":"q70","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8380,"outputTokens":5,"latencyMs":1682.136999999988},{"questionId":"q71","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":11902,"outputTokens":5,"latencyMs":1143.9599589999998},{"questionId":"q71","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"7","isCorrect":false,"inputTokens":6988,"outputTokens":5,"latencyMs":1235.7126249999856},{"questionId":"q71","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8409,"outputTokens":5,"latencyMs":1744.9372919999878},{"questionId":"q71","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":13375,"outputTokens":5,"latencyMs":1740.2846250000293},{"questionId":"q71","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8380,"outputTokens":5,"latencyMs":1173.4387500000303},{"questionId":"q72","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":11902,"outputTokens":5,"latencyMs":1230.0939590000198},{"questionId":"q72","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":6988,"outputTokens":5,"latencyMs":1167.9674999999697},{"questionId":"q72","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"9","isCorrect":false,"inputTokens":8409,"outputTokens":5,"latencyMs":1145.0992090000072},{"questionId":"q72","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":13375,"outputTokens":5,"latencyMs":1309.3867919999757},{"questionId":"q72","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":8380,"outputTokens":5,"latencyMs":1130.8090420000372},{"questionId":"q73","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":11902,"outputTokens":5,"latencyMs":1025.846499999985},{"questionId":"q73","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":6988,"outputTokens":5,"latencyMs":1016.2780839999905},{"questionId":"q73","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8409,"outputTokens":5,"latencyMs":983.272042000026},{"questionId":"q73","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"10","isCorrect":true,"inputTokens":13375,"outputTokens":5,"latencyMs":1961.9339580000378},{"questionId":"q73","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":8380,"outputTokens":5,"latencyMs":1024.795999999973},{"questionId":"q74","format":"json","model":"claude-haiku-4-5-20251001","expected":"42342.25","actual":"48,945.47","isCorrect":false,"inputTokens":11902,"outputTokens":9,"latencyMs":1060.843041999964},{"questionId":"q74","format":"toon","model":"claude-haiku-4-5-20251001","expected":"42342.25","actual":"41,847.47","isCorrect":false,"inputTokens":6988,"outputTokens":9,"latencyMs":1463.711667000025},{"questionId":"q74","format":"csv","model":"claude-haiku-4-5-20251001","expected":"42342.25","actual":"48,847.47","isCorrect":false,"inputTokens":8409,"outputTokens":9,"latencyMs":1293.4600830000127},{"questionId":"q74","format":"xml","model":"claude-haiku-4-5-20251001","expected":"42342.25","actual":"47,847.47","isCorrect":false,"inputTokens":13375,"outputTokens":9,"latencyMs":1255.9750409999979},{"questionId":"q74","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"42342.25","actual":"42,847.47","isCorrect":false,"inputTokens":8380,"outputTokens":9,"latencyMs":3193.0889579999493},{"questionId":"q75","format":"json","model":"claude-haiku-4-5-20251001","expected":"846.85","actual":"896.77","isCorrect":false,"inputTokens":11900,"outputTokens":7,"latencyMs":2870.627749999985},{"questionId":"q75","format":"toon","model":"claude-haiku-4-5-20251001","expected":"846.85","actual":"896.77","isCorrect":false,"inputTokens":6986,"outputTokens":7,"latencyMs":1042.68741699995},{"questionId":"q75","format":"csv","model":"claude-haiku-4-5-20251001","expected":"846.85","actual":"896.77","isCorrect":false,"inputTokens":8407,"outputTokens":7,"latencyMs":1113.4863750000368},{"questionId":"q75","format":"xml","model":"claude-haiku-4-5-20251001","expected":"846.85","actual":"896.77","isCorrect":false,"inputTokens":13373,"outputTokens":7,"latencyMs":1367.4225420000148},{"questionId":"q75","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"846.85","actual":"896.77","isCorrect":false,"inputTokens":8378,"outputTokens":7,"latencyMs":1130.3429170000018},{"questionId":"q76","format":"json","model":"claude-haiku-4-5-20251001","expected":"50","actual":"50","isCorrect":true,"inputTokens":11901,"outputTokens":5,"latencyMs":1733.58391700004},{"questionId":"q76","format":"toon","model":"claude-haiku-4-5-20251001","expected":"50","actual":"50","isCorrect":true,"inputTokens":6987,"outputTokens":5,"latencyMs":1229.5445829999517},{"questionId":"q76","format":"csv","model":"claude-haiku-4-5-20251001","expected":"50","actual":"50","isCorrect":true,"inputTokens":8408,"outputTokens":5,"latencyMs":975.3047079999815},{"questionId":"q76","format":"xml","model":"claude-haiku-4-5-20251001","expected":"50","actual":"50","isCorrect":true,"inputTokens":13374,"outputTokens":5,"latencyMs":1486.6707079999615},{"questionId":"q76","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"50","actual":"50","isCorrect":true,"inputTokens":8379,"outputTokens":5,"latencyMs":1136.8541670000413},{"questionId":"q77","format":"json","model":"claude-haiku-4-5-20251001","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":11900,"outputTokens":8,"latencyMs":2364.3018750000047},{"questionId":"q77","format":"toon","model":"claude-haiku-4-5-20251001","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":6986,"outputTokens":8,"latencyMs":991.6684590000077},{"questionId":"q77","format":"csv","model":"claude-haiku-4-5-20251001","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":8407,"outputTokens":8,"latencyMs":1419.3213749999995},{"questionId":"q77","format":"xml","model":"claude-haiku-4-5-20251001","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":13373,"outputTokens":8,"latencyMs":1045.5306670000427},{"questionId":"q77","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":8378,"outputTokens":8,"latencyMs":1314.1944169999915},{"questionId":"q78","format":"json","model":"claude-haiku-4-5-20251001","expected":"44","actual":"48","isCorrect":false,"inputTokens":11904,"outputTokens":5,"latencyMs":1207.4549579999875},{"questionId":"q78","format":"toon","model":"claude-haiku-4-5-20251001","expected":"44","actual":"47","isCorrect":false,"inputTokens":6990,"outputTokens":5,"latencyMs":1112.302416999999},{"questionId":"q78","format":"csv","model":"claude-haiku-4-5-20251001","expected":"44","actual":"48","isCorrect":false,"inputTokens":8411,"outputTokens":5,"latencyMs":983.4457079999847},{"questionId":"q78","format":"xml","model":"claude-haiku-4-5-20251001","expected":"44","actual":"45","isCorrect":false,"inputTokens":13377,"outputTokens":5,"latencyMs":1604.4271249999874},{"questionId":"q78","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"44","actual":"45","isCorrect":false,"inputTokens":8382,"outputTokens":5,"latencyMs":1013.8412499999977},{"questionId":"q79","format":"json","model":"claude-haiku-4-5-20251001","expected":"39","actual":"38","isCorrect":false,"inputTokens":11904,"outputTokens":5,"latencyMs":1012.8118749999558},{"questionId":"q79","format":"toon","model":"claude-haiku-4-5-20251001","expected":"39","actual":"38","isCorrect":false,"inputTokens":6990,"outputTokens":5,"latencyMs":1249.0495830000145},{"questionId":"q79","format":"csv","model":"claude-haiku-4-5-20251001","expected":"39","actual":"38","isCorrect":false,"inputTokens":8411,"outputTokens":5,"latencyMs":1280.9340840000077},{"questionId":"q79","format":"xml","model":"claude-haiku-4-5-20251001","expected":"39","actual":"38","isCorrect":false,"inputTokens":13377,"outputTokens":5,"latencyMs":1348.8392920000479},{"questionId":"q79","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"39","actual":"35","isCorrect":false,"inputTokens":8382,"outputTokens":5,"latencyMs":933.0613339999691},{"questionId":"q80","format":"json","model":"claude-haiku-4-5-20251001","expected":"32","actual":"28","isCorrect":false,"inputTokens":11904,"outputTokens":5,"latencyMs":1132.9682909999974},{"questionId":"q80","format":"toon","model":"claude-haiku-4-5-20251001","expected":"32","actual":"26","isCorrect":false,"inputTokens":6990,"outputTokens":5,"latencyMs":938.9205829999992},{"questionId":"q80","format":"csv","model":"claude-haiku-4-5-20251001","expected":"32","actual":"28","isCorrect":false,"inputTokens":8411,"outputTokens":5,"latencyMs":1145.2839169999934},{"questionId":"q80","format":"xml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"28","isCorrect":false,"inputTokens":13377,"outputTokens":5,"latencyMs":1417.7863329999964},{"questionId":"q80","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"26","isCorrect":false,"inputTokens":8382,"outputTokens":5,"latencyMs":1141.3652910000528},{"questionId":"q81","format":"json","model":"claude-haiku-4-5-20251001","expected":"7","actual":"7","isCorrect":true,"inputTokens":11908,"outputTokens":5,"latencyMs":1311.5260830000043},{"questionId":"q81","format":"toon","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":6994,"outputTokens":5,"latencyMs":1227.5837919999612},{"questionId":"q81","format":"csv","model":"claude-haiku-4-5-20251001","expected":"7","actual":"5","isCorrect":false,"inputTokens":8415,"outputTokens":5,"latencyMs":1229.9872500000056},{"questionId":"q81","format":"xml","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":13381,"outputTokens":5,"latencyMs":1362.0046250000014},{"questionId":"q81","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":8386,"outputTokens":5,"latencyMs":1368.8997080000117},{"questionId":"q82","format":"json","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":11908,"outputTokens":5,"latencyMs":1256.9955419999897},{"questionId":"q82","format":"toon","model":"claude-haiku-4-5-20251001","expected":"8","actual":"7","isCorrect":false,"inputTokens":6994,"outputTokens":5,"latencyMs":1216.614750000008},{"questionId":"q82","format":"csv","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":8415,"outputTokens":5,"latencyMs":1676.1089999999967},{"questionId":"q82","format":"xml","model":"claude-haiku-4-5-20251001","expected":"8","actual":"7","isCorrect":false,"inputTokens":13381,"outputTokens":5,"latencyMs":2740.838999999978},{"questionId":"q82","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"8","actual":"8","isCorrect":true,"inputTokens":8386,"outputTokens":5,"latencyMs":1226.125166999991},{"questionId":"q83","format":"json","model":"claude-haiku-4-5-20251001","expected":"7","actual":"7","isCorrect":true,"inputTokens":11908,"outputTokens":5,"latencyMs":1207.845417000004},{"questionId":"q83","format":"toon","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":6994,"outputTokens":5,"latencyMs":1088.3862080000108},{"questionId":"q83","format":"csv","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":8415,"outputTokens":5,"latencyMs":1003.1597499999916},{"questionId":"q83","format":"xml","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":13381,"outputTokens":5,"latencyMs":2110.9697499999893},{"questionId":"q83","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"7","actual":"6","isCorrect":false,"inputTokens":8386,"outputTokens":5,"latencyMs":1119.378417},{"questionId":"q84","format":"json","model":"claude-haiku-4-5-20251001","expected":"9","actual":"10","isCorrect":false,"inputTokens":11908,"outputTokens":5,"latencyMs":1909.2988340000156},{"questionId":"q84","format":"toon","model":"claude-haiku-4-5-20251001","expected":"9","actual":"10","isCorrect":false,"inputTokens":6994,"outputTokens":5,"latencyMs":1260.9599579999922},{"questionId":"q84","format":"csv","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":8415,"outputTokens":5,"latencyMs":1050.2036669999943},{"questionId":"q84","format":"xml","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":13381,"outputTokens":5,"latencyMs":1403.5872919999529},{"questionId":"q84","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":8386,"outputTokens":5,"latencyMs":1009.1102079999982},{"questionId":"q85","format":"json","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":11908,"outputTokens":5,"latencyMs":1200.3068329999805},{"questionId":"q85","format":"toon","model":"claude-haiku-4-5-20251001","expected":"9","actual":"7","isCorrect":false,"inputTokens":6994,"outputTokens":5,"latencyMs":1069.2687920000171},{"questionId":"q85","format":"csv","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":8415,"outputTokens":5,"latencyMs":1632.8915840000263},{"questionId":"q85","format":"xml","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":13381,"outputTokens":5,"latencyMs":1120.5287920000264},{"questionId":"q85","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"9","actual":"8","isCorrect":false,"inputTokens":8386,"outputTokens":5,"latencyMs":1330.5777920000255},{"questionId":"q86","format":"json","model":"claude-haiku-4-5-20251001","expected":"6","actual":"4","isCorrect":false,"inputTokens":11909,"outputTokens":5,"latencyMs":1782.4344580000034},{"questionId":"q86","format":"toon","model":"claude-haiku-4-5-20251001","expected":"6","actual":"4","isCorrect":false,"inputTokens":6995,"outputTokens":5,"latencyMs":1164.7007500000182},{"questionId":"q86","format":"csv","model":"claude-haiku-4-5-20251001","expected":"6","actual":"4","isCorrect":false,"inputTokens":8416,"outputTokens":5,"latencyMs":1224.4592089999933},{"questionId":"q86","format":"xml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"4","isCorrect":false,"inputTokens":13382,"outputTokens":5,"latencyMs":1514.1512909999583},{"questionId":"q86","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"4","isCorrect":false,"inputTokens":8387,"outputTokens":5,"latencyMs":1117.8646660000086},{"questionId":"q87","format":"json","model":"claude-haiku-4-5-20251001","expected":"5","actual":"6","isCorrect":false,"inputTokens":11909,"outputTokens":5,"latencyMs":1134.309083},{"questionId":"q87","format":"toon","model":"claude-haiku-4-5-20251001","expected":"5","actual":"6","isCorrect":false,"inputTokens":6995,"outputTokens":5,"latencyMs":1096.8503330000094},{"questionId":"q87","format":"csv","model":"claude-haiku-4-5-20251001","expected":"5","actual":"6","isCorrect":false,"inputTokens":8416,"outputTokens":5,"latencyMs":846.0002919999533},{"questionId":"q87","format":"xml","model":"claude-haiku-4-5-20251001","expected":"5","actual":"6","isCorrect":false,"inputTokens":13382,"outputTokens":5,"latencyMs":1287.6327499999898},{"questionId":"q87","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"5","actual":"6","isCorrect":false,"inputTokens":8387,"outputTokens":5,"latencyMs":1072.791790999996},{"questionId":"q88","format":"json","model":"claude-haiku-4-5-20251001","expected":"4","actual":"6","isCorrect":false,"inputTokens":11909,"outputTokens":5,"latencyMs":1307.440540999989},{"questionId":"q88","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4","actual":"5","isCorrect":false,"inputTokens":6995,"outputTokens":5,"latencyMs":1369.2558750000317},{"questionId":"q88","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4","actual":"6","isCorrect":false,"inputTokens":8416,"outputTokens":5,"latencyMs":998.0497919999762},{"questionId":"q88","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"6","isCorrect":false,"inputTokens":13382,"outputTokens":5,"latencyMs":1198.8790840000147},{"questionId":"q88","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"5","isCorrect":false,"inputTokens":8387,"outputTokens":5,"latencyMs":1184.4371249999967},{"questionId":"q89","format":"json","model":"claude-haiku-4-5-20251001","expected":"27","actual":"23","isCorrect":false,"inputTokens":11912,"outputTokens":5,"latencyMs":954.9974999999977},{"questionId":"q89","format":"toon","model":"claude-haiku-4-5-20251001","expected":"27","actual":"23","isCorrect":false,"inputTokens":6998,"outputTokens":5,"latencyMs":1305.583958000003},{"questionId":"q89","format":"csv","model":"claude-haiku-4-5-20251001","expected":"27","actual":"28","isCorrect":false,"inputTokens":8419,"outputTokens":5,"latencyMs":3619.6967080000322},{"questionId":"q89","format":"xml","model":"claude-haiku-4-5-20251001","expected":"27","actual":"23","isCorrect":false,"inputTokens":13385,"outputTokens":5,"latencyMs":1133.0662920000032},{"questionId":"q89","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"27","actual":"28","isCorrect":false,"inputTokens":8390,"outputTokens":5,"latencyMs":1168.7010840000003},{"questionId":"q90","format":"json","model":"claude-haiku-4-5-20251001","expected":"27","actual":"18","isCorrect":false,"inputTokens":11912,"outputTokens":5,"latencyMs":1415.6329159999732},{"questionId":"q90","format":"toon","model":"claude-haiku-4-5-20251001","expected":"27","actual":"20","isCorrect":false,"inputTokens":6998,"outputTokens":5,"latencyMs":1187.728833000001},{"questionId":"q90","format":"csv","model":"claude-haiku-4-5-20251001","expected":"27","actual":"28","isCorrect":false,"inputTokens":8419,"outputTokens":5,"latencyMs":1990.4743749999907},{"questionId":"q90","format":"xml","model":"claude-haiku-4-5-20251001","expected":"27","actual":"23","isCorrect":false,"inputTokens":13385,"outputTokens":5,"latencyMs":1293.9459589999751},{"questionId":"q90","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"27","actual":"23","isCorrect":false,"inputTokens":8390,"outputTokens":5,"latencyMs":1904.210625000007},{"questionId":"q91","format":"json","model":"claude-haiku-4-5-20251001","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":4080,"outputTokens":6,"latencyMs":966.691041000071},{"questionId":"q91","format":"toon","model":"claude-haiku-4-5-20251001","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":1509,"outputTokens":6,"latencyMs":1408.0366249999497},{"questionId":"q91","format":"csv","model":"claude-haiku-4-5-20251001","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":1445,"outputTokens":6,"latencyMs":1189.9256670000032},{"questionId":"q91","format":"xml","model":"claude-haiku-4-5-20251001","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":4787,"outputTokens":6,"latencyMs":1387.4771669999463},{"questionId":"q91","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":3110,"outputTokens":6,"latencyMs":1201.9924589999719},{"questionId":"q92","format":"json","model":"claude-haiku-4-5-20251001","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":4079,"outputTokens":8,"latencyMs":1164.4929999999003},{"questionId":"q92","format":"toon","model":"claude-haiku-4-5-20251001","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":1508,"outputTokens":8,"latencyMs":1148.0573749999749},{"questionId":"q92","format":"csv","model":"claude-haiku-4-5-20251001","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":1444,"outputTokens":8,"latencyMs":1003.7817920000525},{"questionId":"q92","format":"xml","model":"claude-haiku-4-5-20251001","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":4786,"outputTokens":8,"latencyMs":1258.9658749999944},{"questionId":"q92","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":3109,"outputTokens":8,"latencyMs":1838.9517500000075},{"questionId":"q93","format":"json","model":"claude-haiku-4-5-20251001","expected":"33","actual":"33","isCorrect":true,"inputTokens":4080,"outputTokens":5,"latencyMs":842.0457089999691},{"questionId":"q93","format":"toon","model":"claude-haiku-4-5-20251001","expected":"33","actual":"33","isCorrect":true,"inputTokens":1509,"outputTokens":5,"latencyMs":1149.8674580000807},{"questionId":"q93","format":"csv","model":"claude-haiku-4-5-20251001","expected":"33","actual":"33","isCorrect":true,"inputTokens":1445,"outputTokens":5,"latencyMs":1598.7321249999804},{"questionId":"q93","format":"xml","model":"claude-haiku-4-5-20251001","expected":"33","actual":"33","isCorrect":true,"inputTokens":4787,"outputTokens":5,"latencyMs":1007.8998330000322},{"questionId":"q93","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"33","actual":"33","isCorrect":true,"inputTokens":3110,"outputTokens":5,"latencyMs":975.626290999935},{"questionId":"q94","format":"json","model":"claude-haiku-4-5-20251001","expected":"377","actual":"377","isCorrect":true,"inputTokens":4080,"outputTokens":5,"latencyMs":1935.050750000053},{"questionId":"q94","format":"toon","model":"claude-haiku-4-5-20251001","expected":"377","actual":"377","isCorrect":true,"inputTokens":1509,"outputTokens":5,"latencyMs":1020.1467499999562},{"questionId":"q94","format":"csv","model":"claude-haiku-4-5-20251001","expected":"377","actual":"377","isCorrect":true,"inputTokens":1445,"outputTokens":5,"latencyMs":850.6589590000222},{"questionId":"q94","format":"xml","model":"claude-haiku-4-5-20251001","expected":"377","actual":"377","isCorrect":true,"inputTokens":4787,"outputTokens":5,"latencyMs":1021.7720419999678},{"questionId":"q94","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"377","actual":"377","isCorrect":true,"inputTokens":3110,"outputTokens":5,"latencyMs":1033.235958000063},{"questionId":"q95","format":"json","model":"claude-haiku-4-5-20251001","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":4080,"outputTokens":7,"latencyMs":1355.466875000042},{"questionId":"q95","format":"toon","model":"claude-haiku-4-5-20251001","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":1509,"outputTokens":7,"latencyMs":938.1584160000784},{"questionId":"q95","format":"csv","model":"claude-haiku-4-5-20251001","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":1445,"outputTokens":7,"latencyMs":903.1191660000477},{"questionId":"q95","format":"xml","model":"claude-haiku-4-5-20251001","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":4787,"outputTokens":7,"latencyMs":1082.7665829999605},{"questionId":"q95","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":3110,"outputTokens":7,"latencyMs":861.1040839999914},{"questionId":"q96","format":"json","model":"claude-haiku-4-5-20251001","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":4080,"outputTokens":6,"latencyMs":1177.3398329999764},{"questionId":"q96","format":"toon","model":"claude-haiku-4-5-20251001","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":1509,"outputTokens":6,"latencyMs":859.1606250000186},{"questionId":"q96","format":"csv","model":"claude-haiku-4-5-20251001","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":1445,"outputTokens":6,"latencyMs":820.2049999999581},{"questionId":"q96","format":"xml","model":"claude-haiku-4-5-20251001","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":4787,"outputTokens":6,"latencyMs":1193.239540999988},{"questionId":"q96","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":3110,"outputTokens":6,"latencyMs":1072.1601249999367},{"questionId":"q97","format":"json","model":"claude-haiku-4-5-20251001","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":4079,"outputTokens":8,"latencyMs":1062.7770000000019},{"questionId":"q97","format":"toon","model":"claude-haiku-4-5-20251001","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":1508,"outputTokens":8,"latencyMs":1007.438625000068},{"questionId":"q97","format":"csv","model":"claude-haiku-4-5-20251001","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":1444,"outputTokens":8,"latencyMs":767.1123329999391},{"questionId":"q97","format":"xml","model":"claude-haiku-4-5-20251001","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":4786,"outputTokens":8,"latencyMs":1300.158374999999},{"questionId":"q97","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":3109,"outputTokens":8,"latencyMs":2612.5782500000205},{"questionId":"q98","format":"json","model":"claude-haiku-4-5-20251001","expected":"44","actual":"44","isCorrect":true,"inputTokens":4080,"outputTokens":5,"latencyMs":957.9554159999825},{"questionId":"q98","format":"toon","model":"claude-haiku-4-5-20251001","expected":"44","actual":"44","isCorrect":true,"inputTokens":1509,"outputTokens":5,"latencyMs":1026.5866669999668},{"questionId":"q98","format":"csv","model":"claude-haiku-4-5-20251001","expected":"44","actual":"44","isCorrect":true,"inputTokens":1445,"outputTokens":5,"latencyMs":1030.1116670001065},{"questionId":"q98","format":"xml","model":"claude-haiku-4-5-20251001","expected":"44","actual":"44","isCorrect":true,"inputTokens":4787,"outputTokens":5,"latencyMs":1137.4142920000013},{"questionId":"q98","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"44","actual":"44","isCorrect":true,"inputTokens":3110,"outputTokens":5,"latencyMs":1181.4943340000464},{"questionId":"q99","format":"json","model":"claude-haiku-4-5-20251001","expected":"411","actual":"411","isCorrect":true,"inputTokens":4080,"outputTokens":5,"latencyMs":962.8050420000218},{"questionId":"q99","format":"toon","model":"claude-haiku-4-5-20251001","expected":"411","actual":"411","isCorrect":true,"inputTokens":1509,"outputTokens":5,"latencyMs":1434.658291000058},{"questionId":"q99","format":"csv","model":"claude-haiku-4-5-20251001","expected":"411","actual":"411","isCorrect":true,"inputTokens":1445,"outputTokens":5,"latencyMs":949.6485830000602},{"questionId":"q99","format":"xml","model":"claude-haiku-4-5-20251001","expected":"411","actual":"411","isCorrect":true,"inputTokens":4787,"outputTokens":5,"latencyMs":2933.7991250000196},{"questionId":"q99","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"411","actual":"411","isCorrect":true,"inputTokens":3110,"outputTokens":5,"latencyMs":1033.5851669999538},{"questionId":"q100","format":"json","model":"claude-haiku-4-5-20251001","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":4080,"outputTokens":7,"latencyMs":1852.7236249999842},{"questionId":"q100","format":"toon","model":"claude-haiku-4-5-20251001","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":1509,"outputTokens":7,"latencyMs":1438.7002499999944},{"questionId":"q100","format":"csv","model":"claude-haiku-4-5-20251001","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":1445,"outputTokens":7,"latencyMs":2276.738374999957},{"questionId":"q100","format":"xml","model":"claude-haiku-4-5-20251001","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":4787,"outputTokens":7,"latencyMs":1117.7498749999795},{"questionId":"q100","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":3110,"outputTokens":7,"latencyMs":942.2236669999547},{"questionId":"q101","format":"json","model":"claude-haiku-4-5-20251001","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":4080,"outputTokens":6,"latencyMs":1163.0916669999715},{"questionId":"q101","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":1509,"outputTokens":6,"latencyMs":974.8354159999872},{"questionId":"q101","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":1445,"outputTokens":6,"latencyMs":1171.1146670000162},{"questionId":"q101","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":4787,"outputTokens":6,"latencyMs":7297.294666999951},{"questionId":"q101","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":3110,"outputTokens":6,"latencyMs":1055.473040999961},{"questionId":"q102","format":"json","model":"claude-haiku-4-5-20251001","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":4079,"outputTokens":8,"latencyMs":1217.7702499999432},{"questionId":"q102","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":1508,"outputTokens":8,"latencyMs":1113.1937919999473},{"questionId":"q102","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":1444,"outputTokens":8,"latencyMs":845.7578750000102},{"questionId":"q102","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":4786,"outputTokens":8,"latencyMs":953.4664170000469},{"questionId":"q102","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":3109,"outputTokens":8,"latencyMs":1226.1685419999994},{"questionId":"q103","format":"json","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":4080,"outputTokens":5,"latencyMs":1375.7582079999847},{"questionId":"q103","format":"toon","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":1509,"outputTokens":5,"latencyMs":1024.673499999917},{"questionId":"q103","format":"csv","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":1445,"outputTokens":5,"latencyMs":1326.7837499999441},{"questionId":"q103","format":"xml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":4787,"outputTokens":5,"latencyMs":989.4393749999581},{"questionId":"q103","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":3110,"outputTokens":5,"latencyMs":1197.8059580000117},{"questionId":"q104","format":"json","model":"claude-haiku-4-5-20251001","expected":"344498","actual":"188,945","isCorrect":false,"inputTokens":4077,"outputTokens":7,"latencyMs":1179.029916999978},{"questionId":"q104","format":"toon","model":"claude-haiku-4-5-20251001","expected":"344498","actual":"337,045","isCorrect":false,"inputTokens":1506,"outputTokens":7,"latencyMs":1021.6037090000464},{"questionId":"q104","format":"csv","model":"claude-haiku-4-5-20251001","expected":"344498","actual":"372,915","isCorrect":false,"inputTokens":1442,"outputTokens":7,"latencyMs":958.3143329999875},{"questionId":"q104","format":"xml","model":"claude-haiku-4-5-20251001","expected":"344498","actual":"372,089","isCorrect":false,"inputTokens":4784,"outputTokens":7,"latencyMs":1432.9507079999894},{"questionId":"q104","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"344498","actual":"181,854","isCorrect":false,"inputTokens":3107,"outputTokens":7,"latencyMs":1362.514582999982},{"questionId":"q105","format":"json","model":"claude-haiku-4-5-20251001","expected":"312818.50","actual":"287,745.89","isCorrect":false,"inputTokens":4075,"outputTokens":9,"latencyMs":970.4799999999814},{"questionId":"q105","format":"toon","model":"claude-haiku-4-5-20251001","expected":"312818.50","actual":"487,891.45","isCorrect":false,"inputTokens":1504,"outputTokens":9,"latencyMs":1335.0773749999935},{"questionId":"q105","format":"csv","model":"claude-haiku-4-5-20251001","expected":"312818.50","actual":"487,891.89","isCorrect":false,"inputTokens":1440,"outputTokens":9,"latencyMs":1190.091582999914},{"questionId":"q105","format":"xml","model":"claude-haiku-4-5-20251001","expected":"312818.50","actual":"381,847.89","isCorrect":false,"inputTokens":4782,"outputTokens":9,"latencyMs":1371.9182080000173},{"questionId":"q105","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"312818.50","actual":"381,847.89","isCorrect":false,"inputTokens":3105,"outputTokens":9,"latencyMs":1102.1025839999784},{"questionId":"q106","format":"json","model":"claude-haiku-4-5-20251001","expected":"1811","actual":"1,234","isCorrect":false,"inputTokens":4078,"outputTokens":7,"latencyMs":914.9406250000466},{"questionId":"q106","format":"toon","model":"claude-haiku-4-5-20251001","expected":"1811","actual":"1,945","isCorrect":false,"inputTokens":1507,"outputTokens":7,"latencyMs":930.0820420000236},{"questionId":"q106","format":"csv","model":"claude-haiku-4-5-20251001","expected":"1811","actual":"1,945","isCorrect":false,"inputTokens":1443,"outputTokens":7,"latencyMs":842.8804590000072},{"questionId":"q106","format":"xml","model":"claude-haiku-4-5-20251001","expected":"1811","actual":"1,532","isCorrect":false,"inputTokens":4785,"outputTokens":7,"latencyMs":1126.376624999917},{"questionId":"q106","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"1811","actual":"1,454","isCorrect":false,"inputTokens":3108,"outputTokens":7,"latencyMs":1120.671209000051},{"questionId":"q107","format":"json","model":"claude-haiku-4-5-20251001","expected":"5742","actual":"5,829.03","isCorrect":false,"inputTokens":4076,"outputTokens":9,"latencyMs":1008.0357920000097},{"questionId":"q107","format":"toon","model":"claude-haiku-4-5-20251001","expected":"5742","actual":"5,754.43","isCorrect":false,"inputTokens":1505,"outputTokens":9,"latencyMs":831.6888749999925},{"questionId":"q107","format":"csv","model":"claude-haiku-4-5-20251001","expected":"5742","actual":"5,664.43","isCorrect":false,"inputTokens":1441,"outputTokens":9,"latencyMs":1103.4740000000456},{"questionId":"q107","format":"xml","model":"claude-haiku-4-5-20251001","expected":"5742","actual":"5,747.77","isCorrect":false,"inputTokens":4783,"outputTokens":9,"latencyMs":1104.498750000028},{"questionId":"q107","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"5742","actual":"5,747.27","isCorrect":false,"inputTokens":3106,"outputTokens":9,"latencyMs":1256.2283329999773},{"questionId":"q108","format":"json","model":"claude-haiku-4-5-20251001","expected":"5213.64","actual":"5,454.47","isCorrect":false,"inputTokens":4074,"outputTokens":9,"latencyMs":1964.8543749999953},{"questionId":"q108","format":"toon","model":"claude-haiku-4-5-20251001","expected":"5213.64","actual":"5,447.89","isCorrect":false,"inputTokens":1503,"outputTokens":9,"latencyMs":1086.201457999996},{"questionId":"q108","format":"csv","model":"claude-haiku-4-5-20251001","expected":"5213.64","actual":"5,447.89","isCorrect":false,"inputTokens":1439,"outputTokens":9,"latencyMs":919.8054999999003},{"questionId":"q108","format":"xml","model":"claude-haiku-4-5-20251001","expected":"5213.64","actual":"5,617.89","isCorrect":false,"inputTokens":4781,"outputTokens":9,"latencyMs":1272.7681669999147},{"questionId":"q108","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"5213.64","actual":"5,547.89","isCorrect":false,"inputTokens":3104,"outputTokens":9,"latencyMs":1216.066833000048},{"questionId":"q109","format":"json","model":"claude-haiku-4-5-20251001","expected":"30","actual":"30.03","isCorrect":false,"inputTokens":4077,"outputTokens":7,"latencyMs":1112.5203749999637},{"questionId":"q109","format":"toon","model":"claude-haiku-4-5-20251001","expected":"30","actual":"29.03","isCorrect":false,"inputTokens":1506,"outputTokens":7,"latencyMs":837.4687089999206},{"questionId":"q109","format":"csv","model":"claude-haiku-4-5-20251001","expected":"30","actual":"29.03","isCorrect":false,"inputTokens":1442,"outputTokens":7,"latencyMs":1033.9513329999754},{"questionId":"q109","format":"xml","model":"claude-haiku-4-5-20251001","expected":"30","actual":"30.97","isCorrect":false,"inputTokens":4784,"outputTokens":7,"latencyMs":1142.1755419999827},{"questionId":"q109","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"30","actual":"30.03","isCorrect":false,"inputTokens":3107,"outputTokens":7,"latencyMs":1164.888874999946},{"questionId":"q110","format":"json","model":"claude-haiku-4-5-20251001","expected":"60","actual":"60","isCorrect":true,"inputTokens":4076,"outputTokens":5,"latencyMs":1306.8095830000238},{"questionId":"q110","format":"toon","model":"claude-haiku-4-5-20251001","expected":"60","actual":"60","isCorrect":true,"inputTokens":1505,"outputTokens":5,"latencyMs":1123.1779999999562},{"questionId":"q110","format":"csv","model":"claude-haiku-4-5-20251001","expected":"60","actual":"59","isCorrect":false,"inputTokens":1441,"outputTokens":5,"latencyMs":1082.1129169999622},{"questionId":"q110","format":"xml","model":"claude-haiku-4-5-20251001","expected":"60","actual":"60","isCorrect":true,"inputTokens":4783,"outputTokens":5,"latencyMs":1125.1292080000276},{"questionId":"q110","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"60","actual":"60","isCorrect":true,"inputTokens":3106,"outputTokens":5,"latencyMs":1031.2063339999877},{"questionId":"q111","format":"json","model":"claude-haiku-4-5-20251001","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":4079,"outputTokens":6,"latencyMs":1331.731792000006},{"questionId":"q111","format":"toon","model":"claude-haiku-4-5-20251001","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":1508,"outputTokens":6,"latencyMs":983.5512500000186},{"questionId":"q111","format":"csv","model":"claude-haiku-4-5-20251001","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":1444,"outputTokens":6,"latencyMs":987.8445839999476},{"questionId":"q111","format":"xml","model":"claude-haiku-4-5-20251001","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":4786,"outputTokens":6,"latencyMs":1018.8985829999438},{"questionId":"q111","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":3109,"outputTokens":6,"latencyMs":1297.6886670000385},{"questionId":"q112","format":"json","model":"claude-haiku-4-5-20251001","expected":"42","actual":"42","isCorrect":true,"inputTokens":4078,"outputTokens":5,"latencyMs":1014.283916000044},{"questionId":"q112","format":"toon","model":"claude-haiku-4-5-20251001","expected":"42","actual":"42","isCorrect":true,"inputTokens":1507,"outputTokens":5,"latencyMs":930.3799580000341},{"questionId":"q112","format":"csv","model":"claude-haiku-4-5-20251001","expected":"42","actual":"42","isCorrect":true,"inputTokens":1443,"outputTokens":5,"latencyMs":1603.363000000012},{"questionId":"q112","format":"xml","model":"claude-haiku-4-5-20251001","expected":"42","actual":"54","isCorrect":false,"inputTokens":4785,"outputTokens":5,"latencyMs":1133.8424580000574},{"questionId":"q112","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"42","actual":"47","isCorrect":false,"inputTokens":3108,"outputTokens":5,"latencyMs":1165.5733330000658},{"questionId":"q113","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"11","isCorrect":true,"inputTokens":4078,"outputTokens":5,"latencyMs":1068.3057090000948},{"questionId":"q113","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"12","isCorrect":false,"inputTokens":1507,"outputTokens":5,"latencyMs":942.7892089999514},{"questionId":"q113","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"11","isCorrect":true,"inputTokens":1443,"outputTokens":5,"latencyMs":1161.5375420000637},{"questionId":"q113","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"11","isCorrect":true,"inputTokens":4785,"outputTokens":5,"latencyMs":874.3780829999596},{"questionId":"q113","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"11","isCorrect":true,"inputTokens":3108,"outputTokens":5,"latencyMs":1122.203792000073},{"questionId":"q114","format":"json","model":"claude-haiku-4-5-20251001","expected":"26","actual":"21","isCorrect":false,"inputTokens":4086,"outputTokens":5,"latencyMs":1037.751209000009},{"questionId":"q114","format":"toon","model":"claude-haiku-4-5-20251001","expected":"26","actual":"21","isCorrect":false,"inputTokens":1515,"outputTokens":5,"latencyMs":1911.0047079999931},{"questionId":"q114","format":"csv","model":"claude-haiku-4-5-20251001","expected":"26","actual":"16","isCorrect":false,"inputTokens":1451,"outputTokens":5,"latencyMs":1773.1767909999471},{"questionId":"q114","format":"xml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"21","isCorrect":false,"inputTokens":4793,"outputTokens":5,"latencyMs":970.3540000000503},{"questionId":"q114","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"21","isCorrect":false,"inputTokens":3116,"outputTokens":5,"latencyMs":1061.5327089999337},{"questionId":"q115","format":"json","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":4086,"outputTokens":5,"latencyMs":1067.997207999928},{"questionId":"q115","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10","actual":"11","isCorrect":false,"inputTokens":1515,"outputTokens":5,"latencyMs":945.7236660000635},{"questionId":"q115","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":1451,"outputTokens":5,"latencyMs":871.7425000000512},{"questionId":"q115","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":4793,"outputTokens":5,"latencyMs":1623.2126670000143},{"questionId":"q115","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10","actual":"8","isCorrect":false,"inputTokens":3116,"outputTokens":5,"latencyMs":1221.2374999999302},{"questionId":"q116","format":"json","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":4084,"outputTokens":5,"latencyMs":961.6007919999538},{"questionId":"q116","format":"toon","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":1513,"outputTokens":5,"latencyMs":1009.7077920000302},{"questionId":"q116","format":"csv","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":1449,"outputTokens":5,"latencyMs":889.7579579999438},{"questionId":"q116","format":"xml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"47","isCorrect":false,"inputTokens":4791,"outputTokens":5,"latencyMs":940.1912920000032},{"questionId":"q116","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"47","isCorrect":false,"inputTokens":3114,"outputTokens":5,"latencyMs":1380.3190829999512},{"questionId":"q117","format":"json","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":1178.6390830000164},{"questionId":"q117","format":"toon","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":979.2308750000084},{"questionId":"q117","format":"csv","model":"claude-haiku-4-5-20251001","expected":"28","actual":"32","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":1285.2594999999274},{"questionId":"q117","format":"xml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"38","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":1599.2140420000069},{"questionId":"q117","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"31","isCorrect":false,"inputTokens":3115,"outputTokens":5,"latencyMs":843.9171670000069},{"questionId":"q118","format":"json","model":"claude-haiku-4-5-20251001","expected":"28","actual":"31","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":954.6101250000065},{"questionId":"q118","format":"toon","model":"claude-haiku-4-5-20251001","expected":"28","actual":"31","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":1018.4348750000354},{"questionId":"q118","format":"csv","model":"claude-haiku-4-5-20251001","expected":"28","actual":"31","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":852.1251659999834},{"questionId":"q118","format":"xml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"31","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":2087.4253749999916},{"questionId":"q118","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"28","actual":"28","isCorrect":true,"inputTokens":3115,"outputTokens":5,"latencyMs":944.5495840000222},{"questionId":"q119","format":"json","model":"claude-haiku-4-5-20251001","expected":"26","actual":"28","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":1652.4109170000302},{"questionId":"q119","format":"toon","model":"claude-haiku-4-5-20251001","expected":"26","actual":"28","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":976.6800829999847},{"questionId":"q119","format":"csv","model":"claude-haiku-4-5-20251001","expected":"26","actual":"28","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":1491.4936669999734},{"questionId":"q119","format":"xml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"28","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":1371.1184580000117},{"questionId":"q119","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"26","actual":"28","isCorrect":false,"inputTokens":3115,"outputTokens":5,"latencyMs":1324.189624999999},{"questionId":"q120","format":"json","model":"claude-haiku-4-5-20251001","expected":"25","actual":"23","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":942.2454170000274},{"questionId":"q120","format":"toon","model":"claude-haiku-4-5-20251001","expected":"25","actual":"23","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":1260.2844170000171},{"questionId":"q120","format":"csv","model":"claude-haiku-4-5-20251001","expected":"25","actual":"23","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":944.7768330000108},{"questionId":"q120","format":"xml","model":"claude-haiku-4-5-20251001","expected":"25","actual":"23","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":1230.1708749999525},{"questionId":"q120","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"25","actual":"21","isCorrect":false,"inputTokens":3115,"outputTokens":5,"latencyMs":1508.8189169999678},{"questionId":"q121","format":"json","model":"claude-haiku-4-5-20251001","expected":"35","actual":"28","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":1279.0861249999143},{"questionId":"q121","format":"toon","model":"claude-haiku-4-5-20251001","expected":"35","actual":"28","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":978.6895419999491},{"questionId":"q121","format":"csv","model":"claude-haiku-4-5-20251001","expected":"35","actual":"28","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":942.8243329999968},{"questionId":"q121","format":"xml","model":"claude-haiku-4-5-20251001","expected":"35","actual":"28","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":1253.9298750000307},{"questionId":"q121","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"35","actual":"23","isCorrect":false,"inputTokens":3115,"outputTokens":5,"latencyMs":1233.954207999981},{"questionId":"q122","format":"json","model":"claude-haiku-4-5-20251001","expected":"12","actual":"16","isCorrect":false,"inputTokens":4085,"outputTokens":5,"latencyMs":1070.5387909999117},{"questionId":"q122","format":"toon","model":"claude-haiku-4-5-20251001","expected":"12","actual":"16","isCorrect":false,"inputTokens":1514,"outputTokens":5,"latencyMs":1305.5910829999484},{"questionId":"q122","format":"csv","model":"claude-haiku-4-5-20251001","expected":"12","actual":"16","isCorrect":false,"inputTokens":1450,"outputTokens":5,"latencyMs":1129.7932080000173},{"questionId":"q122","format":"xml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"16","isCorrect":false,"inputTokens":4792,"outputTokens":5,"latencyMs":1081.527166999993},{"questionId":"q122","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"12","actual":"16","isCorrect":false,"inputTokens":3115,"outputTokens":5,"latencyMs":1592.1528750000289},{"questionId":"q123","format":"json","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":4087,"outputTokens":5,"latencyMs":1264.7742500000168},{"questionId":"q123","format":"toon","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":1516,"outputTokens":5,"latencyMs":915.9127500000177},{"questionId":"q123","format":"csv","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":1452,"outputTokens":5,"latencyMs":1125.8347920000087},{"questionId":"q123","format":"xml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"28","isCorrect":false,"inputTokens":4794,"outputTokens":5,"latencyMs":1094.7175419999985},{"questionId":"q123","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"24","isCorrect":false,"inputTokens":3117,"outputTokens":5,"latencyMs":1588.160125000053},{"questionId":"q124","format":"json","model":"claude-haiku-4-5-20251001","expected":"32","actual":"21","isCorrect":false,"inputTokens":4087,"outputTokens":5,"latencyMs":981.7973749999655},{"questionId":"q124","format":"toon","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":1516,"outputTokens":5,"latencyMs":1005.2419160000281},{"questionId":"q124","format":"csv","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":1452,"outputTokens":5,"latencyMs":816.0188329999801},{"questionId":"q124","format":"xml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"24","isCorrect":false,"inputTokens":4794,"outputTokens":5,"latencyMs":921.7119579999708},{"questionId":"q124","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"32","actual":"23","isCorrect":false,"inputTokens":3117,"outputTokens":5,"latencyMs":916.4639159999788},{"questionId":"q125","format":"json","model":"claude-haiku-4-5-20251001","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":17413,"outputTokens":6,"latencyMs":1757.5094999999274},{"questionId":"q125","format":"toon","model":"claude-haiku-4-5-20251001","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":9283,"outputTokens":6,"latencyMs":1323.139458999969},{"questionId":"q125","format":"csv","model":"claude-haiku-4-5-20251001","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":9129,"outputTokens":6,"latencyMs":1236.9662089999765},{"questionId":"q125","format":"xml","model":"claude-haiku-4-5-20251001","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":19808,"outputTokens":6,"latencyMs":1801.3106670000125},{"questionId":"q125","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":14487,"outputTokens":6,"latencyMs":1270.4778340000194},{"questionId":"q126","format":"json","model":"claude-haiku-4-5-20251001","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":17412,"outputTokens":6,"latencyMs":1610.644250000012},{"questionId":"q126","format":"toon","model":"claude-haiku-4-5-20251001","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":9282,"outputTokens":6,"latencyMs":1106.4444999999832},{"questionId":"q126","format":"csv","model":"claude-haiku-4-5-20251001","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":9128,"outputTokens":6,"latencyMs":1073.171915999963},{"questionId":"q126","format":"xml","model":"claude-haiku-4-5-20251001","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":19807,"outputTokens":6,"latencyMs":1363.1502920000348},{"questionId":"q126","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":14486,"outputTokens":6,"latencyMs":1531.886334000039},{"questionId":"q127","format":"json","model":"claude-haiku-4-5-20251001","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":17406,"outputTokens":5,"latencyMs":1481.9807080000173},{"questionId":"q127","format":"toon","model":"claude-haiku-4-5-20251001","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":9276,"outputTokens":5,"latencyMs":1439.4899580000201},{"questionId":"q127","format":"csv","model":"claude-haiku-4-5-20251001","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":9122,"outputTokens":5,"latencyMs":1075.8941249999916},{"questionId":"q127","format":"xml","model":"claude-haiku-4-5-20251001","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":19801,"outputTokens":5,"latencyMs":1804.6294579999521},{"questionId":"q127","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":14480,"outputTokens":5,"latencyMs":1128.1809170000488},{"questionId":"q128","format":"json","model":"claude-haiku-4-5-20251001","expected":"master","actual":"master","isCorrect":true,"inputTokens":17412,"outputTokens":4,"latencyMs":2321.9810419999994},{"questionId":"q128","format":"toon","model":"claude-haiku-4-5-20251001","expected":"master","actual":"master","isCorrect":true,"inputTokens":9282,"outputTokens":4,"latencyMs":1095.6793750000652},{"questionId":"q128","format":"csv","model":"claude-haiku-4-5-20251001","expected":"master","actual":"master","isCorrect":true,"inputTokens":9128,"outputTokens":4,"latencyMs":1123.5810000000056},{"questionId":"q128","format":"xml","model":"claude-haiku-4-5-20251001","expected":"master","actual":"master","isCorrect":true,"inputTokens":19807,"outputTokens":4,"latencyMs":1915.6970420000143},{"questionId":"q128","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"master","actual":"master","isCorrect":true,"inputTokens":14486,"outputTokens":4,"latencyMs":1280.3700000001118},{"questionId":"q129","format":"json","model":"claude-haiku-4-5-20251001","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":17408,"outputTokens":6,"latencyMs":1841.4765409999527},{"questionId":"q129","format":"toon","model":"claude-haiku-4-5-20251001","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":9278,"outputTokens":6,"latencyMs":1724.5},{"questionId":"q129","format":"csv","model":"claude-haiku-4-5-20251001","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":9124,"outputTokens":6,"latencyMs":1324.3665000000037},{"questionId":"q129","format":"xml","model":"claude-haiku-4-5-20251001","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":19803,"outputTokens":6,"latencyMs":1276.29679199995},{"questionId":"q129","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":14482,"outputTokens":6,"latencyMs":1254.8337499999907},{"questionId":"q130","format":"json","model":"claude-haiku-4-5-20251001","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":17411,"outputTokens":6,"latencyMs":1997.4082499999786},{"questionId":"q130","format":"toon","model":"claude-haiku-4-5-20251001","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":9281,"outputTokens":6,"latencyMs":1242.5879999999888},{"questionId":"q130","format":"csv","model":"claude-haiku-4-5-20251001","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":9127,"outputTokens":6,"latencyMs":1228.3447919999016},{"questionId":"q130","format":"xml","model":"claude-haiku-4-5-20251001","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":19806,"outputTokens":6,"latencyMs":1267.2903330000117},{"questionId":"q130","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":14485,"outputTokens":6,"latencyMs":1685.3249589999905},{"questionId":"q131","format":"json","model":"claude-haiku-4-5-20251001","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":17414,"outputTokens":6,"latencyMs":1660.2588329999708},{"questionId":"q131","format":"toon","model":"claude-haiku-4-5-20251001","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":9284,"outputTokens":6,"latencyMs":1181.217667000019},{"questionId":"q131","format":"csv","model":"claude-haiku-4-5-20251001","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":9130,"outputTokens":6,"latencyMs":1296.1214160000673},{"questionId":"q131","format":"xml","model":"claude-haiku-4-5-20251001","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":19809,"outputTokens":6,"latencyMs":1589.8863750000019},{"questionId":"q131","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":14488,"outputTokens":6,"latencyMs":1194.8416669999715},{"questionId":"q132","format":"json","model":"claude-haiku-4-5-20251001","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":17407,"outputTokens":4,"latencyMs":1341.5547500000102},{"questionId":"q132","format":"toon","model":"claude-haiku-4-5-20251001","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":9277,"outputTokens":4,"latencyMs":1047.8982909999322},{"questionId":"q132","format":"csv","model":"claude-haiku-4-5-20251001","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":9123,"outputTokens":4,"latencyMs":1093.3853330001002},{"questionId":"q132","format":"xml","model":"claude-haiku-4-5-20251001","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":19802,"outputTokens":4,"latencyMs":2046.4855419999221},{"questionId":"q132","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":14481,"outputTokens":4,"latencyMs":1340.346375000081},{"questionId":"q133","format":"json","model":"claude-haiku-4-5-20251001","expected":"main","actual":"main","isCorrect":true,"inputTokens":17416,"outputTokens":4,"latencyMs":1905.6404169999296},{"questionId":"q133","format":"toon","model":"claude-haiku-4-5-20251001","expected":"main","actual":"main","isCorrect":true,"inputTokens":9286,"outputTokens":4,"latencyMs":1128.3076669999864},{"questionId":"q133","format":"csv","model":"claude-haiku-4-5-20251001","expected":"main","actual":"main","isCorrect":true,"inputTokens":9132,"outputTokens":4,"latencyMs":1012.4312499999069},{"questionId":"q133","format":"xml","model":"claude-haiku-4-5-20251001","expected":"main","actual":"main","isCorrect":true,"inputTokens":19811,"outputTokens":4,"latencyMs":1368.231625000015},{"questionId":"q133","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"main","actual":"main","isCorrect":true,"inputTokens":14490,"outputTokens":4,"latencyMs":1183.767749999999},{"questionId":"q134","format":"json","model":"claude-haiku-4-5-20251001","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":17411,"outputTokens":6,"latencyMs":1331.795458000037},{"questionId":"q134","format":"toon","model":"claude-haiku-4-5-20251001","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":9281,"outputTokens":6,"latencyMs":1453.8466250000056},{"questionId":"q134","format":"csv","model":"claude-haiku-4-5-20251001","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":9127,"outputTokens":6,"latencyMs":1312.924875000026},{"questionId":"q134","format":"xml","model":"claude-haiku-4-5-20251001","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":19806,"outputTokens":6,"latencyMs":1201.1182500000577},{"questionId":"q134","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":14485,"outputTokens":6,"latencyMs":1659.4385000000475},{"questionId":"q135","format":"json","model":"claude-haiku-4-5-20251001","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":17415,"outputTokens":6,"latencyMs":1293.18512499996},{"questionId":"q135","format":"toon","model":"claude-haiku-4-5-20251001","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":9285,"outputTokens":6,"latencyMs":1522.920792000019},{"questionId":"q135","format":"csv","model":"claude-haiku-4-5-20251001","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":9131,"outputTokens":6,"latencyMs":1273.0050419999752},{"questionId":"q135","format":"xml","model":"claude-haiku-4-5-20251001","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":19810,"outputTokens":6,"latencyMs":1367.845083000022},{"questionId":"q135","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":14489,"outputTokens":6,"latencyMs":2344.700625000056},{"questionId":"q136","format":"json","model":"claude-haiku-4-5-20251001","expected":"15413563","actual":"13,847,892","isCorrect":false,"inputTokens":17407,"outputTokens":9,"latencyMs":1640.2172919999575},{"questionId":"q136","format":"toon","model":"claude-haiku-4-5-20251001","expected":"15413563","actual":"13,847,892","isCorrect":false,"inputTokens":9277,"outputTokens":9,"latencyMs":1354.5127499999944},{"questionId":"q136","format":"csv","model":"claude-haiku-4-5-20251001","expected":"15413563","actual":"13,847,892","isCorrect":false,"inputTokens":9123,"outputTokens":9,"latencyMs":1036.5388749999693},{"questionId":"q136","format":"xml","model":"claude-haiku-4-5-20251001","expected":"15413563","actual":"10,847,892","isCorrect":false,"inputTokens":19802,"outputTokens":9,"latencyMs":2171.9067090000026},{"questionId":"q136","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"15413563","actual":"13,847,892","isCorrect":false,"inputTokens":14481,"outputTokens":9,"latencyMs":1321.8417090000585},{"questionId":"q137","format":"json","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":17404,"outputTokens":5,"latencyMs":1235.2757909999928},{"questionId":"q137","format":"toon","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":9274,"outputTokens":5,"latencyMs":1192.2904169999529},{"questionId":"q137","format":"csv","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":9120,"outputTokens":5,"latencyMs":1018.3940829999046},{"questionId":"q137","format":"xml","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":19799,"outputTokens":5,"latencyMs":2186.387667000061},{"questionId":"q137","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"100","actual":"100","isCorrect":true,"inputTokens":14478,"outputTokens":5,"latencyMs":3161.3915419999976},{"questionId":"q138","format":"json","model":"claude-haiku-4-5-20251001","expected":"154136","actual":"135,892.77","isCorrect":false,"inputTokens":17406,"outputTokens":9,"latencyMs":1567.4401249999646},{"questionId":"q138","format":"toon","model":"claude-haiku-4-5-20251001","expected":"154136","actual":"130,892.57","isCorrect":false,"inputTokens":9276,"outputTokens":9,"latencyMs":1101.0900409999304},{"questionId":"q138","format":"csv","model":"claude-haiku-4-5-20251001","expected":"154136","actual":"127,892.68","isCorrect":false,"inputTokens":9122,"outputTokens":9,"latencyMs":1027.186958999955},{"questionId":"q138","format":"xml","model":"claude-haiku-4-5-20251001","expected":"154136","actual":"130,847.5","isCorrect":false,"inputTokens":19801,"outputTokens":9,"latencyMs":1782.9901250000112},{"questionId":"q138","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"154136","actual":"135,892.45","isCorrect":false,"inputTokens":14480,"outputTokens":9,"latencyMs":1482.1781669999473},{"questionId":"q139","format":"json","model":"claude-haiku-4-5-20251001","expected":"77","actual":"100","isCorrect":false,"inputTokens":17408,"outputTokens":5,"latencyMs":1734.8811249999562},{"questionId":"q139","format":"toon","model":"claude-haiku-4-5-20251001","expected":"77","actual":"50","isCorrect":false,"inputTokens":9278,"outputTokens":5,"latencyMs":1636.998917000019},{"questionId":"q139","format":"csv","model":"claude-haiku-4-5-20251001","expected":"77","actual":"76","isCorrect":false,"inputTokens":9124,"outputTokens":5,"latencyMs":1434.1073750000214},{"questionId":"q139","format":"xml","model":"claude-haiku-4-5-20251001","expected":"77","actual":"100","isCorrect":false,"inputTokens":19803,"outputTokens":5,"latencyMs":2013.3797079999931},{"questionId":"q139","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"77","actual":"100","isCorrect":false,"inputTokens":14482,"outputTokens":5,"latencyMs":1298.3174579999177},{"questionId":"q140","format":"json","model":"claude-haiku-4-5-20251001","expected":"37","actual":"50","isCorrect":false,"inputTokens":17408,"outputTokens":5,"latencyMs":1763.653833999997},{"questionId":"q140","format":"toon","model":"claude-haiku-4-5-20251001","expected":"37","actual":"42","isCorrect":false,"inputTokens":9278,"outputTokens":5,"latencyMs":886.0935000000754},{"questionId":"q140","format":"csv","model":"claude-haiku-4-5-20251001","expected":"37","actual":"42","isCorrect":false,"inputTokens":9124,"outputTokens":5,"latencyMs":1195.078959000064},{"questionId":"q140","format":"xml","model":"claude-haiku-4-5-20251001","expected":"37","actual":"51","isCorrect":false,"inputTokens":19803,"outputTokens":5,"latencyMs":1942.1469580000266},{"questionId":"q140","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"37","actual":"51","isCorrect":false,"inputTokens":14482,"outputTokens":5,"latencyMs":1655.136708999984},{"questionId":"q141","format":"json","model":"claude-haiku-4-5-20251001","expected":"16","actual":"10","isCorrect":false,"inputTokens":17408,"outputTokens":5,"latencyMs":1551.5969580000965},{"questionId":"q141","format":"toon","model":"claude-haiku-4-5-20251001","expected":"16","actual":"20","isCorrect":false,"inputTokens":9278,"outputTokens":5,"latencyMs":1230.4397909999825},{"questionId":"q141","format":"csv","model":"claude-haiku-4-5-20251001","expected":"16","actual":"20","isCorrect":false,"inputTokens":9124,"outputTokens":5,"latencyMs":1354.754083000007},{"questionId":"q141","format":"xml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"13","isCorrect":false,"inputTokens":19803,"outputTokens":5,"latencyMs":1468.8152919999557},{"questionId":"q141","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"16","actual":"12","isCorrect":false,"inputTokens":14482,"outputTokens":5,"latencyMs":1795.4159999999683},{"questionId":"q142","format":"json","model":"claude-haiku-4-5-20251001","expected":"49","actual":"42","isCorrect":false,"inputTokens":17409,"outputTokens":5,"latencyMs":2050.5715830000117},{"questionId":"q142","format":"toon","model":"claude-haiku-4-5-20251001","expected":"49","actual":"42","isCorrect":false,"inputTokens":9279,"outputTokens":5,"latencyMs":1536.8359170000767},{"questionId":"q142","format":"csv","model":"claude-haiku-4-5-20251001","expected":"49","actual":"42","isCorrect":false,"inputTokens":9125,"outputTokens":5,"latencyMs":1075.0399169999873},{"questionId":"q142","format":"xml","model":"claude-haiku-4-5-20251001","expected":"49","actual":"23","isCorrect":false,"inputTokens":19804,"outputTokens":5,"latencyMs":2060.679582999903},{"questionId":"q142","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"49","actual":"47","isCorrect":false,"inputTokens":14483,"outputTokens":5,"latencyMs":1512.578416000004},{"questionId":"q143","format":"json","model":"claude-haiku-4-5-20251001","expected":"23","actual":"15","isCorrect":false,"inputTokens":17409,"outputTokens":5,"latencyMs":1603.0117079999764},{"questionId":"q143","format":"toon","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":9279,"outputTokens":5,"latencyMs":932.1319999999832},{"questionId":"q143","format":"csv","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":9125,"outputTokens":5,"latencyMs":1119.546457999968},{"questionId":"q143","format":"xml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"15","isCorrect":false,"inputTokens":19804,"outputTokens":5,"latencyMs":1855.873874999932},{"questionId":"q143","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"23","actual":"23","isCorrect":true,"inputTokens":14483,"outputTokens":5,"latencyMs":1346.200542000006},{"questionId":"q144","format":"json","model":"claude-haiku-4-5-20251001","expected":"11","actual":"15","isCorrect":false,"inputTokens":17409,"outputTokens":5,"latencyMs":1313.7401250000112},{"questionId":"q144","format":"toon","model":"claude-haiku-4-5-20251001","expected":"11","actual":"15","isCorrect":false,"inputTokens":9279,"outputTokens":5,"latencyMs":1216.9471250000643},{"questionId":"q144","format":"csv","model":"claude-haiku-4-5-20251001","expected":"11","actual":"15","isCorrect":false,"inputTokens":9125,"outputTokens":5,"latencyMs":2675.973708000034},{"questionId":"q144","format":"xml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"15","isCorrect":false,"inputTokens":19804,"outputTokens":5,"latencyMs":1732.7645000000484},{"questionId":"q144","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"11","actual":"18","isCorrect":false,"inputTokens":14483,"outputTokens":5,"latencyMs":1207.5245829999913},{"questionId":"q145","format":"json","model":"claude-haiku-4-5-20251001","expected":"19","actual":"23","isCorrect":false,"inputTokens":17409,"outputTokens":5,"latencyMs":1931.5875839999644},{"questionId":"q145","format":"toon","model":"claude-haiku-4-5-20251001","expected":"19","actual":"42","isCorrect":false,"inputTokens":9279,"outputTokens":5,"latencyMs":1143.0983749999432},{"questionId":"q145","format":"csv","model":"claude-haiku-4-5-20251001","expected":"19","actual":"42","isCorrect":false,"inputTokens":9125,"outputTokens":5,"latencyMs":1161.1778749999357},{"questionId":"q145","format":"xml","model":"claude-haiku-4-5-20251001","expected":"19","actual":"23","isCorrect":false,"inputTokens":19804,"outputTokens":5,"latencyMs":1992.0309170000255},{"questionId":"q145","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"19","actual":"23","isCorrect":false,"inputTokens":14483,"outputTokens":5,"latencyMs":1624.3433749999385},{"questionId":"q146","format":"json","model":"claude-haiku-4-5-20251001","expected":"4","actual":"8","isCorrect":false,"inputTokens":17409,"outputTokens":5,"latencyMs":1244.4897079999791},{"questionId":"q146","format":"toon","model":"claude-haiku-4-5-20251001","expected":"4","actual":"12","isCorrect":false,"inputTokens":9279,"outputTokens":5,"latencyMs":1116.217874999973},{"questionId":"q146","format":"csv","model":"claude-haiku-4-5-20251001","expected":"4","actual":"15","isCorrect":false,"inputTokens":9125,"outputTokens":5,"latencyMs":1347.8542919999454},{"questionId":"q146","format":"xml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"10","isCorrect":false,"inputTokens":19804,"outputTokens":5,"latencyMs":1345.9365420000395},{"questionId":"q146","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"4","actual":"12","isCorrect":false,"inputTokens":14483,"outputTokens":5,"latencyMs":1275.2107500000857},{"questionId":"q147","format":"json","model":"claude-haiku-4-5-20251001","expected":"41","actual":"28","isCorrect":false,"inputTokens":17408,"outputTokens":5,"latencyMs":1309.2654999999795},{"questionId":"q147","format":"toon","model":"claude-haiku-4-5-20251001","expected":"41","actual":"23","isCorrect":false,"inputTokens":9278,"outputTokens":5,"latencyMs":1037.830040999921},{"questionId":"q147","format":"csv","model":"claude-haiku-4-5-20251001","expected":"41","actual":"23","isCorrect":false,"inputTokens":9124,"outputTokens":5,"latencyMs":1813.4298749999143},{"questionId":"q147","format":"xml","model":"claude-haiku-4-5-20251001","expected":"41","actual":"28","isCorrect":false,"inputTokens":19803,"outputTokens":5,"latencyMs":1228.6765000000596},{"questionId":"q147","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"41","actual":"28","isCorrect":false,"inputTokens":14482,"outputTokens":5,"latencyMs":1356.6192080000183},{"questionId":"q148","format":"json","model":"claude-haiku-4-5-20251001","expected":"53","actual":"52","isCorrect":false,"inputTokens":17408,"outputTokens":5,"latencyMs":1681.810375000001},{"questionId":"q148","format":"toon","model":"claude-haiku-4-5-20251001","expected":"53","actual":"47","isCorrect":false,"inputTokens":9278,"outputTokens":5,"latencyMs":1372.5026670000516},{"questionId":"q148","format":"csv","model":"claude-haiku-4-5-20251001","expected":"53","actual":"47","isCorrect":false,"inputTokens":9124,"outputTokens":5,"latencyMs":1770.892165999976},{"questionId":"q148","format":"xml","model":"claude-haiku-4-5-20251001","expected":"53","actual":"57","isCorrect":false,"inputTokens":19803,"outputTokens":5,"latencyMs":1472.7908750000643},{"questionId":"q148","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"53","actual":"57","isCorrect":false,"inputTokens":14482,"outputTokens":5,"latencyMs":1264.1357919999864},{"questionId":"q149","format":"json","model":"claude-haiku-4-5-20251001","expected":"57","actual":"8","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1498.7070420000236},{"questionId":"q149","format":"toon","model":"claude-haiku-4-5-20251001","expected":"57","actual":"12","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":1178.2916670000413},{"questionId":"q149","format":"csv","model":"claude-haiku-4-5-20251001","expected":"57","actual":"15","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":2447.5320830000564},{"questionId":"q149","format":"xml","model":"claude-haiku-4-5-20251001","expected":"57","actual":"15","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":2358.798291999963},{"questionId":"q149","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"57","actual":"15","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1143.7707090000622},{"questionId":"q150","format":"json","model":"claude-haiku-4-5-20251001","expected":"43","actual":"28","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1979.3522499999963},{"questionId":"q150","format":"toon","model":"claude-haiku-4-5-20251001","expected":"43","actual":"23","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":1125.8160419999622},{"questionId":"q150","format":"csv","model":"claude-haiku-4-5-20251001","expected":"43","actual":"32","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":1046.8282500000205},{"questionId":"q150","format":"xml","model":"claude-haiku-4-5-20251001","expected":"43","actual":"15","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":2005.297542000073},{"questionId":"q150","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"43","actual":"31","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1454.232500000042},{"questionId":"q151","format":"json","model":"claude-haiku-4-5-20251001","expected":"25","actual":"12","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1366.8175839999458},{"questionId":"q151","format":"toon","model":"claude-haiku-4-5-20251001","expected":"25","actual":"12","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":1168.4122079999652},{"questionId":"q151","format":"csv","model":"claude-haiku-4-5-20251001","expected":"25","actual":"15","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":1255.723915999988},{"questionId":"q151","format":"xml","model":"claude-haiku-4-5-20251001","expected":"25","actual":"12","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":2307.2180830000434},{"questionId":"q151","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"25","actual":"15","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1441.204832999967},{"questionId":"q152","format":"json","model":"claude-haiku-4-5-20251001","expected":"6","actual":"3","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1794.0238339999923},{"questionId":"q152","format":"toon","model":"claude-haiku-4-5-20251001","expected":"6","actual":"5","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":1161.9437499999767},{"questionId":"q152","format":"csv","model":"claude-haiku-4-5-20251001","expected":"6","actual":"5","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":1660.2390840000007},{"questionId":"q152","format":"xml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"3","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":1695.8838750000577},{"questionId":"q152","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"3","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1330.4464579999913},{"questionId":"q153","format":"json","model":"claude-haiku-4-5-20251001","expected":"6","actual":"8","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1165.8539170000004},{"questionId":"q153","format":"toon","model":"claude-haiku-4-5-20251001","expected":"6","actual":"12","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":1219.5936660000589},{"questionId":"q153","format":"csv","model":"claude-haiku-4-5-20251001","expected":"6","actual":"15","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":1039.9310840000398},{"questionId":"q153","format":"xml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"12","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":1449.7293749999953},{"questionId":"q153","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"6","actual":"12","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1152.4769170000218},{"questionId":"q154","format":"json","model":"claude-haiku-4-5-20251001","expected":"1","actual":"4","isCorrect":false,"inputTokens":17417,"outputTokens":5,"latencyMs":1336.482541999896},{"questionId":"q154","format":"toon","model":"claude-haiku-4-5-20251001","expected":"1","actual":"8","isCorrect":false,"inputTokens":9287,"outputTokens":5,"latencyMs":977.0528329999652},{"questionId":"q154","format":"csv","model":"claude-haiku-4-5-20251001","expected":"1","actual":"3","isCorrect":false,"inputTokens":9133,"outputTokens":5,"latencyMs":902.812249999959},{"questionId":"q154","format":"xml","model":"claude-haiku-4-5-20251001","expected":"1","actual":"5","isCorrect":false,"inputTokens":19812,"outputTokens":5,"latencyMs":1500.1625420000637},{"questionId":"q154","format":"yaml","model":"claude-haiku-4-5-20251001","expected":"1","actual":"4","isCorrect":false,"inputTokens":14491,"outputTokens":5,"latencyMs":1256.6253750000615}]
\ No newline at end of file
diff --git a/benchmarks/results/accuracy/models/gemini-2.5-flash b/benchmarks/results/accuracy/models/gemini-2.5-flash
new file mode 100644
index 0000000..ae973fc
--- /dev/null
+++ b/benchmarks/results/accuracy/models/gemini-2.5-flash
@@ -0,0 +1 @@
+[{"questionId":"q1","format":"json","model":"gemini-2.5-flash","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":7919,"outputTokens":5,"latencyMs":1580.8025420000013},{"questionId":"q1","format":"toon","model":"gemini-2.5-flash","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":3328,"outputTokens":5,"latencyMs":2867.703125},{"questionId":"q1","format":"csv","model":"gemini-2.5-flash","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":3202,"outputTokens":5,"latencyMs":2539.5996659999983},{"questionId":"q1","format":"xml","model":"gemini-2.5-flash","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":9108,"outputTokens":5,"latencyMs":2307.775125},{"questionId":"q1","format":"yaml","model":"gemini-2.5-flash","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":5754,"outputTokens":5,"latencyMs":1417.2890419999985},{"questionId":"q2","format":"json","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7919,"outputTokens":1,"latencyMs":1897.2195829999982},{"questionId":"q2","format":"toon","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":3328,"outputTokens":1,"latencyMs":2282.129084000002},{"questionId":"q2","format":"csv","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":3202,"outputTokens":1,"latencyMs":2625.893000000002},{"questionId":"q2","format":"xml","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":9108,"outputTokens":1,"latencyMs":1890.3851250000007},{"questionId":"q2","format":"yaml","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5754,"outputTokens":1,"latencyMs":1308.6859579999982},{"questionId":"q3","format":"json","model":"gemini-2.5-flash","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":7922,"outputTokens":10,"latencyMs":1049.5787920000002},{"questionId":"q3","format":"toon","model":"gemini-2.5-flash","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":3331,"outputTokens":10,"latencyMs":1592.791291999998},{"questionId":"q3","format":"csv","model":"gemini-2.5-flash","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":3205,"outputTokens":10,"latencyMs":1410.463166999998},{"questionId":"q3","format":"xml","model":"gemini-2.5-flash","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":9111,"outputTokens":10,"latencyMs":2125.4789169999967},{"questionId":"q3","format":"yaml","model":"gemini-2.5-flash","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":5757,"outputTokens":10,"latencyMs":847.513041000002},{"questionId":"q4","format":"json","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":7922,"outputTokens":2,"latencyMs":1198.0414170000004},{"questionId":"q4","format":"toon","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":3331,"outputTokens":2,"latencyMs":1986.697957999997},{"questionId":"q4","format":"csv","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":3205,"outputTokens":2,"latencyMs":2343.3647090000013},{"questionId":"q4","format":"xml","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":9111,"outputTokens":2,"latencyMs":1192.213541000001},{"questionId":"q4","format":"yaml","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":5757,"outputTokens":2,"latencyMs":2922.936999999998},{"questionId":"q5","format":"json","model":"gemini-2.5-flash","expected":"no","actual":"No","isCorrect":true,"inputTokens":7917,"outputTokens":1,"latencyMs":2140.2047079999975},{"questionId":"q5","format":"toon","model":"gemini-2.5-flash","expected":"no","actual":"false","isCorrect":true,"inputTokens":3326,"outputTokens":1,"latencyMs":1629.2954580000005},{"questionId":"q5","format":"csv","model":"gemini-2.5-flash","expected":"no","actual":"No","isCorrect":true,"inputTokens":3200,"outputTokens":1,"latencyMs":2408.795250000003},{"questionId":"q5","format":"xml","model":"gemini-2.5-flash","expected":"no","actual":"false","isCorrect":true,"inputTokens":9106,"outputTokens":1,"latencyMs":1480.3997920000002},{"questionId":"q5","format":"yaml","model":"gemini-2.5-flash","expected":"no","actual":"false","isCorrect":true,"inputTokens":5752,"outputTokens":1,"latencyMs":1532.7541250000031},{"questionId":"q6","format":"json","model":"gemini-2.5-flash","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":7918,"outputTokens":6,"latencyMs":1475.3783750000002},{"questionId":"q6","format":"toon","model":"gemini-2.5-flash","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":3327,"outputTokens":6,"latencyMs":1995.656415999998},{"questionId":"q6","format":"csv","model":"gemini-2.5-flash","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":3201,"outputTokens":6,"latencyMs":1931.7989579999994},{"questionId":"q6","format":"xml","model":"gemini-2.5-flash","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":9107,"outputTokens":6,"latencyMs":3334.5547499999957},{"questionId":"q6","format":"yaml","model":"gemini-2.5-flash","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":5753,"outputTokens":6,"latencyMs":2780.8960829999996},{"questionId":"q7","format":"json","model":"gemini-2.5-flash","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":7918,"outputTokens":1,"latencyMs":1574.4490840000071},{"questionId":"q7","format":"toon","model":"gemini-2.5-flash","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":3327,"outputTokens":1,"latencyMs":2043.5142909999995},{"questionId":"q7","format":"csv","model":"gemini-2.5-flash","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":3201,"outputTokens":1,"latencyMs":1822.5701659999904},{"questionId":"q7","format":"xml","model":"gemini-2.5-flash","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":9107,"outputTokens":1,"latencyMs":3047.141208000001},{"questionId":"q7","format":"yaml","model":"gemini-2.5-flash","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":5753,"outputTokens":1,"latencyMs":2789.5003339999967},{"questionId":"q8","format":"json","model":"gemini-2.5-flash","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":7921,"outputTokens":9,"latencyMs":1268.8674579999933},{"questionId":"q8","format":"toon","model":"gemini-2.5-flash","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":3330,"outputTokens":9,"latencyMs":2008.609540999998},{"questionId":"q8","format":"csv","model":"gemini-2.5-flash","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":3204,"outputTokens":9,"latencyMs":2185.920166000011},{"questionId":"q8","format":"xml","model":"gemini-2.5-flash","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":9110,"outputTokens":9,"latencyMs":1712.0016249999899},{"questionId":"q8","format":"yaml","model":"gemini-2.5-flash","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":5756,"outputTokens":9,"latencyMs":1570.740249999988},{"questionId":"q9","format":"json","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":7922,"outputTokens":1,"latencyMs":2304.852708999999},{"questionId":"q9","format":"toon","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":3331,"outputTokens":1,"latencyMs":2141.2800410000054},{"questionId":"q9","format":"csv","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":3205,"outputTokens":1,"latencyMs":1798.9757500000123},{"questionId":"q9","format":"xml","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":9111,"outputTokens":1,"latencyMs":1668.8819579999981},{"questionId":"q9","format":"yaml","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":5757,"outputTokens":1,"latencyMs":1157.8254999999917},{"questionId":"q10","format":"json","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7919,"outputTokens":1,"latencyMs":1796.344082999989},{"questionId":"q10","format":"toon","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":3328,"outputTokens":1,"latencyMs":1993.1539579999953},{"questionId":"q10","format":"csv","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":3202,"outputTokens":1,"latencyMs":2372.922833000004},{"questionId":"q10","format":"xml","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":9108,"outputTokens":1,"latencyMs":2659.180124999999},{"questionId":"q10","format":"yaml","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":5754,"outputTokens":1,"latencyMs":1727.6837499999965},{"questionId":"q11","format":"json","model":"gemini-2.5-flash","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":7920,"outputTokens":6,"latencyMs":2257.4947910000046},{"questionId":"q11","format":"toon","model":"gemini-2.5-flash","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":3329,"outputTokens":6,"latencyMs":3003.6368340000045},{"questionId":"q11","format":"csv","model":"gemini-2.5-flash","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":3203,"outputTokens":6,"latencyMs":2079.012874999986},{"questionId":"q11","format":"xml","model":"gemini-2.5-flash","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":9109,"outputTokens":6,"latencyMs":3172.8161669999827},{"questionId":"q11","format":"yaml","model":"gemini-2.5-flash","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":5755,"outputTokens":6,"latencyMs":1663.5617499999935},{"questionId":"q12","format":"json","model":"gemini-2.5-flash","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":7918,"outputTokens":1,"latencyMs":1486.9462919999787},{"questionId":"q12","format":"toon","model":"gemini-2.5-flash","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":3327,"outputTokens":1,"latencyMs":2064.915875000006},{"questionId":"q12","format":"csv","model":"gemini-2.5-flash","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":3201,"outputTokens":1,"latencyMs":1782.1797079999815},{"questionId":"q12","format":"xml","model":"gemini-2.5-flash","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":9107,"outputTokens":1,"latencyMs":2877.4236670000246},{"questionId":"q12","format":"yaml","model":"gemini-2.5-flash","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":5753,"outputTokens":1,"latencyMs":1450.604833999998},{"questionId":"q13","format":"json","model":"gemini-2.5-flash","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":7919,"outputTokens":8,"latencyMs":1789.2915410000132},{"questionId":"q13","format":"toon","model":"gemini-2.5-flash","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":3328,"outputTokens":8,"latencyMs":1828.2109579999815},{"questionId":"q13","format":"csv","model":"gemini-2.5-flash","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":3202,"outputTokens":8,"latencyMs":2274.0221660000097},{"questionId":"q13","format":"xml","model":"gemini-2.5-flash","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":9108,"outputTokens":8,"latencyMs":1551.072165999998},{"questionId":"q13","format":"yaml","model":"gemini-2.5-flash","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":5754,"outputTokens":8,"latencyMs":1516.6754580000124},{"questionId":"q14","format":"json","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":7921,"outputTokens":2,"latencyMs":987.642125000013},{"questionId":"q14","format":"toon","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":3330,"outputTokens":2,"latencyMs":1881.2354589999886},{"questionId":"q14","format":"csv","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":3204,"outputTokens":2,"latencyMs":2056.3289579999982},{"questionId":"q14","format":"xml","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":9110,"outputTokens":2,"latencyMs":2817.1784579999803},{"questionId":"q14","format":"yaml","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":5756,"outputTokens":2,"latencyMs":2021.1640830000106},{"questionId":"q15","format":"json","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7918,"outputTokens":1,"latencyMs":2359.317041000002},{"questionId":"q15","format":"toon","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":3327,"outputTokens":1,"latencyMs":1603.3399579999968},{"questionId":"q15","format":"csv","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":3201,"outputTokens":1,"latencyMs":2077.369040999998},{"questionId":"q15","format":"xml","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":9107,"outputTokens":1,"latencyMs":1973.6123329999973},{"questionId":"q15","format":"yaml","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":5753,"outputTokens":1,"latencyMs":2905.178583000001},{"questionId":"q16","format":"json","model":"gemini-2.5-flash","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":7920,"outputTokens":5,"latencyMs":1644.398125000007},{"questionId":"q16","format":"toon","model":"gemini-2.5-flash","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":3329,"outputTokens":5,"latencyMs":2492.454417000001},{"questionId":"q16","format":"csv","model":"gemini-2.5-flash","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":3203,"outputTokens":5,"latencyMs":2201.9482499999867},{"questionId":"q16","format":"xml","model":"gemini-2.5-flash","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":9109,"outputTokens":5,"latencyMs":2547.3975420000206},{"questionId":"q16","format":"yaml","model":"gemini-2.5-flash","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":5755,"outputTokens":5,"latencyMs":2486.4542919999803},{"questionId":"q17","format":"json","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7922,"outputTokens":1,"latencyMs":1902.1616669999785},{"questionId":"q17","format":"toon","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":3331,"outputTokens":1,"latencyMs":2365.476708000002},{"questionId":"q17","format":"csv","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":3205,"outputTokens":1,"latencyMs":1896.7083750000165},{"questionId":"q17","format":"xml","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":9111,"outputTokens":1,"latencyMs":1359.7857920000097},{"questionId":"q17","format":"yaml","model":"gemini-2.5-flash","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5757,"outputTokens":1,"latencyMs":1431.5845419999969},{"questionId":"q18","format":"json","model":"gemini-2.5-flash","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":7920,"outputTokens":8,"latencyMs":759.5840420000022},{"questionId":"q18","format":"toon","model":"gemini-2.5-flash","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":3329,"outputTokens":8,"latencyMs":2147.4177079999936},{"questionId":"q18","format":"csv","model":"gemini-2.5-flash","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":3203,"outputTokens":8,"latencyMs":2092.43862500001},{"questionId":"q18","format":"xml","model":"gemini-2.5-flash","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":9109,"outputTokens":8,"latencyMs":718.7299160000111},{"questionId":"q18","format":"yaml","model":"gemini-2.5-flash","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":5755,"outputTokens":8,"latencyMs":1636.8577919999952},{"questionId":"q19","format":"json","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":7922,"outputTokens":1,"latencyMs":1192.3323749999981},{"questionId":"q19","format":"toon","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":3331,"outputTokens":1,"latencyMs":2263.597584000003},{"questionId":"q19","format":"csv","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":3205,"outputTokens":1,"latencyMs":2004.369584},{"questionId":"q19","format":"xml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":9111,"outputTokens":1,"latencyMs":2294.790500000003},{"questionId":"q19","format":"yaml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":5757,"outputTokens":1,"latencyMs":969.1574170000094},{"questionId":"q20","format":"json","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7918,"outputTokens":1,"latencyMs":2640.4897919999785},{"questionId":"q20","format":"toon","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":3327,"outputTokens":1,"latencyMs":1759.7768330000108},{"questionId":"q20","format":"csv","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":3201,"outputTokens":1,"latencyMs":2334.4565000000002},{"questionId":"q20","format":"xml","model":"gemini-2.5-flash","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":9107,"outputTokens":1,"latencyMs":2929.883332999976},{"questionId":"q20","format":"yaml","model":"gemini-2.5-flash","expected":"yes","actual":"true","isCorrect":true,"inputTokens":5753,"outputTokens":1,"latencyMs":1895.0872079999826},{"questionId":"q21","format":"json","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1336.9982920000039},{"questionId":"q21","format":"toon","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":3326,"outputTokens":2,"latencyMs":1298.859541999991},{"questionId":"q21","format":"csv","model":"gemini-2.5-flash","expected":"17","actual":"17","isCorrect":true,"inputTokens":3200,"outputTokens":2,"latencyMs":3435.683584000013},{"questionId":"q21","format":"xml","model":"gemini-2.5-flash","expected":"17","actual":"17","isCorrect":true,"inputTokens":9106,"outputTokens":2,"latencyMs":7876.424250000011},{"questionId":"q21","format":"yaml","model":"gemini-2.5-flash","expected":"17","actual":"13","isCorrect":false,"inputTokens":5752,"outputTokens":2,"latencyMs":1269.880999999994},{"questionId":"q22","format":"json","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1446.125},{"questionId":"q22","format":"toon","model":"gemini-2.5-flash","expected":"17","actual":"14","isCorrect":false,"inputTokens":3326,"outputTokens":2,"latencyMs":1207.7055830000027},{"questionId":"q22","format":"csv","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":3200,"outputTokens":2,"latencyMs":1461.5985829999845},{"questionId":"q22","format":"xml","model":"gemini-2.5-flash","expected":"17","actual":"17","isCorrect":true,"inputTokens":9106,"outputTokens":2,"latencyMs":7204.429625000019},{"questionId":"q22","format":"yaml","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":5752,"outputTokens":2,"latencyMs":1322.6869590000133},{"questionId":"q23","format":"json","model":"gemini-2.5-flash","expected":"17","actual":"20","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1226.6717910000007},{"questionId":"q23","format":"toon","model":"gemini-2.5-flash","expected":"17","actual":"13","isCorrect":false,"inputTokens":3326,"outputTokens":2,"latencyMs":1516.0925829999906},{"questionId":"q23","format":"csv","model":"gemini-2.5-flash","expected":"17","actual":"17","isCorrect":true,"inputTokens":3200,"outputTokens":2,"latencyMs":5486.268249999994},{"questionId":"q23","format":"xml","model":"gemini-2.5-flash","expected":"17","actual":"17","isCorrect":true,"inputTokens":9106,"outputTokens":2,"latencyMs":6651.016333000007},{"questionId":"q23","format":"yaml","model":"gemini-2.5-flash","expected":"17","actual":"15","isCorrect":false,"inputTokens":5752,"outputTokens":2,"latencyMs":828.4747079999943},{"questionId":"q24","format":"json","model":"gemini-2.5-flash","expected":"17","actual":"12","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1042.7079169999924},{"questionId":"q24","format":"toon","model":"gemini-2.5-flash","expected":"17","actual":"12","isCorrect":false,"inputTokens":3326,"outputTokens":2,"latencyMs":1212.9243750000023},{"questionId":"q24","format":"csv","model":"gemini-2.5-flash","expected":"17","actual":"12","isCorrect":false,"inputTokens":3200,"outputTokens":2,"latencyMs":1512.6335419999668},{"questionId":"q24","format":"xml","model":"gemini-2.5-flash","expected":"17","actual":"10","isCorrect":false,"inputTokens":9106,"outputTokens":2,"latencyMs":912.212332999974},{"questionId":"q24","format":"yaml","model":"gemini-2.5-flash","expected":"17","actual":"13","isCorrect":false,"inputTokens":5752,"outputTokens":2,"latencyMs":959.9500000000116},{"questionId":"q25","format":"json","model":"gemini-2.5-flash","expected":"16","actual":"15","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1017.4402499999851},{"questionId":"q25","format":"toon","model":"gemini-2.5-flash","expected":"16","actual":"12","isCorrect":false,"inputTokens":3326,"outputTokens":2,"latencyMs":962.3897500000312},{"questionId":"q25","format":"csv","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":3200,"outputTokens":2,"latencyMs":1976.8645840000245},{"questionId":"q25","format":"xml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":9106,"outputTokens":2,"latencyMs":2326.5566249999683},{"questionId":"q25","format":"yaml","model":"gemini-2.5-flash","expected":"16","actual":"10","isCorrect":false,"inputTokens":5752,"outputTokens":2,"latencyMs":880.8229170000413},{"questionId":"q26","format":"json","model":"gemini-2.5-flash","expected":"16","actual":"14","isCorrect":false,"inputTokens":7917,"outputTokens":2,"latencyMs":1804.8319999999949},{"questionId":"q26","format":"toon","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":3326,"outputTokens":2,"latencyMs":1348.0182920000516},{"questionId":"q26","format":"csv","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":3200,"outputTokens":2,"latencyMs":3684.9718749999884},{"questionId":"q26","format":"xml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":9106,"outputTokens":2,"latencyMs":2922.235667000001},{"questionId":"q26","format":"yaml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":5752,"outputTokens":2,"latencyMs":1175.5659999999916},{"questionId":"q27","format":"json","model":"gemini-2.5-flash","expected":"91","actual":"90","isCorrect":false,"inputTokens":7925,"outputTokens":2,"latencyMs":1788.3809580000234},{"questionId":"q27","format":"toon","model":"gemini-2.5-flash","expected":"91","actual":"91","isCorrect":true,"inputTokens":3334,"outputTokens":2,"latencyMs":13952.248708},{"questionId":"q27","format":"csv","model":"gemini-2.5-flash","expected":"91","actual":"91","isCorrect":true,"inputTokens":3208,"outputTokens":2,"latencyMs":31483.87050000002},{"questionId":"q27","format":"xml","model":"gemini-2.5-flash","expected":"91","actual":"91","isCorrect":true,"inputTokens":9114,"outputTokens":2,"latencyMs":19297.29920900002},{"questionId":"q27","format":"yaml","model":"gemini-2.5-flash","expected":"91","actual":"88","isCorrect":false,"inputTokens":5760,"outputTokens":2,"latencyMs":1269.508291999984},{"questionId":"q28","format":"json","model":"gemini-2.5-flash","expected":"67","actual":"69","isCorrect":false,"inputTokens":7925,"outputTokens":2,"latencyMs":953.9526249999763},{"questionId":"q28","format":"toon","model":"gemini-2.5-flash","expected":"67","actual":"67","isCorrect":true,"inputTokens":3334,"outputTokens":2,"latencyMs":12068.155332999944},{"questionId":"q28","format":"csv","model":"gemini-2.5-flash","expected":"67","actual":"67","isCorrect":true,"inputTokens":3208,"outputTokens":2,"latencyMs":2511.149374999979},{"questionId":"q28","format":"xml","model":"gemini-2.5-flash","expected":"67","actual":"67","isCorrect":true,"inputTokens":9114,"outputTokens":2,"latencyMs":17922.917750000022},{"questionId":"q28","format":"yaml","model":"gemini-2.5-flash","expected":"67","actual":"69","isCorrect":false,"inputTokens":5760,"outputTokens":2,"latencyMs":1026.073458000028},{"questionId":"q29","format":"json","model":"gemini-2.5-flash","expected":"41","actual":"43","isCorrect":false,"inputTokens":7926,"outputTokens":2,"latencyMs":1400.5033750000293},{"questionId":"q29","format":"toon","model":"gemini-2.5-flash","expected":"41","actual":"37","isCorrect":false,"inputTokens":3335,"outputTokens":2,"latencyMs":9504.683958000038},{"questionId":"q29","format":"csv","model":"gemini-2.5-flash","expected":"41","actual":"41","isCorrect":true,"inputTokens":3209,"outputTokens":2,"latencyMs":31964.216459000017},{"questionId":"q29","format":"xml","model":"gemini-2.5-flash","expected":"41","actual":"41","isCorrect":true,"inputTokens":9115,"outputTokens":2,"latencyMs":16441.10595899995},{"questionId":"q29","format":"yaml","model":"gemini-2.5-flash","expected":"41","actual":"40","isCorrect":false,"inputTokens":5761,"outputTokens":2,"latencyMs":811.9641249999986},{"questionId":"q30","format":"json","model":"gemini-2.5-flash","expected":"26","actual":"30","isCorrect":false,"inputTokens":7926,"outputTokens":2,"latencyMs":1412.7924169999897},{"questionId":"q30","format":"toon","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":3335,"outputTokens":2,"latencyMs":10758.259375000023},{"questionId":"q30","format":"csv","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":3209,"outputTokens":2,"latencyMs":24627.311499999953},{"questionId":"q30","format":"xml","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":9115,"outputTokens":2,"latencyMs":13618.046457999968},{"questionId":"q30","format":"yaml","model":"gemini-2.5-flash","expected":"26","actual":"30","isCorrect":false,"inputTokens":5761,"outputTokens":2,"latencyMs":794.5352500000154},{"questionId":"q31","format":"json","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":7918,"outputTokens":3,"latencyMs":2013.5282910000533},{"questionId":"q31","format":"toon","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":3327,"outputTokens":3,"latencyMs":2192.8402500000084},{"questionId":"q31","format":"csv","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":3201,"outputTokens":3,"latencyMs":2184.3993330000085},{"questionId":"q31","format":"xml","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":9107,"outputTokens":3,"latencyMs":6393.259415999986},{"questionId":"q31","format":"yaml","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":5753,"outputTokens":3,"latencyMs":1999.2310840000282},{"questionId":"q32","format":"json","model":"gemini-2.5-flash","expected":"96503","actual":"99999.99","isCorrect":false,"inputTokens":7919,"outputTokens":8,"latencyMs":54535.416708000004},{"questionId":"q32","format":"toon","model":"gemini-2.5-flash","expected":"96503","actual":"99999.99","isCorrect":false,"inputTokens":3328,"outputTokens":8,"latencyMs":16747.45308399998},{"questionId":"q32","format":"csv","model":"gemini-2.5-flash","expected":"96503","actual":"97900","isCorrect":false,"inputTokens":3202,"outputTokens":5,"latencyMs":11352.681582999998},{"questionId":"q32","format":"xml","model":"gemini-2.5-flash","expected":"96503","actual":"99999.99","isCorrect":false,"inputTokens":9108,"outputTokens":8,"latencyMs":61491.59616699995},{"questionId":"q32","format":"yaml","model":"gemini-2.5-flash","expected":"96503","actual":"98999.00","isCorrect":false,"inputTokens":5754,"outputTokens":8,"latencyMs":11206.51449999999},{"questionId":"q33","format":"json","model":"gemini-2.5-flash","expected":"78","actual":"78","isCorrect":true,"inputTokens":7916,"outputTokens":2,"latencyMs":11325.233707999985},{"questionId":"q33","format":"toon","model":"gemini-2.5-flash","expected":"78","actual":"78","isCorrect":true,"inputTokens":3325,"outputTokens":2,"latencyMs":11204.934750000015},{"questionId":"q33","format":"csv","model":"gemini-2.5-flash","expected":"78","actual":"78","isCorrect":true,"inputTokens":3199,"outputTokens":2,"latencyMs":6446.144792000006},{"questionId":"q33","format":"xml","model":"gemini-2.5-flash","expected":"78","actual":"78","isCorrect":true,"inputTokens":9105,"outputTokens":2,"latencyMs":10947.451666999958},{"questionId":"q33","format":"yaml","model":"gemini-2.5-flash","expected":"78","actual":"78","isCorrect":true,"inputTokens":5751,"outputTokens":2,"latencyMs":12029.519084000029},{"questionId":"q34","format":"json","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":7916,"outputTokens":2,"latencyMs":7387.169915999984},{"questionId":"q34","format":"toon","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":3325,"outputTokens":2,"latencyMs":6514.137707999966},{"questionId":"q34","format":"csv","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":3199,"outputTokens":2,"latencyMs":4721.180499999959},{"questionId":"q34","format":"xml","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":9105,"outputTokens":2,"latencyMs":8046.83820899995},{"questionId":"q34","format":"yaml","model":"gemini-2.5-flash","expected":"22","actual":"22","isCorrect":true,"inputTokens":5751,"outputTokens":2,"latencyMs":8248.850250000018},{"questionId":"q35","format":"json","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":7927,"outputTokens":2,"latencyMs":4517.738541000057},{"questionId":"q35","format":"toon","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":3835.7736659999937},{"questionId":"q35","format":"csv","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":7546.540417000011},{"questionId":"q35","format":"xml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":4546.85616700002},{"questionId":"q35","format":"yaml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":4353.9369169999845},{"questionId":"q36","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"10","isCorrect":false,"inputTokens":7927,"outputTokens":2,"latencyMs":4675.304791000031},{"questionId":"q36","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":5651.744499999972},{"questionId":"q36","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":15644.225792000012},{"questionId":"q36","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":15097.416624999954},{"questionId":"q36","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":15382.696792000032},{"questionId":"q37","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":7927,"outputTokens":2,"latencyMs":5676.599832999986},{"questionId":"q37","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":7216.915082999971},{"questionId":"q37","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":29343.455083999957},{"questionId":"q37","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":5380.1327920000185},{"questionId":"q37","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":4498.144790999999},{"questionId":"q38","format":"json","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":7927,"outputTokens":2,"latencyMs":7617.367333000002},{"questionId":"q38","format":"toon","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":5833.487250000006},{"questionId":"q38","format":"csv","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":9072.47454200004},{"questionId":"q38","format":"xml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":4756.099082999979},{"questionId":"q38","format":"yaml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":4903.0258330000215},{"questionId":"q39","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":7927,"outputTokens":2,"latencyMs":4578.6231670000125},{"questionId":"q39","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":5856.308624999947},{"questionId":"q39","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":5556.784915999975},{"questionId":"q39","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":4543.255791000032},{"questionId":"q39","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":4021.18316700001},{"questionId":"q40","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7927,"outputTokens":2,"latencyMs":5298.731874999998},{"questionId":"q40","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":3336,"outputTokens":2,"latencyMs":4127.492541000014},{"questionId":"q40","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":3210,"outputTokens":2,"latencyMs":4782.160832999973},{"questionId":"q40","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":9116,"outputTokens":2,"latencyMs":4922.576082999993},{"questionId":"q40","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":5762,"outputTokens":2,"latencyMs":5238.765000000014},{"questionId":"q41","format":"json","model":"gemini-2.5-flash","expected":"63","actual":"63","isCorrect":true,"inputTokens":7923,"outputTokens":2,"latencyMs":17932.786165999947},{"questionId":"q41","format":"toon","model":"gemini-2.5-flash","expected":"63","actual":"63","isCorrect":true,"inputTokens":3332,"outputTokens":2,"latencyMs":18237.46525000001},{"questionId":"q41","format":"csv","model":"gemini-2.5-flash","expected":"63","actual":"63","isCorrect":true,"inputTokens":3206,"outputTokens":2,"latencyMs":19649.450375000015},{"questionId":"q41","format":"xml","model":"gemini-2.5-flash","expected":"63","actual":"63","isCorrect":true,"inputTokens":9112,"outputTokens":2,"latencyMs":23083.841791999992},{"questionId":"q41","format":"yaml","model":"gemini-2.5-flash","expected":"63","actual":"63","isCorrect":true,"inputTokens":5758,"outputTokens":2,"latencyMs":20296.590874999994},{"questionId":"q42","format":"json","model":"gemini-2.5-flash","expected":"53","actual":"50","isCorrect":false,"inputTokens":7924,"outputTokens":2,"latencyMs":1903.3749170000083},{"questionId":"q42","format":"toon","model":"gemini-2.5-flash","expected":"53","actual":"53","isCorrect":true,"inputTokens":3333,"outputTokens":2,"latencyMs":18509.53691700002},{"questionId":"q42","format":"csv","model":"gemini-2.5-flash","expected":"53","actual":"53","isCorrect":true,"inputTokens":3207,"outputTokens":2,"latencyMs":46038.66166599997},{"questionId":"q42","format":"xml","model":"gemini-2.5-flash","expected":"53","actual":"53","isCorrect":true,"inputTokens":9113,"outputTokens":2,"latencyMs":22486.269542000024},{"questionId":"q42","format":"yaml","model":"gemini-2.5-flash","expected":"53","actual":"53","isCorrect":true,"inputTokens":5759,"outputTokens":2,"latencyMs":15517.338458999991},{"questionId":"q43","format":"json","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":7924,"outputTokens":2,"latencyMs":18121.96908400004},{"questionId":"q43","format":"toon","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":3333,"outputTokens":2,"latencyMs":18681.138999999966},{"questionId":"q43","format":"csv","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":3207,"outputTokens":2,"latencyMs":20632.628624999954},{"questionId":"q43","format":"xml","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":9113,"outputTokens":2,"latencyMs":16490.62816699996},{"questionId":"q43","format":"yaml","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":5759,"outputTokens":2,"latencyMs":17733.74337499996},{"questionId":"q44","format":"json","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":7924,"outputTokens":2,"latencyMs":16902.36629199999},{"questionId":"q44","format":"toon","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":3333,"outputTokens":2,"latencyMs":16215.257083000033},{"questionId":"q44","format":"csv","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":3207,"outputTokens":2,"latencyMs":17489.787332999986},{"questionId":"q44","format":"xml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":9113,"outputTokens":2,"latencyMs":16175.736791000061},{"questionId":"q44","format":"yaml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":5759,"outputTokens":2,"latencyMs":16000.474625000032},{"questionId":"q45","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":7925,"outputTokens":2,"latencyMs":4217.764250000007},{"questionId":"q45","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3334,"outputTokens":2,"latencyMs":5123.092916999944},{"questionId":"q45","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3208,"outputTokens":2,"latencyMs":25969.879791999934},{"questionId":"q45","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":9114,"outputTokens":2,"latencyMs":3050.6897079999326},{"questionId":"q45","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5760,"outputTokens":2,"latencyMs":4602.8282500000205},{"questionId":"q46","format":"json","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":7925,"outputTokens":1,"latencyMs":8172.519958000048},{"questionId":"q46","format":"toon","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":3334,"outputTokens":1,"latencyMs":7867.166792000062},{"questionId":"q46","format":"csv","model":"gemini-2.5-flash","expected":"8","actual":"10","isCorrect":false,"inputTokens":3208,"outputTokens":2,"latencyMs":1507.8348329999717},{"questionId":"q46","format":"xml","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":9114,"outputTokens":1,"latencyMs":13500.896540999995},{"questionId":"q46","format":"yaml","model":"gemini-2.5-flash","expected":"8","actual":"10","isCorrect":false,"inputTokens":5760,"outputTokens":2,"latencyMs":1825.4723330000415},{"questionId":"q47","format":"json","model":"gemini-2.5-flash","expected":"15","actual":"15","isCorrect":true,"inputTokens":7925,"outputTokens":2,"latencyMs":4594.185750000062},{"questionId":"q47","format":"toon","model":"gemini-2.5-flash","expected":"15","actual":"15","isCorrect":true,"inputTokens":3334,"outputTokens":2,"latencyMs":5401.057666999986},{"questionId":"q47","format":"csv","model":"gemini-2.5-flash","expected":"15","actual":"16","isCorrect":false,"inputTokens":3208,"outputTokens":2,"latencyMs":26074.14720799995},{"questionId":"q47","format":"xml","model":"gemini-2.5-flash","expected":"15","actual":"15","isCorrect":true,"inputTokens":9114,"outputTokens":2,"latencyMs":14245.247207999928},{"questionId":"q47","format":"yaml","model":"gemini-2.5-flash","expected":"15","actual":"14","isCorrect":false,"inputTokens":5760,"outputTokens":2,"latencyMs":3870.017416999908},{"questionId":"q48","format":"json","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":7918,"outputTokens":2,"latencyMs":3650.8690829999978},{"questionId":"q48","format":"toon","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3327,"outputTokens":2,"latencyMs":3402.7862920000916},{"questionId":"q48","format":"csv","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3201,"outputTokens":2,"latencyMs":3921.5722079999978},{"questionId":"q48","format":"xml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":9107,"outputTokens":2,"latencyMs":3262.6539999999804},{"questionId":"q48","format":"yaml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":5753,"outputTokens":2,"latencyMs":4057.5767090000445},{"questionId":"q49","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":7918,"outputTokens":2,"latencyMs":3481.189250000054},{"questionId":"q49","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3327,"outputTokens":2,"latencyMs":15453.611749999924},{"questionId":"q49","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3201,"outputTokens":2,"latencyMs":14601.671291999985},{"questionId":"q49","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":9107,"outputTokens":2,"latencyMs":9624.921040999936},{"questionId":"q49","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5753,"outputTokens":2,"latencyMs":10109.80458399991},{"questionId":"q50","format":"json","model":"gemini-2.5-flash","expected":"14","actual":"14","isCorrect":true,"inputTokens":7918,"outputTokens":2,"latencyMs":4497.873082999955},{"questionId":"q50","format":"toon","model":"gemini-2.5-flash","expected":"14","actual":"14","isCorrect":true,"inputTokens":3327,"outputTokens":2,"latencyMs":5359.112917000079},{"questionId":"q50","format":"csv","model":"gemini-2.5-flash","expected":"14","actual":"14","isCorrect":true,"inputTokens":3201,"outputTokens":2,"latencyMs":4170.251584000071},{"questionId":"q50","format":"xml","model":"gemini-2.5-flash","expected":"14","actual":"14","isCorrect":true,"inputTokens":9107,"outputTokens":2,"latencyMs":2887.9759579999372},{"questionId":"q50","format":"yaml","model":"gemini-2.5-flash","expected":"14","actual":"14","isCorrect":true,"inputTokens":5753,"outputTokens":2,"latencyMs":1299.5940420000115},{"questionId":"q51","format":"json","model":"gemini-2.5-flash","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":12112,"outputTokens":5,"latencyMs":1580.6155839998974},{"questionId":"q51","format":"toon","model":"gemini-2.5-flash","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":7200,"outputTokens":5,"latencyMs":1518.9282910000766},{"questionId":"q51","format":"csv","model":"gemini-2.5-flash","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":7837,"outputTokens":5,"latencyMs":2111.833250000025},{"questionId":"q51","format":"xml","model":"gemini-2.5-flash","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":13450,"outputTokens":5,"latencyMs":1920.7146250000224},{"questionId":"q51","format":"yaml","model":"gemini-2.5-flash","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":8426,"outputTokens":5,"latencyMs":1506.8597079999745},{"questionId":"q52","format":"json","model":"gemini-2.5-flash","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":12112,"outputTokens":2,"latencyMs":2831.41875000007},{"questionId":"q52","format":"toon","model":"gemini-2.5-flash","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":7200,"outputTokens":2,"latencyMs":1911.1040419999044},{"questionId":"q52","format":"csv","model":"gemini-2.5-flash","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":7837,"outputTokens":2,"latencyMs":2831.7014169999165},{"questionId":"q52","format":"xml","model":"gemini-2.5-flash","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":13450,"outputTokens":2,"latencyMs":1906.5173339999747},{"questionId":"q52","format":"yaml","model":"gemini-2.5-flash","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":8426,"outputTokens":2,"latencyMs":1743.2425410000142},{"questionId":"q53","format":"json","model":"gemini-2.5-flash","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":12112,"outputTokens":6,"latencyMs":2252.346375000081},{"questionId":"q53","format":"toon","model":"gemini-2.5-flash","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":7200,"outputTokens":6,"latencyMs":1312.1469170000637},{"questionId":"q53","format":"csv","model":"gemini-2.5-flash","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":7837,"outputTokens":6,"latencyMs":1317.630625000107},{"questionId":"q53","format":"xml","model":"gemini-2.5-flash","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":13450,"outputTokens":6,"latencyMs":2282.068250000011},{"questionId":"q53","format":"yaml","model":"gemini-2.5-flash","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":8426,"outputTokens":6,"latencyMs":1511.6479169999948},{"questionId":"q54","format":"json","model":"gemini-2.5-flash","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":12112,"outputTokens":1,"latencyMs":2419.9388749999925},{"questionId":"q54","format":"toon","model":"gemini-2.5-flash","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":7200,"outputTokens":1,"latencyMs":880.9003749999683},{"questionId":"q54","format":"csv","model":"gemini-2.5-flash","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":7837,"outputTokens":1,"latencyMs":2055.271083999891},{"questionId":"q54","format":"xml","model":"gemini-2.5-flash","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":13450,"outputTokens":1,"latencyMs":1624.9449160000077},{"questionId":"q54","format":"yaml","model":"gemini-2.5-flash","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":8426,"outputTokens":1,"latencyMs":2154.905582999927},{"questionId":"q55","format":"json","model":"gemini-2.5-flash","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":12112,"outputTokens":6,"latencyMs":1692.3707080000313},{"questionId":"q55","format":"toon","model":"gemini-2.5-flash","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":7200,"outputTokens":6,"latencyMs":1140.3244580000173},{"questionId":"q55","format":"csv","model":"gemini-2.5-flash","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":7837,"outputTokens":6,"latencyMs":1543.924457999994},{"questionId":"q55","format":"xml","model":"gemini-2.5-flash","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":13450,"outputTokens":6,"latencyMs":1619.371291999938},{"questionId":"q55","format":"yaml","model":"gemini-2.5-flash","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":8426,"outputTokens":6,"latencyMs":1429.8673750000307},{"questionId":"q56","format":"json","model":"gemini-2.5-flash","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":12112,"outputTokens":1,"latencyMs":1333.9458340000128},{"questionId":"q56","format":"toon","model":"gemini-2.5-flash","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":7200,"outputTokens":1,"latencyMs":1329.0306669999845},{"questionId":"q56","format":"csv","model":"gemini-2.5-flash","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":7837,"outputTokens":1,"latencyMs":1847.7373340000631},{"questionId":"q56","format":"xml","model":"gemini-2.5-flash","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":13450,"outputTokens":1,"latencyMs":1537.011083999998},{"questionId":"q56","format":"yaml","model":"gemini-2.5-flash","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":8426,"outputTokens":1,"latencyMs":2268.63049999997},{"questionId":"q57","format":"json","model":"gemini-2.5-flash","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":12112,"outputTokens":7,"latencyMs":1520.7294589999365},{"questionId":"q57","format":"toon","model":"gemini-2.5-flash","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":7200,"outputTokens":7,"latencyMs":1522.8311669999966},{"questionId":"q57","format":"csv","model":"gemini-2.5-flash","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":7837,"outputTokens":7,"latencyMs":3810.9223340000026},{"questionId":"q57","format":"xml","model":"gemini-2.5-flash","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":13450,"outputTokens":7,"latencyMs":1851.185999999987},{"questionId":"q57","format":"yaml","model":"gemini-2.5-flash","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":8426,"outputTokens":7,"latencyMs":1594.5812090000836},{"questionId":"q58","format":"json","model":"gemini-2.5-flash","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":12112,"outputTokens":1,"latencyMs":1031.888292000047},{"questionId":"q58","format":"toon","model":"gemini-2.5-flash","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":7200,"outputTokens":1,"latencyMs":1181.2981669999426},{"questionId":"q58","format":"csv","model":"gemini-2.5-flash","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":7837,"outputTokens":1,"latencyMs":3704.4069169999566},{"questionId":"q58","format":"xml","model":"gemini-2.5-flash","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":13450,"outputTokens":1,"latencyMs":1588.0394999999553},{"questionId":"q58","format":"yaml","model":"gemini-2.5-flash","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":8426,"outputTokens":1,"latencyMs":1285.1504160000477},{"questionId":"q59","format":"json","model":"gemini-2.5-flash","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":12113,"outputTokens":6,"latencyMs":2122.7909169999184},{"questionId":"q59","format":"toon","model":"gemini-2.5-flash","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":7201,"outputTokens":6,"latencyMs":1173.7912090000464},{"questionId":"q59","format":"csv","model":"gemini-2.5-flash","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":7838,"outputTokens":6,"latencyMs":1374.1822079999838},{"questionId":"q59","format":"xml","model":"gemini-2.5-flash","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":13451,"outputTokens":6,"latencyMs":2022.621999999974},{"questionId":"q59","format":"yaml","model":"gemini-2.5-flash","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":8427,"outputTokens":6,"latencyMs":1497.3771669999696},{"questionId":"q60","format":"json","model":"gemini-2.5-flash","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":12113,"outputTokens":8,"latencyMs":1955.5582500000019},{"questionId":"q60","format":"toon","model":"gemini-2.5-flash","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":7201,"outputTokens":8,"latencyMs":2123.2758750000503},{"questionId":"q60","format":"csv","model":"gemini-2.5-flash","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":7838,"outputTokens":8,"latencyMs":2950.0285419999855},{"questionId":"q60","format":"xml","model":"gemini-2.5-flash","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":13451,"outputTokens":8,"latencyMs":1589.0043750000186},{"questionId":"q60","format":"yaml","model":"gemini-2.5-flash","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":8427,"outputTokens":8,"latencyMs":1577.6151669999817},{"questionId":"q61","format":"json","model":"gemini-2.5-flash","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":12113,"outputTokens":10,"latencyMs":2240.8980000000447},{"questionId":"q61","format":"toon","model":"gemini-2.5-flash","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":7201,"outputTokens":10,"latencyMs":1635.6801249999553},{"questionId":"q61","format":"csv","model":"gemini-2.5-flash","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":7838,"outputTokens":10,"latencyMs":2960.435999999987},{"questionId":"q61","format":"xml","model":"gemini-2.5-flash","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":13451,"outputTokens":10,"latencyMs":1940.5866249999963},{"questionId":"q61","format":"yaml","model":"gemini-2.5-flash","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":8427,"outputTokens":10,"latencyMs":1743.3361250000307},{"questionId":"q62","format":"json","model":"gemini-2.5-flash","expected":"3","actual":"3","isCorrect":true,"inputTokens":12112,"outputTokens":1,"latencyMs":4297.939708000049},{"questionId":"q62","format":"toon","model":"gemini-2.5-flash","expected":"3","actual":"3","isCorrect":true,"inputTokens":7200,"outputTokens":1,"latencyMs":3427.203709000023},{"questionId":"q62","format":"csv","model":"gemini-2.5-flash","expected":"3","actual":"3","isCorrect":true,"inputTokens":7837,"outputTokens":1,"latencyMs":5200.015832999954},{"questionId":"q62","format":"xml","model":"gemini-2.5-flash","expected":"3","actual":"3","isCorrect":true,"inputTokens":13450,"outputTokens":1,"latencyMs":1738.2826249999925},{"questionId":"q62","format":"yaml","model":"gemini-2.5-flash","expected":"3","actual":"3","isCorrect":true,"inputTokens":8426,"outputTokens":1,"latencyMs":3041.4050419999985},{"questionId":"q63","format":"json","model":"gemini-2.5-flash","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":12113,"outputTokens":4,"latencyMs":1406.3583330000984},{"questionId":"q63","format":"toon","model":"gemini-2.5-flash","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":7201,"outputTokens":4,"latencyMs":2469.178959000041},{"questionId":"q63","format":"csv","model":"gemini-2.5-flash","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":7838,"outputTokens":4,"latencyMs":3794.797957999981},{"questionId":"q63","format":"xml","model":"gemini-2.5-flash","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":13451,"outputTokens":4,"latencyMs":1535.8477499999572},{"questionId":"q63","format":"yaml","model":"gemini-2.5-flash","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":8427,"outputTokens":4,"latencyMs":1466.450542000006},{"questionId":"q64","format":"json","model":"gemini-2.5-flash","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":12113,"outputTokens":7,"latencyMs":1096.1007920000702},{"questionId":"q64","format":"toon","model":"gemini-2.5-flash","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":7201,"outputTokens":7,"latencyMs":1037.7984169999836},{"questionId":"q64","format":"csv","model":"gemini-2.5-flash","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":7838,"outputTokens":7,"latencyMs":2832.733583000023},{"questionId":"q64","format":"xml","model":"gemini-2.5-flash","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":13451,"outputTokens":7,"latencyMs":1464.4541660000104},{"questionId":"q64","format":"yaml","model":"gemini-2.5-flash","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":8427,"outputTokens":7,"latencyMs":2447.7934589999495},{"questionId":"q65","format":"json","model":"gemini-2.5-flash","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":12113,"outputTokens":10,"latencyMs":2158.4444580000127},{"questionId":"q65","format":"toon","model":"gemini-2.5-flash","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":7201,"outputTokens":10,"latencyMs":1097.8160840000492},{"questionId":"q65","format":"csv","model":"gemini-2.5-flash","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":7838,"outputTokens":10,"latencyMs":1540.9169160000747},{"questionId":"q65","format":"xml","model":"gemini-2.5-flash","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":13451,"outputTokens":10,"latencyMs":1510.326083999942},{"questionId":"q65","format":"yaml","model":"gemini-2.5-flash","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":8427,"outputTokens":10,"latencyMs":1447.6384580000304},{"questionId":"q66","format":"json","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":12112,"outputTokens":1,"latencyMs":2109.4598329999717},{"questionId":"q66","format":"toon","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":7200,"outputTokens":1,"latencyMs":5819.463708000025},{"questionId":"q66","format":"csv","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":7837,"outputTokens":1,"latencyMs":6597.990292000002},{"questionId":"q66","format":"xml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":13450,"outputTokens":1,"latencyMs":5828.342791999923},{"questionId":"q66","format":"yaml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":8426,"outputTokens":1,"latencyMs":6629.443208999932},{"questionId":"q67","format":"json","model":"gemini-2.5-flash","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":12113,"outputTokens":6,"latencyMs":1820.452125000069},{"questionId":"q67","format":"toon","model":"gemini-2.5-flash","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":7201,"outputTokens":6,"latencyMs":1411.4405840000836},{"questionId":"q67","format":"csv","model":"gemini-2.5-flash","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":7838,"outputTokens":6,"latencyMs":6544.777208000072},{"questionId":"q67","format":"xml","model":"gemini-2.5-flash","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":13451,"outputTokens":6,"latencyMs":2089.7659590000985},{"questionId":"q67","format":"yaml","model":"gemini-2.5-flash","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":8427,"outputTokens":6,"latencyMs":2527.2939999999944},{"questionId":"q68","format":"json","model":"gemini-2.5-flash","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":12113,"outputTokens":11,"latencyMs":1281.5512500000186},{"questionId":"q68","format":"toon","model":"gemini-2.5-flash","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":7201,"outputTokens":11,"latencyMs":1171.5145000000484},{"questionId":"q68","format":"csv","model":"gemini-2.5-flash","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":7838,"outputTokens":11,"latencyMs":4651.4649169999175},{"questionId":"q68","format":"xml","model":"gemini-2.5-flash","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":13451,"outputTokens":11,"latencyMs":1957.409083000035},{"questionId":"q68","format":"yaml","model":"gemini-2.5-flash","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":8427,"outputTokens":11,"latencyMs":2760.401208000025},{"questionId":"q69","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"9","isCorrect":false,"inputTokens":12107,"outputTokens":1,"latencyMs":1127.5047499999637},{"questionId":"q69","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"9","isCorrect":false,"inputTokens":7195,"outputTokens":1,"latencyMs":1147.059499999974},{"questionId":"q69","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7832,"outputTokens":2,"latencyMs":5244.638167000026},{"questionId":"q69","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":13445,"outputTokens":2,"latencyMs":6068.009208999923},{"questionId":"q69","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"9","isCorrect":false,"inputTokens":8421,"outputTokens":1,"latencyMs":1392.0223750000587},{"questionId":"q70","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":12107,"outputTokens":2,"latencyMs":1112.2232919998933},{"questionId":"q70","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"9","isCorrect":false,"inputTokens":7195,"outputTokens":1,"latencyMs":1650.2696250000736},{"questionId":"q70","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":7832,"outputTokens":1,"latencyMs":1103.8838329999708},{"questionId":"q70","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":13445,"outputTokens":1,"latencyMs":993.2644169999985},{"questionId":"q70","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":8421,"outputTokens":1,"latencyMs":1329.4368330000434},{"questionId":"q71","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":12108,"outputTokens":1,"latencyMs":1239.109625000041},{"questionId":"q71","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":7196,"outputTokens":1,"latencyMs":1144.627582999994},{"questionId":"q71","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7833,"outputTokens":2,"latencyMs":5118.194791999995},{"questionId":"q71","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":13446,"outputTokens":2,"latencyMs":7063.755917000002},{"questionId":"q71","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":8422,"outputTokens":1,"latencyMs":1482.218916999991},{"questionId":"q72","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":12107,"outputTokens":2,"latencyMs":1285.560165999923},{"questionId":"q72","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":7195,"outputTokens":1,"latencyMs":1398.859749999945},{"questionId":"q72","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7832,"outputTokens":2,"latencyMs":5587.871250000084},{"questionId":"q72","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":13445,"outputTokens":2,"latencyMs":1693.6959160000551},{"questionId":"q72","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"8","isCorrect":false,"inputTokens":8421,"outputTokens":1,"latencyMs":1455.0037920000032},{"questionId":"q73","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":12107,"outputTokens":2,"latencyMs":876.3439159999834},{"questionId":"q73","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7195,"outputTokens":2,"latencyMs":1309.8970840000547},{"questionId":"q73","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":7832,"outputTokens":2,"latencyMs":4566.827583000064},{"questionId":"q73","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":13445,"outputTokens":2,"latencyMs":5599.446041999967},{"questionId":"q73","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":8421,"outputTokens":2,"latencyMs":1638.1083749999525},{"questionId":"q74","format":"json","model":"gemini-2.5-flash","expected":"42342.25","actual":"40000.00","isCorrect":false,"inputTokens":12108,"outputTokens":8,"latencyMs":6786.59600000002},{"questionId":"q74","format":"toon","model":"gemini-2.5-flash","expected":"42342.25","actual":"40000.00","isCorrect":false,"inputTokens":7196,"outputTokens":8,"latencyMs":11153.984750000061},{"questionId":"q74","format":"csv","model":"gemini-2.5-flash","expected":"42342.25","actual":"42342.25","isCorrect":true,"inputTokens":7833,"outputTokens":8,"latencyMs":12097.733207999961},{"questionId":"q74","format":"xml","model":"gemini-2.5-flash","expected":"42342.25","actual":"40000.00","isCorrect":false,"inputTokens":13446,"outputTokens":8,"latencyMs":13054.01570800005},{"questionId":"q74","format":"yaml","model":"gemini-2.5-flash","expected":"42342.25","actual":"40000.00","isCorrect":false,"inputTokens":8422,"outputTokens":8,"latencyMs":7117.1237919999985},{"questionId":"q75","format":"json","model":"gemini-2.5-flash","expected":"846.85","actual":"760.00","isCorrect":false,"inputTokens":12106,"outputTokens":6,"latencyMs":7815.606207999983},{"questionId":"q75","format":"toon","model":"gemini-2.5-flash","expected":"846.85","actual":"880.00","isCorrect":false,"inputTokens":7194,"outputTokens":6,"latencyMs":7651.42879200005},{"questionId":"q75","format":"csv","model":"gemini-2.5-flash","expected":"846.85","actual":"818.00","isCorrect":false,"inputTokens":7831,"outputTokens":6,"latencyMs":7845.991957999999},{"questionId":"q75","format":"xml","model":"gemini-2.5-flash","expected":"846.85","actual":"818.00","isCorrect":false,"inputTokens":13444,"outputTokens":6,"latencyMs":8046.708541000029},{"questionId":"q75","format":"yaml","model":"gemini-2.5-flash","expected":"846.85","actual":"880.00","isCorrect":false,"inputTokens":8420,"outputTokens":6,"latencyMs":6857.372042000061},{"questionId":"q76","format":"json","model":"gemini-2.5-flash","expected":"50","actual":"50","isCorrect":true,"inputTokens":12107,"outputTokens":2,"latencyMs":1547.6712499998976},{"questionId":"q76","format":"toon","model":"gemini-2.5-flash","expected":"50","actual":"50","isCorrect":true,"inputTokens":7195,"outputTokens":2,"latencyMs":4993.441125000012},{"questionId":"q76","format":"csv","model":"gemini-2.5-flash","expected":"50","actual":"50","isCorrect":true,"inputTokens":7832,"outputTokens":2,"latencyMs":4831.411333000055},{"questionId":"q76","format":"xml","model":"gemini-2.5-flash","expected":"50","actual":"50","isCorrect":true,"inputTokens":13445,"outputTokens":2,"latencyMs":4229.298875000095},{"questionId":"q76","format":"yaml","model":"gemini-2.5-flash","expected":"50","actual":"50","isCorrect":true,"inputTokens":8421,"outputTokens":2,"latencyMs":4456.570707999985},{"questionId":"q77","format":"json","model":"gemini-2.5-flash","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":12106,"outputTokens":7,"latencyMs":1837.4729999999981},{"questionId":"q77","format":"toon","model":"gemini-2.5-flash","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":7194,"outputTokens":7,"latencyMs":9932.015125000034},{"questionId":"q77","format":"csv","model":"gemini-2.5-flash","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":7831,"outputTokens":7,"latencyMs":4192.567958},{"questionId":"q77","format":"xml","model":"gemini-2.5-flash","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":13444,"outputTokens":7,"latencyMs":9929.228666000068},{"questionId":"q77","format":"yaml","model":"gemini-2.5-flash","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":8420,"outputTokens":7,"latencyMs":2070.0963340000017},{"questionId":"q78","format":"json","model":"gemini-2.5-flash","expected":"44","actual":"45","isCorrect":false,"inputTokens":12112,"outputTokens":2,"latencyMs":1614.5701250000857},{"questionId":"q78","format":"toon","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":7200,"outputTokens":2,"latencyMs":11900.045374999987},{"questionId":"q78","format":"csv","model":"gemini-2.5-flash","expected":"44","actual":"46","isCorrect":false,"inputTokens":7837,"outputTokens":2,"latencyMs":10515.212583000073},{"questionId":"q78","format":"xml","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":13450,"outputTokens":2,"latencyMs":10646.541958999936},{"questionId":"q78","format":"yaml","model":"gemini-2.5-flash","expected":"44","actual":"47","isCorrect":false,"inputTokens":8426,"outputTokens":2,"latencyMs":1157.676416000002},{"questionId":"q79","format":"json","model":"gemini-2.5-flash","expected":"39","actual":"44","isCorrect":false,"inputTokens":12112,"outputTokens":2,"latencyMs":1300.4189999999944},{"questionId":"q79","format":"toon","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":7200,"outputTokens":2,"latencyMs":13056.209207999986},{"questionId":"q79","format":"csv","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":7837,"outputTokens":2,"latencyMs":9840.96316699998},{"questionId":"q79","format":"xml","model":"gemini-2.5-flash","expected":"39","actual":"39","isCorrect":true,"inputTokens":13450,"outputTokens":2,"latencyMs":11177.12179200002},{"questionId":"q79","format":"yaml","model":"gemini-2.5-flash","expected":"39","actual":"45","isCorrect":false,"inputTokens":8426,"outputTokens":2,"latencyMs":1516.951457999996},{"questionId":"q80","format":"json","model":"gemini-2.5-flash","expected":"32","actual":"37","isCorrect":false,"inputTokens":12112,"outputTokens":2,"latencyMs":1542.5964580000145},{"questionId":"q80","format":"toon","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":7200,"outputTokens":2,"latencyMs":12602.12175000005},{"questionId":"q80","format":"csv","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":7837,"outputTokens":2,"latencyMs":14156.886625000043},{"questionId":"q80","format":"xml","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":13450,"outputTokens":2,"latencyMs":12611.637791000074},{"questionId":"q80","format":"yaml","model":"gemini-2.5-flash","expected":"32","actual":"39","isCorrect":false,"inputTokens":8426,"outputTokens":2,"latencyMs":1496.0962500000605},{"questionId":"q81","format":"json","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":12116,"outputTokens":1,"latencyMs":8519.22125000006},{"questionId":"q81","format":"toon","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":7204,"outputTokens":1,"latencyMs":4010.6855839999625},{"questionId":"q81","format":"csv","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":7841,"outputTokens":1,"latencyMs":9536.400208999985},{"questionId":"q81","format":"xml","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":13454,"outputTokens":1,"latencyMs":9025.786416999996},{"questionId":"q81","format":"yaml","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":8430,"outputTokens":1,"latencyMs":10038.979999999981},{"questionId":"q82","format":"json","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":12116,"outputTokens":1,"latencyMs":8537.993917000014},{"questionId":"q82","format":"toon","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":7204,"outputTokens":1,"latencyMs":10071.29091599991},{"questionId":"q82","format":"csv","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":7841,"outputTokens":1,"latencyMs":12189.26320799999},{"questionId":"q82","format":"xml","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":13454,"outputTokens":1,"latencyMs":11379.792999999947},{"questionId":"q82","format":"yaml","model":"gemini-2.5-flash","expected":"8","actual":"8","isCorrect":true,"inputTokens":8430,"outputTokens":1,"latencyMs":12900.413124999963},{"questionId":"q83","format":"json","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":12117,"outputTokens":1,"latencyMs":8908.346542000072},{"questionId":"q83","format":"toon","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":7205,"outputTokens":1,"latencyMs":10641.954917000025},{"questionId":"q83","format":"csv","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":7842,"outputTokens":1,"latencyMs":11153.965333000058},{"questionId":"q83","format":"xml","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":13455,"outputTokens":1,"latencyMs":11720.401084000012},{"questionId":"q83","format":"yaml","model":"gemini-2.5-flash","expected":"7","actual":"7","isCorrect":true,"inputTokens":8431,"outputTokens":1,"latencyMs":8965.704250000068},{"questionId":"q84","format":"json","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":12116,"outputTokens":1,"latencyMs":8100.68283299997},{"questionId":"q84","format":"toon","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":7204,"outputTokens":1,"latencyMs":10738.115916999988},{"questionId":"q84","format":"csv","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":7841,"outputTokens":1,"latencyMs":9616.537042000098},{"questionId":"q84","format":"xml","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":13454,"outputTokens":1,"latencyMs":8715.376374999993},{"questionId":"q84","format":"yaml","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":8430,"outputTokens":1,"latencyMs":8968.319125000038},{"questionId":"q85","format":"json","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":12116,"outputTokens":1,"latencyMs":3657.576834000065},{"questionId":"q85","format":"toon","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":7204,"outputTokens":1,"latencyMs":3798.152667000075},{"questionId":"q85","format":"csv","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":7841,"outputTokens":1,"latencyMs":8911.00645799993},{"questionId":"q85","format":"xml","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":13454,"outputTokens":1,"latencyMs":3588.322167000035},{"questionId":"q85","format":"yaml","model":"gemini-2.5-flash","expected":"9","actual":"9","isCorrect":true,"inputTokens":8430,"outputTokens":1,"latencyMs":3582.1919999999227},{"questionId":"q86","format":"json","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":12114,"outputTokens":1,"latencyMs":8035.885791999986},{"questionId":"q86","format":"toon","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":7202,"outputTokens":1,"latencyMs":6373.998417000053},{"questionId":"q86","format":"csv","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":7839,"outputTokens":1,"latencyMs":11387.333958999952},{"questionId":"q86","format":"xml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":13452,"outputTokens":1,"latencyMs":8192.381083999993},{"questionId":"q86","format":"yaml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":8428,"outputTokens":1,"latencyMs":8422.579041999998},{"questionId":"q87","format":"json","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":12114,"outputTokens":1,"latencyMs":7488.218916999991},{"questionId":"q87","format":"toon","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":7202,"outputTokens":1,"latencyMs":7654.077000000048},{"questionId":"q87","format":"csv","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":7839,"outputTokens":1,"latencyMs":9652.224541999982},{"questionId":"q87","format":"xml","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":13452,"outputTokens":1,"latencyMs":10837.234832999995},{"questionId":"q87","format":"yaml","model":"gemini-2.5-flash","expected":"5","actual":"5","isCorrect":true,"inputTokens":8428,"outputTokens":1,"latencyMs":8923.641416999977},{"questionId":"q88","format":"json","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":12115,"outputTokens":1,"latencyMs":7700.8786250001285},{"questionId":"q88","format":"toon","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":7203,"outputTokens":1,"latencyMs":6492.92962499999},{"questionId":"q88","format":"csv","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":7840,"outputTokens":1,"latencyMs":8064.505417000037},{"questionId":"q88","format":"xml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":13453,"outputTokens":1,"latencyMs":9730.972582999966},{"questionId":"q88","format":"yaml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":8429,"outputTokens":1,"latencyMs":8484.045791000011},{"questionId":"q89","format":"json","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":12118,"outputTokens":2,"latencyMs":12580.379917000071},{"questionId":"q89","format":"toon","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":7206,"outputTokens":2,"latencyMs":14257.416209000046},{"questionId":"q89","format":"csv","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":7843,"outputTokens":2,"latencyMs":13104.01683400001},{"questionId":"q89","format":"xml","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":13456,"outputTokens":2,"latencyMs":11685.098832999938},{"questionId":"q89","format":"yaml","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":8432,"outputTokens":2,"latencyMs":11544.163832999999},{"questionId":"q90","format":"json","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":12118,"outputTokens":2,"latencyMs":12309.37549999985},{"questionId":"q90","format":"toon","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":7206,"outputTokens":2,"latencyMs":13411.21887500002},{"questionId":"q90","format":"csv","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":7843,"outputTokens":2,"latencyMs":21766.089167000027},{"questionId":"q90","format":"xml","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":13456,"outputTokens":2,"latencyMs":22480.616332999896},{"questionId":"q90","format":"yaml","model":"gemini-2.5-flash","expected":"27","actual":"27","isCorrect":true,"inputTokens":8432,"outputTokens":2,"latencyMs":20677.540166000137},{"questionId":"q91","format":"json","model":"gemini-2.5-flash","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":4784,"outputTokens":4,"latencyMs":1884.4282500001136},{"questionId":"q91","format":"toon","model":"gemini-2.5-flash","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":2271,"outputTokens":4,"latencyMs":1899.283250000095},{"questionId":"q91","format":"csv","model":"gemini-2.5-flash","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":2208,"outputTokens":4,"latencyMs":2297.7423749999143},{"questionId":"q91","format":"xml","model":"gemini-2.5-flash","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":5431,"outputTokens":4,"latencyMs":968.8400830000173},{"questionId":"q91","format":"yaml","model":"gemini-2.5-flash","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":3814,"outputTokens":4,"latencyMs":2076.0675409999676},{"questionId":"q92","format":"json","model":"gemini-2.5-flash","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":4783,"outputTokens":7,"latencyMs":2072.2511670000385},{"questionId":"q92","format":"toon","model":"gemini-2.5-flash","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":2270,"outputTokens":7,"latencyMs":1872.952959000133},{"questionId":"q92","format":"csv","model":"gemini-2.5-flash","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":2207,"outputTokens":7,"latencyMs":2176.5222089998424},{"questionId":"q92","format":"xml","model":"gemini-2.5-flash","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":5430,"outputTokens":7,"latencyMs":2234.8793750000186},{"questionId":"q92","format":"yaml","model":"gemini-2.5-flash","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":3813,"outputTokens":7,"latencyMs":1461.9923750001471},{"questionId":"q93","format":"json","model":"gemini-2.5-flash","expected":"33","actual":"33","isCorrect":true,"inputTokens":4784,"outputTokens":2,"latencyMs":2470.3572080000304},{"questionId":"q93","format":"toon","model":"gemini-2.5-flash","expected":"33","actual":"33","isCorrect":true,"inputTokens":2271,"outputTokens":2,"latencyMs":2151.329374999972},{"questionId":"q93","format":"csv","model":"gemini-2.5-flash","expected":"33","actual":"33","isCorrect":true,"inputTokens":2208,"outputTokens":2,"latencyMs":2219.444042000221},{"questionId":"q93","format":"xml","model":"gemini-2.5-flash","expected":"33","actual":"33","isCorrect":true,"inputTokens":5431,"outputTokens":2,"latencyMs":2449.41345800017},{"questionId":"q93","format":"yaml","model":"gemini-2.5-flash","expected":"33","actual":"33","isCorrect":true,"inputTokens":3814,"outputTokens":2,"latencyMs":3201.408041999908},{"questionId":"q94","format":"json","model":"gemini-2.5-flash","expected":"377","actual":"377","isCorrect":true,"inputTokens":4784,"outputTokens":3,"latencyMs":1606.6140829999931},{"questionId":"q94","format":"toon","model":"gemini-2.5-flash","expected":"377","actual":"377","isCorrect":true,"inputTokens":2271,"outputTokens":3,"latencyMs":1728.7225830000825},{"questionId":"q94","format":"csv","model":"gemini-2.5-flash","expected":"377","actual":"377","isCorrect":true,"inputTokens":2208,"outputTokens":3,"latencyMs":1581.8261250001378},{"questionId":"q94","format":"xml","model":"gemini-2.5-flash","expected":"377","actual":"377","isCorrect":true,"inputTokens":5431,"outputTokens":3,"latencyMs":2475.9921249998733},{"questionId":"q94","format":"yaml","model":"gemini-2.5-flash","expected":"377","actual":"377","isCorrect":true,"inputTokens":3814,"outputTokens":3,"latencyMs":1658.4720830000006},{"questionId":"q95","format":"json","model":"gemini-2.5-flash","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":4784,"outputTokens":4,"latencyMs":2754.985958999954},{"questionId":"q95","format":"toon","model":"gemini-2.5-flash","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":2271,"outputTokens":4,"latencyMs":1857.8507499999832},{"questionId":"q95","format":"csv","model":"gemini-2.5-flash","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":2208,"outputTokens":4,"latencyMs":1455.5252499999478},{"questionId":"q95","format":"xml","model":"gemini-2.5-flash","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":5431,"outputTokens":4,"latencyMs":1482.518625000026},{"questionId":"q95","format":"yaml","model":"gemini-2.5-flash","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":3814,"outputTokens":4,"latencyMs":3781.1114159999415},{"questionId":"q96","format":"json","model":"gemini-2.5-flash","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":4784,"outputTokens":4,"latencyMs":3417.233917000005},{"questionId":"q96","format":"toon","model":"gemini-2.5-flash","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":2271,"outputTokens":4,"latencyMs":2638.337291000178},{"questionId":"q96","format":"csv","model":"gemini-2.5-flash","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":2208,"outputTokens":4,"latencyMs":2681.343833999941},{"questionId":"q96","format":"xml","model":"gemini-2.5-flash","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":5431,"outputTokens":4,"latencyMs":1367.1620420000982},{"questionId":"q96","format":"yaml","model":"gemini-2.5-flash","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":3814,"outputTokens":4,"latencyMs":1875.5785420001484},{"questionId":"q97","format":"json","model":"gemini-2.5-flash","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":4783,"outputTokens":7,"latencyMs":3925.148833999876},{"questionId":"q97","format":"toon","model":"gemini-2.5-flash","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":2270,"outputTokens":7,"latencyMs":3521.515625},{"questionId":"q97","format":"csv","model":"gemini-2.5-flash","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":2207,"outputTokens":7,"latencyMs":2912.267790999962},{"questionId":"q97","format":"xml","model":"gemini-2.5-flash","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":5430,"outputTokens":7,"latencyMs":2489.6105840001255},{"questionId":"q97","format":"yaml","model":"gemini-2.5-flash","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":3813,"outputTokens":7,"latencyMs":2989.579958999995},{"questionId":"q98","format":"json","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":4784,"outputTokens":2,"latencyMs":1105.9603749997914},{"questionId":"q98","format":"toon","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":2271,"outputTokens":2,"latencyMs":3957.9318329999223},{"questionId":"q98","format":"csv","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":2208,"outputTokens":2,"latencyMs":1753.311042000074},{"questionId":"q98","format":"xml","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":5431,"outputTokens":2,"latencyMs":3162.177583000157},{"questionId":"q98","format":"yaml","model":"gemini-2.5-flash","expected":"44","actual":"44","isCorrect":true,"inputTokens":3814,"outputTokens":2,"latencyMs":2062.8894579999615},{"questionId":"q99","format":"json","model":"gemini-2.5-flash","expected":"411","actual":"411","isCorrect":true,"inputTokens":4784,"outputTokens":3,"latencyMs":3126.8291659997776},{"questionId":"q99","format":"toon","model":"gemini-2.5-flash","expected":"411","actual":"411","isCorrect":true,"inputTokens":2271,"outputTokens":3,"latencyMs":1691.642875000136},{"questionId":"q99","format":"csv","model":"gemini-2.5-flash","expected":"411","actual":"411","isCorrect":true,"inputTokens":2208,"outputTokens":3,"latencyMs":1893.749042000156},{"questionId":"q99","format":"xml","model":"gemini-2.5-flash","expected":"411","actual":"411","isCorrect":true,"inputTokens":5431,"outputTokens":3,"latencyMs":1321.3778750000056},{"questionId":"q99","format":"yaml","model":"gemini-2.5-flash","expected":"411","actual":"411","isCorrect":true,"inputTokens":3814,"outputTokens":3,"latencyMs":1689.7997500000056},{"questionId":"q100","format":"json","model":"gemini-2.5-flash","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":4784,"outputTokens":4,"latencyMs":2584.5812080001924},{"questionId":"q100","format":"toon","model":"gemini-2.5-flash","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":2271,"outputTokens":4,"latencyMs":1487.264124999987},{"questionId":"q100","format":"csv","model":"gemini-2.5-flash","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":2208,"outputTokens":4,"latencyMs":2019.6948329999577},{"questionId":"q100","format":"xml","model":"gemini-2.5-flash","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":5431,"outputTokens":4,"latencyMs":1538.7900419998914},{"questionId":"q100","format":"yaml","model":"gemini-2.5-flash","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":3814,"outputTokens":4,"latencyMs":2006.4605000000447},{"questionId":"q101","format":"json","model":"gemini-2.5-flash","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":4784,"outputTokens":4,"latencyMs":3271.396790999919},{"questionId":"q101","format":"toon","model":"gemini-2.5-flash","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":2271,"outputTokens":4,"latencyMs":2130.3102090000175},{"questionId":"q101","format":"csv","model":"gemini-2.5-flash","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":2208,"outputTokens":4,"latencyMs":2857.2320000000764},{"questionId":"q101","format":"xml","model":"gemini-2.5-flash","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":5431,"outputTokens":4,"latencyMs":2408.035957999993},{"questionId":"q101","format":"yaml","model":"gemini-2.5-flash","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":3814,"outputTokens":4,"latencyMs":2431.956540999934},{"questionId":"q102","format":"json","model":"gemini-2.5-flash","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":4783,"outputTokens":6,"latencyMs":2324.771875000093},{"questionId":"q102","format":"toon","model":"gemini-2.5-flash","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":2270,"outputTokens":6,"latencyMs":2848.8026660000905},{"questionId":"q102","format":"csv","model":"gemini-2.5-flash","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":2207,"outputTokens":6,"latencyMs":2672.785249999957},{"questionId":"q102","format":"xml","model":"gemini-2.5-flash","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":5430,"outputTokens":6,"latencyMs":3891.5064170002006},{"questionId":"q102","format":"yaml","model":"gemini-2.5-flash","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":3813,"outputTokens":6,"latencyMs":3577.0068339998834},{"questionId":"q103","format":"json","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":4784,"outputTokens":2,"latencyMs":3073.2702500000596},{"questionId":"q103","format":"toon","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":2271,"outputTokens":2,"latencyMs":2305.5147919999436},{"questionId":"q103","format":"csv","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":2208,"outputTokens":2,"latencyMs":2397.551499999827},{"questionId":"q103","format":"xml","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":5431,"outputTokens":2,"latencyMs":1539.2414589999244},{"questionId":"q103","format":"yaml","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":3814,"outputTokens":2,"latencyMs":2904.9358749999665},{"questionId":"q104","format":"json","model":"gemini-2.5-flash","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":4777,"outputTokens":6,"latencyMs":11590.23233300005},{"questionId":"q104","format":"toon","model":"gemini-2.5-flash","expected":"344498","actual":"349999","isCorrect":false,"inputTokens":2264,"outputTokens":6,"latencyMs":4674.995833999943},{"questionId":"q104","format":"csv","model":"gemini-2.5-flash","expected":"344498","actual":"349900","isCorrect":false,"inputTokens":2201,"outputTokens":6,"latencyMs":4361.276749999961},{"questionId":"q104","format":"xml","model":"gemini-2.5-flash","expected":"344498","actual":"336900","isCorrect":false,"inputTokens":5424,"outputTokens":6,"latencyMs":5789.307541999966},{"questionId":"q104","format":"yaml","model":"gemini-2.5-flash","expected":"344498","actual":"339000","isCorrect":false,"inputTokens":3807,"outputTokens":6,"latencyMs":5932.7984170001},{"questionId":"q105","format":"json","model":"gemini-2.5-flash","expected":"312818.50","actual":"300000.00","isCorrect":false,"inputTokens":4775,"outputTokens":9,"latencyMs":33710.32349999994},{"questionId":"q105","format":"toon","model":"gemini-2.5-flash","expected":"312818.50","actual":"312818.5","isCorrect":true,"inputTokens":2262,"outputTokens":8,"latencyMs":15737.890790999867},{"questionId":"q105","format":"csv","model":"gemini-2.5-flash","expected":"312818.50","actual":"340000.00","isCorrect":false,"inputTokens":2199,"outputTokens":9,"latencyMs":22115.961250000168},{"questionId":"q105","format":"xml","model":"gemini-2.5-flash","expected":"312818.50","actual":"300000.00","isCorrect":false,"inputTokens":5422,"outputTokens":9,"latencyMs":10489.203625000082},{"questionId":"q105","format":"yaml","model":"gemini-2.5-flash","expected":"312818.50","actual":"369000.00","isCorrect":false,"inputTokens":3805,"outputTokens":9,"latencyMs":1270.1144169999752},{"questionId":"q106","format":"json","model":"gemini-2.5-flash","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":4777,"outputTokens":4,"latencyMs":6748.849709000206},{"questionId":"q106","format":"toon","model":"gemini-2.5-flash","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":2264,"outputTokens":4,"latencyMs":6604.943542000139},{"questionId":"q106","format":"csv","model":"gemini-2.5-flash","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":2201,"outputTokens":4,"latencyMs":8311.442624999909},{"questionId":"q106","format":"xml","model":"gemini-2.5-flash","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":5424,"outputTokens":4,"latencyMs":10314.444957999978},{"questionId":"q106","format":"yaml","model":"gemini-2.5-flash","expected":"1811","actual":"1560","isCorrect":false,"inputTokens":3807,"outputTokens":4,"latencyMs":1681.1095419998746},{"questionId":"q107","format":"json","model":"gemini-2.5-flash","expected":"5742","actual":"5765","isCorrect":false,"inputTokens":4776,"outputTokens":4,"latencyMs":5464.193916999968},{"questionId":"q107","format":"toon","model":"gemini-2.5-flash","expected":"5742","actual":"5750","isCorrect":false,"inputTokens":2263,"outputTokens":4,"latencyMs":5266.248625000007},{"questionId":"q107","format":"csv","model":"gemini-2.5-flash","expected":"5742","actual":"5766.67","isCorrect":false,"inputTokens":2200,"outputTokens":7,"latencyMs":21249.982249999885},{"questionId":"q107","format":"xml","model":"gemini-2.5-flash","expected":"5742","actual":"5650","isCorrect":false,"inputTokens":5423,"outputTokens":4,"latencyMs":7413.202000000048},{"questionId":"q107","format":"yaml","model":"gemini-2.5-flash","expected":"5742","actual":"5733.33","isCorrect":false,"inputTokens":3806,"outputTokens":7,"latencyMs":7655.432166999904},{"questionId":"q108","format":"json","model":"gemini-2.5-flash","expected":"5213.64","actual":"5006.33","isCorrect":false,"inputTokens":4774,"outputTokens":7,"latencyMs":74010.11516599986},{"questionId":"q108","format":"toon","model":"gemini-2.5-flash","expected":"5213.64","actual":"4983.33","isCorrect":false,"inputTokens":2261,"outputTokens":7,"latencyMs":6770.947459000163},{"questionId":"q108","format":"csv","model":"gemini-2.5-flash","expected":"5213.64","actual":"5000.00","isCorrect":false,"inputTokens":2198,"outputTokens":7,"latencyMs":7813.699665999971},{"questionId":"q108","format":"xml","model":"gemini-2.5-flash","expected":"5213.64","actual":"5000.00","isCorrect":false,"inputTokens":5421,"outputTokens":7,"latencyMs":11781.737875000108},{"questionId":"q108","format":"yaml","model":"gemini-2.5-flash","expected":"5213.64","actual":"5333.33","isCorrect":false,"inputTokens":3804,"outputTokens":7,"latencyMs":8382.133042000001},{"questionId":"q109","format":"json","model":"gemini-2.5-flash","expected":"30","actual":"31.67","isCorrect":false,"inputTokens":4776,"outputTokens":5,"latencyMs":5201.766916000051},{"questionId":"q109","format":"toon","model":"gemini-2.5-flash","expected":"30","actual":"31.67","isCorrect":false,"inputTokens":2263,"outputTokens":5,"latencyMs":5553.813208000036},{"questionId":"q109","format":"csv","model":"gemini-2.5-flash","expected":"30","actual":"30.0","isCorrect":true,"inputTokens":2200,"outputTokens":4,"latencyMs":4134.351959000109},{"questionId":"q109","format":"xml","model":"gemini-2.5-flash","expected":"30","actual":"29.51","isCorrect":false,"inputTokens":5423,"outputTokens":5,"latencyMs":5047.897291000001},{"questionId":"q109","format":"yaml","model":"gemini-2.5-flash","expected":"30","actual":"30","isCorrect":true,"inputTokens":3806,"outputTokens":2,"latencyMs":5670.675124999834},{"questionId":"q110","format":"json","model":"gemini-2.5-flash","expected":"60","actual":"60","isCorrect":true,"inputTokens":4776,"outputTokens":2,"latencyMs":2104.9321250000503},{"questionId":"q110","format":"toon","model":"gemini-2.5-flash","expected":"60","actual":"60","isCorrect":true,"inputTokens":2263,"outputTokens":2,"latencyMs":6210.12729199999},{"questionId":"q110","format":"csv","model":"gemini-2.5-flash","expected":"60","actual":"60","isCorrect":true,"inputTokens":2200,"outputTokens":2,"latencyMs":1984.4038329999894},{"questionId":"q110","format":"xml","model":"gemini-2.5-flash","expected":"60","actual":"60","isCorrect":true,"inputTokens":5423,"outputTokens":2,"latencyMs":5691.6853749998845},{"questionId":"q110","format":"yaml","model":"gemini-2.5-flash","expected":"60","actual":"60","isCorrect":true,"inputTokens":3806,"outputTokens":2,"latencyMs":1864.0662499999162},{"questionId":"q111","format":"json","model":"gemini-2.5-flash","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":4779,"outputTokens":4,"latencyMs":9066.991750000045},{"questionId":"q111","format":"toon","model":"gemini-2.5-flash","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":2266,"outputTokens":4,"latencyMs":8553.131667000009},{"questionId":"q111","format":"csv","model":"gemini-2.5-flash","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":2203,"outputTokens":4,"latencyMs":10600.450124999974},{"questionId":"q111","format":"xml","model":"gemini-2.5-flash","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":5426,"outputTokens":4,"latencyMs":1282.9932089999784},{"questionId":"q111","format":"yaml","model":"gemini-2.5-flash","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":3809,"outputTokens":4,"latencyMs":7867.858832999831},{"questionId":"q112","format":"json","model":"gemini-2.5-flash","expected":"42","actual":"42","isCorrect":true,"inputTokens":4779,"outputTokens":2,"latencyMs":13188.195958000142},{"questionId":"q112","format":"toon","model":"gemini-2.5-flash","expected":"42","actual":"42","isCorrect":true,"inputTokens":2266,"outputTokens":2,"latencyMs":13456.709874999942},{"questionId":"q112","format":"csv","model":"gemini-2.5-flash","expected":"42","actual":"42","isCorrect":true,"inputTokens":2203,"outputTokens":2,"latencyMs":24068.5377499999},{"questionId":"q112","format":"xml","model":"gemini-2.5-flash","expected":"42","actual":"42","isCorrect":true,"inputTokens":5426,"outputTokens":2,"latencyMs":11309.083583000116},{"questionId":"q112","format":"yaml","model":"gemini-2.5-flash","expected":"42","actual":"49","isCorrect":false,"inputTokens":3809,"outputTokens":2,"latencyMs":1223.1014159999322},{"questionId":"q113","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":4779,"outputTokens":2,"latencyMs":12378.184833999956},{"questionId":"q113","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":2266,"outputTokens":2,"latencyMs":13815.962708000094},{"questionId":"q113","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":2203,"outputTokens":2,"latencyMs":9263.92879200005},{"questionId":"q113","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":5426,"outputTokens":2,"latencyMs":10745.384291999973},{"questionId":"q113","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":3809,"outputTokens":2,"latencyMs":9530.699459000025},{"questionId":"q114","format":"json","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":4786,"outputTokens":2,"latencyMs":13924.308707999997},{"questionId":"q114","format":"toon","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":2273,"outputTokens":2,"latencyMs":21940.77291699988},{"questionId":"q114","format":"csv","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":2210,"outputTokens":2,"latencyMs":22778.159292000113},{"questionId":"q114","format":"xml","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":5433,"outputTokens":2,"latencyMs":12164.395791999996},{"questionId":"q114","format":"yaml","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":3816,"outputTokens":2,"latencyMs":13955.144666999811},{"questionId":"q115","format":"json","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":4786,"outputTokens":2,"latencyMs":14885.290749999927},{"questionId":"q115","format":"toon","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":2273,"outputTokens":2,"latencyMs":13939.909167000093},{"questionId":"q115","format":"csv","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":2210,"outputTokens":2,"latencyMs":33331.57250000001},{"questionId":"q115","format":"xml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":5433,"outputTokens":2,"latencyMs":24307.091291000135},{"questionId":"q115","format":"yaml","model":"gemini-2.5-flash","expected":"10","actual":"10","isCorrect":true,"inputTokens":3816,"outputTokens":2,"latencyMs":10310.174541000044},{"questionId":"q116","format":"json","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":4787,"outputTokens":2,"latencyMs":17880.37845900003},{"questionId":"q116","format":"toon","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2274,"outputTokens":2,"latencyMs":17058.976334000006},{"questionId":"q116","format":"csv","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2211,"outputTokens":2,"latencyMs":23610.25208300003},{"questionId":"q116","format":"xml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":5434,"outputTokens":2,"latencyMs":17141.34670799994},{"questionId":"q116","format":"yaml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":3817,"outputTokens":2,"latencyMs":12650.736999999965},{"questionId":"q117","format":"json","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":17873.479165999917},{"questionId":"q117","format":"toon","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":12653.337415999966},{"questionId":"q117","format":"csv","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":24524.716874999925},{"questionId":"q117","format":"xml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":20913.757875000127},{"questionId":"q117","format":"yaml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":15534.652999999933},{"questionId":"q118","format":"json","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":13329.5030840002},{"questionId":"q118","format":"toon","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":13304.806832999922},{"questionId":"q118","format":"csv","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":28126.68212500005},{"questionId":"q118","format":"xml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":14084.49116700003},{"questionId":"q118","format":"yaml","model":"gemini-2.5-flash","expected":"28","actual":"28","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":15115.027541999938},{"questionId":"q119","format":"json","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":15886.747209000168},{"questionId":"q119","format":"toon","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":23947.59491600003},{"questionId":"q119","format":"csv","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":25522.67729100003},{"questionId":"q119","format":"xml","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":14566.303583000088},{"questionId":"q119","format":"yaml","model":"gemini-2.5-flash","expected":"26","actual":"26","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":16233.330832999898},{"questionId":"q120","format":"json","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":13494.469167000148},{"questionId":"q120","format":"toon","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":25922.211250000168},{"questionId":"q120","format":"csv","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":25628.55637500016},{"questionId":"q120","format":"xml","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":13339.714542000089},{"questionId":"q120","format":"yaml","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":15300.699999999953},{"questionId":"q121","format":"json","model":"gemini-2.5-flash","expected":"35","actual":"35","isCorrect":true,"inputTokens":4785,"outputTokens":2,"latencyMs":17373.429875000147},{"questionId":"q121","format":"toon","model":"gemini-2.5-flash","expected":"35","actual":"35","isCorrect":true,"inputTokens":2272,"outputTokens":2,"latencyMs":15758.771500000032},{"questionId":"q121","format":"csv","model":"gemini-2.5-flash","expected":"35","actual":"35","isCorrect":true,"inputTokens":2209,"outputTokens":2,"latencyMs":26820.46316600009},{"questionId":"q121","format":"xml","model":"gemini-2.5-flash","expected":"35","actual":"35","isCorrect":true,"inputTokens":5432,"outputTokens":2,"latencyMs":14443.888499999885},{"questionId":"q121","format":"yaml","model":"gemini-2.5-flash","expected":"35","actual":"35","isCorrect":true,"inputTokens":3815,"outputTokens":2,"latencyMs":13739.48695800011},{"questionId":"q122","format":"json","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":4785,"outputTokens":2,"latencyMs":12447.412625000114},{"questionId":"q122","format":"toon","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":2272,"outputTokens":2,"latencyMs":11950.244582999963},{"questionId":"q122","format":"csv","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":2209,"outputTokens":2,"latencyMs":23693.09270899999},{"questionId":"q122","format":"xml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":5432,"outputTokens":2,"latencyMs":14626.785333999898},{"questionId":"q122","format":"yaml","model":"gemini-2.5-flash","expected":"12","actual":"12","isCorrect":true,"inputTokens":3815,"outputTokens":2,"latencyMs":11062.256375000114},{"questionId":"q123","format":"json","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":19030.404666000046},{"questionId":"q123","format":"toon","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":18230.727500000037},{"questionId":"q123","format":"csv","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":26825.848833000055},{"questionId":"q123","format":"xml","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":17138.20162500022},{"questionId":"q123","format":"yaml","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":15069.584500000114},{"questionId":"q124","format":"json","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":4788,"outputTokens":2,"latencyMs":15737.726000000024},{"questionId":"q124","format":"toon","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":2275,"outputTokens":2,"latencyMs":17389.034041000064},{"questionId":"q124","format":"csv","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":2212,"outputTokens":2,"latencyMs":23912.79212499992},{"questionId":"q124","format":"xml","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":5435,"outputTokens":2,"latencyMs":18955.741374999983},{"questionId":"q124","format":"yaml","model":"gemini-2.5-flash","expected":"32","actual":"32","isCorrect":true,"inputTokens":3818,"outputTokens":2,"latencyMs":16964.132499999832},{"questionId":"q125","format":"json","model":"gemini-2.5-flash","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":19994,"outputTokens":6,"latencyMs":1331.826624999987},{"questionId":"q125","format":"toon","model":"gemini-2.5-flash","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":12340,"outputTokens":6,"latencyMs":2754.3997500000987},{"questionId":"q125","format":"csv","model":"gemini-2.5-flash","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":12210,"outputTokens":6,"latencyMs":3215.705832999898},{"questionId":"q125","format":"xml","model":"gemini-2.5-flash","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":21884,"outputTokens":6,"latencyMs":1381.106374999974},{"questionId":"q125","format":"yaml","model":"gemini-2.5-flash","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":17079,"outputTokens":6,"latencyMs":1221.7535840000492},{"questionId":"q126","format":"json","model":"gemini-2.5-flash","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":19997,"outputTokens":5,"latencyMs":2801.957332999911},{"questionId":"q126","format":"toon","model":"gemini-2.5-flash","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":12343,"outputTokens":5,"latencyMs":2992.598999999929},{"questionId":"q126","format":"csv","model":"gemini-2.5-flash","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":12213,"outputTokens":5,"latencyMs":2849.9589169998653},{"questionId":"q126","format":"xml","model":"gemini-2.5-flash","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":21887,"outputTokens":5,"latencyMs":2156.555333999917},{"questionId":"q126","format":"yaml","model":"gemini-2.5-flash","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":17082,"outputTokens":5,"latencyMs":3880.61549999984},{"questionId":"q127","format":"json","model":"gemini-2.5-flash","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":19991,"outputTokens":2,"latencyMs":1875.6967079997994},{"questionId":"q127","format":"toon","model":"gemini-2.5-flash","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":12337,"outputTokens":2,"latencyMs":2374.536707999883},{"questionId":"q127","format":"csv","model":"gemini-2.5-flash","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":12207,"outputTokens":2,"latencyMs":1791.5004999998491},{"questionId":"q127","format":"xml","model":"gemini-2.5-flash","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":21881,"outputTokens":2,"latencyMs":1916.017875000136},{"questionId":"q127","format":"yaml","model":"gemini-2.5-flash","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":17076,"outputTokens":2,"latencyMs":3739.0447500001173},{"questionId":"q128","format":"json","model":"gemini-2.5-flash","expected":"master","actual":"master","isCorrect":true,"inputTokens":19998,"outputTokens":1,"latencyMs":1537.8033340000547},{"questionId":"q128","format":"toon","model":"gemini-2.5-flash","expected":"master","actual":"master","isCorrect":true,"inputTokens":12344,"outputTokens":1,"latencyMs":2379.3255419998895},{"questionId":"q128","format":"csv","model":"gemini-2.5-flash","expected":"master","actual":"master","isCorrect":true,"inputTokens":12214,"outputTokens":1,"latencyMs":2604.7411670000292},{"questionId":"q128","format":"xml","model":"gemini-2.5-flash","expected":"master","actual":"master","isCorrect":true,"inputTokens":21888,"outputTokens":1,"latencyMs":1947.968707999913},{"questionId":"q128","format":"yaml","model":"gemini-2.5-flash","expected":"master","actual":"master","isCorrect":true,"inputTokens":17083,"outputTokens":1,"latencyMs":2905.215084000025},{"questionId":"q129","format":"json","model":"gemini-2.5-flash","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":19990,"outputTokens":4,"latencyMs":2690.9189580001403},{"questionId":"q129","format":"toon","model":"gemini-2.5-flash","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":12336,"outputTokens":4,"latencyMs":2632.326791000087},{"questionId":"q129","format":"csv","model":"gemini-2.5-flash","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":12206,"outputTokens":4,"latencyMs":3994.474958000006},{"questionId":"q129","format":"xml","model":"gemini-2.5-flash","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":21880,"outputTokens":4,"latencyMs":1225.6673749999609},{"questionId":"q129","format":"yaml","model":"gemini-2.5-flash","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":17075,"outputTokens":4,"latencyMs":1695.0257920001168},{"questionId":"q130","format":"json","model":"gemini-2.5-flash","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":19996,"outputTokens":6,"latencyMs":2950.3700000001118},{"questionId":"q130","format":"toon","model":"gemini-2.5-flash","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":12342,"outputTokens":6,"latencyMs":2860.1882090000436},{"questionId":"q130","format":"csv","model":"gemini-2.5-flash","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":12212,"outputTokens":6,"latencyMs":2719.343499999959},{"questionId":"q130","format":"xml","model":"gemini-2.5-flash","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":21886,"outputTokens":6,"latencyMs":1880.9960000000428},{"questionId":"q130","format":"yaml","model":"gemini-2.5-flash","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":17081,"outputTokens":6,"latencyMs":3259.226666999981},{"questionId":"q131","format":"json","model":"gemini-2.5-flash","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":19996,"outputTokens":5,"latencyMs":3616.540916000027},{"questionId":"q131","format":"toon","model":"gemini-2.5-flash","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":12342,"outputTokens":5,"latencyMs":3384.6490829999093},{"questionId":"q131","format":"csv","model":"gemini-2.5-flash","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":12212,"outputTokens":5,"latencyMs":3324.5547080000397},{"questionId":"q131","format":"xml","model":"gemini-2.5-flash","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":21886,"outputTokens":5,"latencyMs":1404.578916999977},{"questionId":"q131","format":"yaml","model":"gemini-2.5-flash","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":17081,"outputTokens":5,"latencyMs":3033.9148750000168},{"questionId":"q132","format":"json","model":"gemini-2.5-flash","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":19991,"outputTokens":1,"latencyMs":3094.6301249999087},{"questionId":"q132","format":"toon","model":"gemini-2.5-flash","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":12337,"outputTokens":1,"latencyMs":1962.830874999985},{"questionId":"q132","format":"csv","model":"gemini-2.5-flash","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":12207,"outputTokens":1,"latencyMs":1890.7381249999162},{"questionId":"q132","format":"xml","model":"gemini-2.5-flash","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":21881,"outputTokens":1,"latencyMs":1801.1806250000373},{"questionId":"q132","format":"yaml","model":"gemini-2.5-flash","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":17076,"outputTokens":1,"latencyMs":1827.8805840001442},{"questionId":"q133","format":"json","model":"gemini-2.5-flash","expected":"main","actual":"main","isCorrect":true,"inputTokens":20000,"outputTokens":1,"latencyMs":1384.7649999998976},{"questionId":"q133","format":"toon","model":"gemini-2.5-flash","expected":"main","actual":"main","isCorrect":true,"inputTokens":12346,"outputTokens":1,"latencyMs":2846.0689999999013},{"questionId":"q133","format":"csv","model":"gemini-2.5-flash","expected":"main","actual":"main","isCorrect":true,"inputTokens":12216,"outputTokens":1,"latencyMs":2928.758332999889},{"questionId":"q133","format":"xml","model":"gemini-2.5-flash","expected":"main","actual":"main","isCorrect":true,"inputTokens":21890,"outputTokens":1,"latencyMs":1256.0088329999708},{"questionId":"q133","format":"yaml","model":"gemini-2.5-flash","expected":"main","actual":"main","isCorrect":true,"inputTokens":17085,"outputTokens":1,"latencyMs":1302.6011670001317},{"questionId":"q134","format":"json","model":"gemini-2.5-flash","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":19994,"outputTokens":4,"latencyMs":2087.42662500008},{"questionId":"q134","format":"toon","model":"gemini-2.5-flash","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":12340,"outputTokens":4,"latencyMs":2163.39154099999},{"questionId":"q134","format":"csv","model":"gemini-2.5-flash","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":12210,"outputTokens":4,"latencyMs":2486.9835000000894},{"questionId":"q134","format":"xml","model":"gemini-2.5-flash","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":21884,"outputTokens":4,"latencyMs":1301.170916999923},{"questionId":"q134","format":"yaml","model":"gemini-2.5-flash","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":17079,"outputTokens":4,"latencyMs":2038.1757920000236},{"questionId":"q135","format":"json","model":"gemini-2.5-flash","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":19997,"outputTokens":6,"latencyMs":1919.6549170000944},{"questionId":"q135","format":"toon","model":"gemini-2.5-flash","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":12343,"outputTokens":6,"latencyMs":3052.263125000056},{"questionId":"q135","format":"csv","model":"gemini-2.5-flash","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":12213,"outputTokens":6,"latencyMs":2680.76866699988},{"questionId":"q135","format":"xml","model":"gemini-2.5-flash","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":21887,"outputTokens":6,"latencyMs":1712.8050000001676},{"questionId":"q135","format":"yaml","model":"gemini-2.5-flash","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":17082,"outputTokens":6,"latencyMs":1682.2451659999788},{"questionId":"q136","format":"json","model":"gemini-2.5-flash","expected":"15413563","actual":"10990000","isCorrect":false,"inputTokens":19992,"outputTokens":8,"latencyMs":1728.4418339999393},{"questionId":"q136","format":"toon","model":"gemini-2.5-flash","expected":"15413563","actual":"12800000","isCorrect":false,"inputTokens":12338,"outputTokens":8,"latencyMs":20421.481582999928},{"questionId":"q136","format":"csv","model":"gemini-2.5-flash","expected":"15413563","actual":"12990000","isCorrect":false,"inputTokens":12208,"outputTokens":8,"latencyMs":81750.67420800007},{"questionId":"q136","format":"xml","model":"gemini-2.5-flash","expected":"15413563","actual":"10999999","isCorrect":false,"inputTokens":21882,"outputTokens":8,"latencyMs":2435.5069170000497},{"questionId":"q136","format":"yaml","model":"gemini-2.5-flash","expected":"15413563","actual":"10990000","isCorrect":false,"inputTokens":17077,"outputTokens":8,"latencyMs":1923.7230409998447},{"questionId":"q137","format":"json","model":"gemini-2.5-flash","expected":"100","actual":"60","isCorrect":false,"inputTokens":19989,"outputTokens":2,"latencyMs":1710.2884579999372},{"questionId":"q137","format":"toon","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":12335,"outputTokens":3,"latencyMs":13404.761375000002},{"questionId":"q137","format":"csv","model":"gemini-2.5-flash","expected":"100","actual":"100","isCorrect":true,"inputTokens":12205,"outputTokens":3,"latencyMs":6119.993791999994},{"questionId":"q137","format":"xml","model":"gemini-2.5-flash","expected":"100","actual":"60","isCorrect":false,"inputTokens":21879,"outputTokens":2,"latencyMs":1758.3802920000162},{"questionId":"q137","format":"yaml","model":"gemini-2.5-flash","expected":"100","actual":"60","isCorrect":false,"inputTokens":17074,"outputTokens":2,"latencyMs":2015.8025829999242},{"questionId":"q138","format":"json","model":"gemini-2.5-flash","expected":"154136","actual":"169000","isCorrect":false,"inputTokens":19991,"outputTokens":6,"latencyMs":923.6561660000589},{"questionId":"q138","format":"toon","model":"gemini-2.5-flash","expected":"154136","actual":"149999.99","isCorrect":false,"inputTokens":12337,"outputTokens":9,"latencyMs":16272.457915999927},{"questionId":"q138","format":"csv","model":"gemini-2.5-flash","expected":"154136","actual":"144333","isCorrect":false,"inputTokens":12207,"outputTokens":6,"latencyMs":11209.916749999858},{"questionId":"q138","format":"xml","model":"gemini-2.5-flash","expected":"154136","actual":"166666.66","isCorrect":false,"inputTokens":21881,"outputTokens":9,"latencyMs":135129.45999999996},{"questionId":"q138","format":"yaml","model":"gemini-2.5-flash","expected":"154136","actual":"149999.99","isCorrect":false,"inputTokens":17076,"outputTokens":9,"latencyMs":20043.56337499991},{"questionId":"q139","format":"json","model":"gemini-2.5-flash","expected":"77","actual":"54","isCorrect":false,"inputTokens":19996,"outputTokens":2,"latencyMs":1299.9022079999559},{"questionId":"q139","format":"toon","model":"gemini-2.5-flash","expected":"77","actual":"77","isCorrect":true,"inputTokens":12342,"outputTokens":2,"latencyMs":21103.826624999987},{"questionId":"q139","format":"csv","model":"gemini-2.5-flash","expected":"77","actual":"77","isCorrect":true,"inputTokens":12212,"outputTokens":2,"latencyMs":15810.050041000126},{"questionId":"q139","format":"xml","model":"gemini-2.5-flash","expected":"77","actual":"77","isCorrect":true,"inputTokens":21886,"outputTokens":2,"latencyMs":16842.129749999847},{"questionId":"q139","format":"yaml","model":"gemini-2.5-flash","expected":"77","actual":"48","isCorrect":false,"inputTokens":17081,"outputTokens":2,"latencyMs":1482.9725420000032},{"questionId":"q140","format":"json","model":"gemini-2.5-flash","expected":"37","actual":"37","isCorrect":true,"inputTokens":19996,"outputTokens":2,"latencyMs":1171.0767920000944},{"questionId":"q140","format":"toon","model":"gemini-2.5-flash","expected":"37","actual":"37","isCorrect":true,"inputTokens":12342,"outputTokens":2,"latencyMs":8308.945374999894},{"questionId":"q140","format":"csv","model":"gemini-2.5-flash","expected":"37","actual":"37","isCorrect":true,"inputTokens":12212,"outputTokens":2,"latencyMs":88574.87341700005},{"questionId":"q140","format":"xml","model":"gemini-2.5-flash","expected":"37","actual":"37","isCorrect":true,"inputTokens":21886,"outputTokens":2,"latencyMs":17164.141541999998},{"questionId":"q140","format":"yaml","model":"gemini-2.5-flash","expected":"37","actual":"31","isCorrect":false,"inputTokens":17081,"outputTokens":2,"latencyMs":1420.0008750001434},{"questionId":"q141","format":"json","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":19996,"outputTokens":2,"latencyMs":1639.8862499999814},{"questionId":"q141","format":"toon","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":12342,"outputTokens":2,"latencyMs":5415.8904999999795},{"questionId":"q141","format":"csv","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":12212,"outputTokens":2,"latencyMs":15452.510082999943},{"questionId":"q141","format":"xml","model":"gemini-2.5-flash","expected":"16","actual":"16","isCorrect":true,"inputTokens":21886,"outputTokens":2,"latencyMs":18354.517375000054},{"questionId":"q141","format":"yaml","model":"gemini-2.5-flash","expected":"16","actual":"15","isCorrect":false,"inputTokens":17081,"outputTokens":2,"latencyMs":1621.2825830001384},{"questionId":"q142","format":"json","model":"gemini-2.5-flash","expected":"49","actual":"45","isCorrect":false,"inputTokens":19995,"outputTokens":2,"latencyMs":1736.1554170001764},{"questionId":"q142","format":"toon","model":"gemini-2.5-flash","expected":"49","actual":"49","isCorrect":true,"inputTokens":12341,"outputTokens":2,"latencyMs":16870.034584000008},{"questionId":"q142","format":"csv","model":"gemini-2.5-flash","expected":"49","actual":"49","isCorrect":true,"inputTokens":12211,"outputTokens":2,"latencyMs":16379.907332999865},{"questionId":"q142","format":"xml","model":"gemini-2.5-flash","expected":"49","actual":"49","isCorrect":true,"inputTokens":21885,"outputTokens":2,"latencyMs":25082.777166999876},{"questionId":"q142","format":"yaml","model":"gemini-2.5-flash","expected":"49","actual":"49","isCorrect":true,"inputTokens":17080,"outputTokens":2,"latencyMs":19932.96479200013},{"questionId":"q143","format":"json","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":19995,"outputTokens":2,"latencyMs":13783.272125000134},{"questionId":"q143","format":"toon","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":12341,"outputTokens":2,"latencyMs":13312.562875000061},{"questionId":"q143","format":"csv","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":12211,"outputTokens":2,"latencyMs":27375.345000000205},{"questionId":"q143","format":"xml","model":"gemini-2.5-flash","expected":"23","actual":"23","isCorrect":true,"inputTokens":21885,"outputTokens":2,"latencyMs":17887.27408299991},{"questionId":"q143","format":"yaml","model":"gemini-2.5-flash","expected":"23","actual":"25","isCorrect":false,"inputTokens":17080,"outputTokens":2,"latencyMs":10026.456541999942},{"questionId":"q144","format":"json","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":19995,"outputTokens":2,"latencyMs":14357.982542000012},{"questionId":"q144","format":"toon","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":12341,"outputTokens":2,"latencyMs":13512.807207999984},{"questionId":"q144","format":"csv","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":12211,"outputTokens":2,"latencyMs":99156.41708399984},{"questionId":"q144","format":"xml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":21885,"outputTokens":2,"latencyMs":15279.87195800012},{"questionId":"q144","format":"yaml","model":"gemini-2.5-flash","expected":"11","actual":"11","isCorrect":true,"inputTokens":17080,"outputTokens":2,"latencyMs":16611.06645900011},{"questionId":"q145","format":"json","model":"gemini-2.5-flash","expected":"19","actual":"12","isCorrect":false,"inputTokens":19994,"outputTokens":2,"latencyMs":1430.3512500000652},{"questionId":"q145","format":"toon","model":"gemini-2.5-flash","expected":"19","actual":"19","isCorrect":true,"inputTokens":12340,"outputTokens":2,"latencyMs":25215.23679200001},{"questionId":"q145","format":"csv","model":"gemini-2.5-flash","expected":"19","actual":"18","isCorrect":false,"inputTokens":12210,"outputTokens":2,"latencyMs":13470.632208999945},{"questionId":"q145","format":"xml","model":"gemini-2.5-flash","expected":"19","actual":"19","isCorrect":true,"inputTokens":21884,"outputTokens":2,"latencyMs":17431.872999999905},{"questionId":"q145","format":"yaml","model":"gemini-2.5-flash","expected":"19","actual":"19","isCorrect":true,"inputTokens":17079,"outputTokens":2,"latencyMs":14449.361540999962},{"questionId":"q146","format":"json","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":19994,"outputTokens":1,"latencyMs":19472.050208},{"questionId":"q146","format":"toon","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":12340,"outputTokens":1,"latencyMs":15048.826249999925},{"questionId":"q146","format":"csv","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":12210,"outputTokens":1,"latencyMs":87955.33499999996},{"questionId":"q146","format":"xml","model":"gemini-2.5-flash","expected":"4","actual":"4","isCorrect":true,"inputTokens":21884,"outputTokens":1,"latencyMs":9616.131832999876},{"questionId":"q146","format":"yaml","model":"gemini-2.5-flash","expected":"4","actual":"9","isCorrect":false,"inputTokens":17079,"outputTokens":1,"latencyMs":1725.5754590001889},{"questionId":"q147","format":"json","model":"gemini-2.5-flash","expected":"41","actual":"30","isCorrect":false,"inputTokens":19993,"outputTokens":2,"latencyMs":1532.5546249998733},{"questionId":"q147","format":"toon","model":"gemini-2.5-flash","expected":"41","actual":"41","isCorrect":true,"inputTokens":12339,"outputTokens":2,"latencyMs":12422.034999999916},{"questionId":"q147","format":"csv","model":"gemini-2.5-flash","expected":"41","actual":"30","isCorrect":false,"inputTokens":12209,"outputTokens":2,"latencyMs":1742.5015419998672},{"questionId":"q147","format":"xml","model":"gemini-2.5-flash","expected":"41","actual":"30","isCorrect":false,"inputTokens":21883,"outputTokens":2,"latencyMs":1362.4468750001397},{"questionId":"q147","format":"yaml","model":"gemini-2.5-flash","expected":"41","actual":"30","isCorrect":false,"inputTokens":17078,"outputTokens":2,"latencyMs":1193.8830829998478},{"questionId":"q148","format":"json","model":"gemini-2.5-flash","expected":"53","actual":"30","isCorrect":false,"inputTokens":19993,"outputTokens":2,"latencyMs":1353.424875000026},{"questionId":"q148","format":"toon","model":"gemini-2.5-flash","expected":"53","actual":"50","isCorrect":false,"inputTokens":12339,"outputTokens":2,"latencyMs":1249.1261670000385},{"questionId":"q148","format":"csv","model":"gemini-2.5-flash","expected":"53","actual":"45","isCorrect":false,"inputTokens":12209,"outputTokens":2,"latencyMs":1972.7094580000266},{"questionId":"q148","format":"xml","model":"gemini-2.5-flash","expected":"53","actual":"40","isCorrect":false,"inputTokens":21883,"outputTokens":2,"latencyMs":1756.154957999941},{"questionId":"q148","format":"yaml","model":"gemini-2.5-flash","expected":"53","actual":"40","isCorrect":false,"inputTokens":17078,"outputTokens":2,"latencyMs":1228.0062500000931},{"questionId":"q149","format":"json","model":"gemini-2.5-flash","expected":"57","actual":"57","isCorrect":true,"inputTokens":20005,"outputTokens":2,"latencyMs":47274.037834000075},{"questionId":"q149","format":"toon","model":"gemini-2.5-flash","expected":"57","actual":"57","isCorrect":true,"inputTokens":12351,"outputTokens":2,"latencyMs":45877.21466599987},{"questionId":"q149","format":"csv","model":"gemini-2.5-flash","expected":"57","actual":"57","isCorrect":true,"inputTokens":12221,"outputTokens":2,"latencyMs":26619.96366700018},{"questionId":"q149","format":"xml","model":"gemini-2.5-flash","expected":"57","actual":"57","isCorrect":true,"inputTokens":21895,"outputTokens":2,"latencyMs":51367.91837499989},{"questionId":"q149","format":"yaml","model":"gemini-2.5-flash","expected":"57","actual":"57","isCorrect":true,"inputTokens":17090,"outputTokens":2,"latencyMs":53637.45275000017},{"questionId":"q150","format":"json","model":"gemini-2.5-flash","expected":"43","actual":"43","isCorrect":true,"inputTokens":20006,"outputTokens":2,"latencyMs":25561.847749999957},{"questionId":"q150","format":"toon","model":"gemini-2.5-flash","expected":"43","actual":"43","isCorrect":true,"inputTokens":12352,"outputTokens":2,"latencyMs":26499.081000000006},{"questionId":"q150","format":"csv","model":"gemini-2.5-flash","expected":"43","actual":"43","isCorrect":true,"inputTokens":12222,"outputTokens":2,"latencyMs":25068.995084000053},{"questionId":"q150","format":"xml","model":"gemini-2.5-flash","expected":"43","actual":"43","isCorrect":true,"inputTokens":21896,"outputTokens":2,"latencyMs":25324.510249999817},{"questionId":"q150","format":"yaml","model":"gemini-2.5-flash","expected":"43","actual":"43","isCorrect":true,"inputTokens":17091,"outputTokens":2,"latencyMs":22352.21083300002},{"questionId":"q151","format":"json","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":20006,"outputTokens":2,"latencyMs":25705.24091699999},{"questionId":"q151","format":"toon","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":12352,"outputTokens":2,"latencyMs":22850.758625000017},{"questionId":"q151","format":"csv","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":12222,"outputTokens":2,"latencyMs":100242.82016699994},{"questionId":"q151","format":"xml","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":21896,"outputTokens":2,"latencyMs":22053.189250000054},{"questionId":"q151","format":"yaml","model":"gemini-2.5-flash","expected":"25","actual":"25","isCorrect":true,"inputTokens":17091,"outputTokens":2,"latencyMs":24186.479082999984},{"questionId":"q152","format":"json","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":20006,"outputTokens":1,"latencyMs":7077.461167000001},{"questionId":"q152","format":"toon","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":12352,"outputTokens":1,"latencyMs":12635.33349999995},{"questionId":"q152","format":"csv","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":12222,"outputTokens":1,"latencyMs":25134.92562500015},{"questionId":"q152","format":"xml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":21896,"outputTokens":1,"latencyMs":24282.145332999993},{"questionId":"q152","format":"yaml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":17091,"outputTokens":1,"latencyMs":18622.97499999986},{"questionId":"q153","format":"json","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":20005,"outputTokens":1,"latencyMs":23772.71641699993},{"questionId":"q153","format":"toon","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":12351,"outputTokens":1,"latencyMs":27072.890291000018},{"questionId":"q153","format":"csv","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":12221,"outputTokens":1,"latencyMs":36290.79170900001},{"questionId":"q153","format":"xml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":21895,"outputTokens":1,"latencyMs":21293.912832999835},{"questionId":"q153","format":"yaml","model":"gemini-2.5-flash","expected":"6","actual":"6","isCorrect":true,"inputTokens":17090,"outputTokens":1,"latencyMs":26820.58583300002},{"questionId":"q154","format":"json","model":"gemini-2.5-flash","expected":"1","actual":"1","isCorrect":true,"inputTokens":20005,"outputTokens":1,"latencyMs":11658.803499999922},{"questionId":"q154","format":"toon","model":"gemini-2.5-flash","expected":"1","actual":"1","isCorrect":true,"inputTokens":12351,"outputTokens":1,"latencyMs":17849.99624999985},{"questionId":"q154","format":"csv","model":"gemini-2.5-flash","expected":"1","actual":"1","isCorrect":true,"inputTokens":12221,"outputTokens":1,"latencyMs":35389.41012499994},{"questionId":"q154","format":"xml","model":"gemini-2.5-flash","expected":"1","actual":"1","isCorrect":true,"inputTokens":21895,"outputTokens":1,"latencyMs":14284.633541000076},{"questionId":"q154","format":"yaml","model":"gemini-2.5-flash","expected":"1","actual":"1","isCorrect":true,"inputTokens":17090,"outputTokens":1,"latencyMs":7885.770458000014}]
\ No newline at end of file
diff --git a/benchmarks/results/accuracy/models/gpt-5-nano b/benchmarks/results/accuracy/models/gpt-5-nano
new file mode 100644
index 0000000..9741bd9
--- /dev/null
+++ b/benchmarks/results/accuracy/models/gpt-5-nano
@@ -0,0 +1 @@
+[{"questionId":"q1","format":"json","model":"gpt-5-nano","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":6390,"outputTokens":72,"latencyMs":2286.895917},{"questionId":"q1","format":"toon","model":"gpt-5-nano","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":2527,"outputTokens":72,"latencyMs":2080.2120830000003},{"questionId":"q1","format":"csv","model":"gpt-5-nano","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":2381,"outputTokens":72,"latencyMs":2368.424333000001},{"questionId":"q1","format":"xml","model":"gpt-5-nano","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":7357,"outputTokens":136,"latencyMs":3603.5194579999998},{"questionId":"q1","format":"yaml","model":"gpt-5-nano","expected":"56176","actual":"56176","isCorrect":true,"inputTokens":5012,"outputTokens":72,"latencyMs":2783.105292},{"questionId":"q2","format":"json","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":6390,"outputTokens":71,"latencyMs":3301.9922080000006},{"questionId":"q2","format":"toon","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2527,"outputTokens":71,"latencyMs":2579.989917000001},{"questionId":"q2","format":"csv","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2381,"outputTokens":71,"latencyMs":2071.0654589999995},{"questionId":"q2","format":"xml","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7357,"outputTokens":135,"latencyMs":2436.658125000001},{"questionId":"q2","format":"yaml","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5012,"outputTokens":71,"latencyMs":3412.149292},{"questionId":"q3","format":"json","model":"gpt-5-nano","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":6392,"outputTokens":140,"latencyMs":5299.290375},{"questionId":"q3","format":"toon","model":"gpt-5-nano","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":2529,"outputTokens":76,"latencyMs":2528.3222079999996},{"questionId":"q3","format":"csv","model":"gpt-5-nano","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":2383,"outputTokens":140,"latencyMs":3022.2497079999994},{"questionId":"q3","format":"xml","model":"gpt-5-nano","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":7359,"outputTokens":204,"latencyMs":3238.962124999999},{"questionId":"q3","format":"yaml","model":"gpt-5-nano","expected":"lorenza.kunze@yahoo.com","actual":"lorenza.kunze@yahoo.com","isCorrect":true,"inputTokens":5014,"outputTokens":140,"latencyMs":2557.434041999999},{"questionId":"q4","format":"json","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":6392,"outputTokens":71,"latencyMs":3143.1138339999998},{"questionId":"q4","format":"toon","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":2529,"outputTokens":71,"latencyMs":2368.6757910000006},{"questionId":"q4","format":"csv","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":2383,"outputTokens":71,"latencyMs":2801.3656659999997},{"questionId":"q4","format":"xml","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":7359,"outputTokens":199,"latencyMs":3047.416791999999},{"questionId":"q4","format":"yaml","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":5014,"outputTokens":71,"latencyMs":3128.1965420000006},{"questionId":"q5","format":"json","model":"gpt-5-nano","expected":"no","actual":"No","isCorrect":true,"inputTokens":6388,"outputTokens":199,"latencyMs":2972.251875},{"questionId":"q5","format":"toon","model":"gpt-5-nano","expected":"no","actual":"false","isCorrect":true,"inputTokens":2525,"outputTokens":327,"latencyMs":4319.013167000001},{"questionId":"q5","format":"csv","model":"gpt-5-nano","expected":"no","actual":"No","isCorrect":true,"inputTokens":2379,"outputTokens":647,"latencyMs":8503.479375},{"questionId":"q5","format":"xml","model":"gpt-5-nano","expected":"no","actual":"false","isCorrect":true,"inputTokens":7355,"outputTokens":263,"latencyMs":3787.869708},{"questionId":"q5","format":"yaml","model":"gpt-5-nano","expected":"no","actual":"false","isCorrect":true,"inputTokens":5010,"outputTokens":327,"latencyMs":4215.784416999999},{"questionId":"q6","format":"json","model":"gpt-5-nano","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":6389,"outputTokens":72,"latencyMs":2708.392833},{"questionId":"q6","format":"toon","model":"gpt-5-nano","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":2526,"outputTokens":72,"latencyMs":2004.1692079999993},{"questionId":"q6","format":"csv","model":"gpt-5-nano","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":2380,"outputTokens":136,"latencyMs":2530.7687079999996},{"questionId":"q6","format":"xml","model":"gpt-5-nano","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":7356,"outputTokens":136,"latencyMs":2244.525791},{"questionId":"q6","format":"yaml","model":"gpt-5-nano","expected":"133081","actual":"133081","isCorrect":true,"inputTokens":5011,"outputTokens":136,"latencyMs":2472.8984170000003},{"questionId":"q7","format":"json","model":"gpt-5-nano","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":6390,"outputTokens":7,"latencyMs":1896.0880000000016},{"questionId":"q7","format":"toon","model":"gpt-5-nano","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":2527,"outputTokens":71,"latencyMs":2263.058832999999},{"questionId":"q7","format":"csv","model":"gpt-5-nano","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":2381,"outputTokens":71,"latencyMs":2069.6880410000012},{"questionId":"q7","format":"xml","model":"gpt-5-nano","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":7357,"outputTokens":135,"latencyMs":2421.5882500000007},{"questionId":"q7","format":"yaml","model":"gpt-5-nano","expected":"Engineering","actual":"Engineering","isCorrect":true,"inputTokens":5012,"outputTokens":71,"latencyMs":2004.3543750000008},{"questionId":"q8","format":"json","model":"gpt-5-nano","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":6391,"outputTokens":77,"latencyMs":2184.6345},{"questionId":"q8","format":"toon","model":"gpt-5-nano","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":2528,"outputTokens":141,"latencyMs":3463.506875000001},{"questionId":"q8","format":"csv","model":"gpt-5-nano","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":2382,"outputTokens":141,"latencyMs":2491.552375000001},{"questionId":"q8","format":"xml","model":"gpt-5-nano","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":7358,"outputTokens":141,"latencyMs":2773.072124999999},{"questionId":"q8","format":"yaml","model":"gpt-5-nano","expected":"delpha.russel@gmail.com","actual":"delpha.russel@gmail.com","isCorrect":true,"inputTokens":5013,"outputTokens":77,"latencyMs":3364.1551249999993},{"questionId":"q9","format":"json","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":6393,"outputTokens":263,"latencyMs":8407.982375000001},{"questionId":"q9","format":"toon","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":2530,"outputTokens":135,"latencyMs":3553.8328330000004},{"questionId":"q9","format":"csv","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":2384,"outputTokens":455,"latencyMs":8108.107749999997},{"questionId":"q9","format":"xml","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":7360,"outputTokens":199,"latencyMs":7765.270042},{"questionId":"q9","format":"yaml","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":5015,"outputTokens":135,"latencyMs":2643.0200829999994},{"questionId":"q10","format":"json","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":6390,"outputTokens":135,"latencyMs":3184.6242919999986},{"questionId":"q10","format":"toon","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":2527,"outputTokens":263,"latencyMs":3714.655332999999},{"questionId":"q10","format":"csv","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2381,"outputTokens":135,"latencyMs":2412.2727080000004},{"questionId":"q10","format":"xml","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":7357,"outputTokens":391,"latencyMs":4610.666667000001},{"questionId":"q10","format":"yaml","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":5012,"outputTokens":135,"latencyMs":2337.4404170000016},{"questionId":"q11","format":"json","model":"gpt-5-nano","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":6390,"outputTokens":264,"latencyMs":3417.8135},{"questionId":"q11","format":"toon","model":"gpt-5-nano","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":2527,"outputTokens":200,"latencyMs":2675.2862499999974},{"questionId":"q11","format":"csv","model":"gpt-5-nano","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":2381,"outputTokens":136,"latencyMs":4805.800959},{"questionId":"q11","format":"xml","model":"gpt-5-nano","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":7357,"outputTokens":136,"latencyMs":2649.596416999997},{"questionId":"q11","format":"yaml","model":"gpt-5-nano","expected":"109064","actual":"109064","isCorrect":true,"inputTokens":5012,"outputTokens":72,"latencyMs":2322.7847089999996},{"questionId":"q12","format":"json","model":"gpt-5-nano","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":6388,"outputTokens":263,"latencyMs":4128.735457999999},{"questionId":"q12","format":"toon","model":"gpt-5-nano","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":2525,"outputTokens":71,"latencyMs":2963.8491250000006},{"questionId":"q12","format":"csv","model":"gpt-5-nano","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":2379,"outputTokens":71,"latencyMs":3226.5830000000024},{"questionId":"q12","format":"xml","model":"gpt-5-nano","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":7355,"outputTokens":135,"latencyMs":3400.928915999997},{"questionId":"q12","format":"yaml","model":"gpt-5-nano","expected":"Operations","actual":"Operations","isCorrect":true,"inputTokens":5010,"outputTokens":711,"latencyMs":8393.479792000002},{"questionId":"q13","format":"json","model":"gpt-5-nano","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":6389,"outputTokens":75,"latencyMs":2083.456707999998},{"questionId":"q13","format":"toon","model":"gpt-5-nano","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":2526,"outputTokens":75,"latencyMs":2282.820208000001},{"questionId":"q13","format":"csv","model":"gpt-5-nano","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":2380,"outputTokens":139,"latencyMs":1922.3527920000015},{"questionId":"q13","format":"xml","model":"gpt-5-nano","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":7356,"outputTokens":139,"latencyMs":1967.7009160000016},{"questionId":"q13","format":"yaml","model":"gpt-5-nano","expected":"henderson70@yahoo.com","actual":"henderson70@yahoo.com","isCorrect":true,"inputTokens":5011,"outputTokens":75,"latencyMs":2097.907542000001},{"questionId":"q14","format":"json","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":6391,"outputTokens":135,"latencyMs":3816.0825000000004},{"questionId":"q14","format":"toon","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":2528,"outputTokens":71,"latencyMs":1841.6428339999984},{"questionId":"q14","format":"csv","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":2382,"outputTokens":135,"latencyMs":2661.6788750000014},{"questionId":"q14","format":"xml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":7358,"outputTokens":135,"latencyMs":3028.1100410000035},{"questionId":"q14","format":"yaml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":5013,"outputTokens":135,"latencyMs":2456.2266249999993},{"questionId":"q15","format":"json","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":6388,"outputTokens":135,"latencyMs":2595.8724580000016},{"questionId":"q15","format":"toon","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":2525,"outputTokens":199,"latencyMs":3002.6034579999978},{"questionId":"q15","format":"csv","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2379,"outputTokens":263,"latencyMs":3817.756000000001},{"questionId":"q15","format":"xml","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":7355,"outputTokens":455,"latencyMs":4972.323082999999},{"questionId":"q15","format":"yaml","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":5010,"outputTokens":327,"latencyMs":7745.852374999999},{"questionId":"q16","format":"json","model":"gpt-5-nano","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":6389,"outputTokens":72,"latencyMs":2094.709333999999},{"questionId":"q16","format":"toon","model":"gpt-5-nano","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":2526,"outputTokens":72,"latencyMs":3989.400916999999},{"questionId":"q16","format":"csv","model":"gpt-5-nano","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":2380,"outputTokens":72,"latencyMs":1999.0430420000012},{"questionId":"q16","format":"xml","model":"gpt-5-nano","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":7356,"outputTokens":136,"latencyMs":3469.017167000002},{"questionId":"q16","format":"yaml","model":"gpt-5-nano","expected":"89436","actual":"89436","isCorrect":true,"inputTokens":5011,"outputTokens":136,"latencyMs":2959.3207089999996},{"questionId":"q17","format":"json","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":6392,"outputTokens":135,"latencyMs":4353.834665999999},{"questionId":"q17","format":"toon","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2529,"outputTokens":135,"latencyMs":2734.705167},{"questionId":"q17","format":"csv","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":2383,"outputTokens":135,"latencyMs":5485.966791999999},{"questionId":"q17","format":"xml","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":7359,"outputTokens":135,"latencyMs":3338.3081660000025},{"questionId":"q17","format":"yaml","model":"gpt-5-nano","expected":"Marketing","actual":"Marketing","isCorrect":true,"inputTokens":5014,"outputTokens":135,"latencyMs":2317.4250410000022},{"questionId":"q18","format":"json","model":"gpt-5-nano","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":6390,"outputTokens":139,"latencyMs":2934.8541250000017},{"questionId":"q18","format":"toon","model":"gpt-5-nano","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":2527,"outputTokens":75,"latencyMs":2196.355125000002},{"questionId":"q18","format":"csv","model":"gpt-5-nano","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":2381,"outputTokens":75,"latencyMs":2179.3174580000014},{"questionId":"q18","format":"xml","model":"gpt-5-nano","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":7357,"outputTokens":203,"latencyMs":2986.970416},{"questionId":"q18","format":"yaml","model":"gpt-5-nano","expected":"kelvin54@yahoo.com","actual":"kelvin54@yahoo.com","isCorrect":true,"inputTokens":5012,"outputTokens":139,"latencyMs":2035.5609160000022},{"questionId":"q19","format":"json","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":6392,"outputTokens":135,"latencyMs":2827.9320420000004},{"questionId":"q19","format":"toon","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":2529,"outputTokens":71,"latencyMs":2052.042333999998},{"questionId":"q19","format":"csv","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":2383,"outputTokens":135,"latencyMs":2475.6582089999974},{"questionId":"q19","format":"xml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":7359,"outputTokens":199,"latencyMs":5298.210291999996},{"questionId":"q19","format":"yaml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":5014,"outputTokens":71,"latencyMs":2479.8611249999994},{"questionId":"q20","format":"json","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":6388,"outputTokens":199,"latencyMs":3099.4663340000043},{"questionId":"q20","format":"toon","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":2525,"outputTokens":327,"latencyMs":4842.604750000002},{"questionId":"q20","format":"csv","model":"gpt-5-nano","expected":"yes","actual":"Yes","isCorrect":true,"inputTokens":2379,"outputTokens":135,"latencyMs":2375.8693330000024},{"questionId":"q20","format":"xml","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":7355,"outputTokens":199,"latencyMs":3211.3723340000033},{"questionId":"q20","format":"yaml","model":"gpt-5-nano","expected":"yes","actual":"true","isCorrect":true,"inputTokens":5010,"outputTokens":135,"latencyMs":3330.7180000000008},{"questionId":"q21","format":"json","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":6387,"outputTokens":647,"latencyMs":7148.650417000001},{"questionId":"q21","format":"toon","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2524,"outputTokens":1607,"latencyMs":15327.959125000001},{"questionId":"q21","format":"csv","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2378,"outputTokens":967,"latencyMs":10992.290750000004},{"questionId":"q21","format":"xml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":7354,"outputTokens":1031,"latencyMs":9394.927084000003},{"questionId":"q21","format":"yaml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":5009,"outputTokens":903,"latencyMs":10763.375417000003},{"questionId":"q22","format":"json","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":6387,"outputTokens":391,"latencyMs":4349.884417000001},{"questionId":"q22","format":"toon","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2524,"outputTokens":1095,"latencyMs":9809.553958000004},{"questionId":"q22","format":"csv","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2378,"outputTokens":1031,"latencyMs":9584.158749999995},{"questionId":"q22","format":"xml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":7354,"outputTokens":519,"latencyMs":5500.127124999999},{"questionId":"q22","format":"yaml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":5009,"outputTokens":839,"latencyMs":8069.941374999995},{"questionId":"q23","format":"json","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":6387,"outputTokens":647,"latencyMs":6670.407958000003},{"questionId":"q23","format":"toon","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2524,"outputTokens":1031,"latencyMs":9428.577291000001},{"questionId":"q23","format":"csv","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2378,"outputTokens":647,"latencyMs":6800.205249999999},{"questionId":"q23","format":"xml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":7354,"outputTokens":903,"latencyMs":9085.086500000005},{"questionId":"q23","format":"yaml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":5009,"outputTokens":1031,"latencyMs":10963.525583000002},{"questionId":"q24","format":"json","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":6387,"outputTokens":647,"latencyMs":6168.287916999994},{"questionId":"q24","format":"toon","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2524,"outputTokens":455,"latencyMs":5222.8764999999985},{"questionId":"q24","format":"csv","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":2378,"outputTokens":967,"latencyMs":9628.338166000001},{"questionId":"q24","format":"xml","model":"gpt-5-nano","expected":"17","actual":"17","isCorrect":true,"inputTokens":7354,"outputTokens":967,"latencyMs":8964.717292000001},{"questionId":"q24","format":"yaml","model":"gpt-5-nano","expected":"17","actual":"16","isCorrect":false,"inputTokens":5009,"outputTokens":583,"latencyMs":5695.531999999999},{"questionId":"q25","format":"json","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":6387,"outputTokens":455,"latencyMs":4840.368499999997},{"questionId":"q25","format":"toon","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2524,"outputTokens":711,"latencyMs":8578.791709000005},{"questionId":"q25","format":"csv","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2378,"outputTokens":1479,"latencyMs":15123.943},{"questionId":"q25","format":"xml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":7354,"outputTokens":967,"latencyMs":9384.946625000004},{"questionId":"q25","format":"yaml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":5009,"outputTokens":839,"latencyMs":10944.875042},{"questionId":"q26","format":"json","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":6387,"outputTokens":647,"latencyMs":6869.418540999999},{"questionId":"q26","format":"toon","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2524,"outputTokens":455,"latencyMs":4867.974583000003},{"questionId":"q26","format":"csv","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2378,"outputTokens":1543,"latencyMs":14267.353582999996},{"questionId":"q26","format":"xml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":7354,"outputTokens":647,"latencyMs":7383.540583000002},{"questionId":"q26","format":"yaml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":5009,"outputTokens":775,"latencyMs":14026.064583},{"questionId":"q27","format":"json","model":"gpt-5-nano","expected":"91","actual":"91","isCorrect":true,"inputTokens":6392,"outputTokens":2119,"latencyMs":19840.095458000003},{"questionId":"q27","format":"toon","model":"gpt-5-nano","expected":"91","actual":"91","isCorrect":true,"inputTokens":2529,"outputTokens":1863,"latencyMs":19581.628542},{"questionId":"q27","format":"csv","model":"gpt-5-nano","expected":"91","actual":"91","isCorrect":true,"inputTokens":2383,"outputTokens":1863,"latencyMs":17144.098332999994},{"questionId":"q27","format":"xml","model":"gpt-5-nano","expected":"91","actual":"91","isCorrect":true,"inputTokens":7359,"outputTokens":2375,"latencyMs":22871.458750000005},{"questionId":"q27","format":"yaml","model":"gpt-5-nano","expected":"91","actual":"91","isCorrect":true,"inputTokens":5014,"outputTokens":2503,"latencyMs":24232.040250000005},{"questionId":"q28","format":"json","model":"gpt-5-nano","expected":"67","actual":"68","isCorrect":false,"inputTokens":6392,"outputTokens":2695,"latencyMs":22657.479165999997},{"questionId":"q28","format":"toon","model":"gpt-5-nano","expected":"67","actual":"67","isCorrect":true,"inputTokens":2529,"outputTokens":1607,"latencyMs":13011.923875},{"questionId":"q28","format":"csv","model":"gpt-5-nano","expected":"67","actual":"64","isCorrect":false,"inputTokens":2383,"outputTokens":1927,"latencyMs":17143.839250000005},{"questionId":"q28","format":"xml","model":"gpt-5-nano","expected":"67","actual":"66","isCorrect":false,"inputTokens":7359,"outputTokens":2119,"latencyMs":19857.302667000004},{"questionId":"q28","format":"yaml","model":"gpt-5-nano","expected":"67","actual":"67","isCorrect":true,"inputTokens":5014,"outputTokens":1799,"latencyMs":17493.660707999996},{"questionId":"q29","format":"json","model":"gpt-5-nano","expected":"41","actual":"41","isCorrect":true,"inputTokens":6392,"outputTokens":1543,"latencyMs":13661.939208000003},{"questionId":"q29","format":"toon","model":"gpt-5-nano","expected":"41","actual":"41","isCorrect":true,"inputTokens":2529,"outputTokens":1415,"latencyMs":13394.808249999995},{"questionId":"q29","format":"csv","model":"gpt-5-nano","expected":"41","actual":"41","isCorrect":true,"inputTokens":2383,"outputTokens":1863,"latencyMs":16580.891334000007},{"questionId":"q29","format":"xml","model":"gpt-5-nano","expected":"41","actual":"41","isCorrect":true,"inputTokens":7359,"outputTokens":1543,"latencyMs":14548.037708000003},{"questionId":"q29","format":"yaml","model":"gpt-5-nano","expected":"41","actual":"41","isCorrect":true,"inputTokens":5014,"outputTokens":1671,"latencyMs":14537.892209000005},{"questionId":"q30","format":"json","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":6392,"outputTokens":1159,"latencyMs":11617.139958},{"questionId":"q30","format":"toon","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":2529,"outputTokens":1671,"latencyMs":17613.913875},{"questionId":"q30","format":"csv","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":2383,"outputTokens":1287,"latencyMs":9721.494916999996},{"questionId":"q30","format":"xml","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":7359,"outputTokens":1543,"latencyMs":14938.151124999997},{"questionId":"q30","format":"yaml","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":5014,"outputTokens":1543,"latencyMs":15495.643333},{"questionId":"q31","format":"json","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":6388,"outputTokens":71,"latencyMs":1938.4781660000008},{"questionId":"q31","format":"toon","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":2525,"outputTokens":71,"latencyMs":2216.999291},{"questionId":"q31","format":"csv","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":2379,"outputTokens":135,"latencyMs":2948.765041999999},{"questionId":"q31","format":"xml","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":7355,"outputTokens":135,"latencyMs":3050.746583999993},{"questionId":"q31","format":"yaml","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":5010,"outputTokens":71,"latencyMs":1862.2132500000007},{"questionId":"q32","format":"json","model":"gpt-5-nano","expected":"96503","actual":"96503.32","isCorrect":false,"inputTokens":6389,"outputTokens":4426,"latencyMs":38802.84825000001},{"questionId":"q32","format":"toon","model":"gpt-5-nano","expected":"96503","actual":"96503.32","isCorrect":false,"inputTokens":2526,"outputTokens":4874,"latencyMs":39527.296707999994},{"questionId":"q32","format":"csv","model":"gpt-5-nano","expected":"96503","actual":"96503.32","isCorrect":false,"inputTokens":2380,"outputTokens":3466,"latencyMs":31568.755042000004},{"questionId":"q32","format":"xml","model":"gpt-5-nano","expected":"96503","actual":"97075.91","isCorrect":false,"inputTokens":7356,"outputTokens":7946,"latencyMs":71846.78920900001},{"questionId":"q32","format":"yaml","model":"gpt-5-nano","expected":"96503","actual":"96503.32","isCorrect":false,"inputTokens":5011,"outputTokens":3210,"latencyMs":29167.25637500001},{"questionId":"q33","format":"json","model":"gpt-5-nano","expected":"78","actual":"78","isCorrect":true,"inputTokens":6386,"outputTokens":3079,"latencyMs":27806.129750000007},{"questionId":"q33","format":"toon","model":"gpt-5-nano","expected":"78","actual":"78","isCorrect":true,"inputTokens":2523,"outputTokens":1287,"latencyMs":11461.352291999996},{"questionId":"q33","format":"csv","model":"gpt-5-nano","expected":"78","actual":"79","isCorrect":false,"inputTokens":2377,"outputTokens":3079,"latencyMs":28779.471042000005},{"questionId":"q33","format":"xml","model":"gpt-5-nano","expected":"78","actual":"77","isCorrect":false,"inputTokens":7353,"outputTokens":1095,"latencyMs":11862.612083},{"questionId":"q33","format":"yaml","model":"gpt-5-nano","expected":"78","actual":"78","isCorrect":true,"inputTokens":5008,"outputTokens":1671,"latencyMs":16546.208209000004},{"questionId":"q34","format":"json","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":6386,"outputTokens":1287,"latencyMs":12874.773583000002},{"questionId":"q34","format":"toon","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":2523,"outputTokens":967,"latencyMs":10128.866540999996},{"questionId":"q34","format":"csv","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":2377,"outputTokens":2567,"latencyMs":21328.398541999995},{"questionId":"q34","format":"xml","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":7353,"outputTokens":1223,"latencyMs":10527.548834000001},{"questionId":"q34","format":"yaml","model":"gpt-5-nano","expected":"22","actual":"22","isCorrect":true,"inputTokens":5008,"outputTokens":1159,"latencyMs":10514.372334},{"questionId":"q35","format":"json","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":6394,"outputTokens":1671,"latencyMs":16690.553291999997},{"questionId":"q35","format":"toon","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2531,"outputTokens":1543,"latencyMs":18155.74162500001},{"questionId":"q35","format":"csv","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2385,"outputTokens":2247,"latencyMs":19133.287500000006},{"questionId":"q35","format":"xml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":7361,"outputTokens":1287,"latencyMs":14527.046083000008},{"questionId":"q35","format":"yaml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":5016,"outputTokens":1031,"latencyMs":11708.512457999997},{"questionId":"q36","format":"json","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":6394,"outputTokens":1223,"latencyMs":15122.788833999992},{"questionId":"q36","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2531,"outputTokens":839,"latencyMs":10423.516166000001},{"questionId":"q36","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2385,"outputTokens":1735,"latencyMs":15695.157332999996},{"questionId":"q36","format":"xml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":7361,"outputTokens":1415,"latencyMs":13899.425959},{"questionId":"q36","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":5016,"outputTokens":1159,"latencyMs":11514.098790999997},{"questionId":"q37","format":"json","model":"gpt-5-nano","expected":"11","actual":"10","isCorrect":false,"inputTokens":6394,"outputTokens":1287,"latencyMs":13009.814249999996},{"questionId":"q37","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2531,"outputTokens":1543,"latencyMs":15871.650417000012},{"questionId":"q37","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2385,"outputTokens":967,"latencyMs":9735.937375000009},{"questionId":"q37","format":"xml","model":"gpt-5-nano","expected":"11","actual":"10","isCorrect":false,"inputTokens":7361,"outputTokens":2055,"latencyMs":20242.728875},{"questionId":"q37","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":5016,"outputTokens":1671,"latencyMs":16286.792667000002},{"questionId":"q38","format":"json","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":6394,"outputTokens":2183,"latencyMs":21316.307375000004},{"questionId":"q38","format":"toon","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2531,"outputTokens":2439,"latencyMs":24585.012208999993},{"questionId":"q38","format":"csv","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2385,"outputTokens":1287,"latencyMs":15640.584124999994},{"questionId":"q38","format":"xml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":7361,"outputTokens":1415,"latencyMs":13889.092875000002},{"questionId":"q38","format":"yaml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":5016,"outputTokens":1159,"latencyMs":11549.750583000001},{"questionId":"q39","format":"json","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":6394,"outputTokens":583,"latencyMs":6874.004750000007},{"questionId":"q39","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2531,"outputTokens":1031,"latencyMs":14618.748875000005},{"questionId":"q39","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2385,"outputTokens":2055,"latencyMs":42524.04500000001},{"questionId":"q39","format":"xml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":7361,"outputTokens":1095,"latencyMs":10262.768083999996},{"questionId":"q39","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":5016,"outputTokens":1415,"latencyMs":13156.821458000006},{"questionId":"q40","format":"json","model":"gpt-5-nano","expected":"10","actual":"8","isCorrect":false,"inputTokens":6394,"outputTokens":1351,"latencyMs":15696.610916999998},{"questionId":"q40","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":2531,"outputTokens":1415,"latencyMs":15140.198166000002},{"questionId":"q40","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":2385,"outputTokens":1351,"latencyMs":20472.353375000006},{"questionId":"q40","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7361,"outputTokens":2183,"latencyMs":23243.26454100001},{"questionId":"q40","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"9","isCorrect":false,"inputTokens":5016,"outputTokens":1543,"latencyMs":17841.989625000002},{"questionId":"q41","format":"json","model":"gpt-5-nano","expected":"63","actual":"63","isCorrect":true,"inputTokens":6393,"outputTokens":2695,"latencyMs":23743.842332999993},{"questionId":"q41","format":"toon","model":"gpt-5-nano","expected":"63","actual":"63","isCorrect":true,"inputTokens":2530,"outputTokens":2631,"latencyMs":27546.34533299999},{"questionId":"q41","format":"csv","model":"gpt-5-nano","expected":"63","actual":"65","isCorrect":false,"inputTokens":2384,"outputTokens":4039,"latencyMs":42146.063124999986},{"questionId":"q41","format":"xml","model":"gpt-5-nano","expected":"63","actual":"63","isCorrect":true,"inputTokens":7360,"outputTokens":4871,"latencyMs":37767.37599999999},{"questionId":"q41","format":"yaml","model":"gpt-5-nano","expected":"63","actual":"63","isCorrect":true,"inputTokens":5015,"outputTokens":2439,"latencyMs":25553.095333999998},{"questionId":"q42","format":"json","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":6393,"outputTokens":2375,"latencyMs":22442.41562500001},{"questionId":"q42","format":"toon","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":2530,"outputTokens":2567,"latencyMs":25569.86658300001},{"questionId":"q42","format":"csv","model":"gpt-5-nano","expected":"53","actual":"54","isCorrect":false,"inputTokens":2384,"outputTokens":3079,"latencyMs":25882.737875000006},{"questionId":"q42","format":"xml","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":7360,"outputTokens":2695,"latencyMs":28840.383208000014},{"questionId":"q42","format":"yaml","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":5015,"outputTokens":2439,"latencyMs":25784.016457999984},{"questionId":"q43","format":"json","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":6393,"outputTokens":2183,"latencyMs":20179.226250000007},{"questionId":"q43","format":"toon","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":2530,"outputTokens":2567,"latencyMs":25615.354624999993},{"questionId":"q43","format":"csv","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":2384,"outputTokens":3335,"latencyMs":25506.231792000006},{"questionId":"q43","format":"xml","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":7360,"outputTokens":2503,"latencyMs":19985.533500000005},{"questionId":"q43","format":"yaml","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":5015,"outputTokens":2631,"latencyMs":22299.584208},{"questionId":"q44","format":"json","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":6393,"outputTokens":2311,"latencyMs":24179.406917},{"questionId":"q44","format":"toon","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2530,"outputTokens":2375,"latencyMs":19273.154207999993},{"questionId":"q44","format":"csv","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":2384,"outputTokens":2567,"latencyMs":25190.493749999994},{"questionId":"q44","format":"xml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":7360,"outputTokens":1991,"latencyMs":16888.5325},{"questionId":"q44","format":"yaml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":5015,"outputTokens":1799,"latencyMs":16030.792166999978},{"questionId":"q45","format":"json","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":6394,"outputTokens":1287,"latencyMs":12991.277834000008},{"questionId":"q45","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2531,"outputTokens":1351,"latencyMs":13087.36054199998},{"questionId":"q45","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2385,"outputTokens":1991,"latencyMs":18446.735167000006},{"questionId":"q45","format":"xml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":7361,"outputTokens":1607,"latencyMs":10865.307417000004},{"questionId":"q45","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":5016,"outputTokens":1159,"latencyMs":14611.011917000025},{"questionId":"q46","format":"json","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":6394,"outputTokens":967,"latencyMs":7963.411249999976},{"questionId":"q46","format":"toon","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":2531,"outputTokens":1287,"latencyMs":13350.95537499999},{"questionId":"q46","format":"csv","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":2385,"outputTokens":1223,"latencyMs":12941.291666999983},{"questionId":"q46","format":"xml","model":"gpt-5-nano","expected":"8","actual":"7","isCorrect":false,"inputTokens":7361,"outputTokens":2055,"latencyMs":19121.181458999985},{"questionId":"q46","format":"yaml","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":5016,"outputTokens":839,"latencyMs":11689.81270899999},{"questionId":"q47","format":"json","model":"gpt-5-nano","expected":"15","actual":"15","isCorrect":true,"inputTokens":6394,"outputTokens":1735,"latencyMs":12743.134749999997},{"questionId":"q47","format":"toon","model":"gpt-5-nano","expected":"15","actual":"15","isCorrect":true,"inputTokens":2531,"outputTokens":775,"latencyMs":8221.038832999999},{"questionId":"q47","format":"csv","model":"gpt-5-nano","expected":"15","actual":"15","isCorrect":true,"inputTokens":2385,"outputTokens":775,"latencyMs":8083.291667000012},{"questionId":"q47","format":"xml","model":"gpt-5-nano","expected":"15","actual":"15","isCorrect":true,"inputTokens":7361,"outputTokens":1031,"latencyMs":12041.053416999988},{"questionId":"q47","format":"yaml","model":"gpt-5-nano","expected":"15","actual":"14","isCorrect":false,"inputTokens":5016,"outputTokens":1159,"latencyMs":12225.70216700001},{"questionId":"q48","format":"json","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":6388,"outputTokens":1031,"latencyMs":10024.215874999994},{"questionId":"q48","format":"toon","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2525,"outputTokens":1287,"latencyMs":12129.384416000015},{"questionId":"q48","format":"csv","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2379,"outputTokens":3335,"latencyMs":17532.385958},{"questionId":"q48","format":"xml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":7355,"outputTokens":1287,"latencyMs":14163.686583000002},{"questionId":"q48","format":"yaml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":5010,"outputTokens":1031,"latencyMs":10721.844666999998},{"questionId":"q49","format":"json","model":"gpt-5-nano","expected":"11","actual":"10","isCorrect":false,"inputTokens":6388,"outputTokens":647,"latencyMs":10175.196499999991},{"questionId":"q49","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2525,"outputTokens":1095,"latencyMs":8359.25920900001},{"questionId":"q49","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2379,"outputTokens":1671,"latencyMs":19210.797334000003},{"questionId":"q49","format":"xml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":7355,"outputTokens":839,"latencyMs":9501.392666999978},{"questionId":"q49","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":5010,"outputTokens":903,"latencyMs":10511.122625000018},{"questionId":"q50","format":"json","model":"gpt-5-nano","expected":"14","actual":"14","isCorrect":true,"inputTokens":6388,"outputTokens":839,"latencyMs":7034.4220000000205},{"questionId":"q50","format":"toon","model":"gpt-5-nano","expected":"14","actual":"14","isCorrect":true,"inputTokens":2525,"outputTokens":903,"latencyMs":9088.10166700001},{"questionId":"q50","format":"csv","model":"gpt-5-nano","expected":"14","actual":"14","isCorrect":true,"inputTokens":2379,"outputTokens":1479,"latencyMs":13106.483208999998},{"questionId":"q50","format":"xml","model":"gpt-5-nano","expected":"14","actual":"14","isCorrect":true,"inputTokens":7355,"outputTokens":1223,"latencyMs":12101.726083999994},{"questionId":"q50","format":"yaml","model":"gpt-5-nano","expected":"14","actual":"14","isCorrect":true,"inputTokens":5010,"outputTokens":1799,"latencyMs":17414.184500000003},{"questionId":"q51","format":"json","model":"gpt-5-nano","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":9738,"outputTokens":73,"latencyMs":3038.685334000009},{"questionId":"q51","format":"toon","model":"gpt-5-nano","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":6012,"outputTokens":73,"latencyMs":2160.960334000003},{"questionId":"q51","format":"csv","model":"gpt-5-nano","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":6780,"outputTokens":137,"latencyMs":2365.200749999989},{"questionId":"q51","format":"xml","model":"gpt-5-nano","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":11036,"outputTokens":137,"latencyMs":1989.758124999993},{"questionId":"q51","format":"yaml","model":"gpt-5-nano","expected":"96.17","actual":"96.17","isCorrect":true,"inputTokens":7372,"outputTokens":137,"latencyMs":2664.6698329999927},{"questionId":"q52","format":"json","model":"gpt-5-nano","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":9738,"outputTokens":136,"latencyMs":3202.772499999992},{"questionId":"q52","format":"toon","model":"gpt-5-nano","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":6012,"outputTokens":136,"latencyMs":2305.1638749999984},{"questionId":"q52","format":"csv","model":"gpt-5-nano","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":6780,"outputTokens":200,"latencyMs":3069.4525830000057},{"questionId":"q52","format":"xml","model":"gpt-5-nano","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":11036,"outputTokens":200,"latencyMs":2304.95974999998},{"questionId":"q52","format":"yaml","model":"gpt-5-nano","expected":"shipped","actual":"shipped","isCorrect":true,"inputTokens":7372,"outputTokens":136,"latencyMs":2077.190957999992},{"questionId":"q53","format":"json","model":"gpt-5-nano","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":9738,"outputTokens":265,"latencyMs":3420.801832999976},{"questionId":"q53","format":"toon","model":"gpt-5-nano","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":6012,"outputTokens":201,"latencyMs":2733.4107920000097},{"questionId":"q53","format":"csv","model":"gpt-5-nano","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":6780,"outputTokens":265,"latencyMs":3371.902375000005},{"questionId":"q53","format":"xml","model":"gpt-5-nano","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":11036,"outputTokens":329,"latencyMs":2736.295167000004},{"questionId":"q53","format":"yaml","model":"gpt-5-nano","expected":"599.39","actual":"599.39","isCorrect":true,"inputTokens":7372,"outputTokens":201,"latencyMs":3164.7157080000034},{"questionId":"q54","format":"json","model":"gpt-5-nano","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":9738,"outputTokens":199,"latencyMs":2172.5717090000107},{"questionId":"q54","format":"toon","model":"gpt-5-nano","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":6012,"outputTokens":135,"latencyMs":2345.0319169999857},{"questionId":"q54","format":"csv","model":"gpt-5-nano","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":6780,"outputTokens":135,"latencyMs":3713.114291999984},{"questionId":"q54","format":"xml","model":"gpt-5-nano","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":11036,"outputTokens":391,"latencyMs":6108.840708000003},{"questionId":"q54","format":"yaml","model":"gpt-5-nano","expected":"processing","actual":"processing","isCorrect":true,"inputTokens":7372,"outputTokens":199,"latencyMs":3263.157750000013},{"questionId":"q55","format":"json","model":"gpt-5-nano","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":9738,"outputTokens":265,"latencyMs":3599.183208000002},{"questionId":"q55","format":"toon","model":"gpt-5-nano","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":6012,"outputTokens":265,"latencyMs":3806.1117080000113},{"questionId":"q55","format":"csv","model":"gpt-5-nano","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":6780,"outputTokens":137,"latencyMs":2482.1311250000144},{"questionId":"q55","format":"xml","model":"gpt-5-nano","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":11036,"outputTokens":457,"latencyMs":4714.9086669999815},{"questionId":"q55","format":"yaml","model":"gpt-5-nano","expected":"528.71","actual":"528.71","isCorrect":true,"inputTokens":7372,"outputTokens":265,"latencyMs":3542.246542000008},{"questionId":"q56","format":"json","model":"gpt-5-nano","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":9738,"outputTokens":199,"latencyMs":4117.672166999982},{"questionId":"q56","format":"toon","model":"gpt-5-nano","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":6012,"outputTokens":263,"latencyMs":3441.915166999999},{"questionId":"q56","format":"csv","model":"gpt-5-nano","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":6780,"outputTokens":263,"latencyMs":8454.847999999998},{"questionId":"q56","format":"xml","model":"gpt-5-nano","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":11036,"outputTokens":199,"latencyMs":2997.5},{"questionId":"q56","format":"yaml","model":"gpt-5-nano","expected":"pending","actual":"pending","isCorrect":true,"inputTokens":7372,"outputTokens":199,"latencyMs":3116.710000000021},{"questionId":"q57","format":"json","model":"gpt-5-nano","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":9738,"outputTokens":266,"latencyMs":3084.641333000007},{"questionId":"q57","format":"toon","model":"gpt-5-nano","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":6012,"outputTokens":202,"latencyMs":3517.2125410000153},{"questionId":"q57","format":"csv","model":"gpt-5-nano","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":6780,"outputTokens":394,"latencyMs":2861.477082999976},{"questionId":"q57","format":"xml","model":"gpt-5-nano","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":11036,"outputTokens":330,"latencyMs":4378.942290999985},{"questionId":"q57","format":"yaml","model":"gpt-5-nano","expected":"1687.82","actual":"1687.82","isCorrect":true,"inputTokens":7372,"outputTokens":266,"latencyMs":3748.4990410000028},{"questionId":"q58","format":"json","model":"gpt-5-nano","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":9738,"outputTokens":136,"latencyMs":2310.9124590000138},{"questionId":"q58","format":"toon","model":"gpt-5-nano","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":6012,"outputTokens":392,"latencyMs":5970.874375000014},{"questionId":"q58","format":"csv","model":"gpt-5-nano","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":6780,"outputTokens":264,"latencyMs":4393.402040999994},{"questionId":"q58","format":"xml","model":"gpt-5-nano","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":11036,"outputTokens":200,"latencyMs":3243.2633340000175},{"questionId":"q58","format":"yaml","model":"gpt-5-nano","expected":"cancelled","actual":"cancelled","isCorrect":true,"inputTokens":7372,"outputTokens":200,"latencyMs":2769.2004580000066},{"questionId":"q59","format":"json","model":"gpt-5-nano","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":9739,"outputTokens":76,"latencyMs":2105.4266669999924},{"questionId":"q59","format":"toon","model":"gpt-5-nano","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":6013,"outputTokens":140,"latencyMs":2265.2000830000034},{"questionId":"q59","format":"csv","model":"gpt-5-nano","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":6781,"outputTokens":140,"latencyMs":2353.731375000003},{"questionId":"q59","format":"xml","model":"gpt-5-nano","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":11037,"outputTokens":140,"latencyMs":2614.9170830000076},{"questionId":"q59","format":"yaml","model":"gpt-5-nano","expected":"Dr. Courtney Satterfield","actual":"Dr. Courtney Satterfield","isCorrect":true,"inputTokens":7373,"outputTokens":140,"latencyMs":3472.885209},{"questionId":"q60","format":"json","model":"gpt-5-nano","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":9739,"outputTokens":139,"latencyMs":2373.651208999974},{"questionId":"q60","format":"toon","model":"gpt-5-nano","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":6013,"outputTokens":139,"latencyMs":2132.121083999984},{"questionId":"q60","format":"csv","model":"gpt-5-nano","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":6781,"outputTokens":267,"latencyMs":3185.6174170000013},{"questionId":"q60","format":"xml","model":"gpt-5-nano","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":11037,"outputTokens":203,"latencyMs":3214.6773329999996},{"questionId":"q60","format":"yaml","model":"gpt-5-nano","expected":"lukas71@gmail.com","actual":"lukas71@gmail.com","isCorrect":true,"inputTokens":7373,"outputTokens":139,"latencyMs":1703.899000000005},{"questionId":"q61","format":"json","model":"gpt-5-nano","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":9739,"outputTokens":204,"latencyMs":3408.625457999995},{"questionId":"q61","format":"toon","model":"gpt-5-nano","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":6013,"outputTokens":76,"latencyMs":1742.614750000008},{"questionId":"q61","format":"csv","model":"gpt-5-nano","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":6781,"outputTokens":268,"latencyMs":4062.0994579999824},{"questionId":"q61","format":"xml","model":"gpt-5-nano","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":11037,"outputTokens":332,"latencyMs":4329.766915999993},{"questionId":"q61","format":"yaml","model":"gpt-5-nano","expected":"2025-08-05","actual":"2025-08-05","isCorrect":true,"inputTokens":7373,"outputTokens":140,"latencyMs":2656.797082999983},{"questionId":"q62","format":"json","model":"gpt-5-nano","expected":"3","actual":"3","isCorrect":true,"inputTokens":9738,"outputTokens":327,"latencyMs":4221.204874999996},{"questionId":"q62","format":"toon","model":"gpt-5-nano","expected":"3","actual":"3","isCorrect":true,"inputTokens":6012,"outputTokens":711,"latencyMs":7848.512791999994},{"questionId":"q62","format":"csv","model":"gpt-5-nano","expected":"3","actual":"3","isCorrect":true,"inputTokens":6780,"outputTokens":903,"latencyMs":8287.347917000006},{"questionId":"q62","format":"xml","model":"gpt-5-nano","expected":"3","actual":"10","isCorrect":false,"inputTokens":11036,"outputTokens":647,"latencyMs":6944.630499999999},{"questionId":"q62","format":"yaml","model":"gpt-5-nano","expected":"3","actual":"10","isCorrect":false,"inputTokens":7372,"outputTokens":327,"latencyMs":3122.6620419999817},{"questionId":"q63","format":"json","model":"gpt-5-nano","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":9739,"outputTokens":138,"latencyMs":4663.652958999999},{"questionId":"q63","format":"toon","model":"gpt-5-nano","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":6013,"outputTokens":138,"latencyMs":3369.3136670000094},{"questionId":"q63","format":"csv","model":"gpt-5-nano","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":6781,"outputTokens":266,"latencyMs":3798.8209999999963},{"questionId":"q63","format":"xml","model":"gpt-5-nano","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":11037,"outputTokens":202,"latencyMs":3454.3941669999913},{"questionId":"q63","format":"yaml","model":"gpt-5-nano","expected":"Maxine Zemlak","actual":"Maxine Zemlak","isCorrect":true,"inputTokens":7373,"outputTokens":138,"latencyMs":4146.146832999977},{"questionId":"q64","format":"json","model":"gpt-5-nano","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":9739,"outputTokens":267,"latencyMs":5647.55133300001},{"questionId":"q64","format":"toon","model":"gpt-5-nano","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":6013,"outputTokens":203,"latencyMs":3010.75991600001},{"questionId":"q64","format":"csv","model":"gpt-5-nano","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":6781,"outputTokens":267,"latencyMs":2115.998583000008},{"questionId":"q64","format":"xml","model":"gpt-5-nano","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":11037,"outputTokens":331,"latencyMs":4380.475833000004},{"questionId":"q64","format":"yaml","model":"gpt-5-nano","expected":"brenden2@hotmail.com","actual":"brenden2@hotmail.com","isCorrect":true,"inputTokens":7373,"outputTokens":139,"latencyMs":2166.7608330000076},{"questionId":"q65","format":"json","model":"gpt-5-nano","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":9739,"outputTokens":332,"latencyMs":3944.2122079999826},{"questionId":"q65","format":"toon","model":"gpt-5-nano","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":6013,"outputTokens":268,"latencyMs":3732.385457999975},{"questionId":"q65","format":"csv","model":"gpt-5-nano","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":6781,"outputTokens":396,"latencyMs":2841.6518329999817},{"questionId":"q65","format":"xml","model":"gpt-5-nano","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":11037,"outputTokens":140,"latencyMs":2206.0024580000027},{"questionId":"q65","format":"yaml","model":"gpt-5-nano","expected":"2025-08-29","actual":"2025-08-29","isCorrect":true,"inputTokens":7373,"outputTokens":140,"latencyMs":2291.364208999992},{"questionId":"q66","format":"json","model":"gpt-5-nano","expected":"4","actual":"2","isCorrect":false,"inputTokens":9738,"outputTokens":519,"latencyMs":4125.47641599999},{"questionId":"q66","format":"toon","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":6012,"outputTokens":711,"latencyMs":8088.974500000011},{"questionId":"q66","format":"csv","model":"gpt-5-nano","expected":"4","actual":"11","isCorrect":false,"inputTokens":6780,"outputTokens":519,"latencyMs":6321.9130000000005},{"questionId":"q66","format":"xml","model":"gpt-5-nano","expected":"4","actual":"11","isCorrect":false,"inputTokens":11036,"outputTokens":583,"latencyMs":4200.701750000007},{"questionId":"q66","format":"yaml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":7372,"outputTokens":583,"latencyMs":6297.599625000003},{"questionId":"q67","format":"json","model":"gpt-5-nano","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":9739,"outputTokens":143,"latencyMs":2912.834124999994},{"questionId":"q67","format":"toon","model":"gpt-5-nano","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":6013,"outputTokens":143,"latencyMs":2421.850584},{"questionId":"q67","format":"csv","model":"gpt-5-nano","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":6781,"outputTokens":207,"latencyMs":3116.5822500000068},{"questionId":"q67","format":"xml","model":"gpt-5-nano","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":11037,"outputTokens":207,"latencyMs":3218.9356249999837},{"questionId":"q67","format":"yaml","model":"gpt-5-nano","expected":"Claudia Cruickshank DVM","actual":"Claudia Cruickshank DVM","isCorrect":true,"inputTokens":7373,"outputTokens":335,"latencyMs":5959.828000000009},{"questionId":"q68","format":"json","model":"gpt-5-nano","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":9739,"outputTokens":205,"latencyMs":2961.5506250000035},{"questionId":"q68","format":"toon","model":"gpt-5-nano","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":6013,"outputTokens":141,"latencyMs":2043.8920419999922},{"questionId":"q68","format":"csv","model":"gpt-5-nano","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":6781,"outputTokens":333,"latencyMs":3585.4907080000266},{"questionId":"q68","format":"xml","model":"gpt-5-nano","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":11037,"outputTokens":141,"latencyMs":3028.4967079999915},{"questionId":"q68","format":"yaml","model":"gpt-5-nano","expected":"freeda.maggio74@gmail.com","actual":"freeda.maggio74@gmail.com","isCorrect":true,"inputTokens":7373,"outputTokens":141,"latencyMs":2843.2516249999753},{"questionId":"q69","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":9735,"outputTokens":903,"latencyMs":15920.834208999993},{"questionId":"q69","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6009,"outputTokens":583,"latencyMs":6311.494167000026},{"questionId":"q69","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6777,"outputTokens":1159,"latencyMs":11771.282832999976},{"questionId":"q69","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":11033,"outputTokens":647,"latencyMs":4768.233042000007},{"questionId":"q69","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7369,"outputTokens":711,"latencyMs":12148.621790999983},{"questionId":"q70","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":9735,"outputTokens":647,"latencyMs":7048.331458000001},{"questionId":"q70","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6009,"outputTokens":519,"latencyMs":11328.925374999992},{"questionId":"q70","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6777,"outputTokens":583,"latencyMs":6098.344834000018},{"questionId":"q70","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":11033,"outputTokens":903,"latencyMs":9603.738207999995},{"questionId":"q70","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7369,"outputTokens":903,"latencyMs":14231.113124999974},{"questionId":"q71","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":9736,"outputTokens":775,"latencyMs":7369.550875000015},{"questionId":"q71","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6010,"outputTokens":583,"latencyMs":6731.325707999989},{"questionId":"q71","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6778,"outputTokens":583,"latencyMs":7837.276500000007},{"questionId":"q71","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":11034,"outputTokens":519,"latencyMs":6240.008499999996},{"questionId":"q71","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7370,"outputTokens":583,"latencyMs":5608.396291000012},{"questionId":"q72","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":9736,"outputTokens":775,"latencyMs":6498.46762499999},{"questionId":"q72","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6010,"outputTokens":455,"latencyMs":5201.593292000005},{"questionId":"q72","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6778,"outputTokens":839,"latencyMs":8005.897459},{"questionId":"q72","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":11034,"outputTokens":903,"latencyMs":8286.163332999975},{"questionId":"q72","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7370,"outputTokens":583,"latencyMs":3667.8866249999846},{"questionId":"q73","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":9736,"outputTokens":711,"latencyMs":4776.384583000006},{"questionId":"q73","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6010,"outputTokens":711,"latencyMs":9609.80254199999},{"questionId":"q73","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":6778,"outputTokens":583,"latencyMs":5845.595000000001},{"questionId":"q73","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":11034,"outputTokens":1031,"latencyMs":11357.896833000006},{"questionId":"q73","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":7370,"outputTokens":519,"latencyMs":5951.586875000008},{"questionId":"q74","format":"json","model":"gpt-5-nano","expected":"42342.25","actual":"42342.25","isCorrect":true,"inputTokens":9736,"outputTokens":1802,"latencyMs":19755.418540999992},{"questionId":"q74","format":"toon","model":"gpt-5-nano","expected":"42342.25","actual":"42342.25","isCorrect":true,"inputTokens":6010,"outputTokens":3978,"latencyMs":33005.04820800002},{"questionId":"q74","format":"csv","model":"gpt-5-nano","expected":"42342.25","actual":"42342.25","isCorrect":true,"inputTokens":6778,"outputTokens":2826,"latencyMs":15668.504250000027},{"questionId":"q74","format":"xml","model":"gpt-5-nano","expected":"42342.25","actual":"42342.25","isCorrect":true,"inputTokens":11034,"outputTokens":2442,"latencyMs":21508.00350000002},{"questionId":"q74","format":"yaml","model":"gpt-5-nano","expected":"42342.25","actual":"49193.23","isCorrect":false,"inputTokens":7370,"outputTokens":2378,"latencyMs":22421.654666999995},{"questionId":"q75","format":"json","model":"gpt-5-nano","expected":"846.85","actual":"846.85","isCorrect":true,"inputTokens":9734,"outputTokens":2569,"latencyMs":14351.128457999992},{"questionId":"q75","format":"toon","model":"gpt-5-nano","expected":"846.85","actual":"846.85","isCorrect":true,"inputTokens":6008,"outputTokens":2889,"latencyMs":26770.479124999983},{"questionId":"q75","format":"csv","model":"gpt-5-nano","expected":"846.85","actual":"846.85","isCorrect":true,"inputTokens":6776,"outputTokens":1865,"latencyMs":15913.093415999989},{"questionId":"q75","format":"xml","model":"gpt-5-nano","expected":"846.85","actual":"846.85","isCorrect":true,"inputTokens":11032,"outputTokens":1545,"latencyMs":13416.188874999993},{"questionId":"q75","format":"yaml","model":"gpt-5-nano","expected":"846.85","actual":"846.85","isCorrect":true,"inputTokens":7368,"outputTokens":2569,"latencyMs":23448.192584000004},{"questionId":"q76","format":"json","model":"gpt-5-nano","expected":"50","actual":"50","isCorrect":true,"inputTokens":9735,"outputTokens":135,"latencyMs":5185.920958000002},{"questionId":"q76","format":"toon","model":"gpt-5-nano","expected":"50","actual":"50","isCorrect":true,"inputTokens":6009,"outputTokens":71,"latencyMs":2111.6935419999936},{"questionId":"q76","format":"csv","model":"gpt-5-nano","expected":"50","actual":"50","isCorrect":true,"inputTokens":6777,"outputTokens":71,"latencyMs":2160.7785420000146},{"questionId":"q76","format":"xml","model":"gpt-5-nano","expected":"50","actual":"50","isCorrect":true,"inputTokens":11033,"outputTokens":455,"latencyMs":4898.41833299998},{"questionId":"q76","format":"yaml","model":"gpt-5-nano","expected":"50","actual":"50","isCorrect":true,"inputTokens":7369,"outputTokens":263,"latencyMs":3025.5673750000133},{"questionId":"q77","format":"json","model":"gpt-5-nano","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":9734,"outputTokens":1098,"latencyMs":11276.571957999986},{"questionId":"q77","format":"toon","model":"gpt-5-nano","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":6008,"outputTokens":1034,"latencyMs":11671.425916999986},{"questionId":"q77","format":"csv","model":"gpt-5-nano","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":6776,"outputTokens":842,"latencyMs":7802.907333999989},{"questionId":"q77","format":"xml","model":"gpt-5-nano","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":11032,"outputTokens":1098,"latencyMs":7163.344249999995},{"questionId":"q77","format":"yaml","model":"gpt-5-nano","expected":"1936.06","actual":"1936.06","isCorrect":true,"inputTokens":7368,"outputTokens":1354,"latencyMs":13101.612166000006},{"questionId":"q78","format":"json","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":9738,"outputTokens":2759,"latencyMs":25064.072875000013},{"questionId":"q78","format":"toon","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":6012,"outputTokens":1351,"latencyMs":6848.968334000005},{"questionId":"q78","format":"csv","model":"gpt-5-nano","expected":"44","actual":"46","isCorrect":false,"inputTokens":6780,"outputTokens":1415,"latencyMs":14459.32925000001},{"questionId":"q78","format":"xml","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":11036,"outputTokens":1607,"latencyMs":16181.164584000013},{"questionId":"q78","format":"yaml","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":7372,"outputTokens":1671,"latencyMs":13176.961833000008},{"questionId":"q79","format":"json","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":9738,"outputTokens":1735,"latencyMs":16681.134500000015},{"questionId":"q79","format":"toon","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":6012,"outputTokens":1031,"latencyMs":9764.626374999993},{"questionId":"q79","format":"csv","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":6780,"outputTokens":1671,"latencyMs":15794.334374999977},{"questionId":"q79","format":"xml","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":11036,"outputTokens":967,"latencyMs":9426.200333999994},{"questionId":"q79","format":"yaml","model":"gpt-5-nano","expected":"39","actual":"39","isCorrect":true,"inputTokens":7372,"outputTokens":1351,"latencyMs":13007.724125000008},{"questionId":"q80","format":"json","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":9738,"outputTokens":1863,"latencyMs":19127.65849999999},{"questionId":"q80","format":"toon","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":6012,"outputTokens":1543,"latencyMs":16356.698375000007},{"questionId":"q80","format":"csv","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":6780,"outputTokens":1543,"latencyMs":11483.868124999979},{"questionId":"q80","format":"xml","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":11036,"outputTokens":1351,"latencyMs":15123.078042000008},{"questionId":"q80","format":"yaml","model":"gpt-5-nano","expected":"32","actual":"31","isCorrect":false,"inputTokens":7372,"outputTokens":1287,"latencyMs":6462.253124999988},{"questionId":"q81","format":"json","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":9742,"outputTokens":903,"latencyMs":8882.427333},{"questionId":"q81","format":"toon","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":6016,"outputTokens":711,"latencyMs":12324.765500000009},{"questionId":"q81","format":"csv","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":6784,"outputTokens":583,"latencyMs":6280.893833000009},{"questionId":"q81","format":"xml","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":11040,"outputTokens":1543,"latencyMs":15681.051708000014},{"questionId":"q81","format":"yaml","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":7376,"outputTokens":775,"latencyMs":7663.829792000004},{"questionId":"q82","format":"json","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":9742,"outputTokens":775,"latencyMs":6653.681707999989},{"questionId":"q82","format":"toon","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":6016,"outputTokens":775,"latencyMs":7786.641916000022},{"questionId":"q82","format":"csv","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":6784,"outputTokens":1159,"latencyMs":10789.47875000001},{"questionId":"q82","format":"xml","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":11040,"outputTokens":1223,"latencyMs":8007.263500000001},{"questionId":"q82","format":"yaml","model":"gpt-5-nano","expected":"8","actual":"8","isCorrect":true,"inputTokens":7376,"outputTokens":711,"latencyMs":4364.750958999997},{"questionId":"q83","format":"json","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":9743,"outputTokens":711,"latencyMs":8292.946624999982},{"questionId":"q83","format":"toon","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":6017,"outputTokens":839,"latencyMs":4682.287000000011},{"questionId":"q83","format":"csv","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":6785,"outputTokens":775,"latencyMs":6971.318999999989},{"questionId":"q83","format":"xml","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":11041,"outputTokens":1287,"latencyMs":7167.556458000006},{"questionId":"q83","format":"yaml","model":"gpt-5-nano","expected":"7","actual":"7","isCorrect":true,"inputTokens":7377,"outputTokens":1095,"latencyMs":10502.716707999993},{"questionId":"q84","format":"json","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":9743,"outputTokens":775,"latencyMs":8604.063166999986},{"questionId":"q84","format":"toon","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":6017,"outputTokens":839,"latencyMs":7962.534583000001},{"questionId":"q84","format":"csv","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":6785,"outputTokens":775,"latencyMs":7521.391000000003},{"questionId":"q84","format":"xml","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":11041,"outputTokens":1479,"latencyMs":13763.949292000005},{"questionId":"q84","format":"yaml","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":7377,"outputTokens":775,"latencyMs":7821.052334000007},{"questionId":"q85","format":"json","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":9743,"outputTokens":839,"latencyMs":9474.105582999997},{"questionId":"q85","format":"toon","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":6017,"outputTokens":775,"latencyMs":7121.427082999988},{"questionId":"q85","format":"csv","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":6785,"outputTokens":1031,"latencyMs":11699.078667000023},{"questionId":"q85","format":"xml","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":11041,"outputTokens":1223,"latencyMs":13459.754665999993},{"questionId":"q85","format":"yaml","model":"gpt-5-nano","expected":"9","actual":"9","isCorrect":true,"inputTokens":7377,"outputTokens":583,"latencyMs":5726.723750000005},{"questionId":"q86","format":"json","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":9742,"outputTokens":839,"latencyMs":8881.83374999999},{"questionId":"q86","format":"toon","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":6016,"outputTokens":1095,"latencyMs":10383.40737499998},{"questionId":"q86","format":"csv","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":6784,"outputTokens":2247,"latencyMs":18668.413291999983},{"questionId":"q86","format":"xml","model":"gpt-5-nano","expected":"6","actual":"8","isCorrect":false,"inputTokens":11040,"outputTokens":2311,"latencyMs":18610.611999999994},{"questionId":"q86","format":"yaml","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":7376,"outputTokens":1223,"latencyMs":11865.399291000009},{"questionId":"q87","format":"json","model":"gpt-5-nano","expected":"5","actual":"4","isCorrect":false,"inputTokens":9742,"outputTokens":1095,"latencyMs":6300.196458999999},{"questionId":"q87","format":"toon","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":6016,"outputTokens":647,"latencyMs":7462.632207999995},{"questionId":"q87","format":"csv","model":"gpt-5-nano","expected":"5","actual":"10","isCorrect":false,"inputTokens":6784,"outputTokens":1479,"latencyMs":14004.076541999995},{"questionId":"q87","format":"xml","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":11040,"outputTokens":2375,"latencyMs":14972.963541999983},{"questionId":"q87","format":"yaml","model":"gpt-5-nano","expected":"5","actual":"5","isCorrect":true,"inputTokens":7376,"outputTokens":1479,"latencyMs":10234.670041999983},{"questionId":"q88","format":"json","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":9743,"outputTokens":967,"latencyMs":5632.503333000001},{"questionId":"q88","format":"toon","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":6017,"outputTokens":711,"latencyMs":6941.359792000003},{"questionId":"q88","format":"csv","model":"gpt-5-nano","expected":"4","actual":"10","isCorrect":false,"inputTokens":6785,"outputTokens":2311,"latencyMs":22497.30016600003},{"questionId":"q88","format":"xml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":11041,"outputTokens":1863,"latencyMs":16778.21416600002},{"questionId":"q88","format":"yaml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":7377,"outputTokens":1031,"latencyMs":9174.879209000006},{"questionId":"q89","format":"json","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":9744,"outputTokens":2439,"latencyMs":15790.312208000018},{"questionId":"q89","format":"toon","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":6018,"outputTokens":3527,"latencyMs":20750.796208999993},{"questionId":"q89","format":"csv","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":6786,"outputTokens":6087,"latencyMs":32360.92216700001},{"questionId":"q89","format":"xml","model":"gpt-5-nano","expected":"27","actual":"39","isCorrect":false,"inputTokens":11042,"outputTokens":5127,"latencyMs":51062.804458},{"questionId":"q89","format":"yaml","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":7378,"outputTokens":3463,"latencyMs":30908.412584000005},{"questionId":"q90","format":"json","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":9744,"outputTokens":1671,"latencyMs":15720.804125000024},{"questionId":"q90","format":"toon","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":6018,"outputTokens":1927,"latencyMs":13024.252500000031},{"questionId":"q90","format":"csv","model":"gpt-5-nano","expected":"27","actual":"37","isCorrect":false,"inputTokens":6786,"outputTokens":6471,"latencyMs":58691.74404199998},{"questionId":"q90","format":"xml","model":"gpt-5-nano","expected":"27","actual":"28","isCorrect":false,"inputTokens":11042,"outputTokens":5511,"latencyMs":48244.214707999985},{"questionId":"q90","format":"yaml","model":"gpt-5-nano","expected":"27","actual":"27","isCorrect":true,"inputTokens":7378,"outputTokens":3399,"latencyMs":21301.83387500001},{"questionId":"q91","format":"json","model":"gpt-5-nano","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":3712,"outputTokens":72,"latencyMs":3106.625208000012},{"questionId":"q91","format":"toon","model":"gpt-5-nano","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":1563,"outputTokens":136,"latencyMs":2173.4088749999937},{"questionId":"q91","format":"csv","model":"gpt-5-nano","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":1441,"outputTokens":136,"latencyMs":2207.5250000000233},{"questionId":"q91","format":"xml","model":"gpt-5-nano","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":4423,"outputTokens":136,"latencyMs":2563.4236670000246},{"questionId":"q91","format":"yaml","model":"gpt-5-nano","expected":"6975","actual":"6975","isCorrect":true,"inputTokens":2985,"outputTokens":72,"latencyMs":2004.1497499999823},{"questionId":"q92","format":"json","model":"gpt-5-nano","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":3711,"outputTokens":138,"latencyMs":2035.1270420000073},{"questionId":"q92","format":"toon","model":"gpt-5-nano","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":1562,"outputTokens":138,"latencyMs":4099.307708000008},{"questionId":"q92","format":"csv","model":"gpt-5-nano","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":1440,"outputTokens":138,"latencyMs":4950.298874999979},{"questionId":"q92","format":"xml","model":"gpt-5-nano","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":4422,"outputTokens":74,"latencyMs":2060.0328749999753},{"questionId":"q92","format":"yaml","model":"gpt-5-nano","expected":"6686.23","actual":"6686.23","isCorrect":true,"inputTokens":2984,"outputTokens":138,"latencyMs":8157.924708999984},{"questionId":"q93","format":"json","model":"gpt-5-nano","expected":"33","actual":"33","isCorrect":true,"inputTokens":3712,"outputTokens":71,"latencyMs":3266.4955840000184},{"questionId":"q93","format":"toon","model":"gpt-5-nano","expected":"33","actual":"33","isCorrect":true,"inputTokens":1563,"outputTokens":135,"latencyMs":2373.3992499999586},{"questionId":"q93","format":"csv","model":"gpt-5-nano","expected":"33","actual":"33","isCorrect":true,"inputTokens":1441,"outputTokens":71,"latencyMs":1805.440333000035},{"questionId":"q93","format":"xml","model":"gpt-5-nano","expected":"33","actual":"33","isCorrect":true,"inputTokens":4423,"outputTokens":135,"latencyMs":2269.6386250000214},{"questionId":"q93","format":"yaml","model":"gpt-5-nano","expected":"33","actual":"33","isCorrect":true,"inputTokens":2985,"outputTokens":71,"latencyMs":2004.672957999981},{"questionId":"q94","format":"json","model":"gpt-5-nano","expected":"377","actual":"377","isCorrect":true,"inputTokens":3712,"outputTokens":71,"latencyMs":1775.2346249999828},{"questionId":"q94","format":"toon","model":"gpt-5-nano","expected":"377","actual":"377","isCorrect":true,"inputTokens":1563,"outputTokens":455,"latencyMs":6650.344334000023},{"questionId":"q94","format":"csv","model":"gpt-5-nano","expected":"377","actual":"377","isCorrect":true,"inputTokens":1441,"outputTokens":71,"latencyMs":2128.058707999997},{"questionId":"q94","format":"xml","model":"gpt-5-nano","expected":"377","actual":"377","isCorrect":true,"inputTokens":4423,"outputTokens":199,"latencyMs":4896.449458000017},{"questionId":"q94","format":"yaml","model":"gpt-5-nano","expected":"377","actual":"377","isCorrect":true,"inputTokens":2985,"outputTokens":71,"latencyMs":1800.050500000012},{"questionId":"q95","format":"json","model":"gpt-5-nano","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":3712,"outputTokens":201,"latencyMs":2280.200375000015},{"questionId":"q95","format":"toon","model":"gpt-5-nano","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":1563,"outputTokens":201,"latencyMs":2640.3976660000044},{"questionId":"q95","format":"csv","model":"gpt-5-nano","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":1441,"outputTokens":137,"latencyMs":2159.501833999995},{"questionId":"q95","format":"xml","model":"gpt-5-nano","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":4423,"outputTokens":201,"latencyMs":2729.7381250000326},{"questionId":"q95","format":"yaml","model":"gpt-5-nano","expected":"0.44","actual":"0.44","isCorrect":true,"inputTokens":2985,"outputTokens":137,"latencyMs":2862.6320000000414},{"questionId":"q96","format":"json","model":"gpt-5-nano","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":3712,"outputTokens":136,"latencyMs":1977.3999170000316},{"questionId":"q96","format":"toon","model":"gpt-5-nano","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":1563,"outputTokens":136,"latencyMs":1777.6621659999946},{"questionId":"q96","format":"csv","model":"gpt-5-nano","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":1441,"outputTokens":200,"latencyMs":2808.3442500000237},{"questionId":"q96","format":"xml","model":"gpt-5-nano","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":4423,"outputTokens":136,"latencyMs":2431.7366250000196},{"questionId":"q96","format":"yaml","model":"gpt-5-nano","expected":"7621","actual":"7621","isCorrect":true,"inputTokens":2985,"outputTokens":264,"latencyMs":3476.1824170000036},{"questionId":"q97","format":"json","model":"gpt-5-nano","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":3711,"outputTokens":266,"latencyMs":2816.1715420000255},{"questionId":"q97","format":"toon","model":"gpt-5-nano","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":1562,"outputTokens":138,"latencyMs":4694.5291669999715},{"questionId":"q97","format":"csv","model":"gpt-5-nano","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":1440,"outputTokens":74,"latencyMs":1895.272500000021},{"questionId":"q97","format":"xml","model":"gpt-5-nano","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":4422,"outputTokens":74,"latencyMs":1696.6640840000473},{"questionId":"q97","format":"yaml","model":"gpt-5-nano","expected":"1827.12","actual":"1827.12","isCorrect":true,"inputTokens":2984,"outputTokens":138,"latencyMs":2057.5235840000096},{"questionId":"q98","format":"json","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":3712,"outputTokens":71,"latencyMs":2102.908333999978},{"questionId":"q98","format":"toon","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":1563,"outputTokens":135,"latencyMs":2015.2874580000062},{"questionId":"q98","format":"csv","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":1441,"outputTokens":391,"latencyMs":4864.857958999986},{"questionId":"q98","format":"xml","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":4423,"outputTokens":263,"latencyMs":2451.5397079999675},{"questionId":"q98","format":"yaml","model":"gpt-5-nano","expected":"44","actual":"44","isCorrect":true,"inputTokens":2985,"outputTokens":327,"latencyMs":3204.123082999955},{"questionId":"q99","format":"json","model":"gpt-5-nano","expected":"411","actual":"411","isCorrect":true,"inputTokens":3712,"outputTokens":327,"latencyMs":7051.611250000016},{"questionId":"q99","format":"toon","model":"gpt-5-nano","expected":"411","actual":"411","isCorrect":true,"inputTokens":1563,"outputTokens":199,"latencyMs":2742.971750000026},{"questionId":"q99","format":"csv","model":"gpt-5-nano","expected":"411","actual":"411","isCorrect":true,"inputTokens":1441,"outputTokens":455,"latencyMs":3407.819332999992},{"questionId":"q99","format":"xml","model":"gpt-5-nano","expected":"411","actual":"411","isCorrect":true,"inputTokens":4423,"outputTokens":135,"latencyMs":1760.8075839999947},{"questionId":"q99","format":"yaml","model":"gpt-5-nano","expected":"411","actual":"411","isCorrect":true,"inputTokens":2985,"outputTokens":199,"latencyMs":2489.05237499997},{"questionId":"q100","format":"json","model":"gpt-5-nano","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":3712,"outputTokens":201,"latencyMs":2107.64816599997},{"questionId":"q100","format":"toon","model":"gpt-5-nano","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":1563,"outputTokens":201,"latencyMs":2152.561207999999},{"questionId":"q100","format":"csv","model":"gpt-5-nano","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":1441,"outputTokens":201,"latencyMs":2159.094207999995},{"questionId":"q100","format":"xml","model":"gpt-5-nano","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":4423,"outputTokens":585,"latencyMs":8331.69312499999},{"questionId":"q100","format":"yaml","model":"gpt-5-nano","expected":"0.48","actual":"0.48","isCorrect":true,"inputTokens":2985,"outputTokens":137,"latencyMs":3471.0555000000168},{"questionId":"q101","format":"json","model":"gpt-5-nano","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":3712,"outputTokens":136,"latencyMs":2086.7599580000388},{"questionId":"q101","format":"toon","model":"gpt-5-nano","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":1563,"outputTokens":328,"latencyMs":4088.7678339999984},{"questionId":"q101","format":"csv","model":"gpt-5-nano","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":1441,"outputTokens":136,"latencyMs":2125.830750000023},{"questionId":"q101","format":"xml","model":"gpt-5-nano","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":4423,"outputTokens":200,"latencyMs":1937.344999999972},{"questionId":"q101","format":"yaml","model":"gpt-5-nano","expected":"4696","actual":"4696","isCorrect":true,"inputTokens":2985,"outputTokens":136,"latencyMs":1919.3835419999668},{"questionId":"q102","format":"json","model":"gpt-5-nano","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":3711,"outputTokens":202,"latencyMs":1896.579999999958},{"questionId":"q102","format":"toon","model":"gpt-5-nano","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":1562,"outputTokens":74,"latencyMs":1951.8673749999725},{"questionId":"q102","format":"csv","model":"gpt-5-nano","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":1440,"outputTokens":138,"latencyMs":1431.4333340000012},{"questionId":"q102","format":"xml","model":"gpt-5-nano","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":4422,"outputTokens":266,"latencyMs":2830.9484999999986},{"questionId":"q102","format":"yaml","model":"gpt-5-nano","expected":"4211.6","actual":"4211.6","isCorrect":true,"inputTokens":2984,"outputTokens":202,"latencyMs":2569.2954589999863},{"questionId":"q103","format":"json","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":3712,"outputTokens":135,"latencyMs":3055.4072909999522},{"questionId":"q103","format":"toon","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":1563,"outputTokens":327,"latencyMs":6063.461208000022},{"questionId":"q103","format":"csv","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":1441,"outputTokens":135,"latencyMs":1830.0050420000334},{"questionId":"q103","format":"xml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":4423,"outputTokens":135,"latencyMs":2073.8175000000047},{"questionId":"q103","format":"yaml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":2985,"outputTokens":135,"latencyMs":2024.4842910000007},{"questionId":"q104","format":"json","model":"gpt-5-nano","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":3709,"outputTokens":1864,"latencyMs":20743.07024999999},{"questionId":"q104","format":"toon","model":"gpt-5-nano","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":1560,"outputTokens":2376,"latencyMs":19158.497167000023},{"questionId":"q104","format":"csv","model":"gpt-5-nano","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":1438,"outputTokens":4040,"latencyMs":33570.64941699995},{"questionId":"q104","format":"xml","model":"gpt-5-nano","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":4420,"outputTokens":2120,"latencyMs":18319.398874999955},{"questionId":"q104","format":"yaml","model":"gpt-5-nano","expected":"344498","actual":"344498","isCorrect":true,"inputTokens":2982,"outputTokens":2504,"latencyMs":17486.27891699999},{"questionId":"q105","format":"json","model":"gpt-5-nano","expected":"312818.50","actual":"312818.50","isCorrect":true,"inputTokens":3707,"outputTokens":4042,"latencyMs":27715.301083000028},{"questionId":"q105","format":"toon","model":"gpt-5-nano","expected":"312818.50","actual":"312818.50","isCorrect":true,"inputTokens":1558,"outputTokens":2634,"latencyMs":17378.131166999985},{"questionId":"q105","format":"csv","model":"gpt-5-nano","expected":"312818.50","actual":"312718.50","isCorrect":false,"inputTokens":1436,"outputTokens":2954,"latencyMs":29288.556417000014},{"questionId":"q105","format":"xml","model":"gpt-5-nano","expected":"312818.50","actual":"312818.50","isCorrect":true,"inputTokens":4418,"outputTokens":7690,"latencyMs":46754.21683300001},{"questionId":"q105","format":"yaml","model":"gpt-5-nano","expected":"312818.50","actual":"312818.50","isCorrect":true,"inputTokens":2980,"outputTokens":3786,"latencyMs":33383.13175},{"questionId":"q106","format":"json","model":"gpt-5-nano","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":3709,"outputTokens":1544,"latencyMs":13894.615500000014},{"questionId":"q106","format":"toon","model":"gpt-5-nano","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":1560,"outputTokens":1928,"latencyMs":12648.721375000023},{"questionId":"q106","format":"csv","model":"gpt-5-nano","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":1438,"outputTokens":1928,"latencyMs":18158.010540999996},{"questionId":"q106","format":"xml","model":"gpt-5-nano","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":4420,"outputTokens":2568,"latencyMs":15732.940917},{"questionId":"q106","format":"yaml","model":"gpt-5-nano","expected":"1811","actual":"1811","isCorrect":true,"inputTokens":2982,"outputTokens":1288,"latencyMs":10955.163375000004},{"questionId":"q107","format":"json","model":"gpt-5-nano","expected":"5742","actual":"5741.63","isCorrect":false,"inputTokens":3708,"outputTokens":2826,"latencyMs":26201.144542000024},{"questionId":"q107","format":"toon","model":"gpt-5-nano","expected":"5742","actual":"5741.63","isCorrect":false,"inputTokens":1559,"outputTokens":3594,"latencyMs":35990.964875000005},{"questionId":"q107","format":"csv","model":"gpt-5-nano","expected":"5742","actual":"5741.63","isCorrect":true,"inputTokens":1437,"outputTokens":2890,"latencyMs":23745.996999999974},{"questionId":"q107","format":"xml","model":"gpt-5-nano","expected":"5742","actual":"5741.63","isCorrect":false,"inputTokens":4419,"outputTokens":2762,"latencyMs":23545.925084000046},{"questionId":"q107","format":"yaml","model":"gpt-5-nano","expected":"5742","actual":"5741.63","isCorrect":false,"inputTokens":2981,"outputTokens":2122,"latencyMs":18222.963749999995},{"questionId":"q108","format":"json","model":"gpt-5-nano","expected":"5213.64","actual":"5401.64","isCorrect":false,"inputTokens":3706,"outputTokens":1866,"latencyMs":10071.50433299999},{"questionId":"q108","format":"toon","model":"gpt-5-nano","expected":"5213.64","actual":"5213.64","isCorrect":true,"inputTokens":1557,"outputTokens":5066,"latencyMs":36936.507458999986},{"questionId":"q108","format":"csv","model":"gpt-5-nano","expected":"5213.64","actual":"5211.98","isCorrect":false,"inputTokens":1435,"outputTokens":3722,"latencyMs":31915.33920799999},{"questionId":"q108","format":"xml","model":"gpt-5-nano","expected":"5213.64","actual":"5213.64","isCorrect":true,"inputTokens":4417,"outputTokens":4042,"latencyMs":22632.840333},{"questionId":"q108","format":"yaml","model":"gpt-5-nano","expected":"5213.64","actual":"5213.64","isCorrect":true,"inputTokens":2979,"outputTokens":3850,"latencyMs":18988.701457999996},{"questionId":"q109","format":"json","model":"gpt-5-nano","expected":"30","actual":"30.18","isCorrect":false,"inputTokens":3708,"outputTokens":1353,"latencyMs":13138.985000000044},{"questionId":"q109","format":"toon","model":"gpt-5-nano","expected":"30","actual":"30.1833333333","isCorrect":false,"inputTokens":1559,"outputTokens":1996,"latencyMs":26100.575125000032},{"questionId":"q109","format":"csv","model":"gpt-5-nano","expected":"30","actual":"30.18","isCorrect":false,"inputTokens":1437,"outputTokens":2697,"latencyMs":24620.171333000006},{"questionId":"q109","format":"xml","model":"gpt-5-nano","expected":"30","actual":"30.18","isCorrect":false,"inputTokens":4419,"outputTokens":2825,"latencyMs":18780.89512500004},{"questionId":"q109","format":"yaml","model":"gpt-5-nano","expected":"30","actual":"30.1833333333","isCorrect":false,"inputTokens":2981,"outputTokens":3084,"latencyMs":30253.369750000013},{"questionId":"q110","format":"json","model":"gpt-5-nano","expected":"60","actual":"60","isCorrect":true,"inputTokens":3708,"outputTokens":391,"latencyMs":4351.089999999967},{"questionId":"q110","format":"toon","model":"gpt-5-nano","expected":"60","actual":"60","isCorrect":true,"inputTokens":1559,"outputTokens":327,"latencyMs":3603.5555000000168},{"questionId":"q110","format":"csv","model":"gpt-5-nano","expected":"60","actual":"60","isCorrect":true,"inputTokens":1437,"outputTokens":263,"latencyMs":3470.5262499999953},{"questionId":"q110","format":"xml","model":"gpt-5-nano","expected":"60","actual":"60","isCorrect":true,"inputTokens":4419,"outputTokens":263,"latencyMs":3301.9788749999716},{"questionId":"q110","format":"yaml","model":"gpt-5-nano","expected":"60","actual":"60","isCorrect":true,"inputTokens":2981,"outputTokens":391,"latencyMs":5403.282624999993},{"questionId":"q111","format":"json","model":"gpt-5-nano","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":3711,"outputTokens":520,"latencyMs":3600.170083999983},{"questionId":"q111","format":"toon","model":"gpt-5-nano","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":1562,"outputTokens":712,"latencyMs":7900.946333000029},{"questionId":"q111","format":"csv","model":"gpt-5-nano","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":1440,"outputTokens":648,"latencyMs":7093.944542000012},{"questionId":"q111","format":"xml","model":"gpt-5-nano","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":4422,"outputTokens":1352,"latencyMs":12142.23683400004},{"questionId":"q111","format":"yaml","model":"gpt-5-nano","expected":"7944","actual":"7944","isCorrect":true,"inputTokens":2984,"outputTokens":776,"latencyMs":9676.69750000001},{"questionId":"q112","format":"json","model":"gpt-5-nano","expected":"42","actual":"42","isCorrect":true,"inputTokens":3709,"outputTokens":2823,"latencyMs":26626.55466700002},{"questionId":"q112","format":"toon","model":"gpt-5-nano","expected":"42","actual":"42","isCorrect":true,"inputTokens":1560,"outputTokens":1479,"latencyMs":11620.979290999996},{"questionId":"q112","format":"csv","model":"gpt-5-nano","expected":"42","actual":"42","isCorrect":true,"inputTokens":1438,"outputTokens":1799,"latencyMs":17816.583874999953},{"questionId":"q112","format":"xml","model":"gpt-5-nano","expected":"42","actual":"42","isCorrect":true,"inputTokens":4420,"outputTokens":1991,"latencyMs":11608.117665999976},{"questionId":"q112","format":"yaml","model":"gpt-5-nano","expected":"42","actual":"42","isCorrect":true,"inputTokens":2982,"outputTokens":1927,"latencyMs":17976.007583},{"questionId":"q113","format":"json","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":3709,"outputTokens":1351,"latencyMs":7504.992665999976},{"questionId":"q113","format":"toon","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":1560,"outputTokens":1287,"latencyMs":8089.16079200001},{"questionId":"q113","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":1438,"outputTokens":1159,"latencyMs":8368.544332999969},{"questionId":"q113","format":"xml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":4420,"outputTokens":1351,"latencyMs":9879.407125000027},{"questionId":"q113","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":2982,"outputTokens":1415,"latencyMs":8203.717042000033},{"questionId":"q114","format":"json","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":3715,"outputTokens":2503,"latencyMs":14300.482250000001},{"questionId":"q114","format":"toon","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":1566,"outputTokens":1863,"latencyMs":19860.288916999998},{"questionId":"q114","format":"csv","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":1444,"outputTokens":3207,"latencyMs":17891.136750000005},{"questionId":"q114","format":"xml","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":4426,"outputTokens":2183,"latencyMs":23856.857374999963},{"questionId":"q114","format":"yaml","model":"gpt-5-nano","expected":"26","actual":"27","isCorrect":false,"inputTokens":2988,"outputTokens":1863,"latencyMs":15280.603833000001},{"questionId":"q115","format":"json","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":3715,"outputTokens":1287,"latencyMs":14521.147874999966},{"questionId":"q115","format":"toon","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":1566,"outputTokens":1671,"latencyMs":19639.551666999992},{"questionId":"q115","format":"csv","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":1444,"outputTokens":1415,"latencyMs":8054.100792000012},{"questionId":"q115","format":"xml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":4426,"outputTokens":1799,"latencyMs":18204.095917000028},{"questionId":"q115","format":"yaml","model":"gpt-5-nano","expected":"10","actual":"10","isCorrect":true,"inputTokens":2988,"outputTokens":1415,"latencyMs":13753.654209},{"questionId":"q116","format":"json","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":3715,"outputTokens":1863,"latencyMs":11231.150665999972},{"questionId":"q116","format":"toon","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1566,"outputTokens":2247,"latencyMs":16329.248583000037},{"questionId":"q116","format":"csv","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1444,"outputTokens":1671,"latencyMs":15908.416999999958},{"questionId":"q116","format":"xml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":4426,"outputTokens":3015,"latencyMs":32053.260583999974},{"questionId":"q116","format":"yaml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":2988,"outputTokens":1991,"latencyMs":15593.033584000019},{"questionId":"q117","format":"json","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":3716,"outputTokens":2247,"latencyMs":22851.75224999996},{"questionId":"q117","format":"toon","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1567,"outputTokens":2119,"latencyMs":20895.994542},{"questionId":"q117","format":"csv","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1445,"outputTokens":2119,"latencyMs":13167.545125000004},{"questionId":"q117","format":"xml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":4427,"outputTokens":2247,"latencyMs":26842.44524999999},{"questionId":"q117","format":"yaml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":2989,"outputTokens":2119,"latencyMs":14630.024459000037},{"questionId":"q118","format":"json","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":3716,"outputTokens":2439,"latencyMs":13179.716833000013},{"questionId":"q118","format":"toon","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1567,"outputTokens":2311,"latencyMs":12719.164832999988},{"questionId":"q118","format":"csv","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":1445,"outputTokens":2695,"latencyMs":13549.327916999988},{"questionId":"q118","format":"xml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":4427,"outputTokens":2183,"latencyMs":23100.062124999997},{"questionId":"q118","format":"yaml","model":"gpt-5-nano","expected":"28","actual":"28","isCorrect":true,"inputTokens":2989,"outputTokens":3719,"latencyMs":18481.843249999976},{"questionId":"q119","format":"json","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":3716,"outputTokens":1863,"latencyMs":20471.790625000023},{"questionId":"q119","format":"toon","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":1567,"outputTokens":2887,"latencyMs":14944.100292000046},{"questionId":"q119","format":"csv","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":1445,"outputTokens":1863,"latencyMs":11682.023917000042},{"questionId":"q119","format":"xml","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":4427,"outputTokens":4615,"latencyMs":26417.39070799999},{"questionId":"q119","format":"yaml","model":"gpt-5-nano","expected":"26","actual":"26","isCorrect":true,"inputTokens":2989,"outputTokens":3719,"latencyMs":32432.48529099999},{"questionId":"q120","format":"json","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":3716,"outputTokens":1927,"latencyMs":11993.960625000007},{"questionId":"q120","format":"toon","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":1567,"outputTokens":1479,"latencyMs":19029.13149999996},{"questionId":"q120","format":"csv","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":1445,"outputTokens":2119,"latencyMs":18643.346375000023},{"questionId":"q120","format":"xml","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":4427,"outputTokens":3399,"latencyMs":25141.51587500004},{"questionId":"q120","format":"yaml","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":2989,"outputTokens":3527,"latencyMs":29699.743499999982},{"questionId":"q121","format":"json","model":"gpt-5-nano","expected":"35","actual":"35","isCorrect":true,"inputTokens":3714,"outputTokens":3271,"latencyMs":21694.63841699995},{"questionId":"q121","format":"toon","model":"gpt-5-nano","expected":"35","actual":"35","isCorrect":true,"inputTokens":1565,"outputTokens":2183,"latencyMs":15088.209875},{"questionId":"q121","format":"csv","model":"gpt-5-nano","expected":"35","actual":"35","isCorrect":true,"inputTokens":1443,"outputTokens":2119,"latencyMs":22566.850584},{"questionId":"q121","format":"xml","model":"gpt-5-nano","expected":"35","actual":"36","isCorrect":false,"inputTokens":4425,"outputTokens":2055,"latencyMs":14981.830625000002},{"questionId":"q121","format":"yaml","model":"gpt-5-nano","expected":"35","actual":"35","isCorrect":true,"inputTokens":2987,"outputTokens":2631,"latencyMs":15605.16954100004},{"questionId":"q122","format":"json","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":3714,"outputTokens":1607,"latencyMs":10281.626707999967},{"questionId":"q122","format":"toon","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":1565,"outputTokens":1799,"latencyMs":10508.287624999997},{"questionId":"q122","format":"csv","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":1443,"outputTokens":1543,"latencyMs":13406.946959000023},{"questionId":"q122","format":"xml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":4425,"outputTokens":1735,"latencyMs":19976.58150000003},{"questionId":"q122","format":"yaml","model":"gpt-5-nano","expected":"12","actual":"12","isCorrect":true,"inputTokens":2987,"outputTokens":1223,"latencyMs":7062.52566699998},{"questionId":"q123","format":"json","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":3718,"outputTokens":2439,"latencyMs":22098.910083999974},{"questionId":"q123","format":"toon","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":1569,"outputTokens":2119,"latencyMs":24676.80033300002},{"questionId":"q123","format":"csv","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":1447,"outputTokens":4615,"latencyMs":47459.09729100001},{"questionId":"q123","format":"xml","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":4429,"outputTokens":2951,"latencyMs":28576.63483300002},{"questionId":"q123","format":"yaml","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":2991,"outputTokens":3463,"latencyMs":22446.219542000035},{"questionId":"q124","format":"json","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":3718,"outputTokens":2503,"latencyMs":23901.167791999993},{"questionId":"q124","format":"toon","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":1569,"outputTokens":2503,"latencyMs":17261.147666999954},{"questionId":"q124","format":"csv","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":1447,"outputTokens":3143,"latencyMs":21859.215417},{"questionId":"q124","format":"xml","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":4429,"outputTokens":2439,"latencyMs":25891.808375000022},{"questionId":"q124","format":"yaml","model":"gpt-5-nano","expected":"32","actual":"32","isCorrect":true,"inputTokens":2991,"outputTokens":2311,"latencyMs":14862.04933400004},{"questionId":"q125","format":"json","model":"gpt-5-nano","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":15189,"outputTokens":264,"latencyMs":6322.1847919999855},{"questionId":"q125","format":"toon","model":"gpt-5-nano","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":8790,"outputTokens":456,"latencyMs":8064.342333999986},{"questionId":"q125","format":"csv","model":"gpt-5-nano","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":8558,"outputTokens":136,"latencyMs":2674.229042000021},{"questionId":"q125","format":"xml","model":"gpt-5-nano","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":17140,"outputTokens":200,"latencyMs":2699.6745830000145},{"questionId":"q125","format":"yaml","model":"gpt-5-nano","expected":"430886","actual":"430886","isCorrect":true,"inputTokens":13173,"outputTokens":72,"latencyMs":2387.1922920000507},{"questionId":"q126","format":"json","model":"gpt-5-nano","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":15191,"outputTokens":264,"latencyMs":10286.163166999992},{"questionId":"q126","format":"toon","model":"gpt-5-nano","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":8792,"outputTokens":264,"latencyMs":3193.7972500000033},{"questionId":"q126","format":"csv","model":"gpt-5-nano","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":8560,"outputTokens":392,"latencyMs":4903.814499999979},{"questionId":"q126","format":"xml","model":"gpt-5-nano","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":17142,"outputTokens":328,"latencyMs":7727.762624999974},{"questionId":"q126","format":"yaml","model":"gpt-5-nano","expected":"52904","actual":"52904","isCorrect":true,"inputTokens":13175,"outputTokens":264,"latencyMs":3304.836208999972},{"questionId":"q127","format":"json","model":"gpt-5-nano","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":15186,"outputTokens":264,"latencyMs":4128.396166999999},{"questionId":"q127","format":"toon","model":"gpt-5-nano","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":8787,"outputTokens":200,"latencyMs":3041.617041999998},{"questionId":"q127","format":"csv","model":"gpt-5-nano","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":8555,"outputTokens":264,"latencyMs":3009.996334000025},{"questionId":"q127","format":"xml","model":"gpt-5-nano","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":17137,"outputTokens":264,"latencyMs":4328.163291999954},{"questionId":"q127","format":"yaml","model":"gpt-5-nano","expected":"vuejs","actual":"vuejs","isCorrect":true,"inputTokens":13170,"outputTokens":200,"latencyMs":3640.219541999977},{"questionId":"q128","format":"json","model":"gpt-5-nano","expected":"master","actual":"master","isCorrect":true,"inputTokens":15192,"outputTokens":199,"latencyMs":3064.0731659999583},{"questionId":"q128","format":"toon","model":"gpt-5-nano","expected":"master","actual":"master","isCorrect":true,"inputTokens":8793,"outputTokens":199,"latencyMs":2740.8643750000047},{"questionId":"q128","format":"csv","model":"gpt-5-nano","expected":"master","actual":"master","isCorrect":true,"inputTokens":8561,"outputTokens":199,"latencyMs":2650.438375000027},{"questionId":"q128","format":"xml","model":"gpt-5-nano","expected":"master","actual":"master","isCorrect":true,"inputTokens":17143,"outputTokens":263,"latencyMs":3573.017332999967},{"questionId":"q128","format":"yaml","model":"gpt-5-nano","expected":"master","actual":"master","isCorrect":true,"inputTokens":13176,"outputTokens":263,"latencyMs":12294.340290999971},{"questionId":"q129","format":"json","model":"gpt-5-nano","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":15186,"outputTokens":264,"latencyMs":4325.090625000012},{"questionId":"q129","format":"toon","model":"gpt-5-nano","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":8787,"outputTokens":328,"latencyMs":3983.594792000018},{"questionId":"q129","format":"csv","model":"gpt-5-nano","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":8555,"outputTokens":328,"latencyMs":5348.565249999985},{"questionId":"q129","format":"xml","model":"gpt-5-nano","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":17137,"outputTokens":264,"latencyMs":5281.116500000004},{"questionId":"q129","format":"yaml","model":"gpt-5-nano","expected":"3367","actual":"3367","isCorrect":true,"inputTokens":13170,"outputTokens":200,"latencyMs":4126.400749999972},{"questionId":"q130","format":"json","model":"gpt-5-nano","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":15191,"outputTokens":264,"latencyMs":6962.222958999977},{"questionId":"q130","format":"toon","model":"gpt-5-nano","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":8792,"outputTokens":392,"latencyMs":3621.357874999987},{"questionId":"q130","format":"csv","model":"gpt-5-nano","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":8560,"outputTokens":648,"latencyMs":4996.8137080000015},{"questionId":"q130","format":"xml","model":"gpt-5-nano","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":17142,"outputTokens":264,"latencyMs":6185.885999999999},{"questionId":"q130","format":"yaml","model":"gpt-5-nano","expected":"152300","actual":"152300","isCorrect":true,"inputTokens":13175,"outputTokens":200,"latencyMs":3915.9683339999756},{"questionId":"q131","format":"json","model":"gpt-5-nano","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":15190,"outputTokens":264,"latencyMs":3767.001791000017},{"questionId":"q131","format":"toon","model":"gpt-5-nano","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":8791,"outputTokens":392,"latencyMs":3681.7863329999964},{"questionId":"q131","format":"csv","model":"gpt-5-nano","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":8559,"outputTokens":200,"latencyMs":3517.4621669999906},{"questionId":"q131","format":"xml","model":"gpt-5-nano","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":17141,"outputTokens":264,"latencyMs":3491.596833000018},{"questionId":"q131","format":"yaml","model":"gpt-5-nano","expected":"10668","actual":"10668","isCorrect":true,"inputTokens":13174,"outputTokens":200,"latencyMs":3151.8418749999837},{"questionId":"q132","format":"json","model":"gpt-5-nano","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":15188,"outputTokens":136,"latencyMs":3227.4194580000476},{"questionId":"q132","format":"toon","model":"gpt-5-nano","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":8789,"outputTokens":328,"latencyMs":4768.578541999974},{"questionId":"q132","format":"csv","model":"gpt-5-nano","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":8557,"outputTokens":200,"latencyMs":3007.484666000004},{"questionId":"q132","format":"xml","model":"gpt-5-nano","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":17139,"outputTokens":136,"latencyMs":3115.9872920000344},{"questionId":"q132","format":"yaml","model":"gpt-5-nano","expected":"microsoft","actual":"microsoft","isCorrect":true,"inputTokens":13172,"outputTokens":200,"latencyMs":3243.9305000000168},{"questionId":"q133","format":"json","model":"gpt-5-nano","expected":"main","actual":"main","isCorrect":true,"inputTokens":15194,"outputTokens":263,"latencyMs":3193.990583000006},{"questionId":"q133","format":"toon","model":"gpt-5-nano","expected":"main","actual":"main","isCorrect":true,"inputTokens":8795,"outputTokens":327,"latencyMs":4803.676875000005},{"questionId":"q133","format":"csv","model":"gpt-5-nano","expected":"main","actual":"main","isCorrect":true,"inputTokens":8563,"outputTokens":135,"latencyMs":3888.2197910000104},{"questionId":"q133","format":"xml","model":"gpt-5-nano","expected":"main","actual":"main","isCorrect":true,"inputTokens":17145,"outputTokens":327,"latencyMs":4009.9150000000373},{"questionId":"q133","format":"yaml","model":"gpt-5-nano","expected":"main","actual":"main","isCorrect":true,"inputTokens":13178,"outputTokens":135,"latencyMs":2496.18658400001},{"questionId":"q134","format":"json","model":"gpt-5-nano","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":15189,"outputTokens":264,"latencyMs":3411.0290000000386},{"questionId":"q134","format":"toon","model":"gpt-5-nano","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":8790,"outputTokens":392,"latencyMs":3882.483082999999},{"questionId":"q134","format":"csv","model":"gpt-5-nano","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":8558,"outputTokens":392,"latencyMs":7564.845540999959},{"questionId":"q134","format":"xml","model":"gpt-5-nano","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":17140,"outputTokens":328,"latencyMs":4407.723416999972},{"questionId":"q134","format":"yaml","model":"gpt-5-nano","expected":"2518","actual":"2518","isCorrect":true,"inputTokens":13173,"outputTokens":200,"latencyMs":2888.3398330000346},{"questionId":"q135","format":"json","model":"gpt-5-nano","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":15194,"outputTokens":264,"latencyMs":2919.5192080000415},{"questionId":"q135","format":"toon","model":"gpt-5-nano","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":8795,"outputTokens":904,"latencyMs":7213.861749999982},{"questionId":"q135","format":"csv","model":"gpt-5-nano","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":8563,"outputTokens":392,"latencyMs":6935.310915999988},{"questionId":"q135","format":"xml","model":"gpt-5-nano","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":17145,"outputTokens":392,"latencyMs":11060.892332999967},{"questionId":"q135","format":"yaml","model":"gpt-5-nano","expected":"103358","actual":"103358","isCorrect":true,"inputTokens":13178,"outputTokens":264,"latencyMs":3446.359790999966},{"questionId":"q136","format":"json","model":"gpt-5-nano","expected":"15413563","actual":"13178919","isCorrect":false,"inputTokens":15188,"outputTokens":4297,"latencyMs":31770.805915999983},{"questionId":"q136","format":"toon","model":"gpt-5-nano","expected":"15413563","actual":"15413563","isCorrect":true,"inputTokens":8789,"outputTokens":13705,"latencyMs":144553.23929200007},{"questionId":"q136","format":"csv","model":"gpt-5-nano","expected":"15413563","actual":"15413563","isCorrect":true,"inputTokens":8557,"outputTokens":4361,"latencyMs":37849.367791},{"questionId":"q136","format":"xml","model":"gpt-5-nano","expected":"15413563","actual":"11144871","isCorrect":false,"inputTokens":17139,"outputTokens":3529,"latencyMs":25391.98550000001},{"questionId":"q136","format":"yaml","model":"gpt-5-nano","expected":"15413563","actual":"15413563","isCorrect":true,"inputTokens":13172,"outputTokens":4361,"latencyMs":30101.182541000016},{"questionId":"q137","format":"json","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":15185,"outputTokens":1351,"latencyMs":10567.556249999965},{"questionId":"q137","format":"toon","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":8786,"outputTokens":199,"latencyMs":6884.175707999966},{"questionId":"q137","format":"csv","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":8554,"outputTokens":1287,"latencyMs":10417.314499999979},{"questionId":"q137","format":"xml","model":"gpt-5-nano","expected":"100","actual":"100","isCorrect":true,"inputTokens":17136,"outputTokens":1351,"latencyMs":11468.357833000016},{"questionId":"q137","format":"yaml","model":"gpt-5-nano","expected":"100","actual":"90","isCorrect":false,"inputTokens":13169,"outputTokens":1287,"latencyMs":16153.891917},{"questionId":"q138","format":"json","model":"gpt-5-nano","expected":"154136","actual":"161831.35","isCorrect":false,"inputTokens":15187,"outputTokens":7114,"latencyMs":59323.559499999974},{"questionId":"q138","format":"toon","model":"gpt-5-nano","expected":"154136","actual":"154135.63","isCorrect":true,"inputTokens":8788,"outputTokens":10378,"latencyMs":80909.49466600001},{"questionId":"q138","format":"csv","model":"gpt-5-nano","expected":"154136","actual":"154135.63","isCorrect":true,"inputTokens":8556,"outputTokens":6154,"latencyMs":39523.52633299999},{"questionId":"q138","format":"xml","model":"gpt-5-nano","expected":"154136","actual":"176493.34","isCorrect":false,"inputTokens":17138,"outputTokens":4746,"latencyMs":33491.533999999985},{"questionId":"q138","format":"yaml","model":"gpt-5-nano","expected":"154136","actual":"148742.2972972973","isCorrect":false,"inputTokens":13171,"outputTokens":5645,"latencyMs":35528.64245799999},{"questionId":"q139","format":"json","model":"gpt-5-nano","expected":"77","actual":"65","isCorrect":false,"inputTokens":15188,"outputTokens":2375,"latencyMs":17078.73887499998},{"questionId":"q139","format":"toon","model":"gpt-5-nano","expected":"77","actual":"77","isCorrect":true,"inputTokens":8789,"outputTokens":4679,"latencyMs":43084.070791999984},{"questionId":"q139","format":"csv","model":"gpt-5-nano","expected":"77","actual":"77","isCorrect":true,"inputTokens":8557,"outputTokens":3079,"latencyMs":32706.273708999972},{"questionId":"q139","format":"xml","model":"gpt-5-nano","expected":"77","actual":"64","isCorrect":false,"inputTokens":17139,"outputTokens":5191,"latencyMs":33100.169166000036},{"questionId":"q139","format":"yaml","model":"gpt-5-nano","expected":"77","actual":"64","isCorrect":false,"inputTokens":15068,"outputTokens":7120,"latencyMs":64101.707042000024},{"questionId":"q140","format":"json","model":"gpt-5-nano","expected":"37","actual":"37","isCorrect":true,"inputTokens":15188,"outputTokens":2759,"latencyMs":19231.35716700001},{"questionId":"q140","format":"toon","model":"gpt-5-nano","expected":"37","actual":"37","isCorrect":true,"inputTokens":8789,"outputTokens":2055,"latencyMs":14256.53354100004},{"questionId":"q140","format":"csv","model":"gpt-5-nano","expected":"37","actual":"37","isCorrect":true,"inputTokens":8557,"outputTokens":3399,"latencyMs":22249.686209000007},{"questionId":"q140","format":"xml","model":"gpt-5-nano","expected":"37","actual":"37","isCorrect":true,"inputTokens":17139,"outputTokens":2631,"latencyMs":17393.795042000012},{"questionId":"q140","format":"yaml","model":"gpt-5-nano","expected":"37","actual":"37","isCorrect":true,"inputTokens":13172,"outputTokens":2695,"latencyMs":29277.75575000001},{"questionId":"q141","format":"json","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":15188,"outputTokens":1991,"latencyMs":21917.801875000005},{"questionId":"q141","format":"toon","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":8789,"outputTokens":3015,"latencyMs":31398.578958},{"questionId":"q141","format":"csv","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":8557,"outputTokens":2247,"latencyMs":28072.916000000027},{"questionId":"q141","format":"xml","model":"gpt-5-nano","expected":"16","actual":"14","isCorrect":false,"inputTokens":17139,"outputTokens":1415,"latencyMs":10279.029707999958},{"questionId":"q141","format":"yaml","model":"gpt-5-nano","expected":"16","actual":"16","isCorrect":true,"inputTokens":13172,"outputTokens":2119,"latencyMs":13705.472291999962},{"questionId":"q142","format":"json","model":"gpt-5-nano","expected":"49","actual":"37","isCorrect":false,"inputTokens":15188,"outputTokens":2247,"latencyMs":17690.96520799998},{"questionId":"q142","format":"toon","model":"gpt-5-nano","expected":"49","actual":"41","isCorrect":false,"inputTokens":8789,"outputTokens":2247,"latencyMs":13964.716707999993},{"questionId":"q142","format":"csv","model":"gpt-5-nano","expected":"49","actual":"49","isCorrect":true,"inputTokens":8557,"outputTokens":4359,"latencyMs":24992.431709000026},{"questionId":"q142","format":"xml","model":"gpt-5-nano","expected":"49","actual":"38","isCorrect":false,"inputTokens":17139,"outputTokens":1863,"latencyMs":16201.765582999971},{"questionId":"q142","format":"yaml","model":"gpt-5-nano","expected":"49","actual":"49","isCorrect":true,"inputTokens":13172,"outputTokens":3463,"latencyMs":24024.559666000016},{"questionId":"q143","format":"json","model":"gpt-5-nano","expected":"23","actual":"22","isCorrect":false,"inputTokens":15188,"outputTokens":3271,"latencyMs":23265.205874999985},{"questionId":"q143","format":"toon","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":8789,"outputTokens":4807,"latencyMs":31048.19095899997},{"questionId":"q143","format":"csv","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":8557,"outputTokens":3911,"latencyMs":28614.549457999994},{"questionId":"q143","format":"xml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":17139,"outputTokens":7367,"latencyMs":50101.432875},{"questionId":"q143","format":"yaml","model":"gpt-5-nano","expected":"23","actual":"23","isCorrect":true,"inputTokens":13172,"outputTokens":6151,"latencyMs":37457.83145900001},{"questionId":"q144","format":"json","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":15188,"outputTokens":5639,"latencyMs":43804.95516700001},{"questionId":"q144","format":"toon","model":"gpt-5-nano","expected":"11","actual":"9","isCorrect":false,"inputTokens":8789,"outputTokens":2055,"latencyMs":14880.539583000005},{"questionId":"q144","format":"csv","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":8557,"outputTokens":3399,"latencyMs":35436.010875000036},{"questionId":"q144","format":"xml","model":"gpt-5-nano","expected":"11","actual":"10","isCorrect":false,"inputTokens":17139,"outputTokens":1863,"latencyMs":15287.124416999985},{"questionId":"q144","format":"yaml","model":"gpt-5-nano","expected":"11","actual":"11","isCorrect":true,"inputTokens":13172,"outputTokens":4359,"latencyMs":45950.23566699994},{"questionId":"q145","format":"json","model":"gpt-5-nano","expected":"19","actual":"19","isCorrect":true,"inputTokens":15188,"outputTokens":4551,"latencyMs":41888.297667000035},{"questionId":"q145","format":"toon","model":"gpt-5-nano","expected":"19","actual":"19","isCorrect":true,"inputTokens":8789,"outputTokens":10183,"latencyMs":62461.11354200001},{"questionId":"q145","format":"csv","model":"gpt-5-nano","expected":"19","actual":"19","isCorrect":true,"inputTokens":8557,"outputTokens":3143,"latencyMs":33699.921250000014},{"questionId":"q145","format":"xml","model":"gpt-5-nano","expected":"19","actual":"19","isCorrect":true,"inputTokens":17139,"outputTokens":8711,"latencyMs":75960.46604100004},{"questionId":"q145","format":"yaml","model":"gpt-5-nano","expected":"19","actual":"20","isCorrect":false,"inputTokens":13172,"outputTokens":6215,"latencyMs":40243.47724999994},{"questionId":"q146","format":"json","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":15188,"outputTokens":3527,"latencyMs":20281.348042000027},{"questionId":"q146","format":"toon","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":8789,"outputTokens":3655,"latencyMs":38010.426833000034},{"questionId":"q146","format":"csv","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":8557,"outputTokens":10183,"latencyMs":67830.84658300004},{"questionId":"q146","format":"xml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":17139,"outputTokens":2183,"latencyMs":14748.326416999975},{"questionId":"q146","format":"yaml","model":"gpt-5-nano","expected":"4","actual":"4","isCorrect":true,"inputTokens":13172,"outputTokens":3399,"latencyMs":31304.879791999992},{"questionId":"q147","format":"json","model":"gpt-5-nano","expected":"41","actual":"35","isCorrect":false,"inputTokens":15189,"outputTokens":2439,"latencyMs":19051.980291999993},{"questionId":"q147","format":"toon","model":"gpt-5-nano","expected":"41","actual":"37","isCorrect":false,"inputTokens":8790,"outputTokens":3527,"latencyMs":22412.309667000023},{"questionId":"q147","format":"csv","model":"gpt-5-nano","expected":"41","actual":"40","isCorrect":false,"inputTokens":8558,"outputTokens":3015,"latencyMs":30923.15724999999},{"questionId":"q147","format":"xml","model":"gpt-5-nano","expected":"41","actual":"54","isCorrect":false,"inputTokens":17140,"outputTokens":8519,"latencyMs":85140.38},{"questionId":"q147","format":"yaml","model":"gpt-5-nano","expected":"41","actual":"67","isCorrect":false,"inputTokens":13173,"outputTokens":10119,"latencyMs":54918.44337499997},{"questionId":"q148","format":"json","model":"gpt-5-nano","expected":"53","actual":"57","isCorrect":false,"inputTokens":15189,"outputTokens":2631,"latencyMs":14920.208584000007},{"questionId":"q148","format":"toon","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":8790,"outputTokens":2567,"latencyMs":15358.285791000002},{"questionId":"q148","format":"csv","model":"gpt-5-nano","expected":"53","actual":"63","isCorrect":false,"inputTokens":8558,"outputTokens":2567,"latencyMs":16075.23654199997},{"questionId":"q148","format":"xml","model":"gpt-5-nano","expected":"53","actual":"48","isCorrect":false,"inputTokens":17140,"outputTokens":5063,"latencyMs":27688.752749999985},{"questionId":"q148","format":"yaml","model":"gpt-5-nano","expected":"53","actual":"53","isCorrect":true,"inputTokens":13173,"outputTokens":6151,"latencyMs":42171.99545899994},{"questionId":"q149","format":"json","model":"gpt-5-nano","expected":"57","actual":"57","isCorrect":true,"inputTokens":15195,"outputTokens":16391,"latencyMs":149480.339417},{"questionId":"q149","format":"toon","model":"gpt-5-nano","expected":"57","actual":"57","isCorrect":true,"inputTokens":8796,"outputTokens":10503,"latencyMs":72016.71129100001},{"questionId":"q149","format":"csv","model":"gpt-5-nano","expected":"57","actual":"57","isCorrect":true,"inputTokens":8564,"outputTokens":4807,"latencyMs":44379.204958999995},{"questionId":"q149","format":"xml","model":"gpt-5-nano","expected":"57","actual":"46","isCorrect":false,"inputTokens":17146,"outputTokens":9863,"latencyMs":54558.46879099996},{"questionId":"q149","format":"yaml","model":"gpt-5-nano","expected":"57","actual":"56","isCorrect":false,"inputTokens":13179,"outputTokens":12167,"latencyMs":66911.90370799997},{"questionId":"q150","format":"json","model":"gpt-5-nano","expected":"43","actual":"44","isCorrect":false,"inputTokens":15195,"outputTokens":3399,"latencyMs":21145.205332999933},{"questionId":"q150","format":"toon","model":"gpt-5-nano","expected":"43","actual":"43","isCorrect":true,"inputTokens":8796,"outputTokens":7623,"latencyMs":47413.88270800002},{"questionId":"q150","format":"csv","model":"gpt-5-nano","expected":"43","actual":"43","isCorrect":true,"inputTokens":8564,"outputTokens":3527,"latencyMs":19383.114291999955},{"questionId":"q150","format":"xml","model":"gpt-5-nano","expected":"43","actual":"37","isCorrect":false,"inputTokens":17146,"outputTokens":3655,"latencyMs":21911.847582999966},{"questionId":"q150","format":"yaml","model":"gpt-5-nano","expected":"43","actual":"40","isCorrect":false,"inputTokens":13179,"outputTokens":3015,"latencyMs":17307.11975000007},{"questionId":"q151","format":"json","model":"gpt-5-nano","expected":"25","actual":"30","isCorrect":false,"inputTokens":15195,"outputTokens":6471,"latencyMs":46284.17358299997},{"questionId":"q151","format":"toon","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":8796,"outputTokens":5127,"latencyMs":30523.704790999996},{"questionId":"q151","format":"csv","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":8564,"outputTokens":3975,"latencyMs":22902.60674999992},{"questionId":"q151","format":"xml","model":"gpt-5-nano","expected":"25","actual":"23","isCorrect":false,"inputTokens":17146,"outputTokens":7175,"latencyMs":40170.364291999955},{"questionId":"q151","format":"yaml","model":"gpt-5-nano","expected":"25","actual":"25","isCorrect":true,"inputTokens":13179,"outputTokens":3783,"latencyMs":24397.443499999936},{"questionId":"q152","format":"json","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":15195,"outputTokens":1927,"latencyMs":11596.635458000004},{"questionId":"q152","format":"toon","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":8796,"outputTokens":3591,"latencyMs":34839.70533299993},{"questionId":"q152","format":"csv","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":8564,"outputTokens":2631,"latencyMs":16677.432333000004},{"questionId":"q152","format":"xml","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":17146,"outputTokens":1863,"latencyMs":15458.170166000025},{"questionId":"q152","format":"yaml","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":13179,"outputTokens":1863,"latencyMs":11414.568250000011},{"questionId":"q153","format":"json","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":15195,"outputTokens":1543,"latencyMs":9937.089749999926},{"questionId":"q153","format":"toon","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":8796,"outputTokens":5383,"latencyMs":29585.493583000032},{"questionId":"q153","format":"csv","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":8564,"outputTokens":4679,"latencyMs":57854.73316600011},{"questionId":"q153","format":"xml","model":"gpt-5-nano","expected":"6","actual":"5","isCorrect":false,"inputTokens":17146,"outputTokens":3335,"latencyMs":24867.77470800001},{"questionId":"q153","format":"yaml","model":"gpt-5-nano","expected":"6","actual":"6","isCorrect":true,"inputTokens":13179,"outputTokens":2951,"latencyMs":18378.52354199998},{"questionId":"q154","format":"json","model":"gpt-5-nano","expected":"1","actual":"1","isCorrect":true,"inputTokens":15195,"outputTokens":2375,"latencyMs":13409.657957999967},{"questionId":"q154","format":"toon","model":"gpt-5-nano","expected":"1","actual":"1","isCorrect":true,"inputTokens":8796,"outputTokens":3527,"latencyMs":37057.010750000016},{"questionId":"q154","format":"csv","model":"gpt-5-nano","expected":"1","actual":"1","isCorrect":true,"inputTokens":8564,"outputTokens":2695,"latencyMs":19144.693792000064},{"questionId":"q154","format":"xml","model":"gpt-5-nano","expected":"1","actual":"1","isCorrect":true,"inputTokens":17146,"outputTokens":2375,"latencyMs":13506.978166999994},{"questionId":"q154","format":"yaml","model":"gpt-5-nano","expected":"1","actual":"1","isCorrect":true,"inputTokens":13179,"outputTokens":2055,"latencyMs":16356.974416999961}]
\ No newline at end of file
diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json
deleted file mode 100644
index 45e5806..0000000
--- a/benchmarks/results/accuracy/raw-results.json
+++ /dev/null
@@ -1,26237 +0,0 @@
-[
- {
- "questionId": "q1",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 136,
- "latencyMs": 1973.9505419999998
- },
- {
- "questionId": "q1",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 7870,
- "outputTokens": 6,
- "latencyMs": 1337.454
- },
- {
- "questionId": "q1",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 5,
- "latencyMs": 2219.8078330000003
- },
- {
- "questionId": "q1",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 72,
- "latencyMs": 2159.820958
- },
- {
- "questionId": "q1",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 6,
- "latencyMs": 1456.8202079999999
- },
- {
- "questionId": "q1",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 5,
- "latencyMs": 2502.1313750000004
- },
- {
- "questionId": "q1",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 72,
- "latencyMs": 2189.1171249999998
- },
- {
- "questionId": "q1",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 2856,
- "outputTokens": 6,
- "latencyMs": 1251.8321250000001
- },
- {
- "questionId": "q1",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 5,
- "latencyMs": 2795.7488749999998
- },
- {
- "questionId": "q1",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 136,
- "latencyMs": 13798.979167
- },
- {
- "questionId": "q1",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 9360,
- "outputTokens": 6,
- "latencyMs": 1484.293458
- },
- {
- "questionId": "q1",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 5,
- "latencyMs": 2323.462083
- },
- {
- "questionId": "q1",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 8,
- "latencyMs": 2319.068875
- },
- {
- "questionId": "q1",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 5760,
- "outputTokens": 6,
- "latencyMs": 1252.173292
- },
- {
- "questionId": "q1",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "56176",
- "actual": "56176",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 5,
- "latencyMs": 1856.926
- },
- {
- "questionId": "q2",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 71,
- "latencyMs": 2500.574542
- },
- {
- "questionId": "q2",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 4,
- "latencyMs": 1249.101917
- },
- {
- "questionId": "q2",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 1,
- "latencyMs": 1744.0090420000001
- },
- {
- "questionId": "q2",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 71,
- "latencyMs": 2319.50975
- },
- {
- "questionId": "q2",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 4,
- "latencyMs": 1258.086833
- },
- {
- "questionId": "q2",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 1,
- "latencyMs": 1847.8221249999997
- },
- {
- "questionId": "q2",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 71,
- "latencyMs": 4817.745874999999
- },
- {
- "questionId": "q2",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 4,
- "latencyMs": 1024.5234999999998
- },
- {
- "questionId": "q2",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 1,
- "latencyMs": 1336.0151660000001
- },
- {
- "questionId": "q2",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 135,
- "latencyMs": 4109.140791
- },
- {
- "questionId": "q2",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 4,
- "latencyMs": 1267.7541249999995
- },
- {
- "questionId": "q2",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 1,
- "latencyMs": 1808.7597920000007
- },
- {
- "questionId": "q2",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 71,
- "latencyMs": 4865.839082999999
- },
- {
- "questionId": "q2",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 4,
- "latencyMs": 1018.2179999999998
- },
- {
- "questionId": "q2",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 1,
- "latencyMs": 2534.4780839999994
- },
- {
- "questionId": "q3",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 204,
- "latencyMs": 3778.0985
- },
- {
- "questionId": "q3",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7874,
- "outputTokens": 12,
- "latencyMs": 1190.655541
- },
- {
- "questionId": "q3",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7911,
- "outputTokens": 10,
- "latencyMs": 1595.469916
- },
- {
- "questionId": "q3",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 76,
- "latencyMs": 4163.945208000001
- },
- {
- "questionId": "q3",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2986,
- "outputTokens": 12,
- "latencyMs": 892.92875
- },
- {
- "questionId": "q3",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3320,
- "outputTokens": 10,
- "latencyMs": 1780.4322919999995
- },
- {
- "questionId": "q3",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 76,
- "latencyMs": 3440.4715000000006
- },
- {
- "questionId": "q3",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2860,
- "outputTokens": 12,
- "latencyMs": 1312.3002079999997
- },
- {
- "questionId": "q3",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3194,
- "outputTokens": 10,
- "latencyMs": 1560.3538330000001
- },
- {
- "questionId": "q3",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 76,
- "latencyMs": 3440.5599999999995
- },
- {
- "questionId": "q3",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9364,
- "outputTokens": 12,
- "latencyMs": 1354.2122089999993
- },
- {
- "questionId": "q3",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9100,
- "outputTokens": 10,
- "latencyMs": 1389.2405829999998
- },
- {
- "questionId": "q3",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 76,
- "latencyMs": 2048.7699159999993
- },
- {
- "questionId": "q3",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5764,
- "outputTokens": 12,
- "latencyMs": 1123.4172500000004
- },
- {
- "questionId": "q3",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "lorenza.kunze@yahoo.com",
- "actual": "lorenza.kunze@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5746,
- "outputTokens": 10,
- "latencyMs": 1638.1436670000003
- },
- {
- "questionId": "q4",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 72,
- "latencyMs": 2966.8363329999993
- },
- {
- "questionId": "q4",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 7870,
- "outputTokens": 6,
- "latencyMs": 1323.5372910000006
- },
- {
- "questionId": "q4",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 6,
- "latencyMs": 1860.8958750000002
- },
- {
- "questionId": "q4",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 136,
- "latencyMs": 6895.250208000001
- },
- {
- "questionId": "q4",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 6,
- "latencyMs": 1020.296542
- },
- {
- "questionId": "q4",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 6,
- "latencyMs": 2481.260875
- },
- {
- "questionId": "q4",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 200,
- "latencyMs": 2689.2119999999995
- },
- {
- "questionId": "q4",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 2856,
- "outputTokens": 6,
- "latencyMs": 1194.3670409999995
- },
- {
- "questionId": "q4",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 6,
- "latencyMs": 1743.3429579999993
- },
- {
- "questionId": "q4",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 72,
- "latencyMs": 5788.955082999999
- },
- {
- "questionId": "q4",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 9360,
- "outputTokens": 6,
- "latencyMs": 1222.5617920000004
- },
- {
- "questionId": "q4",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 6,
- "latencyMs": 1692.9171670000014
- },
- {
- "questionId": "q4",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 72,
- "latencyMs": 6426.231709
- },
- {
- "questionId": "q4",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 5760,
- "outputTokens": 6,
- "latencyMs": 1159.4893339999999
- },
- {
- "questionId": "q4",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "117381",
- "actual": "117381",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 6,
- "latencyMs": 2415.9878329999992
- },
- {
- "questionId": "q5",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 71,
- "latencyMs": 2950.774625
- },
- {
- "questionId": "q5",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 4,
- "latencyMs": 1003.6548750000002
- },
- {
- "questionId": "q5",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7907,
- "outputTokens": 1,
- "latencyMs": 1209.7468329999992
- },
- {
- "questionId": "q5",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 71,
- "latencyMs": 3026.993291999999
- },
- {
- "questionId": "q5",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 4,
- "latencyMs": 981.8320000000003
- },
- {
- "questionId": "q5",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3316,
- "outputTokens": 1,
- "latencyMs": 2011.3852089999982
- },
- {
- "questionId": "q5",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 135,
- "latencyMs": 4215.294709
- },
- {
- "questionId": "q5",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 4,
- "latencyMs": 906.2993340000012
- },
- {
- "questionId": "q5",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3190,
- "outputTokens": 1,
- "latencyMs": 1666.1483749999989
- },
- {
- "questionId": "q5",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 135,
- "latencyMs": 4311.166333000001
- },
- {
- "questionId": "q5",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 4,
- "latencyMs": 1072.923917
- },
- {
- "questionId": "q5",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9096,
- "outputTokens": 1,
- "latencyMs": 2526.938041999998
- },
- {
- "questionId": "q5",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 135,
- "latencyMs": 3970.2666659999995
- },
- {
- "questionId": "q5",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 4,
- "latencyMs": 1364.8737079999992
- },
- {
- "questionId": "q5",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5742,
- "outputTokens": 1,
- "latencyMs": 3125.6591660000013
- },
- {
- "questionId": "q6",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 139,
- "latencyMs": 3116.8453340000015
- },
- {
- "questionId": "q6",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 11,
- "latencyMs": 1065.8984999999993
- },
- {
- "questionId": "q6",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 8,
- "latencyMs": 2190.0096250000024
- },
- {
- "questionId": "q6",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 75,
- "latencyMs": 2661.1630829999995
- },
- {
- "questionId": "q6",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 11,
- "latencyMs": 990.5193749999999
- },
- {
- "questionId": "q6",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 8,
- "latencyMs": 1937.4020420000015
- },
- {
- "questionId": "q6",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 139,
- "latencyMs": 3740.6538750000036
- },
- {
- "questionId": "q6",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 11,
- "latencyMs": 1033.1626250000008
- },
- {
- "questionId": "q6",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 8,
- "latencyMs": 1733.0828340000007
- },
- {
- "questionId": "q6",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 139,
- "latencyMs": 3042.367707999998
- },
- {
- "questionId": "q6",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 11,
- "latencyMs": 1472.3534580000014
- },
- {
- "questionId": "q6",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 8,
- "latencyMs": 1953.7035419999993
- },
- {
- "questionId": "q6",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 75,
- "latencyMs": 2179.8505829999995
- },
- {
- "questionId": "q6",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 11,
- "latencyMs": 1714.971625000002
- },
- {
- "questionId": "q6",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "jayda60@hotmail.com",
- "actual": "jayda60@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 8,
- "latencyMs": 2170.373334
- },
- {
- "questionId": "q7",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 72,
- "latencyMs": 3005.6769590000004
- },
- {
- "questionId": "q7",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 7870,
- "outputTokens": 6,
- "latencyMs": 2070.191666999999
- },
- {
- "questionId": "q7",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 7907,
- "outputTokens": 5,
- "latencyMs": 1338.8482500000027
- },
- {
- "questionId": "q7",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 136,
- "latencyMs": 2615.7999579999996
- },
- {
- "questionId": "q7",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 6,
- "latencyMs": 1124.058917000002
- },
- {
- "questionId": "q7",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 3316,
- "outputTokens": 5,
- "latencyMs": 2317.5837079999983
- },
- {
- "questionId": "q7",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 72,
- "latencyMs": 9505.310291999998
- },
- {
- "questionId": "q7",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 2856,
- "outputTokens": 6,
- "latencyMs": 895.9319159999977
- },
- {
- "questionId": "q7",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 3190,
- "outputTokens": 5,
- "latencyMs": 1462.6939160000002
- },
- {
- "questionId": "q7",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 136,
- "latencyMs": 2529.6767499999987
- },
- {
- "questionId": "q7",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 9360,
- "outputTokens": 6,
- "latencyMs": 1144.4980419999993
- },
- {
- "questionId": "q7",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 9096,
- "outputTokens": 5,
- "latencyMs": 3182.1694160000006
- },
- {
- "questionId": "q7",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 72,
- "latencyMs": 2789.477584
- },
- {
- "questionId": "q7",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 5760,
- "outputTokens": 6,
- "latencyMs": 1023.4829170000012
- },
- {
- "questionId": "q7",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "92971",
- "actual": "92971",
- "isCorrect": true,
- "inputTokens": 5742,
- "outputTokens": 5,
- "latencyMs": 3741.309666000001
- },
- {
- "questionId": "q8",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 199,
- "latencyMs": 2646.0443330000016
- },
- {
- "questionId": "q8",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 4,
- "latencyMs": 1147.7947499999973
- },
- {
- "questionId": "q8",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 1,
- "latencyMs": 2658.0985
- },
- {
- "questionId": "q8",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 71,
- "latencyMs": 3748.428749999999
- },
- {
- "questionId": "q8",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 4,
- "latencyMs": 876.6897919999974
- },
- {
- "questionId": "q8",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 1,
- "latencyMs": 3812.920249999999
- },
- {
- "questionId": "q8",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 71,
- "latencyMs": 6820.9698750000025
- },
- {
- "questionId": "q8",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 4,
- "latencyMs": 997.5997500000012
- },
- {
- "questionId": "q8",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 1,
- "latencyMs": 1829.7533750000002
- },
- {
- "questionId": "q8",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 135,
- "latencyMs": 6256.235125000003
- },
- {
- "questionId": "q8",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 4,
- "latencyMs": 1280.0348330000015
- },
- {
- "questionId": "q8",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 1,
- "latencyMs": 3024.0259170000027
- },
- {
- "questionId": "q8",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 71,
- "latencyMs": 3522.8339579999993
- },
- {
- "questionId": "q8",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 4,
- "latencyMs": 1134.9532080000026
- },
- {
- "questionId": "q8",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 1,
- "latencyMs": 3095.1540000000023
- },
- {
- "questionId": "q9",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 140,
- "latencyMs": 2087.950582999998
- },
- {
- "questionId": "q9",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 11,
- "latencyMs": 1115.425166000001
- },
- {
- "questionId": "q9",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7910,
- "outputTokens": 9,
- "latencyMs": 1841.3965420000022
- },
- {
- "questionId": "q9",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 204,
- "latencyMs": 4039.2035830000023
- },
- {
- "questionId": "q9",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 11,
- "latencyMs": 1254.9832079999978
- },
- {
- "questionId": "q9",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3319,
- "outputTokens": 9,
- "latencyMs": 2190.8811249999962
- },
- {
- "questionId": "q9",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 140,
- "latencyMs": 3403.9012079999957
- },
- {
- "questionId": "q9",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 11,
- "latencyMs": 1323.0636660000018
- },
- {
- "questionId": "q9",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3193,
- "outputTokens": 9,
- "latencyMs": 1047.0718749999942
- },
- {
- "questionId": "q9",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 140,
- "latencyMs": 3498.7119999999995
- },
- {
- "questionId": "q9",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 11,
- "latencyMs": 1830.5542919999934
- },
- {
- "questionId": "q9",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9099,
- "outputTokens": 9,
- "latencyMs": 2052.039208999995
- },
- {
- "questionId": "q9",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 140,
- "latencyMs": 2254.0641659999965
- },
- {
- "questionId": "q9",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 11,
- "latencyMs": 1279.8175830000037
- },
- {
- "questionId": "q9",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "terrance.hansen@yahoo.com",
- "actual": "terrance.hansen@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5745,
- "outputTokens": 9,
- "latencyMs": 2624.0571249999994
- },
- {
- "questionId": "q10",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 72,
- "latencyMs": 3316.716124999999
- },
- {
- "questionId": "q10",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 7870,
- "outputTokens": 6,
- "latencyMs": 1078.8857919999937
- },
- {
- "questionId": "q10",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 6,
- "latencyMs": 1426.163416000003
- },
- {
- "questionId": "q10",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 136,
- "latencyMs": 3091.0714579999985
- },
- {
- "questionId": "q10",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 6,
- "latencyMs": 1171.1557079999984
- },
- {
- "questionId": "q10",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 6,
- "latencyMs": 2722.0316250000033
- },
- {
- "questionId": "q10",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 72,
- "latencyMs": 3280.0853329999954
- },
- {
- "questionId": "q10",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 2856,
- "outputTokens": 6,
- "latencyMs": 937.3515409999964
- },
- {
- "questionId": "q10",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 6,
- "latencyMs": 1638.423999999999
- },
- {
- "questionId": "q10",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 136,
- "latencyMs": 15425.220833
- },
- {
- "questionId": "q10",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 9360,
- "outputTokens": 6,
- "latencyMs": 1195.8543749999953
- },
- {
- "questionId": "q10",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 6,
- "latencyMs": 2432.2206250000017
- },
- {
- "questionId": "q10",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 72,
- "latencyMs": 2047.1201250000013
- },
- {
- "questionId": "q10",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 5760,
- "outputTokens": 6,
- "latencyMs": 1617.048625000003
- },
- {
- "questionId": "q10",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "107744",
- "actual": "107744",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 6,
- "latencyMs": 1548.9360000000015
- },
- {
- "questionId": "q11",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 71,
- "latencyMs": 3741.5673339999994
- },
- {
- "questionId": "q11",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 4,
- "latencyMs": 1189.5477079999982
- },
- {
- "questionId": "q11",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 1,
- "latencyMs": 1194.6662920000017
- },
- {
- "questionId": "q11",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 135,
- "latencyMs": 2947.4346250000017
- },
- {
- "questionId": "q11",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 4,
- "latencyMs": 944.1087090000001
- },
- {
- "questionId": "q11",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 1,
- "latencyMs": 2017.044041999994
- },
- {
- "questionId": "q11",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 71,
- "latencyMs": 4068.897624999998
- },
- {
- "questionId": "q11",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 4,
- "latencyMs": 1092.8982499999984
- },
- {
- "questionId": "q11",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 1,
- "latencyMs": 2148.519874999998
- },
- {
- "questionId": "q11",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 135,
- "latencyMs": 3025.696167000002
- },
- {
- "questionId": "q11",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 4,
- "latencyMs": 1069.479542000001
- },
- {
- "questionId": "q11",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 1,
- "latencyMs": 2595.035582999997
- },
- {
- "questionId": "q11",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 71,
- "latencyMs": 2200.230208000001
- },
- {
- "questionId": "q11",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 4,
- "latencyMs": 1226.070749999999
- },
- {
- "questionId": "q11",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 1,
- "latencyMs": 2045.9056249999994
- },
- {
- "questionId": "q12",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 266,
- "latencyMs": 5672.897708000004
- },
- {
- "questionId": "q12",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 7867,
- "outputTokens": 9,
- "latencyMs": 1745.323000000004
- },
- {
- "questionId": "q12",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 8,
- "latencyMs": 1877.5404999999955
- },
- {
- "questionId": "q12",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 74,
- "latencyMs": 5317.909041999999
- },
- {
- "questionId": "q12",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 2979,
- "outputTokens": 9,
- "latencyMs": 916.7109169999967
- },
- {
- "questionId": "q12",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 8,
- "latencyMs": 2401.305290999997
- },
- {
- "questionId": "q12",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 74,
- "latencyMs": 3016.4596669999955
- },
- {
- "questionId": "q12",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 2853,
- "outputTokens": 9,
- "latencyMs": 1233.9625830000004
- },
- {
- "questionId": "q12",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 8,
- "latencyMs": 2000.6465000000026
- },
- {
- "questionId": "q12",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 138,
- "latencyMs": 6270.167416999997
- },
- {
- "questionId": "q12",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 9357,
- "outputTokens": 9,
- "latencyMs": 2332.7022089999955
- },
- {
- "questionId": "q12",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 8,
- "latencyMs": 1986.9040000000023
- },
- {
- "questionId": "q12",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 74,
- "latencyMs": 3294.769625000001
- },
- {
- "questionId": "q12",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 5757,
- "outputTokens": 9,
- "latencyMs": 1028.5119580000028
- },
- {
- "questionId": "q12",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "allan21@gmail.com",
- "actual": "allan21@gmail.com",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 8,
- "latencyMs": 1788.622083000002
- },
- {
- "questionId": "q13",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 6388,
- "outputTokens": 72,
- "latencyMs": 2426.662333
- },
- {
- "questionId": "q13",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 6,
- "latencyMs": 1199.7499580000003
- },
- {
- "questionId": "q13",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 7907,
- "outputTokens": 6,
- "latencyMs": 2230.200499999999
- },
- {
- "questionId": "q13",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 2525,
- "outputTokens": 72,
- "latencyMs": 2973.9408330000006
- },
- {
- "questionId": "q13",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 6,
- "latencyMs": 1759.8231249999953
- },
- {
- "questionId": "q13",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 3316,
- "outputTokens": 6,
- "latencyMs": 3236.040165999999
- },
- {
- "questionId": "q13",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 2379,
- "outputTokens": 72,
- "latencyMs": 2829.9307920000065
- },
- {
- "questionId": "q13",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 6,
- "latencyMs": 905.942667000003
- },
- {
- "questionId": "q13",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 3190,
- "outputTokens": 6,
- "latencyMs": 1492.0838749999966
- },
- {
- "questionId": "q13",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 7355,
- "outputTokens": 136,
- "latencyMs": 3018.9516250000015
- },
- {
- "questionId": "q13",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 6,
- "latencyMs": 1010.1432910000003
- },
- {
- "questionId": "q13",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 9096,
- "outputTokens": 6,
- "latencyMs": 2475.971083000004
- },
- {
- "questionId": "q13",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 5010,
- "outputTokens": 72,
- "latencyMs": 2322.1169999999984
- },
- {
- "questionId": "q13",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 6,
- "latencyMs": 993.6942500000005
- },
- {
- "questionId": "q13",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "145843",
- "actual": "145843",
- "isCorrect": true,
- "inputTokens": 5742,
- "outputTokens": 6,
- "latencyMs": 2137.871124999998
- },
- {
- "questionId": "q14",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 71,
- "latencyMs": 2223.1494999999995
- },
- {
- "questionId": "q14",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 4,
- "latencyMs": 1101.960708999999
- },
- {
- "questionId": "q14",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 1,
- "latencyMs": 1264.4358330000032
- },
- {
- "questionId": "q14",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 71,
- "latencyMs": 3117.289082999996
- },
- {
- "questionId": "q14",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 4,
- "latencyMs": 975.8156250000029
- },
- {
- "questionId": "q14",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 1,
- "latencyMs": 2076.140041999999
- },
- {
- "questionId": "q14",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 71,
- "latencyMs": 3522.6094999999987
- },
- {
- "questionId": "q14",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 4,
- "latencyMs": 749.1067079999993
- },
- {
- "questionId": "q14",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 1,
- "latencyMs": 2162.154208
- },
- {
- "questionId": "q14",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 135,
- "latencyMs": 15105.717249999994
- },
- {
- "questionId": "q14",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 4,
- "latencyMs": 1518.0794160000005
- },
- {
- "questionId": "q14",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 1,
- "latencyMs": 2634.745458999998
- },
- {
- "questionId": "q14",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 71,
- "latencyMs": 2809.990375000001
- },
- {
- "questionId": "q14",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 4,
- "latencyMs": 2328.9382079999996
- },
- {
- "questionId": "q14",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 1,
- "latencyMs": 2122.7864169999957
- },
- {
- "questionId": "q15",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 140,
- "latencyMs": 2744.6706660000054
- },
- {
- "questionId": "q15",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 9,
- "latencyMs": 1389.9784999999974
- },
- {
- "questionId": "q15",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 8,
- "latencyMs": 1310.762625000003
- },
- {
- "questionId": "q15",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 204,
- "latencyMs": 5402.840416999999
- },
- {
- "questionId": "q15",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 9,
- "latencyMs": 1480.7467909999978
- },
- {
- "questionId": "q15",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 8,
- "latencyMs": 1741.1184169999979
- },
- {
- "questionId": "q15",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 140,
- "latencyMs": 2192.0577909999993
- },
- {
- "questionId": "q15",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 9,
- "latencyMs": 1052.5672919999997
- },
- {
- "questionId": "q15",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 8,
- "latencyMs": 2969.6880840000013
- },
- {
- "questionId": "q15",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 140,
- "latencyMs": 4902.5039590000015
- },
- {
- "questionId": "q15",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 9,
- "latencyMs": 1337.9500409999964
- },
- {
- "questionId": "q15",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 8,
- "latencyMs": 988.1449579999971
- },
- {
- "questionId": "q15",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 140,
- "latencyMs": 5435.804457999999
- },
- {
- "questionId": "q15",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 9,
- "latencyMs": 1164.0297080000018
- },
- {
- "questionId": "q15",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "alexandria61@gmail.com",
- "actual": "alexandria61@gmail.com",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 8,
- "latencyMs": 1684.5642079999961
- },
- {
- "questionId": "q16",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 72,
- "latencyMs": 2137.3070000000007
- },
- {
- "questionId": "q16",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 7870,
- "outputTokens": 6,
- "latencyMs": 1353.1784169999955
- },
- {
- "questionId": "q16",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 5,
- "latencyMs": 2152.076667000001
- },
- {
- "questionId": "q16",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 72,
- "latencyMs": 9838.444999999992
- },
- {
- "questionId": "q16",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 6,
- "latencyMs": 1011.8612080000021
- },
- {
- "questionId": "q16",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 5,
- "latencyMs": 2380.466207999998
- },
- {
- "questionId": "q16",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 72,
- "latencyMs": 2358.7515829999975
- },
- {
- "questionId": "q16",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 2856,
- "outputTokens": 6,
- "latencyMs": 1073.5187089999963
- },
- {
- "questionId": "q16",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 5,
- "latencyMs": 1808.9837499999994
- },
- {
- "questionId": "q16",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 200,
- "latencyMs": 3657.137167000008
- },
- {
- "questionId": "q16",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 9360,
- "outputTokens": 6,
- "latencyMs": 1216.3329169999997
- },
- {
- "questionId": "q16",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 5,
- "latencyMs": 2347.6749580000032
- },
- {
- "questionId": "q16",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 136,
- "latencyMs": 2985.761999999995
- },
- {
- "questionId": "q16",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 5760,
- "outputTokens": 6,
- "latencyMs": 1062.5013749999998
- },
- {
- "questionId": "q16",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "89436",
- "actual": "89436",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 5,
- "latencyMs": 2942.199041999993
- },
- {
- "questionId": "q17",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 71,
- "latencyMs": 2072.9703750000044
- },
- {
- "questionId": "q17",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7872,
- "outputTokens": 4,
- "latencyMs": 1143.0027499999997
- },
- {
- "questionId": "q17",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7911,
- "outputTokens": 1,
- "latencyMs": 2339.718792000007
- },
- {
- "questionId": "q17",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 135,
- "latencyMs": 2721.8648749999993
- },
- {
- "questionId": "q17",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 4,
- "latencyMs": 1106.3964160000032
- },
- {
- "questionId": "q17",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3320,
- "outputTokens": 1,
- "latencyMs": 2453.6342910000094
- },
- {
- "questionId": "q17",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 135,
- "latencyMs": 2526.1070829999953
- },
- {
- "questionId": "q17",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2858,
- "outputTokens": 4,
- "latencyMs": 963.8103339999943
- },
- {
- "questionId": "q17",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3194,
- "outputTokens": 1,
- "latencyMs": 1213.7454580000049
- },
- {
- "questionId": "q17",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 199,
- "latencyMs": 3451.3691249999974
- },
- {
- "questionId": "q17",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9362,
- "outputTokens": 4,
- "latencyMs": 1054.2650409999915
- },
- {
- "questionId": "q17",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9100,
- "outputTokens": 1,
- "latencyMs": 1712.7362089999951
- },
- {
- "questionId": "q17",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 199,
- "latencyMs": 4517.758332999991
- },
- {
- "questionId": "q17",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5762,
- "outputTokens": 4,
- "latencyMs": 1036.0673749999987
- },
- {
- "questionId": "q17",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5746,
- "outputTokens": 1,
- "latencyMs": 2099.134084000005
- },
- {
- "questionId": "q18",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 139,
- "latencyMs": 3450.1222080000007
- },
- {
- "questionId": "q18",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 10,
- "latencyMs": 2320.022790999996
- },
- {
- "questionId": "q18",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 8,
- "latencyMs": 1058.7114589999983
- },
- {
- "questionId": "q18",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 75,
- "latencyMs": 3345.744040999998
- },
- {
- "questionId": "q18",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 10,
- "latencyMs": 1209.7132500000007
- },
- {
- "questionId": "q18",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 8,
- "latencyMs": 1716.227457999994
- },
- {
- "questionId": "q18",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 139,
- "latencyMs": 3093.9495000000024
- },
- {
- "questionId": "q18",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 10,
- "latencyMs": 1311.3692500000034
- },
- {
- "questionId": "q18",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 8,
- "latencyMs": 794.0660829999979
- },
- {
- "questionId": "q18",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 459,
- "latencyMs": 5397.067582999996
- },
- {
- "questionId": "q18",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 10,
- "latencyMs": 1179.005124999996
- },
- {
- "questionId": "q18",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 8,
- "latencyMs": 3390.3811669999996
- },
- {
- "questionId": "q18",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 75,
- "latencyMs": 3942.734500000006
- },
- {
- "questionId": "q18",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 10,
- "latencyMs": 1198.2199580000015
- },
- {
- "questionId": "q18",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "kelvin54@yahoo.com",
- "actual": "kelvin54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 8,
- "latencyMs": 1988.9680829999998
- },
- {
- "questionId": "q19",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 200,
- "latencyMs": 2964.017540999994
- },
- {
- "questionId": "q19",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 7872,
- "outputTokens": 6,
- "latencyMs": 1171.257249999995
- },
- {
- "questionId": "q19",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 6,
- "latencyMs": 1304.4575840000034
- },
- {
- "questionId": "q19",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 72,
- "latencyMs": 3056.008249999999
- },
- {
- "questionId": "q19",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 6,
- "latencyMs": 873.7801659999968
- },
- {
- "questionId": "q19",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 6,
- "latencyMs": 1536.4943750000093
- },
- {
- "questionId": "q19",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 328,
- "latencyMs": 3966.832792000001
- },
- {
- "questionId": "q19",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 2858,
- "outputTokens": 6,
- "latencyMs": 1072.791458000007
- },
- {
- "questionId": "q19",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 6,
- "latencyMs": 1334.2349169999943
- },
- {
- "questionId": "q19",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 136,
- "latencyMs": 2824.245167000001
- },
- {
- "questionId": "q19",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 9362,
- "outputTokens": 6,
- "latencyMs": 1156.3476669999945
- },
- {
- "questionId": "q19",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 6,
- "latencyMs": 2503.603999999992
- },
- {
- "questionId": "q19",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 72,
- "latencyMs": 1988.6155419999996
- },
- {
- "questionId": "q19",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 5762,
- "outputTokens": 6,
- "latencyMs": 2019.264417000013
- },
- {
- "questionId": "q19",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "143365",
- "actual": "143365",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 6,
- "latencyMs": 2120.657042000006
- },
- {
- "questionId": "q20",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 71,
- "latencyMs": 2674.240417000008
- },
- {
- "questionId": "q20",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 4,
- "latencyMs": 985.5821250000008
- },
- {
- "questionId": "q20",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 1,
- "latencyMs": 1005.9853749999893
- },
- {
- "questionId": "q20",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 71,
- "latencyMs": 2337.429165999987
- },
- {
- "questionId": "q20",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 4,
- "latencyMs": 1671.3083750000078
- },
- {
- "questionId": "q20",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 1,
- "latencyMs": 1858.936124999993
- },
- {
- "questionId": "q20",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 71,
- "latencyMs": 1797.8257500000036
- },
- {
- "questionId": "q20",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 4,
- "latencyMs": 1014.9593339999992
- },
- {
- "questionId": "q20",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 1,
- "latencyMs": 1534.200667000012
- },
- {
- "questionId": "q20",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 135,
- "latencyMs": 3340.923125000001
- },
- {
- "questionId": "q20",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 4,
- "latencyMs": 1555.2516250000044
- },
- {
- "questionId": "q20",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 1,
- "latencyMs": 2945.7507919999916
- },
- {
- "questionId": "q20",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 71,
- "latencyMs": 3605.196708999996
- },
- {
- "questionId": "q20",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 4,
- "latencyMs": 1068.8147920000047
- },
- {
- "questionId": "q20",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 1,
- "latencyMs": 2330.3333749999874
- },
- {
- "questionId": "q21",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 6393,
- "outputTokens": 75,
- "latencyMs": 2723.754000000001
- },
- {
- "questionId": "q21",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 7876,
- "outputTokens": 9,
- "latencyMs": 1170.7758329999924
- },
- {
- "questionId": "q21",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 7912,
- "outputTokens": 7,
- "latencyMs": 2132.3265829999873
- },
- {
- "questionId": "q21",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 2530,
- "outputTokens": 139,
- "latencyMs": 3074.613540999999
- },
- {
- "questionId": "q21",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 2988,
- "outputTokens": 9,
- "latencyMs": 887.1294170000037
- },
- {
- "questionId": "q21",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 3321,
- "outputTokens": 7,
- "latencyMs": 1689.1039579999924
- },
- {
- "questionId": "q21",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 2384,
- "outputTokens": 75,
- "latencyMs": 2337.622915999993
- },
- {
- "questionId": "q21",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 2862,
- "outputTokens": 9,
- "latencyMs": 951.0157920000056
- },
- {
- "questionId": "q21",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 3195,
- "outputTokens": 7,
- "latencyMs": 2195.647125000003
- },
- {
- "questionId": "q21",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 7360,
- "outputTokens": 75,
- "latencyMs": 2328.1204169999983
- },
- {
- "questionId": "q21",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 9366,
- "outputTokens": 9,
- "latencyMs": 1225.2067499999976
- },
- {
- "questionId": "q21",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 9101,
- "outputTokens": 7,
- "latencyMs": 1613.4727500000008
- },
- {
- "questionId": "q21",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 5015,
- "outputTokens": 75,
- "latencyMs": 2482.4477909999987
- },
- {
- "questionId": "q21",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 5766,
- "outputTokens": 9,
- "latencyMs": 1235.0746250000084
- },
- {
- "questionId": "q21",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "dean19@gmail.com",
- "actual": "dean19@gmail.com",
- "isCorrect": true,
- "inputTokens": 5747,
- "outputTokens": 7,
- "latencyMs": 4278.624791999988
- },
- {
- "questionId": "q22",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 136,
- "latencyMs": 2741.065750000009
- },
- {
- "questionId": "q22",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 6,
- "latencyMs": 1172.1854580000072
- },
- {
- "questionId": "q22",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 6,
- "latencyMs": 1184.0355000000127
- },
- {
- "questionId": "q22",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 136,
- "latencyMs": 6348.677542000005
- },
- {
- "questionId": "q22",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 6,
- "latencyMs": 964.3882920000033
- },
- {
- "questionId": "q22",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 6,
- "latencyMs": 1484.964082999999
- },
- {
- "questionId": "q22",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 72,
- "latencyMs": 23689.366624999995
- },
- {
- "questionId": "q22",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 6,
- "latencyMs": 1258.0295830000105
- },
- {
- "questionId": "q22",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 6,
- "latencyMs": 18510.087583
- },
- {
- "questionId": "q22",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 136,
- "latencyMs": 2856.495458000005
- },
- {
- "questionId": "q22",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 6,
- "latencyMs": 1031.8081669999956
- },
- {
- "questionId": "q22",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 6,
- "latencyMs": 2408.5496249999997
- },
- {
- "questionId": "q22",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 72,
- "latencyMs": 2405.9946670000063
- },
- {
- "questionId": "q22",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 6,
- "latencyMs": 1855.128291999994
- },
- {
- "questionId": "q22",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "111314",
- "actual": "111314",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 6,
- "latencyMs": 14026.715166000009
- },
- {
- "questionId": "q23",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6388,
- "outputTokens": 71,
- "latencyMs": 2613.9667920000065
- },
- {
- "questionId": "q23",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 4,
- "latencyMs": 914.9832499999902
- },
- {
- "questionId": "q23",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7907,
- "outputTokens": 1,
- "latencyMs": 17605.488457999993
- },
- {
- "questionId": "q23",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2525,
- "outputTokens": 455,
- "latencyMs": 5491.203125
- },
- {
- "questionId": "q23",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 4,
- "latencyMs": 1559.9341249999998
- },
- {
- "questionId": "q23",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3316,
- "outputTokens": 1,
- "latencyMs": 12204.927791999988
- },
- {
- "questionId": "q23",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2379,
- "outputTokens": 71,
- "latencyMs": 4993.148166999992
- },
- {
- "questionId": "q23",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 4,
- "latencyMs": 1479.5367499999993
- },
- {
- "questionId": "q23",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3190,
- "outputTokens": 1,
- "latencyMs": 2016.5271659999999
- },
- {
- "questionId": "q23",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7355,
- "outputTokens": 135,
- "latencyMs": 3785.880541999999
- },
- {
- "questionId": "q23",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 4,
- "latencyMs": 1170.9521249999962
- },
- {
- "questionId": "q23",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9096,
- "outputTokens": 1,
- "latencyMs": 2376.3025000000052
- },
- {
- "questionId": "q23",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5010,
- "outputTokens": 71,
- "latencyMs": 12974.991708999994
- },
- {
- "questionId": "q23",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 4,
- "latencyMs": 1062.6410830000095
- },
- {
- "questionId": "q23",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5742,
- "outputTokens": 1,
- "latencyMs": 2375.1459170000016
- },
- {
- "questionId": "q24",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 331,
- "latencyMs": 7831.431874999995
- },
- {
- "questionId": "q24",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 10,
- "latencyMs": 1169.4948749999894
- },
- {
- "questionId": "q24",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 8,
- "latencyMs": 6873.670041000005
- },
- {
- "questionId": "q24",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 139,
- "latencyMs": 2733.310750000004
- },
- {
- "questionId": "q24",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 10,
- "latencyMs": 1465.5957500000077
- },
- {
- "questionId": "q24",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 8,
- "latencyMs": 12162.723041999998
- },
- {
- "questionId": "q24",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 203,
- "latencyMs": 2401.237958999991
- },
- {
- "questionId": "q24",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 10,
- "latencyMs": 976.5733749999927
- },
- {
- "questionId": "q24",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 8,
- "latencyMs": 1773.305250000005
- },
- {
- "questionId": "q24",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 395,
- "latencyMs": 6293.676041999992
- },
- {
- "questionId": "q24",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 10,
- "latencyMs": 1263.188875000007
- },
- {
- "questionId": "q24",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 8,
- "latencyMs": 1866.224624999988
- },
- {
- "questionId": "q24",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 75,
- "latencyMs": 1734.0090409999975
- },
- {
- "questionId": "q24",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 10,
- "latencyMs": 1076.4865419999987
- },
- {
- "questionId": "q24",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "laurel54@yahoo.com",
- "actual": "laurel54@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 8,
- "latencyMs": 1799.7341250000027
- },
- {
- "questionId": "q25",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 136,
- "latencyMs": 4268.888999999996
- },
- {
- "questionId": "q25",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 7873,
- "outputTokens": 6,
- "latencyMs": 1100.426707999999
- },
- {
- "questionId": "q25",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 7910,
- "outputTokens": 5,
- "latencyMs": 905.148000000001
- },
- {
- "questionId": "q25",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 72,
- "latencyMs": 3470.1760000000068
- },
- {
- "questionId": "q25",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 6,
- "latencyMs": 1239.0414170000004
- },
- {
- "questionId": "q25",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 3319,
- "outputTokens": 5,
- "latencyMs": 3012.1026249999995
- },
- {
- "questionId": "q25",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 72,
- "latencyMs": 4932.565208
- },
- {
- "questionId": "q25",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 2859,
- "outputTokens": 6,
- "latencyMs": 923.8483330000017
- },
- {
- "questionId": "q25",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 3193,
- "outputTokens": 5,
- "latencyMs": 1677.830792000008
- },
- {
- "questionId": "q25",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 200,
- "latencyMs": 4701.415708
- },
- {
- "questionId": "q25",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 9363,
- "outputTokens": 6,
- "latencyMs": 1366.9058340000047
- },
- {
- "questionId": "q25",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 9099,
- "outputTokens": 5,
- "latencyMs": 1693.0314170000056
- },
- {
- "questionId": "q25",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 136,
- "latencyMs": 5666.829292000009
- },
- {
- "questionId": "q25",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 5763,
- "outputTokens": 6,
- "latencyMs": 1181.8469999999943
- },
- {
- "questionId": "q25",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "89553",
- "actual": "89553",
- "isCorrect": true,
- "inputTokens": 5745,
- "outputTokens": 5,
- "latencyMs": 2083.4975829999894
- },
- {
- "questionId": "q26",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6388,
- "outputTokens": 71,
- "latencyMs": 2986.76112499999
- },
- {
- "questionId": "q26",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7866,
- "outputTokens": 4,
- "latencyMs": 1736.9273340000072
- },
- {
- "questionId": "q26",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7907,
- "outputTokens": 1,
- "latencyMs": 1777.5319579999923
- },
- {
- "questionId": "q26",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2525,
- "outputTokens": 71,
- "latencyMs": 2717.0237919999927
- },
- {
- "questionId": "q26",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2978,
- "outputTokens": 4,
- "latencyMs": 874.0303339999955
- },
- {
- "questionId": "q26",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3316,
- "outputTokens": 1,
- "latencyMs": 5675.357959000001
- },
- {
- "questionId": "q26",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2379,
- "outputTokens": 71,
- "latencyMs": 3198.773958000005
- },
- {
- "questionId": "q26",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2852,
- "outputTokens": 4,
- "latencyMs": 1085.409707999992
- },
- {
- "questionId": "q26",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3190,
- "outputTokens": 1,
- "latencyMs": 1932.898749999993
- },
- {
- "questionId": "q26",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7355,
- "outputTokens": 135,
- "latencyMs": 4096.534249999997
- },
- {
- "questionId": "q26",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9356,
- "outputTokens": 4,
- "latencyMs": 1258.4983749999956
- },
- {
- "questionId": "q26",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9096,
- "outputTokens": 1,
- "latencyMs": 2413.0945409999986
- },
- {
- "questionId": "q26",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5010,
- "outputTokens": 71,
- "latencyMs": 3148.736499999999
- },
- {
- "questionId": "q26",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5756,
- "outputTokens": 4,
- "latencyMs": 1131.4892499999987
- },
- {
- "questionId": "q26",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5742,
- "outputTokens": 1,
- "latencyMs": 1526.3339579999883
- },
- {
- "questionId": "q27",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 142,
- "latencyMs": 2969.5719580000004
- },
- {
- "questionId": "q27",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 14,
- "latencyMs": 2196.764500000005
- },
- {
- "questionId": "q27",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 7910,
- "outputTokens": 12,
- "latencyMs": 1040.4618750000081
- },
- {
- "questionId": "q27",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 78,
- "latencyMs": 3091.4898329999996
- },
- {
- "questionId": "q27",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 14,
- "latencyMs": 1001.9885000000068
- },
- {
- "questionId": "q27",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 3319,
- "outputTokens": 12,
- "latencyMs": 3467.2665410000045
- },
- {
- "questionId": "q27",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 78,
- "latencyMs": 5917.028874999989
- },
- {
- "questionId": "q27",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 14,
- "latencyMs": 1305.7503750000033
- },
- {
- "questionId": "q27",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 3193,
- "outputTokens": 12,
- "latencyMs": 2613.1883329999982
- },
- {
- "questionId": "q27",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 142,
- "latencyMs": 2786.5942090000026
- },
- {
- "questionId": "q27",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 14,
- "latencyMs": 2270.722458999997
- },
- {
- "questionId": "q27",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 9099,
- "outputTokens": 12,
- "latencyMs": 1157.144708000007
- },
- {
- "questionId": "q27",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 142,
- "latencyMs": 3469.4895829999878
- },
- {
- "questionId": "q27",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 14,
- "latencyMs": 1359.8917079999956
- },
- {
- "questionId": "q27",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "jayme.kertzmann77@gmail.com",
- "actual": "jayme.kertzmann77@gmail.com",
- "isCorrect": true,
- "inputTokens": 5745,
- "outputTokens": 12,
- "latencyMs": 2318.6192080000037
- },
- {
- "questionId": "q28",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 136,
- "latencyMs": 4774.099707999994
- },
- {
- "questionId": "q28",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 6,
- "latencyMs": 1098.6865830000024
- },
- {
- "questionId": "q28",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 6,
- "latencyMs": 1239.2771659999999
- },
- {
- "questionId": "q28",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 136,
- "latencyMs": 5861.847667000009
- },
- {
- "questionId": "q28",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 6,
- "latencyMs": 1297.473874999996
- },
- {
- "questionId": "q28",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 6,
- "latencyMs": 1698.9040830000013
- },
- {
- "questionId": "q28",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 72,
- "latencyMs": 7521.450750000004
- },
- {
- "questionId": "q28",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 6,
- "latencyMs": 989.1705420000071
- },
- {
- "questionId": "q28",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 6,
- "latencyMs": 1598.6000829999975
- },
- {
- "questionId": "q28",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 136,
- "latencyMs": 4121.990666000012
- },
- {
- "questionId": "q28",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 6,
- "latencyMs": 1153.3577499999956
- },
- {
- "questionId": "q28",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 6,
- "latencyMs": 5119.164292000001
- },
- {
- "questionId": "q28",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 136,
- "latencyMs": 5101.831541000007
- },
- {
- "questionId": "q28",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 6,
- "latencyMs": 1048.2691250000062
- },
- {
- "questionId": "q28",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "104053",
- "actual": "104053",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 6,
- "latencyMs": 2109.3487500000047
- },
- {
- "questionId": "q29",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 135,
- "latencyMs": 3792.2222499999916
- },
- {
- "questionId": "q29",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7872,
- "outputTokens": 4,
- "latencyMs": 1203.301084000006
- },
- {
- "questionId": "q29",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7910,
- "outputTokens": 1,
- "latencyMs": 1963.9974580000126
- },
- {
- "questionId": "q29",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 135,
- "latencyMs": 3127.7867909999914
- },
- {
- "questionId": "q29",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 4,
- "latencyMs": 1192.564333000002
- },
- {
- "questionId": "q29",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3319,
- "outputTokens": 1,
- "latencyMs": 2034.2360419999895
- },
- {
- "questionId": "q29",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 71,
- "latencyMs": 2648.283917000008
- },
- {
- "questionId": "q29",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2858,
- "outputTokens": 4,
- "latencyMs": 902.732290999993
- },
- {
- "questionId": "q29",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3193,
- "outputTokens": 1,
- "latencyMs": 2174.387124999994
- },
- {
- "questionId": "q29",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 71,
- "latencyMs": 2300.0212080000056
- },
- {
- "questionId": "q29",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9362,
- "outputTokens": 4,
- "latencyMs": 963.8994999999995
- },
- {
- "questionId": "q29",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9099,
- "outputTokens": 1,
- "latencyMs": 4195.405083000005
- },
- {
- "questionId": "q29",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 135,
- "latencyMs": 3398.262333999999
- },
- {
- "questionId": "q29",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5762,
- "outputTokens": 4,
- "latencyMs": 1032.8332079999964
- },
- {
- "questionId": "q29",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5745,
- "outputTokens": 1,
- "latencyMs": 2265.614916999999
- },
- {
- "questionId": "q30",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 76,
- "latencyMs": 2575.189624999999
- },
- {
- "questionId": "q30",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 12,
- "latencyMs": 1003.463208000001
- },
- {
- "questionId": "q30",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 9,
- "latencyMs": 1218.547916999989
- },
- {
- "questionId": "q30",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 76,
- "latencyMs": 17850.385834000015
- },
- {
- "questionId": "q30",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 12,
- "latencyMs": 1060.4747919999936
- },
- {
- "questionId": "q30",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 9,
- "latencyMs": 2927.220583000002
- },
- {
- "questionId": "q30",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 140,
- "latencyMs": 2492.920542000007
- },
- {
- "questionId": "q30",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 12,
- "latencyMs": 1167.4384590000118
- },
- {
- "questionId": "q30",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 9,
- "latencyMs": 1760.1724159999867
- },
- {
- "questionId": "q30",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 76,
- "latencyMs": 2586.2806249999994
- },
- {
- "questionId": "q30",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 12,
- "latencyMs": 1827.6337499999936
- },
- {
- "questionId": "q30",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 9,
- "latencyMs": 1985.0590000000084
- },
- {
- "questionId": "q30",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 76,
- "latencyMs": 2150.4795000000013
- },
- {
- "questionId": "q30",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 12,
- "latencyMs": 1151.3658339999965
- },
- {
- "questionId": "q30",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "carley.bauch@yahoo.com",
- "actual": "carley.bauch@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 9,
- "latencyMs": 2104.947874999998
- },
- {
- "questionId": "q31",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 6393,
- "outputTokens": 136,
- "latencyMs": 2204.857333000007
- },
- {
- "questionId": "q31",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 7874,
- "outputTokens": 6,
- "latencyMs": 1366.9736249999987
- },
- {
- "questionId": "q31",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 7911,
- "outputTokens": 6,
- "latencyMs": 1108.5303330000024
- },
- {
- "questionId": "q31",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 2530,
- "outputTokens": 136,
- "latencyMs": 2809.3447089999972
- },
- {
- "questionId": "q31",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 2986,
- "outputTokens": 6,
- "latencyMs": 985.2792080000072
- },
- {
- "questionId": "q31",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 3320,
- "outputTokens": 6,
- "latencyMs": 1869.5062499999913
- },
- {
- "questionId": "q31",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 2384,
- "outputTokens": 136,
- "latencyMs": 2816.2447910000046
- },
- {
- "questionId": "q31",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 2860,
- "outputTokens": 6,
- "latencyMs": 1038.263666999992
- },
- {
- "questionId": "q31",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 3194,
- "outputTokens": 6,
- "latencyMs": 1011.8830000000016
- },
- {
- "questionId": "q31",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 7360,
- "outputTokens": 200,
- "latencyMs": 2650.324915999983
- },
- {
- "questionId": "q31",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 9364,
- "outputTokens": 6,
- "latencyMs": 1139.189167000004
- },
- {
- "questionId": "q31",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 9100,
- "outputTokens": 6,
- "latencyMs": 1773.4112920000043
- },
- {
- "questionId": "q31",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 5015,
- "outputTokens": 136,
- "latencyMs": 2481.3391249999986
- },
- {
- "questionId": "q31",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 5764,
- "outputTokens": 6,
- "latencyMs": 1290.1707079999906
- },
- {
- "questionId": "q31",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "142029",
- "actual": "142029",
- "isCorrect": true,
- "inputTokens": 5746,
- "outputTokens": 6,
- "latencyMs": 2289.944292
- },
- {
- "questionId": "q32",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 135,
- "latencyMs": 4142.8067919999885
- },
- {
- "questionId": "q32",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 4,
- "latencyMs": 1067.801999999996
- },
- {
- "questionId": "q32",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 1,
- "latencyMs": 1057.6598330000124
- },
- {
- "questionId": "q32",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 135,
- "latencyMs": 2198.369875000004
- },
- {
- "questionId": "q32",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 4,
- "latencyMs": 1228.235249999998
- },
- {
- "questionId": "q32",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 1,
- "latencyMs": 2113.6464160000032
- },
- {
- "questionId": "q32",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 135,
- "latencyMs": 2331.9615420000046
- },
- {
- "questionId": "q32",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 4,
- "latencyMs": 1010.4068330000155
- },
- {
- "questionId": "q32",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 1,
- "latencyMs": 1529.0002080000122
- },
- {
- "questionId": "q32",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 199,
- "latencyMs": 4986.682375000004
- },
- {
- "questionId": "q32",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 4,
- "latencyMs": 1295.2261669999862
- },
- {
- "questionId": "q32",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 1,
- "latencyMs": 2608.518458000006
- },
- {
- "questionId": "q32",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 71,
- "latencyMs": 1683.7294159999874
- },
- {
- "questionId": "q32",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 4,
- "latencyMs": 1466.112374999997
- },
- {
- "questionId": "q32",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 1,
- "latencyMs": 2186.13829100001
- },
- {
- "questionId": "q33",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 6393,
- "outputTokens": 204,
- "latencyMs": 4101.640291000018
- },
- {
- "questionId": "q33",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7872,
- "outputTokens": 14,
- "latencyMs": 1355.6347499999974
- },
- {
- "questionId": "q33",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7911,
- "outputTokens": 9,
- "latencyMs": 1218.3612080000166
- },
- {
- "questionId": "q33",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2530,
- "outputTokens": 140,
- "latencyMs": 2800.1185839999816
- },
- {
- "questionId": "q33",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 14,
- "latencyMs": 1477.837124999991
- },
- {
- "questionId": "q33",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 3320,
- "outputTokens": 9,
- "latencyMs": 1545.5144169999985
- },
- {
- "questionId": "q33",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2384,
- "outputTokens": 76,
- "latencyMs": 3839.476958000014
- },
- {
- "questionId": "q33",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 2858,
- "outputTokens": 14,
- "latencyMs": 1138.701000000001
- },
- {
- "questionId": "q33",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 3194,
- "outputTokens": 9,
- "latencyMs": 928.7706250000047
- },
- {
- "questionId": "q33",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 7360,
- "outputTokens": 140,
- "latencyMs": 2666.2794580000045
- },
- {
- "questionId": "q33",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 9362,
- "outputTokens": 14,
- "latencyMs": 2169.680166999984
- },
- {
- "questionId": "q33",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 9100,
- "outputTokens": 9,
- "latencyMs": 1705.846458999993
- },
- {
- "questionId": "q33",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5015,
- "outputTokens": 76,
- "latencyMs": 2263.530958999996
- },
- {
- "questionId": "q33",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5762,
- "outputTokens": 14,
- "latencyMs": 1402.7602079999924
- },
- {
- "questionId": "q33",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "cheyenne_skiles@hotmail.com",
- "actual": "cheyenne_skiles@hotmail.com",
- "isCorrect": true,
- "inputTokens": 5746,
- "outputTokens": 9,
- "latencyMs": 2376.068292000011
- },
- {
- "questionId": "q34",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 6391,
- "outputTokens": 72,
- "latencyMs": 2438.071291
- },
- {
- "questionId": "q34",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 6,
- "latencyMs": 1119.892125000013
- },
- {
- "questionId": "q34",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 7910,
- "outputTokens": 5,
- "latencyMs": 1219.9752500000177
- },
- {
- "questionId": "q34",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 2528,
- "outputTokens": 136,
- "latencyMs": 3074.212375000003
- },
- {
- "questionId": "q34",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 6,
- "latencyMs": 1182.489499999996
- },
- {
- "questionId": "q34",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 3319,
- "outputTokens": 5,
- "latencyMs": 2366.0734999999986
- },
- {
- "questionId": "q34",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 2382,
- "outputTokens": 72,
- "latencyMs": 3682.4087500000023
- },
- {
- "questionId": "q34",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 6,
- "latencyMs": 865.8139159999846
- },
- {
- "questionId": "q34",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 3193,
- "outputTokens": 5,
- "latencyMs": 1594.2567079999717
- },
- {
- "questionId": "q34",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 7358,
- "outputTokens": 200,
- "latencyMs": 9620.968290999997
- },
- {
- "questionId": "q34",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 6,
- "latencyMs": 1066.5026659999858
- },
- {
- "questionId": "q34",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 9099,
- "outputTokens": 5,
- "latencyMs": 2701.866624999995
- },
- {
- "questionId": "q34",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 5013,
- "outputTokens": 136,
- "latencyMs": 3559.778957999981
- },
- {
- "questionId": "q34",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 6,
- "latencyMs": 1008.4788750000007
- },
- {
- "questionId": "q34",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "84650",
- "actual": "84650",
- "isCorrect": true,
- "inputTokens": 5745,
- "outputTokens": 5,
- "latencyMs": 1889.822375000018
- },
- {
- "questionId": "q35",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 71,
- "latencyMs": 3083.3981669999775
- },
- {
- "questionId": "q35",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 4,
- "latencyMs": 1060.2027909999888
- },
- {
- "questionId": "q35",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 1,
- "latencyMs": 1432.9026670000167
- },
- {
- "questionId": "q35",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 71,
- "latencyMs": 2827.286916000012
- },
- {
- "questionId": "q35",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 4,
- "latencyMs": 1606.289208000002
- },
- {
- "questionId": "q35",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 1,
- "latencyMs": 1781.2257079999836
- },
- {
- "questionId": "q35",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 135,
- "latencyMs": 2855.722792000015
- },
- {
- "questionId": "q35",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2857,
- "outputTokens": 4,
- "latencyMs": 1140.299874999997
- },
- {
- "questionId": "q35",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 1,
- "latencyMs": 2195.365832999989
- },
- {
- "questionId": "q35",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 135,
- "latencyMs": 2904.48324999999
- },
- {
- "questionId": "q35",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 4,
- "latencyMs": 1264.2794160000049
- },
- {
- "questionId": "q35",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 1,
- "latencyMs": 3598.464708000014
- },
- {
- "questionId": "q35",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 71,
- "latencyMs": 2646.219666000019
- },
- {
- "questionId": "q35",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 4,
- "latencyMs": 1090.8027500000026
- },
- {
- "questionId": "q35",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 1,
- "latencyMs": 2322.022082999989
- },
- {
- "questionId": "q36",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 78,
- "latencyMs": 2498.7566669999796
- },
- {
- "questionId": "q36",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 14,
- "latencyMs": 1563.026332999987
- },
- {
- "questionId": "q36",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 11,
- "latencyMs": 1062.8037919999915
- },
- {
- "questionId": "q36",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 590,
- "latencyMs": 9420.16175
- },
- {
- "questionId": "q36",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 14,
- "latencyMs": 1038.3448750000098
- },
- {
- "questionId": "q36",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 11,
- "latencyMs": 3468.648833000014
- },
- {
- "questionId": "q36",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 142,
- "latencyMs": 3061.706208000018
- },
- {
- "questionId": "q36",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 14,
- "latencyMs": 1053.0741669999843
- },
- {
- "questionId": "q36",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 11,
- "latencyMs": 1576.9219160000212
- },
- {
- "questionId": "q36",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 78,
- "latencyMs": 1889.579624999984
- },
- {
- "questionId": "q36",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 14,
- "latencyMs": 1520.9462920000078
- },
- {
- "questionId": "q36",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 11,
- "latencyMs": 1917.4184999999998
- },
- {
- "questionId": "q36",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 142,
- "latencyMs": 4630.122166999994
- },
- {
- "questionId": "q36",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 14,
- "latencyMs": 1646.354083000013
- },
- {
- "questionId": "q36",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "macey.gottlieb5@yahoo.com",
- "actual": "macey.gottlieb5@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 11,
- "latencyMs": 2197.673375000013
- },
- {
- "questionId": "q37",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 72,
- "latencyMs": 3646.0600829999894
- },
- {
- "questionId": "q37",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 6,
- "latencyMs": 1356.2343330000003
- },
- {
- "questionId": "q37",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 5,
- "latencyMs": 735.1860419999866
- },
- {
- "questionId": "q37",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 136,
- "latencyMs": 2701.791499999992
- },
- {
- "questionId": "q37",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 6,
- "latencyMs": 1259.3909169999824
- },
- {
- "questionId": "q37",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 5,
- "latencyMs": 1960.7033339999907
- },
- {
- "questionId": "q37",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 72,
- "latencyMs": 5573.357083999988
- },
- {
- "questionId": "q37",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 6,
- "latencyMs": 1284.3673750000016
- },
- {
- "questionId": "q37",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 5,
- "latencyMs": 2050.5506659999955
- },
- {
- "questionId": "q37",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 136,
- "latencyMs": 3253.602791000012
- },
- {
- "questionId": "q37",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 6,
- "latencyMs": 1146.329166999989
- },
- {
- "questionId": "q37",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 5,
- "latencyMs": 2395.673125000001
- },
- {
- "questionId": "q37",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 72,
- "latencyMs": 2913.434957999998
- },
- {
- "questionId": "q37",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 6,
- "latencyMs": 2243.595874999999
- },
- {
- "questionId": "q37",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "89773",
- "actual": "89773",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 5,
- "latencyMs": 1839.661374999996
- },
- {
- "questionId": "q38",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 135,
- "latencyMs": 2779.79579199999
- },
- {
- "questionId": "q38",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7868,
- "outputTokens": 4,
- "latencyMs": 1133.7338750000054
- },
- {
- "questionId": "q38",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 1,
- "latencyMs": 774.6977079999924
- },
- {
- "questionId": "q38",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 71,
- "latencyMs": 4311.999750000017
- },
- {
- "questionId": "q38",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 4,
- "latencyMs": 2223.9427499999874
- },
- {
- "questionId": "q38",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 1,
- "latencyMs": 2975.953125
- },
- {
- "questionId": "q38",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 71,
- "latencyMs": 4617.852291999996
- },
- {
- "questionId": "q38",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 2854,
- "outputTokens": 4,
- "latencyMs": 1096.2197500000184
- },
- {
- "questionId": "q38",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 1,
- "latencyMs": 2754.3287919999857
- },
- {
- "questionId": "q38",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 135,
- "latencyMs": 3539.3821250000037
- },
- {
- "questionId": "q38",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9358,
- "outputTokens": 4,
- "latencyMs": 1369.516082999995
- },
- {
- "questionId": "q38",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 1,
- "latencyMs": 2677.958791000012
- },
- {
- "questionId": "q38",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 71,
- "latencyMs": 2209.974041999987
- },
- {
- "questionId": "q38",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5758,
- "outputTokens": 4,
- "latencyMs": 1352.3056670000078
- },
- {
- "questionId": "q38",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Marketing",
- "actual": "Marketing",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 1,
- "latencyMs": 2126.258208000014
- },
- {
- "questionId": "q39",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 6389,
- "outputTokens": 207,
- "latencyMs": 3999.7677079999994
- },
- {
- "questionId": "q39",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7869,
- "outputTokens": 13,
- "latencyMs": 1170.8554579999764
- },
- {
- "questionId": "q39",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7908,
- "outputTokens": 10,
- "latencyMs": 1278.5721670000057
- },
- {
- "questionId": "q39",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2526,
- "outputTokens": 143,
- "latencyMs": 3334.013791000005
- },
- {
- "questionId": "q39",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 13,
- "latencyMs": 1115.4245419999934
- },
- {
- "questionId": "q39",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3317,
- "outputTokens": 10,
- "latencyMs": 2555.918707999983
- },
- {
- "questionId": "q39",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2380,
- "outputTokens": 143,
- "latencyMs": 2100.1043329999957
- },
- {
- "questionId": "q39",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 2855,
- "outputTokens": 13,
- "latencyMs": 1298.810999999987
- },
- {
- "questionId": "q39",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 3191,
- "outputTokens": 10,
- "latencyMs": 1940.2669170000008
- },
- {
- "questionId": "q39",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 7356,
- "outputTokens": 143,
- "latencyMs": 2666.5189580000006
- },
- {
- "questionId": "q39",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9359,
- "outputTokens": 13,
- "latencyMs": 1611.7814170000202
- },
- {
- "questionId": "q39",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 9097,
- "outputTokens": 10,
- "latencyMs": 1709.3350419999915
- },
- {
- "questionId": "q39",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5011,
- "outputTokens": 143,
- "latencyMs": 4774.929042000003
- },
- {
- "questionId": "q39",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5759,
- "outputTokens": 13,
- "latencyMs": 1369.8504160000011
- },
- {
- "questionId": "q39",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "georgianna_renner@yahoo.com",
- "actual": "georgianna_renner@yahoo.com",
- "isCorrect": true,
- "inputTokens": 5743,
- "outputTokens": 10,
- "latencyMs": 3123.9857920000213
- },
- {
- "questionId": "q40",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 6390,
- "outputTokens": 72,
- "latencyMs": 2700.2800830000197
- },
- {
- "questionId": "q40",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 7871,
- "outputTokens": 6,
- "latencyMs": 1145.983292000019
- },
- {
- "questionId": "q40",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 7909,
- "outputTokens": 5,
- "latencyMs": 952.1742089999898
- },
- {
- "questionId": "q40",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 2527,
- "outputTokens": 72,
- "latencyMs": 2220.3111250000075
- },
- {
- "questionId": "q40",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 2983,
- "outputTokens": 6,
- "latencyMs": 981.9718339999963
- },
- {
- "questionId": "q40",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 3318,
- "outputTokens": 5,
- "latencyMs": 2079.9035830000066
- },
- {
- "questionId": "q40",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 2381,
- "outputTokens": 136,
- "latencyMs": 2519.2579590000096
- },
- {
- "questionId": "q40",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "49741",
- "actual": "144426",
- "isCorrect": false,
- "inputTokens": 2857,
- "outputTokens": 6,
- "latencyMs": 942.0043329999899
- },
- {
- "questionId": "q40",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 3192,
- "outputTokens": 5,
- "latencyMs": 1683.0637080000015
- },
- {
- "questionId": "q40",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 7357,
- "outputTokens": 72,
- "latencyMs": 2190.1603750000068
- },
- {
- "questionId": "q40",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 9361,
- "outputTokens": 6,
- "latencyMs": 1771.8361250000016
- },
- {
- "questionId": "q40",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 9098,
- "outputTokens": 5,
- "latencyMs": 2376.372875000001
- },
- {
- "questionId": "q40",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 5012,
- "outputTokens": 72,
- "latencyMs": 2355.175791000016
- },
- {
- "questionId": "q40",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 5761,
- "outputTokens": 6,
- "latencyMs": 1192.191541999986
- },
- {
- "questionId": "q40",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "49741",
- "actual": "49741",
- "isCorrect": true,
- "inputTokens": 5744,
- "outputTokens": 5,
- "latencyMs": 2328.137166999979
- },
- {
- "questionId": "q41",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 775,
- "latencyMs": 11132.566209000011
- },
- {
- "questionId": "q41",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1048.9463749999995
- },
- {
- "questionId": "q41",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 954.9381670000148
- },
- {
- "questionId": "q41",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 583,
- "latencyMs": 5343.168333000009
- },
- {
- "questionId": "q41",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 929.4576249999809
- },
- {
- "questionId": "q41",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 1230.1574160000018
- },
- {
- "questionId": "q41",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 1415,
- "latencyMs": 16158.150375000027
- },
- {
- "questionId": "q41",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 932.4995000000054
- },
- {
- "questionId": "q41",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 1859.355958
- },
- {
- "questionId": "q41",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 903,
- "latencyMs": 11415.376208000001
- },
- {
- "questionId": "q41",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1198.3916249999893
- },
- {
- "questionId": "q41",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 3497.0485409999965
- },
- {
- "questionId": "q41",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 1031,
- "latencyMs": 10859.450207999995
- },
- {
- "questionId": "q41",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 2038.0866250000254
- },
- {
- "questionId": "q41",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 1642.4759159999958
- },
- {
- "questionId": "q42",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 1031,
- "latencyMs": 11081.197666000022
- },
- {
- "questionId": "q42",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1095.9497919999994
- },
- {
- "questionId": "q42",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 1309.7017500000075
- },
- {
- "questionId": "q42",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 711,
- "latencyMs": 9064.612916999991
- },
- {
- "questionId": "q42",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 1045.4045000000042
- },
- {
- "questionId": "q42",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 2056.116624999995
- },
- {
- "questionId": "q42",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 967,
- "latencyMs": 8423.070084000006
- },
- {
- "questionId": "q42",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 901.4683749999967
- },
- {
- "questionId": "q42",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 2192.902625000017
- },
- {
- "questionId": "q42",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 647,
- "latencyMs": 9821.846875000017
- },
- {
- "questionId": "q42",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1586.0259169999918
- },
- {
- "questionId": "q42",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 9515.369042000006
- },
- {
- "questionId": "q42",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 455,
- "latencyMs": 5076.419125000015
- },
- {
- "questionId": "q42",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 1472.8408340000024
- },
- {
- "questionId": "q42",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 865.6228749999718
- },
- {
- "questionId": "q43",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 775,
- "latencyMs": 8729.67633300001
- },
- {
- "questionId": "q43",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1217.0473749999946
- },
- {
- "questionId": "q43",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 1158.2075419999892
- },
- {
- "questionId": "q43",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 775,
- "latencyMs": 6998.693750000006
- },
- {
- "questionId": "q43",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 1640.0182080000232
- },
- {
- "questionId": "q43",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 947.1101670000062
- },
- {
- "questionId": "q43",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 583,
- "latencyMs": 13248.978291000007
- },
- {
- "questionId": "q43",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 836.4533340000198
- },
- {
- "questionId": "q43",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 818.1433329999854
- },
- {
- "questionId": "q43",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 1095,
- "latencyMs": 9890.235916000005
- },
- {
- "questionId": "q43",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1320.4134170000034
- },
- {
- "questionId": "q43",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 4225.577166000003
- },
- {
- "questionId": "q43",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 1031,
- "latencyMs": 13344.171333000006
- },
- {
- "questionId": "q43",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 863.8359160000109
- },
- {
- "questionId": "q43",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 1194.4381250000151
- },
- {
- "questionId": "q44",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 455,
- "latencyMs": 5239.934833000007
- },
- {
- "questionId": "q44",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1124.6063330000034
- },
- {
- "questionId": "q44",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 1525.701040999993
- },
- {
- "questionId": "q44",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 519,
- "latencyMs": 6195.039833999996
- },
- {
- "questionId": "q44",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 891.0962500000023
- },
- {
- "questionId": "q44",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 1322.2949580000131
- },
- {
- "questionId": "q44",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 1543,
- "latencyMs": 16353.942624999996
- },
- {
- "questionId": "q44",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 861.9590829999943
- },
- {
- "questionId": "q44",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 912.1500829999859
- },
- {
- "questionId": "q44",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 519,
- "latencyMs": 6838.317749999987
- },
- {
- "questionId": "q44",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1875.6236249999783
- },
- {
- "questionId": "q44",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 1482.7477500000095
- },
- {
- "questionId": "q44",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 1223,
- "latencyMs": 13887.709959
- },
- {
- "questionId": "q44",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "17",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 1135.573457999999
- },
- {
- "questionId": "q44",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "17",
- "actual": "17",
- "isCorrect": true,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 1063.958209000004
- },
- {
- "questionId": "q45",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 903,
- "latencyMs": 11372.731792000006
- },
- {
- "questionId": "q45",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "12",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1085.2727500000037
- },
- {
- "questionId": "q45",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "14",
- "isCorrect": false,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 788.761582999985
- },
- {
- "questionId": "q45",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 775,
- "latencyMs": 9670.953584000003
- },
- {
- "questionId": "q45",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 1307.5495419999934
- },
- {
- "questionId": "q45",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "17",
- "isCorrect": false,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 1034.7324580000131
- },
- {
- "questionId": "q45",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 647,
- "latencyMs": 7079.23558399998
- },
- {
- "questionId": "q45",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 1123.2897499999963
- },
- {
- "questionId": "q45",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 1318.0012920000008
- },
- {
- "questionId": "q45",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 583,
- "latencyMs": 5795.2639590000035
- },
- {
- "questionId": "q45",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "12",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1125.9925829999847
- },
- {
- "questionId": "q45",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 8305.401042000012
- },
- {
- "questionId": "q45",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 839,
- "latencyMs": 10189.432124999992
- },
- {
- "questionId": "q45",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 1615.4580000000133
- },
- {
- "questionId": "q45",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 1533.5138750000042
- },
- {
- "questionId": "q46",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 6387,
- "outputTokens": 519,
- "latencyMs": 7169.378540999984
- },
- {
- "questionId": "q46",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 7865,
- "outputTokens": 5,
- "latencyMs": 1133.9953749999986
- },
- {
- "questionId": "q46",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7906,
- "outputTokens": 2,
- "latencyMs": 1018.8396669999929
- },
- {
- "questionId": "q46",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 2524,
- "outputTokens": 647,
- "latencyMs": 6637.351416999998
- },
- {
- "questionId": "q46",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2977,
- "outputTokens": 5,
- "latencyMs": 864.9015839999774
- },
- {
- "questionId": "q46",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "17",
- "isCorrect": false,
- "inputTokens": 3315,
- "outputTokens": 2,
- "latencyMs": 992.5710419999959
- },
- {
- "questionId": "q46",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 2378,
- "outputTokens": 839,
- "latencyMs": 7426.826874999999
- },
- {
- "questionId": "q46",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 2851,
- "outputTokens": 5,
- "latencyMs": 893.4481660000165
- },
- {
- "questionId": "q46",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 3189,
- "outputTokens": 2,
- "latencyMs": 1200.8498329999857
- },
- {
- "questionId": "q46",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 7354,
- "outputTokens": 775,
- "latencyMs": 8865.971332999994
- },
- {
- "questionId": "q46",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 9355,
- "outputTokens": 5,
- "latencyMs": 1491.2856249999895
- },
- {
- "questionId": "q46",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "17",
- "isCorrect": false,
- "inputTokens": 9095,
- "outputTokens": 2,
- "latencyMs": 1216.2892920000013
- },
- {
- "questionId": "q46",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "16",
- "actual": "16",
- "isCorrect": true,
- "inputTokens": 5009,
- "outputTokens": 839,
- "latencyMs": 9403.812124999997
- },
- {
- "questionId": "q46",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "16",
- "actual": "12",
- "isCorrect": false,
- "inputTokens": 5755,
- "outputTokens": 5,
- "latencyMs": 1126.5797500000044
- },
- {
- "questionId": "q46",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "16",
- "actual": "17",
- "isCorrect": false,
- "inputTokens": 5741,
- "outputTokens": 2,
- "latencyMs": 1671.0382089999912
- },
- {
- "questionId": "q47",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 1671,
- "latencyMs": 15363.507083999983
- },
- {
- "questionId": "q47",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "91",
- "actual": "89",
- "isCorrect": false,
- "inputTokens": 7870,
- "outputTokens": 5,
- "latencyMs": 1189.3042910000077
- },
- {
- "questionId": "q47",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "91",
- "actual": "90",
- "isCorrect": false,
- "inputTokens": 7914,
- "outputTokens": 2,
- "latencyMs": 1651.3950829999812
- },
- {
- "questionId": "q47",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 2311,
- "latencyMs": 21706.56012499999
- },
- {
- "questionId": "q47",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "91",
- "actual": "85",
- "isCorrect": false,
- "inputTokens": 2982,
- "outputTokens": 5,
- "latencyMs": 1338.67408300002
- },
- {
- "questionId": "q47",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 3323,
- "outputTokens": 2,
- "latencyMs": 12844.911791999999
- },
- {
- "questionId": "q47",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 2823,
- "latencyMs": 16151.116582999995
- },
- {
- "questionId": "q47",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "91",
- "actual": "85",
- "isCorrect": false,
- "inputTokens": 2856,
- "outputTokens": 5,
- "latencyMs": 3041.4831669999985
- },
- {
- "questionId": "q47",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 3197,
- "outputTokens": 2,
- "latencyMs": 12006.398833000014
- },
- {
- "questionId": "q47",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 2695,
- "latencyMs": 26044.306083000003
- },
- {
- "questionId": "q47",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "91",
- "actual": "89",
- "isCorrect": false,
- "inputTokens": 9360,
- "outputTokens": 5,
- "latencyMs": 1573.8229160000046
- },
- {
- "questionId": "q47",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 9103,
- "outputTokens": 2,
- "latencyMs": 27838.932499999995
- },
- {
- "questionId": "q47",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "91",
- "actual": "91",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 2823,
- "latencyMs": 22628.083542000008
- },
- {
- "questionId": "q47",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "91",
- "actual": "89",
- "isCorrect": false,
- "inputTokens": 5760,
- "outputTokens": 5,
- "latencyMs": 1787.638666999992
- },
- {
- "questionId": "q47",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "91",
- "actual": "90",
- "isCorrect": false,
- "inputTokens": 5749,
- "outputTokens": 2,
- "latencyMs": 1343.8462499999732
- },
- {
- "questionId": "q48",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 1479,
- "latencyMs": 14420.83845900002
- },
- {
- "questionId": "q48",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "67",
- "actual": "57",
- "isCorrect": false,
- "inputTokens": 7870,
- "outputTokens": 5,
- "latencyMs": 1271.2462919999962
- },
- {
- "questionId": "q48",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "67",
- "actual": "70",
- "isCorrect": false,
- "inputTokens": 7914,
- "outputTokens": 2,
- "latencyMs": 1108.4178750000137
- },
- {
- "questionId": "q48",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 2247,
- "latencyMs": 18434.695834000013
- },
- {
- "questionId": "q48",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "67",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 2982,
- "outputTokens": 5,
- "latencyMs": 1125.2875420000055
- },
- {
- "questionId": "q48",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "67",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 3323,
- "outputTokens": 2,
- "latencyMs": 13027.224332999991
- },
- {
- "questionId": "q48",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 2503,
- "latencyMs": 23294.861958000023
- },
- {
- "questionId": "q48",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "67",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 2856,
- "outputTokens": 5,
- "latencyMs": 1208.8763340000005
- },
- {
- "questionId": "q48",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 3197,
- "outputTokens": 2,
- "latencyMs": 11604.352749999991
- },
- {
- "questionId": "q48",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 1479,
- "latencyMs": 18504.804959
- },
- {
- "questionId": "q48",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "67",
- "actual": "57",
- "isCorrect": false,
- "inputTokens": 9360,
- "outputTokens": 5,
- "latencyMs": 1127.928917000012
- },
- {
- "questionId": "q48",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 9103,
- "outputTokens": 2,
- "latencyMs": 22629.69987500002
- },
- {
- "questionId": "q48",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "67",
- "actual": "67",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 2631,
- "latencyMs": 93677.45470900001
- },
- {
- "questionId": "q48",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "67",
- "actual": "57",
- "isCorrect": false,
- "inputTokens": 5760,
- "outputTokens": 5,
- "latencyMs": 1083.3742910000146
- },
- {
- "questionId": "q48",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "67",
- "actual": "70",
- "isCorrect": false,
- "inputTokens": 5749,
- "outputTokens": 2,
- "latencyMs": 1435.5812079999887
- },
- {
- "questionId": "q49",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 1543,
- "latencyMs": 14267.44858299999
- },
- {
- "questionId": "q49",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 7870,
- "outputTokens": 5,
- "latencyMs": 1483.0176250000077
- },
- {
- "questionId": "q49",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "40",
- "isCorrect": false,
- "inputTokens": 7915,
- "outputTokens": 2,
- "latencyMs": 1598.6212089999754
- },
- {
- "questionId": "q49",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 1671,
- "latencyMs": 15241.04254200001
- },
- {
- "questionId": "q49",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "27",
- "isCorrect": false,
- "inputTokens": 2982,
- "outputTokens": 5,
- "latencyMs": 1011.390458000009
- },
- {
- "questionId": "q49",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 3324,
- "outputTokens": 2,
- "latencyMs": 17035.035957999993
- },
- {
- "questionId": "q49",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 1799,
- "latencyMs": 15270.303583
- },
- {
- "questionId": "q49",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 2856,
- "outputTokens": 5,
- "latencyMs": 919.8500000000058
- },
- {
- "questionId": "q49",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 3198,
- "outputTokens": 2,
- "latencyMs": 9191.171333000006
- },
- {
- "questionId": "q49",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 7359,
- "outputTokens": 1479,
- "latencyMs": 14804.62512500002
- },
- {
- "questionId": "q49",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 9360,
- "outputTokens": 5,
- "latencyMs": 1236.6115409999911
- },
- {
- "questionId": "q49",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 9104,
- "outputTokens": 2,
- "latencyMs": 19284.10699999999
- },
- {
- "questionId": "q49",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 1863,
- "latencyMs": 17259.288042
- },
- {
- "questionId": "q49",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 5760,
- "outputTokens": 5,
- "latencyMs": 1715.9734999999928
- },
- {
- "questionId": "q49",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "44",
- "isCorrect": false,
- "inputTokens": 5750,
- "outputTokens": 2,
- "latencyMs": 1872.7845830000006
- },
- {
- "questionId": "q50",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 6392,
- "outputTokens": 1543,
- "latencyMs": 15919.779666999995
- },
- {
- "questionId": "q50",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "26",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 7870,
- "outputTokens": 5,
- "latencyMs": 1291.8912500000151
- },
- {
- "questionId": "q50",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "26",
- "actual": "24",
- "isCorrect": false,
- "inputTokens": 7915,
- "outputTokens": 2,
- "latencyMs": 1005.6952080000192
- },
- {
- "questionId": "q50",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 2529,
- "outputTokens": 1287,
- "latencyMs": 30941.076040999993
- },
- {
- "questionId": "q50",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "26",
- "actual": "16",
- "isCorrect": false,
- "inputTokens": 2982,
- "outputTokens": 5,
- "latencyMs": 1114.022666999983
- },
- {
- "questionId": "q50",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 3324,
- "outputTokens": 2,
- "latencyMs": 17484.997459000006
- },
- {
- "questionId": "q50",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 2383,
- "outputTokens": 1735,
- "latencyMs": 16410.497957999993
- },
- {
- "questionId": "q50",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "26",
- "actual": "16",
- "isCorrect": false,
- "inputTokens": 2856,
- "outputTokens": 5,
- "latencyMs": 1096.8193330000213
- },
- {
- "questionId": "q50",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 3198,
- "outputTokens": 2,
- "latencyMs": 14324.279708000016
- },
- {
- "questionId": "q50",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 7359,
- "outputTokens": 1543,
- "latencyMs": 15139.200333999994
- },
- {
- "questionId": "q50",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "26",
- "actual": "21",
- "isCorrect": false,
- "inputTokens": 9360,
- "outputTokens": 5,
- "latencyMs": 1152.736042000004
- },
- {
- "questionId": "q50",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 9104,
- "outputTokens": 2,
- "latencyMs": 19624.726874999993
- },
- {
- "questionId": "q50",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "26",
- "actual": "26",
- "isCorrect": true,
- "inputTokens": 5014,
- "outputTokens": 1031,
- "latencyMs": 7884.299167000019
- },
- {
- "questionId": "q50",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "26",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 5760,
- "outputTokens": 5,
- "latencyMs": 984.3461250000109
- },
- {
- "questionId": "q50",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "26",
- "actual": "30",
- "isCorrect": false,
- "inputTokens": 5750,
- "outputTokens": 2,
- "latencyMs": 1294.497417000006
- },
- {
- "questionId": "q51",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 6386,
- "outputTokens": 2695,
- "latencyMs": 25757.74325
- },
- {
- "questionId": "q51",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "78",
- "actual": "81",
- "isCorrect": false,
- "inputTokens": 7864,
- "outputTokens": 5,
- "latencyMs": 1330.1275409999944
- },
- {
- "questionId": "q51",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 7905,
- "outputTokens": 2,
- "latencyMs": 11349.042874999985
- },
- {
- "questionId": "q51",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 2523,
- "outputTokens": 2119,
- "latencyMs": 31391.252624999994
- },
- {
- "questionId": "q51",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 2976,
- "outputTokens": 5,
- "latencyMs": 1051.2665419999976
- },
- {
- "questionId": "q51",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 3314,
- "outputTokens": 2,
- "latencyMs": 9630.083915999974
- },
- {
- "questionId": "q51",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "78",
- "actual": "84",
- "isCorrect": false,
- "inputTokens": 2377,
- "outputTokens": 1863,
- "latencyMs": 15133.794208000007
- },
- {
- "questionId": "q51",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "78",
- "actual": "73",
- "isCorrect": false,
- "inputTokens": 2850,
- "outputTokens": 5,
- "latencyMs": 952.5605000000214
- },
- {
- "questionId": "q51",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 3188,
- "outputTokens": 2,
- "latencyMs": 11450.481040999992
- },
- {
- "questionId": "q51",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 7353,
- "outputTokens": 903,
- "latencyMs": 32111.97775000002
- },
- {
- "questionId": "q51",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "78",
- "actual": "77",
- "isCorrect": false,
- "inputTokens": 9354,
- "outputTokens": 5,
- "latencyMs": 2015.6932080000115
- },
- {
- "questionId": "q51",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 9094,
- "outputTokens": 2,
- "latencyMs": 11316.587916999997
- },
- {
- "questionId": "q51",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 5008,
- "outputTokens": 1607,
- "latencyMs": 17228.22670900001
- },
- {
- "questionId": "q51",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "78",
- "actual": "77",
- "isCorrect": false,
- "inputTokens": 5754,
- "outputTokens": 5,
- "latencyMs": 1434.8912919999857
- },
- {
- "questionId": "q51",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "78",
- "actual": "78",
- "isCorrect": true,
- "inputTokens": 5740,
- "outputTokens": 2,
- "latencyMs": 15144.007791000011
- },
- {
- "questionId": "q52",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "22",
- "actual": "21",
- "isCorrect": false,
- "inputTokens": 6386,
- "outputTokens": 839,
- "latencyMs": 8969.827833999996
- },
- {
- "questionId": "q52",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "22",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 7864,
- "outputTokens": 5,
- "latencyMs": 1038.1520420000015
- },
- {
- "questionId": "q52",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 7905,
- "outputTokens": 2,
- "latencyMs": 8416.65183399999
- },
- {
- "questionId": "q52",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 2523,
- "outputTokens": 967,
- "latencyMs": 9633.799374999973
- },
- {
- "questionId": "q52",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "22",
- "actual": "16",
- "isCorrect": false,
- "inputTokens": 2976,
- "outputTokens": 5,
- "latencyMs": 1134.1007079999836
- },
- {
- "questionId": "q52",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 3314,
- "outputTokens": 2,
- "latencyMs": 11542.581249999988
- },
- {
- "questionId": "q52",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "22",
- "actual": "24",
- "isCorrect": false,
- "inputTokens": 2377,
- "outputTokens": 2695,
- "latencyMs": 41106.853249999986
- },
- {
- "questionId": "q52",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "22",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 2850,
- "outputTokens": 5,
- "latencyMs": 918.981958999997
- },
- {
- "questionId": "q52",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 3188,
- "outputTokens": 2,
- "latencyMs": 2052.5287920000264
- },
- {
- "questionId": "q52",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 7353,
- "outputTokens": 839,
- "latencyMs": 8334.775790999993
- },
- {
- "questionId": "q52",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "22",
- "actual": "15",
- "isCorrect": false,
- "inputTokens": 9354,
- "outputTokens": 5,
- "latencyMs": 949.7613340000098
- },
- {
- "questionId": "q52",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 9094,
- "outputTokens": 2,
- "latencyMs": 10658.192250000022
- },
- {
- "questionId": "q52",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 5008,
- "outputTokens": 1991,
- "latencyMs": 14355.515540999972
- },
- {
- "questionId": "q52",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "22",
- "actual": "16",
- "isCorrect": false,
- "inputTokens": 5754,
- "outputTokens": 5,
- "latencyMs": 1039.7822079999896
- },
- {
- "questionId": "q52",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "22",
- "actual": "22",
- "isCorrect": true,
- "inputTokens": 5740,
- "outputTokens": 2,
- "latencyMs": 12535.245041999995
- },
- {
- "questionId": "q53",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 6394,
- "outputTokens": 1223,
- "latencyMs": 11632.450709000026
- },
- {
- "questionId": "q53",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 1179.524166999996
- },
- {
- "questionId": "q53",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 7916,
- "outputTokens": 2,
- "latencyMs": 4426.7412919999915
- },
- {
- "questionId": "q53",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 2531,
- "outputTokens": 1799,
- "latencyMs": 21729.542084000015
- },
- {
- "questionId": "q53",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 3320.943874999997
- },
- {
- "questionId": "q53",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 3325,
- "outputTokens": 2,
- "latencyMs": 5572.28795800003
- },
- {
- "questionId": "q53",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 2385,
- "outputTokens": 1479,
- "latencyMs": 23517.660458
- },
- {
- "questionId": "q53",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 1028.1668340000033
- },
- {
- "questionId": "q53",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 3199,
- "outputTokens": 2,
- "latencyMs": 21513.301958999975
- },
- {
- "questionId": "q53",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 7361,
- "outputTokens": 1415,
- "latencyMs": 25169.729082999984
- },
- {
- "questionId": "q53",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "11",
- "isCorrect": false,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1306.0004590000026
- },
- {
- "questionId": "q53",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 9105,
- "outputTokens": 2,
- "latencyMs": 22791.16737499999
- },
- {
- "questionId": "q53",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 5016,
- "outputTokens": 1415,
- "latencyMs": 18191.111124999996
- },
- {
- "questionId": "q53",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 927.1151660000323
- },
- {
- "questionId": "q53",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "13",
- "isCorrect": false,
- "inputTokens": 5751,
- "outputTokens": 2,
- "latencyMs": 5849.65625
- },
- {
- "questionId": "q54",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 6394,
- "outputTokens": 1543,
- "latencyMs": 17624.57283399999
- },
- {
- "questionId": "q54",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 1445.3690829999978
- },
- {
- "questionId": "q54",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 7916,
- "outputTokens": 2,
- "latencyMs": 4641.89829099999
- },
- {
- "questionId": "q54",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2531,
- "outputTokens": 1095,
- "latencyMs": 16408.578749999986
- },
- {
- "questionId": "q54",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "6",
- "isCorrect": false,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 1336.712916999997
- },
- {
- "questionId": "q54",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3325,
- "outputTokens": 2,
- "latencyMs": 5775.600584
- },
- {
- "questionId": "q54",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2385,
- "outputTokens": 1479,
- "latencyMs": 15717.845583999995
- },
- {
- "questionId": "q54",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 2198.0668749999604
- },
- {
- "questionId": "q54",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3199,
- "outputTokens": 2,
- "latencyMs": 37479.52691700001
- },
- {
- "questionId": "q54",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 7361,
- "outputTokens": 1095,
- "latencyMs": 10663.58587499999
- },
- {
- "questionId": "q54",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1077.469374999986
- },
- {
- "questionId": "q54",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 9105,
- "outputTokens": 2,
- "latencyMs": 16569.429416999978
- },
- {
- "questionId": "q54",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 5016,
- "outputTokens": 1415,
- "latencyMs": 15212.04125000001
- },
- {
- "questionId": "q54",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 935.8371249999618
- },
- {
- "questionId": "q54",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "10",
- "isCorrect": false,
- "inputTokens": 5751,
- "outputTokens": 2,
- "latencyMs": 5121.037708000047
- },
- {
- "questionId": "q55",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 6394,
- "outputTokens": 1095,
- "latencyMs": 34446.65704199998
- },
- {
- "questionId": "q55",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 2282.8374170000316
- },
- {
- "questionId": "q55",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 7916,
- "outputTokens": 2,
- "latencyMs": 5432.8123749999795
- },
- {
- "questionId": "q55",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2531,
- "outputTokens": 1479,
- "latencyMs": 42719.131124999956
- },
- {
- "questionId": "q55",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 1832.9572909999988
- },
- {
- "questionId": "q55",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3325,
- "outputTokens": 2,
- "latencyMs": 7711.211624999996
- },
- {
- "questionId": "q55",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2385,
- "outputTokens": 1607,
- "latencyMs": 57515.48358300002
- },
- {
- "questionId": "q55",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 3238.0369170000195
- },
- {
- "questionId": "q55",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3199,
- "outputTokens": 2,
- "latencyMs": 9271.402125000022
- },
- {
- "questionId": "q55",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 7361,
- "outputTokens": 967,
- "latencyMs": 12946.014833999972
- },
- {
- "questionId": "q55",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1523.2371250000433
- },
- {
- "questionId": "q55",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 9105,
- "outputTokens": 2,
- "latencyMs": 11301.93191600003
- },
- {
- "questionId": "q55",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 5016,
- "outputTokens": 1351,
- "latencyMs": 18129.383040999994
- },
- {
- "questionId": "q55",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 1117.6802920000046
- },
- {
- "questionId": "q55",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 5751,
- "outputTokens": 2,
- "latencyMs": 4743.260083000001
- },
- {
- "questionId": "q56",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "11",
- "isCorrect": false,
- "inputTokens": 6394,
- "outputTokens": 1479,
- "latencyMs": 12632.222667000024
- },
- {
- "questionId": "q56",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 1567.1472920000087
- },
- {
- "questionId": "q56",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 7916,
- "outputTokens": 2,
- "latencyMs": 5749.258750000037
- },
- {
- "questionId": "q56",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 2531,
- "outputTokens": 1479,
- "latencyMs": 17473.24116700003
- },
- {
- "questionId": "q56",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "6",
- "isCorrect": false,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 922.2049170000246
- },
- {
- "questionId": "q56",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 3325,
- "outputTokens": 2,
- "latencyMs": 5561.690833000001
- },
- {
- "questionId": "q56",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "11",
- "isCorrect": false,
- "inputTokens": 2385,
- "outputTokens": 2183,
- "latencyMs": 23539.67433399998
- },
- {
- "questionId": "q56",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 1159.2557500000112
- },
- {
- "questionId": "q56",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 3199,
- "outputTokens": 2,
- "latencyMs": 9863.856417000003
- },
- {
- "questionId": "q56",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 7361,
- "outputTokens": 1927,
- "latencyMs": 106756.24308399996
- },
- {
- "questionId": "q56",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1064.2161659999983
- },
- {
- "questionId": "q56",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 9105,
- "outputTokens": 2,
- "latencyMs": 7033.105833999987
- },
- {
- "questionId": "q56",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 5016,
- "outputTokens": 1095,
- "latencyMs": 14048.506916999992
- },
- {
- "questionId": "q56",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "12",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 1192.642125000013
- },
- {
- "questionId": "q56",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "12",
- "actual": "12",
- "isCorrect": true,
- "inputTokens": 5751,
- "outputTokens": 2,
- "latencyMs": 5957.613042000041
- },
- {
- "questionId": "q57",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 6393,
- "outputTokens": 3719,
- "latencyMs": 332341.88812499994
- },
- {
- "questionId": "q57",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 1168.1113340000156
- },
- {
- "questionId": "q57",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 7912,
- "outputTokens": 2,
- "latencyMs": 20747.95541699999
- },
- {
- "questionId": "q57",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 2530,
- "outputTokens": 3079,
- "latencyMs": 24893.890125000034
- },
- {
- "questionId": "q57",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 1446.5637920000008
- },
- {
- "questionId": "q57",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 3321,
- "outputTokens": 2,
- "latencyMs": 18187.491625000024
- },
- {
- "questionId": "q57",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "62",
- "actual": "64",
- "isCorrect": false,
- "inputTokens": 2384,
- "outputTokens": 4551,
- "latencyMs": 61990.75604200002
- },
- {
- "questionId": "q57",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 2368.5950840000296
- },
- {
- "questionId": "q57",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 3195,
- "outputTokens": 2,
- "latencyMs": 19295.422582999978
- },
- {
- "questionId": "q57",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 7360,
- "outputTokens": 3015,
- "latencyMs": 27433.851124999986
- },
- {
- "questionId": "q57",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1239.7937919999822
- },
- {
- "questionId": "q57",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 9101,
- "outputTokens": 2,
- "latencyMs": 21703.45670800004
- },
- {
- "questionId": "q57",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 5015,
- "outputTokens": 4615,
- "latencyMs": 38416.754041999986
- },
- {
- "questionId": "q57",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 974.5636659999727
- },
- {
- "questionId": "q57",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "62",
- "actual": "62",
- "isCorrect": true,
- "inputTokens": 5747,
- "outputTokens": 2,
- "latencyMs": 20388.102249999996
- },
- {
- "questionId": "q58",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 6393,
- "outputTokens": 2567,
- "latencyMs": 23536.014041999995
- },
- {
- "questionId": "q58",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "45",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 7872,
- "outputTokens": 5,
- "latencyMs": 1002.8562090000487
- },
- {
- "questionId": "q58",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 7913,
- "outputTokens": 2,
- "latencyMs": 35012.274959
- },
- {
- "questionId": "q58",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 2530,
- "outputTokens": 3143,
- "latencyMs": 27182.416041999997
- },
- {
- "questionId": "q58",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "45",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 2984,
- "outputTokens": 5,
- "latencyMs": 935.4336250000051
- },
- {
- "questionId": "q58",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 3322,
- "outputTokens": 2,
- "latencyMs": 19937.21420799999
- },
- {
- "questionId": "q58",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "45",
- "actual": "46",
- "isCorrect": false,
- "inputTokens": 2384,
- "outputTokens": 3271,
- "latencyMs": 26153.538457999995
- },
- {
- "questionId": "q58",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "45",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 2858,
- "outputTokens": 5,
- "latencyMs": 1029.4126660000184
- },
- {
- "questionId": "q58",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 3196,
- "outputTokens": 2,
- "latencyMs": 36182.66629199998
- },
- {
- "questionId": "q58",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 7360,
- "outputTokens": 2823,
- "latencyMs": 27939.341790999984
- },
- {
- "questionId": "q58",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "45",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 9362,
- "outputTokens": 5,
- "latencyMs": 1699.4091669999762
- },
- {
- "questionId": "q58",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 9102,
- "outputTokens": 2,
- "latencyMs": 20119.059750000015
- },
- {
- "questionId": "q58",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 5015,
- "outputTokens": 2631,
- "latencyMs": 25962.383333999955
- },
- {
- "questionId": "q58",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "45",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 5762,
- "outputTokens": 5,
- "latencyMs": 1063.877124999999
- },
- {
- "questionId": "q58",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "45",
- "actual": "45",
- "isCorrect": true,
- "inputTokens": 5748,
- "outputTokens": 2,
- "latencyMs": 37951.156874999986
- },
- {
- "questionId": "q59",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 137,
- "latencyMs": 2635.883374999976
- },
- {
- "questionId": "q59",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1164.0292079999927
- },
- {
- "questionId": "q59",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 5,
- "latencyMs": 1510.9628750000265
- },
- {
- "questionId": "q59",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 73,
- "latencyMs": 3338.3452919999836
- },
- {
- "questionId": "q59",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1290.2898750000168
- },
- {
- "questionId": "q59",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 5,
- "latencyMs": 1073.7947919999715
- },
- {
- "questionId": "q59",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 201,
- "latencyMs": 3254.3114590000478
- },
- {
- "questionId": "q59",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1300.0598330000066
- },
- {
- "questionId": "q59",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 5,
- "latencyMs": 2603.532125000027
- },
- {
- "questionId": "q59",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 137,
- "latencyMs": 2712.822291999997
- },
- {
- "questionId": "q59",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1369.1374160000123
- },
- {
- "questionId": "q59",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 5,
- "latencyMs": 1339.450165999995
- },
- {
- "questionId": "q59",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 137,
- "latencyMs": 2561.059583000024
- },
- {
- "questionId": "q59",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1122.8535000000265
- },
- {
- "questionId": "q59",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "96.17",
- "actual": "96.17",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 5,
- "latencyMs": 1243.387041000009
- },
- {
- "questionId": "q60",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 200,
- "latencyMs": 4276.413916999998
- },
- {
- "questionId": "q60",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1337.8417079999927
- },
- {
- "questionId": "q60",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 2,
- "latencyMs": 1526.3712500000256
- },
- {
- "questionId": "q60",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 136,
- "latencyMs": 2210.3001669999794
- },
- {
- "questionId": "q60",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1227.2460840000422
- },
- {
- "questionId": "q60",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 2,
- "latencyMs": 1149.5532499999972
- },
- {
- "questionId": "q60",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 200,
- "latencyMs": 2463.5065419999883
- },
- {
- "questionId": "q60",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1474.229833999998
- },
- {
- "questionId": "q60",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 2,
- "latencyMs": 3119.7202080000425
- },
- {
- "questionId": "q60",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 136,
- "latencyMs": 2996.8577500000247
- },
- {
- "questionId": "q60",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1374.8893749999697
- },
- {
- "questionId": "q60",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 2,
- "latencyMs": 1361.1552500000107
- },
- {
- "questionId": "q60",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 136,
- "latencyMs": 2356.033334000036
- },
- {
- "questionId": "q60",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1128.8600410000072
- },
- {
- "questionId": "q60",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 2,
- "latencyMs": 1012.1753329999628
- },
- {
- "questionId": "q61",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 201,
- "latencyMs": 2894.6042920000036
- },
- {
- "questionId": "q61",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1140.3883749999804
- },
- {
- "questionId": "q61",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 1286.3832499999553
- },
- {
- "questionId": "q61",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 201,
- "latencyMs": 5983.418707999983
- },
- {
- "questionId": "q61",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1257.5179999999818
- },
- {
- "questionId": "q61",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 1470.9667500000214
- },
- {
- "questionId": "q61",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 265,
- "latencyMs": 3804.386666000006
- },
- {
- "questionId": "q61",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1181.0549580000225
- },
- {
- "questionId": "q61",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 2825.75008300005
- },
- {
- "questionId": "q61",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 201,
- "latencyMs": 4155.127124999999
- },
- {
- "questionId": "q61",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1243.845667000045
- },
- {
- "questionId": "q61",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1183.5630419999943
- },
- {
- "questionId": "q61",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 137,
- "latencyMs": 3305.4360420000157
- },
- {
- "questionId": "q61",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1122.905792000005
- },
- {
- "questionId": "q61",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "599.39",
- "actual": "599.39",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1289.1040829999838
- },
- {
- "questionId": "q62",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 199,
- "latencyMs": 4459.190540999989
- },
- {
- "questionId": "q62",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1385.2943749999977
- },
- {
- "questionId": "q62",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1281.1537499999977
- },
- {
- "questionId": "q62",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 135,
- "latencyMs": 2211.059750000015
- },
- {
- "questionId": "q62",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1282.652208000014
- },
- {
- "questionId": "q62",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1296.6791250000242
- },
- {
- "questionId": "q62",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 135,
- "latencyMs": 4460.896583999973
- },
- {
- "questionId": "q62",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1311.2437919999938
- },
- {
- "questionId": "q62",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 2321.0788329999777
- },
- {
- "questionId": "q62",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 135,
- "latencyMs": 2574.011124999961
- },
- {
- "questionId": "q62",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1331.6849169999477
- },
- {
- "questionId": "q62",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 1876.967500000028
- },
- {
- "questionId": "q62",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 71,
- "latencyMs": 4585.356583999994
- },
- {
- "questionId": "q62",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1472.130541999999
- },
- {
- "questionId": "q62",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 3066.8415830000304
- },
- {
- "questionId": "q63",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 265,
- "latencyMs": 4022.9598750000005
- },
- {
- "questionId": "q63",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1480.8643750000047
- },
- {
- "questionId": "q63",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 1615.6131670000032
- },
- {
- "questionId": "q63",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 265,
- "latencyMs": 3674.1392500000075
- },
- {
- "questionId": "q63",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1060.8583750000107
- },
- {
- "questionId": "q63",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 1496.0798749999958
- },
- {
- "questionId": "q63",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 329,
- "latencyMs": 3936.86050000001
- },
- {
- "questionId": "q63",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1451.5014170000213
- },
- {
- "questionId": "q63",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 3275.3027920000022
- },
- {
- "questionId": "q63",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 521,
- "latencyMs": 7834.65945799998
- },
- {
- "questionId": "q63",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1066.7734170000185
- },
- {
- "questionId": "q63",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1091.2406670000055
- },
- {
- "questionId": "q63",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 265,
- "latencyMs": 7133.230082999973
- },
- {
- "questionId": "q63",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1334.3640829999931
- },
- {
- "questionId": "q63",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "528.71",
- "actual": "528.71",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1548.7799590000068
- },
- {
- "questionId": "q64",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 199,
- "latencyMs": 3084.847666000016
- },
- {
- "questionId": "q64",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1400.1154589999933
- },
- {
- "questionId": "q64",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 2145.6674999999814
- },
- {
- "questionId": "q64",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 199,
- "latencyMs": 2951.514334000007
- },
- {
- "questionId": "q64",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1178.9784170000348
- },
- {
- "questionId": "q64",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1061.4745419999817
- },
- {
- "questionId": "q64",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 263,
- "latencyMs": 3550.5126670000027
- },
- {
- "questionId": "q64",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1128.6832500000019
- },
- {
- "questionId": "q64",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 2419.836874999979
- },
- {
- "questionId": "q64",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 263,
- "latencyMs": 18500.49987499998
- },
- {
- "questionId": "q64",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1697.067417000013
- },
- {
- "questionId": "q64",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 1665.4901669999817
- },
- {
- "questionId": "q64",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 135,
- "latencyMs": 3648.2167090000003
- },
- {
- "questionId": "q64",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1223.7409169999883
- },
- {
- "questionId": "q64",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 2938.2844999999506
- },
- {
- "questionId": "q65",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 202,
- "latencyMs": 3459.946917000052
- },
- {
- "questionId": "q65",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 8,
- "latencyMs": 1173.402208000014
- },
- {
- "questionId": "q65",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 7,
- "latencyMs": 3167.1566250000033
- },
- {
- "questionId": "q65",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 202,
- "latencyMs": 3737.224749999994
- },
- {
- "questionId": "q65",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 8,
- "latencyMs": 926.1720830000122
- },
- {
- "questionId": "q65",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 7,
- "latencyMs": 1469.4704999999958
- },
- {
- "questionId": "q65",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 4014.4818339999765
- },
- {
- "questionId": "q65",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 8,
- "latencyMs": 1132.7197079999605
- },
- {
- "questionId": "q65",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 7,
- "latencyMs": 3670.1206250000396
- },
- {
- "questionId": "q65",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 202,
- "latencyMs": 4318.927583000041
- },
- {
- "questionId": "q65",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 8,
- "latencyMs": 1835.1892919999664
- },
- {
- "questionId": "q65",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 7,
- "latencyMs": 1211.4787500000093
- },
- {
- "questionId": "q65",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 202,
- "latencyMs": 3591.6950419999775
- },
- {
- "questionId": "q65",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 8,
- "latencyMs": 1278.8472920000204
- },
- {
- "questionId": "q65",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1687.82",
- "actual": "1687.82",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 7,
- "latencyMs": 2102.123208999983
- },
- {
- "questionId": "q66",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 136,
- "latencyMs": 2793.1591250000056
- },
- {
- "questionId": "q66",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1319.3459579999908
- },
- {
- "questionId": "q66",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1572.3595830000122
- },
- {
- "questionId": "q66",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 264,
- "latencyMs": 4642.070207999961
- },
- {
- "questionId": "q66",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1161.8217919999734
- },
- {
- "questionId": "q66",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1045.6249589999788
- },
- {
- "questionId": "q66",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 200,
- "latencyMs": 3501.1775419999612
- },
- {
- "questionId": "q66",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1463.0212910000118
- },
- {
- "questionId": "q66",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 1782.100999999966
- },
- {
- "questionId": "q66",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 584,
- "latencyMs": 7168.528500000015
- },
- {
- "questionId": "q66",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1339.9878749999916
- },
- {
- "questionId": "q66",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 1196.7808749999967
- },
- {
- "questionId": "q66",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 328,
- "latencyMs": 4938.96991699998
- },
- {
- "questionId": "q66",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1121.6232500000042
- },
- {
- "questionId": "q66",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 1062.6134160000365
- },
- {
- "questionId": "q67",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 137,
- "latencyMs": 2332.1545840000035
- },
- {
- "questionId": "q67",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1210.105333000014
- },
- {
- "questionId": "q67",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 5,
- "latencyMs": 2248.713915999979
- },
- {
- "questionId": "q67",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 201,
- "latencyMs": 5095.391790999973
- },
- {
- "questionId": "q67",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 2002.2553749999497
- },
- {
- "questionId": "q67",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 5,
- "latencyMs": 1447.1179159999592
- },
- {
- "questionId": "q67",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 201,
- "latencyMs": 7838.877333000011
- },
- {
- "questionId": "q67",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1108.0410839999677
- },
- {
- "questionId": "q67",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 5,
- "latencyMs": 2419.8735420000157
- },
- {
- "questionId": "q67",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 201,
- "latencyMs": 4098.654000000039
- },
- {
- "questionId": "q67",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1200.5831250000047
- },
- {
- "questionId": "q67",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 5,
- "latencyMs": 1685.785542000027
- },
- {
- "questionId": "q67",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 201,
- "latencyMs": 4059.9044170000125
- },
- {
- "questionId": "q67",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1264.0358329999726
- },
- {
- "questionId": "q67",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "423.6",
- "actual": "423.6",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 5,
- "latencyMs": 1237.0989580000169
- },
- {
- "questionId": "q68",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 200,
- "latencyMs": 3303.1327499999898
- },
- {
- "questionId": "q68",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1808.5881250000093
- },
- {
- "questionId": "q68",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1355.4241669999901
- },
- {
- "questionId": "q68",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 200,
- "latencyMs": 3711.711249999993
- },
- {
- "questionId": "q68",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1294.2883750000037
- },
- {
- "questionId": "q68",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1162.5020840000361
- },
- {
- "questionId": "q68",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 264,
- "latencyMs": 3022.083249999967
- },
- {
- "questionId": "q68",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 944.2437079999945
- },
- {
- "questionId": "q68",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 3629.1201669999864
- },
- {
- "questionId": "q68",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 456,
- "latencyMs": 4701.368916000007
- },
- {
- "questionId": "q68",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1121.0914999999804
- },
- {
- "questionId": "q68",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 2000.4341669999994
- },
- {
- "questionId": "q68",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 200,
- "latencyMs": 6000.394582999987
- },
- {
- "questionId": "q68",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1584.1092090000166
- },
- {
- "questionId": "q68",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 2002.2350420000148
- },
- {
- "questionId": "q69",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 265,
- "latencyMs": 7792.974290999991
- },
- {
- "questionId": "q69",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 2028.2800829999615
- },
- {
- "questionId": "q69",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 1505.0516669999924
- },
- {
- "questionId": "q69",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 201,
- "latencyMs": 7270.891041999974
- },
- {
- "questionId": "q69",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 2478.4481660000165
- },
- {
- "questionId": "q69",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 1305.2497500000172
- },
- {
- "questionId": "q69",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 393,
- "latencyMs": 6261.073583999998
- },
- {
- "questionId": "q69",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1863.528500000015
- },
- {
- "questionId": "q69",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 3306.4452499999898
- },
- {
- "questionId": "q69",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 265,
- "latencyMs": 3464.767792000028
- },
- {
- "questionId": "q69",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1144.0890420000069
- },
- {
- "questionId": "q69",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1458.4538750000065
- },
- {
- "questionId": "q69",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 201,
- "latencyMs": 3276.8598340000026
- },
- {
- "questionId": "q69",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1434.8686669999734
- },
- {
- "questionId": "q69",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "784.03",
- "actual": "784.03",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1570.2152500000084
- },
- {
- "questionId": "q70",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 200,
- "latencyMs": 3532.8103330000304
- },
- {
- "questionId": "q70",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1212.3070409999928
- },
- {
- "questionId": "q70",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 2,
- "latencyMs": 1246.4002080000355
- },
- {
- "questionId": "q70",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 136,
- "latencyMs": 6942.459582999989
- },
- {
- "questionId": "q70",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1144.068333000003
- },
- {
- "questionId": "q70",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 2,
- "latencyMs": 2209.296417000005
- },
- {
- "questionId": "q70",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 136,
- "latencyMs": 4940.5221670000465
- },
- {
- "questionId": "q70",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1493.192041000002
- },
- {
- "questionId": "q70",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 2,
- "latencyMs": 1817.8049579999642
- },
- {
- "questionId": "q70",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 136,
- "latencyMs": 3458.8650829999824
- },
- {
- "questionId": "q70",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1401.621165999968
- },
- {
- "questionId": "q70",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 2,
- "latencyMs": 3644.271166999999
- },
- {
- "questionId": "q70",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 200,
- "latencyMs": 2859.7807909999974
- },
- {
- "questionId": "q70",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1170.455874999985
- },
- {
- "questionId": "q70",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "shipped",
- "actual": "shipped",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 2,
- "latencyMs": 2668.4208750000107
- },
- {
- "questionId": "q71",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 265,
- "latencyMs": 3387.9897919999785
- },
- {
- "questionId": "q71",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1210.6735000000335
- },
- {
- "questionId": "q71",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 2313.2734579999815
- },
- {
- "questionId": "q71",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 201,
- "latencyMs": 2948.030916000018
- },
- {
- "questionId": "q71",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1499.2446670000209
- },
- {
- "questionId": "q71",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 1259.240832999989
- },
- {
- "questionId": "q71",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 201,
- "latencyMs": 8963.050458999991
- },
- {
- "questionId": "q71",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1168.6370839999872
- },
- {
- "questionId": "q71",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 2633.771375000011
- },
- {
- "questionId": "q71",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 329,
- "latencyMs": 7189.561790999956
- },
- {
- "questionId": "q71",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1225.8507080000127
- },
- {
- "questionId": "q71",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1124.1396250000107
- },
- {
- "questionId": "q71",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 201,
- "latencyMs": 3990.592707999982
- },
- {
- "questionId": "q71",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1128.0700419999775
- },
- {
- "questionId": "q71",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "645.88",
- "actual": "645.88",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1804.0158330000122
- },
- {
- "questionId": "q72",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 263,
- "latencyMs": 3661.423624999996
- },
- {
- "questionId": "q72",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1125.6147919999785
- },
- {
- "questionId": "q72",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1711.6630829999922
- },
- {
- "questionId": "q72",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 199,
- "latencyMs": 3128.0557079999708
- },
- {
- "questionId": "q72",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1669.1822079999838
- },
- {
- "questionId": "q72",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1274.667958999984
- },
- {
- "questionId": "q72",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 263,
- "latencyMs": 3663.237792
- },
- {
- "questionId": "q72",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1122.126249999972
- },
- {
- "questionId": "q72",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 1549.8010420000064
- },
- {
- "questionId": "q72",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 327,
- "latencyMs": 6674.916083000018
- },
- {
- "questionId": "q72",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1230.8339169999817
- },
- {
- "questionId": "q72",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 992.4760409999872
- },
- {
- "questionId": "q72",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 199,
- "latencyMs": 3755.6932919999817
- },
- {
- "questionId": "q72",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1540.152833
- },
- {
- "questionId": "q72",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "processing",
- "actual": "processing",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 2185.4502910000156
- },
- {
- "questionId": "q73",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 265,
- "latencyMs": 3809.869667000021
- },
- {
- "questionId": "q73",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1150.84375
- },
- {
- "questionId": "q73",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 1217.3986659999937
- },
- {
- "questionId": "q73",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 137,
- "latencyMs": 2091.0124589999905
- },
- {
- "questionId": "q73",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1357.4467920000316
- },
- {
- "questionId": "q73",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 2377.229250000033
- },
- {
- "questionId": "q73",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 201,
- "latencyMs": 2673.4793749999953
- },
- {
- "questionId": "q73",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1785.7454999999609
- },
- {
- "questionId": "q73",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 1956.5365410000086
- },
- {
- "questionId": "q73",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 201,
- "latencyMs": 2943.3867910000263
- },
- {
- "questionId": "q73",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1264.3261250000214
- },
- {
- "questionId": "q73",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1479.502083999978
- },
- {
- "questionId": "q73",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 137,
- "latencyMs": 2697.696667000011
- },
- {
- "questionId": "q73",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1319.8920829999843
- },
- {
- "questionId": "q73",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "371.91",
- "actual": "371.91",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1655.4022090000217
- },
- {
- "questionId": "q74",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 327,
- "latencyMs": 3728.9863749999786
- },
- {
- "questionId": "q74",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1403.8238750000019
- },
- {
- "questionId": "q74",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1610.8924579999875
- },
- {
- "questionId": "q74",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 199,
- "latencyMs": 3121.718416000018
- },
- {
- "questionId": "q74",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1051.426999999967
- },
- {
- "questionId": "q74",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1171.1483340000268
- },
- {
- "questionId": "q74",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 263,
- "latencyMs": 2642.1894589999574
- },
- {
- "questionId": "q74",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1286.3537080000388
- },
- {
- "questionId": "q74",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 3901.2503750000033
- },
- {
- "questionId": "q74",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 263,
- "latencyMs": 3386.3902919999673
- },
- {
- "questionId": "q74",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1593.6848750000354
- },
- {
- "questionId": "q74",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 1085.9149159999797
- },
- {
- "questionId": "q74",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 135,
- "latencyMs": 2352.2881669999915
- },
- {
- "questionId": "q74",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1046.4814580000238
- },
- {
- "questionId": "q74",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "pending",
- "actual": "pending",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 1687.5740409999853
- },
- {
- "questionId": "q75",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 264,
- "latencyMs": 5460.0885409999755
- },
- {
- "questionId": "q75",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 6,
- "latencyMs": 1246.0814159999718
- },
- {
- "questionId": "q75",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 1696.832666000002
- },
- {
- "questionId": "q75",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 200,
- "latencyMs": 2906.3054160000174
- },
- {
- "questionId": "q75",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 6,
- "latencyMs": 1201.3947090000147
- },
- {
- "questionId": "q75",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1066",
- "actual": "1066.00",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 7,
- "latencyMs": 1377.305457999988
- },
- {
- "questionId": "q75",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 456,
- "latencyMs": 8801.27112499997
- },
- {
- "questionId": "q75",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 6,
- "latencyMs": 1433.466666000022
- },
- {
- "questionId": "q75",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 3448.654917000036
- },
- {
- "questionId": "q75",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 264,
- "latencyMs": 4939.312791000004
- },
- {
- "questionId": "q75",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 6,
- "latencyMs": 1252.419332999969
- },
- {
- "questionId": "q75",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1066",
- "actual": "1066.00",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 7,
- "latencyMs": 1151.2592920000316
- },
- {
- "questionId": "q75",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 136,
- "latencyMs": 3143.9853749999893
- },
- {
- "questionId": "q75",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1066",
- "actual": "1066",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 6,
- "latencyMs": 1177.0768329999992
- },
- {
- "questionId": "q75",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1066",
- "actual": "1066.0",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 1535.377165999962
- },
- {
- "questionId": "q76",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 328,
- "latencyMs": 10990.360375000047
- },
- {
- "questionId": "q76",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1467.304375000007
- },
- {
- "questionId": "q76",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1316.8680830000085
- },
- {
- "questionId": "q76",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 392,
- "latencyMs": 4399.92220900004
- },
- {
- "questionId": "q76",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1077.4348749999772
- },
- {
- "questionId": "q76",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1317.501791000017
- },
- {
- "questionId": "q76",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 200,
- "latencyMs": 4153.370333999977
- },
- {
- "questionId": "q76",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1147.2140420000069
- },
- {
- "questionId": "q76",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 1243.451000000001
- },
- {
- "questionId": "q76",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 328,
- "latencyMs": 7804.228665999952
- },
- {
- "questionId": "q76",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1144.1722500000033
- },
- {
- "questionId": "q76",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 857.7333750000107
- },
- {
- "questionId": "q76",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 136,
- "latencyMs": 2287.29574999999
- },
- {
- "questionId": "q76",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1285.9760839999653
- },
- {
- "questionId": "q76",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "cancelled",
- "actual": "cancelled",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 1174.2349580000155
- },
- {
- "questionId": "q77",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 266,
- "latencyMs": 4109.542333999998
- },
- {
- "questionId": "q77",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 8,
- "latencyMs": 1433.0992499999702
- },
- {
- "questionId": "q77",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 6,
- "latencyMs": 3301.268875000009
- },
- {
- "questionId": "q77",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 394,
- "latencyMs": 4952.654542000033
- },
- {
- "questionId": "q77",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 8,
- "latencyMs": 1165.5959999999614
- },
- {
- "questionId": "q77",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 6,
- "latencyMs": 982.1686660000123
- },
- {
- "questionId": "q77",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 4735.772292000009
- },
- {
- "questionId": "q77",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 8,
- "latencyMs": 1361.5435829999624
- },
- {
- "questionId": "q77",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 6,
- "latencyMs": 2838.4672920000157
- },
- {
- "questionId": "q77",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 394,
- "latencyMs": 4771.182459000032
- },
- {
- "questionId": "q77",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 8,
- "latencyMs": 1202.4828330000164
- },
- {
- "questionId": "q77",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 6,
- "latencyMs": 1063.3247500000289
- },
- {
- "questionId": "q77",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 202,
- "latencyMs": 7751.146624999994
- },
- {
- "questionId": "q77",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 8,
- "latencyMs": 1352.936708000023
- },
- {
- "questionId": "q77",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1697.4",
- "actual": "1697.4",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 6,
- "latencyMs": 3135.286582999979
- },
- {
- "questionId": "q78",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 264,
- "latencyMs": 3105.402541999996
- },
- {
- "questionId": "q78",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 11906,
- "outputTokens": 4,
- "latencyMs": 1140.6077500000247
- },
- {
- "questionId": "q78",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 12112,
- "outputTokens": 1,
- "latencyMs": 1257.6969169999938
- },
- {
- "questionId": "q78",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 72,
- "latencyMs": 2142.8472499999916
- },
- {
- "questionId": "q78",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6992,
- "outputTokens": 4,
- "latencyMs": 1485.6063330000034
- },
- {
- "questionId": "q78",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 1,
- "latencyMs": 1350.4362079999992
- },
- {
- "questionId": "q78",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 264,
- "latencyMs": 3870.94754199998
- },
- {
- "questionId": "q78",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8413,
- "outputTokens": 4,
- "latencyMs": 1153.2942499999772
- },
- {
- "questionId": "q78",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 1,
- "latencyMs": 2935.8738330000197
- },
- {
- "questionId": "q78",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 328,
- "latencyMs": 4063.2786669999477
- },
- {
- "questionId": "q78",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 13379,
- "outputTokens": 4,
- "latencyMs": 1202.6428329999908
- },
- {
- "questionId": "q78",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 1,
- "latencyMs": 1221.4335410000058
- },
- {
- "questionId": "q78",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 200,
- "latencyMs": 5382.740458999993
- },
- {
- "questionId": "q78",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8384,
- "outputTokens": 4,
- "latencyMs": 1434.1426659999997
- },
- {
- "questionId": "q78",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "delivered",
- "actual": "delivered",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 1,
- "latencyMs": 1046.8339999999735
- },
- {
- "questionId": "q79",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 73,
- "latencyMs": 2607.845874999999
- },
- {
- "questionId": "q79",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 9,
- "latencyMs": 1676.4270830000169
- },
- {
- "questionId": "q79",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 3,
- "latencyMs": 1219.0042910000193
- },
- {
- "questionId": "q79",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 137,
- "latencyMs": 3378.1006669999915
- },
- {
- "questionId": "q79",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 9,
- "latencyMs": 1979.5205839999835
- },
- {
- "questionId": "q79",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 3,
- "latencyMs": 1439.3422910000081
- },
- {
- "questionId": "q79",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 137,
- "latencyMs": 2889.578749999986
- },
- {
- "questionId": "q79",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 9,
- "latencyMs": 1190.1848750000354
- },
- {
- "questionId": "q79",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 3,
- "latencyMs": 2444.884665999969
- },
- {
- "questionId": "q79",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 73,
- "latencyMs": 2360.869958999974
- },
- {
- "questionId": "q79",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 9,
- "latencyMs": 1299.0499999999884
- },
- {
- "questionId": "q79",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 3,
- "latencyMs": 932.0124589999905
- },
- {
- "questionId": "q79",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 73,
- "latencyMs": 3092.9805410000263
- },
- {
- "questionId": "q79",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 9,
- "latencyMs": 1872.3574159999844
- },
- {
- "questionId": "q79",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Valerie Braun",
- "actual": "Valerie Braun",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 3,
- "latencyMs": 1216.4535000000033
- },
- {
- "questionId": "q80",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 138,
- "latencyMs": 2404.87479099998
- },
- {
- "questionId": "q80",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 9,
- "latencyMs": 2182.619249999989
- },
- {
- "questionId": "q80",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 3,
- "latencyMs": 1508.2469580000034
- },
- {
- "questionId": "q80",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 138,
- "latencyMs": 3670.61050000001
- },
- {
- "questionId": "q80",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 9,
- "latencyMs": 1291.4328749999986
- },
- {
- "questionId": "q80",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 3,
- "latencyMs": 1201.7425829999847
- },
- {
- "questionId": "q80",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 202,
- "latencyMs": 4846.332458000048
- },
- {
- "questionId": "q80",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 9,
- "latencyMs": 1134.4527920000255
- },
- {
- "questionId": "q80",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 3,
- "latencyMs": 2760.9979579999927
- },
- {
- "questionId": "q80",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 138,
- "latencyMs": 4943.049208999961
- },
- {
- "questionId": "q80",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 9,
- "latencyMs": 1163.70645899995
- },
- {
- "questionId": "q80",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 3,
- "latencyMs": 2088.2969169999706
- },
- {
- "questionId": "q80",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 74,
- "latencyMs": 1973.243833000015
- },
- {
- "questionId": "q80",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 9,
- "latencyMs": 1430.9339170000167
- },
- {
- "questionId": "q80",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Anita Kozey",
- "actual": "Anita Kozey",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 3,
- "latencyMs": 1687.4137919999775
- },
- {
- "questionId": "q81",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 203,
- "latencyMs": 3178.392749999999
- },
- {
- "questionId": "q81",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 10,
- "latencyMs": 1213.1997499999707
- },
- {
- "questionId": "q81",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 1591.6145830000169
- },
- {
- "questionId": "q81",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 203,
- "latencyMs": 3938.462541999994
- },
- {
- "questionId": "q81",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 10,
- "latencyMs": 1552.203542000032
- },
- {
- "questionId": "q81",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 1499.0997919999645
- },
- {
- "questionId": "q81",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 203,
- "latencyMs": 5183.275583000039
- },
- {
- "questionId": "q81",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 10,
- "latencyMs": 1740.2195410000277
- },
- {
- "questionId": "q81",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 3886.555624999979
- },
- {
- "questionId": "q81",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 203,
- "latencyMs": 6655.238542000006
- },
- {
- "questionId": "q81",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 10,
- "latencyMs": 1357.9108329999726
- },
- {
- "questionId": "q81",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 1344.8635829999694
- },
- {
- "questionId": "q81",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 139,
- "latencyMs": 10553.66091700003
- },
- {
- "questionId": "q81",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 10,
- "latencyMs": 1807.1954169999808
- },
- {
- "questionId": "q81",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Elmer Kub PhD",
- "actual": "Elmer Kub PhD",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 2490.0647499999614
- },
- {
- "questionId": "q82",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 138,
- "latencyMs": 4916.117375000031
- },
- {
- "questionId": "q82",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 10,
- "latencyMs": 1074.780374999973
- },
- {
- "questionId": "q82",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 1412.95891700004
- },
- {
- "questionId": "q82",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 138,
- "latencyMs": 2372.7108339999686
- },
- {
- "questionId": "q82",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 10,
- "latencyMs": 1261.033374999999
- },
- {
- "questionId": "q82",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 1507.3635420000064
- },
- {
- "questionId": "q82",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 4028.793000000005
- },
- {
- "questionId": "q82",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 10,
- "latencyMs": 1685.5001250000205
- },
- {
- "questionId": "q82",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 4534.999041000032
- },
- {
- "questionId": "q82",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 202,
- "latencyMs": 3417.137708000024
- },
- {
- "questionId": "q82",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 10,
- "latencyMs": 1361.4405830000178
- },
- {
- "questionId": "q82",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 2432.530415999994
- },
- {
- "questionId": "q82",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 202,
- "latencyMs": 5838.863542000006
- },
- {
- "questionId": "q82",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 10,
- "latencyMs": 1243.5272090000217
- },
- {
- "questionId": "q82",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Maxine Zemlak",
- "actual": "Maxine Zemlak",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 3514.3164579999866
- },
- {
- "questionId": "q83",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 202,
- "latencyMs": 6595.4543330000015
- },
- {
- "questionId": "q83",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1498.3081660000025
- },
- {
- "questionId": "q83",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 2013.447125000006
- },
- {
- "questionId": "q83",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 202,
- "latencyMs": 3336.2056250000023
- },
- {
- "questionId": "q83",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1070.626500000013
- },
- {
- "questionId": "q83",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 1394.0314590000198
- },
- {
- "questionId": "q83",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 4194.179917000001
- },
- {
- "questionId": "q83",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1139.8458330000285
- },
- {
- "questionId": "q83",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 3437.878625000012
- },
- {
- "questionId": "q83",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 458,
- "latencyMs": 13446.595333000005
- },
- {
- "questionId": "q83",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 2680.581542
- },
- {
- "questionId": "q83",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 1203.1962920000078
- },
- {
- "questionId": "q83",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 138,
- "latencyMs": 4011.303083000006
- },
- {
- "questionId": "q83",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1039.7921659999993
- },
- {
- "questionId": "q83",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Emanuel Littel",
- "actual": "Emanuel Littel",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 2480.1701660000253
- },
- {
- "questionId": "q84",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 136,
- "latencyMs": 4735.566333000024
- },
- {
- "questionId": "q84",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1280.546875
- },
- {
- "questionId": "q84",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 2,
- "latencyMs": 1865.3758329999982
- },
- {
- "questionId": "q84",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 200,
- "latencyMs": 2902.7560829999857
- },
- {
- "questionId": "q84",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1081.401291999966
- },
- {
- "questionId": "q84",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 2,
- "latencyMs": 1030.250207999954
- },
- {
- "questionId": "q84",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 264,
- "latencyMs": 3382.8625409999513
- },
- {
- "questionId": "q84",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1059.5115829999559
- },
- {
- "questionId": "q84",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 2,
- "latencyMs": 4047.5788749999483
- },
- {
- "questionId": "q84",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 264,
- "latencyMs": 4623.2353329999605
- },
- {
- "questionId": "q84",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1069.810291999951
- },
- {
- "questionId": "q84",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 2,
- "latencyMs": 1081.8097089999937
- },
- {
- "questionId": "q84",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 264,
- "latencyMs": 8454.222833000007
- },
- {
- "questionId": "q84",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1248.3214579999913
- },
- {
- "questionId": "q84",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Andrew Kling",
- "actual": "Andrew Kling",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 2,
- "latencyMs": 3052.669667000009
- },
- {
- "questionId": "q85",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 139,
- "latencyMs": 6477.822083999985
- },
- {
- "questionId": "q85",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 9,
- "latencyMs": 1177.795124999946
- },
- {
- "questionId": "q85",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 2578.6090829999885
- },
- {
- "questionId": "q85",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 139,
- "latencyMs": 11574.13941599999
- },
- {
- "questionId": "q85",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 9,
- "latencyMs": 1197.251500000013
- },
- {
- "questionId": "q85",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 902.3842500000028
- },
- {
- "questionId": "q85",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 267,
- "latencyMs": 5139.725291999988
- },
- {
- "questionId": "q85",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 9,
- "latencyMs": 1539.0101670000004
- },
- {
- "questionId": "q85",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 5590.813292000035
- },
- {
- "questionId": "q85",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 459,
- "latencyMs": 5332.691916999989
- },
- {
- "questionId": "q85",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 9,
- "latencyMs": 1692.4654169999994
- },
- {
- "questionId": "q85",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 981.0666250000359
- },
- {
- "questionId": "q85",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 331,
- "latencyMs": 4571.373957999982
- },
- {
- "questionId": "q85",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 9,
- "latencyMs": 1186.5836659999914
- },
- {
- "questionId": "q85",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Morris O'Hara",
- "actual": "Morris O'Hara",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 3083.60266699997
- },
- {
- "questionId": "q86",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 203,
- "latencyMs": 6090.284833999991
- },
- {
- "questionId": "q86",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 9,
- "latencyMs": 1271.532459000009
- },
- {
- "questionId": "q86",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 5,
- "latencyMs": 1557.2529580000555
- },
- {
- "questionId": "q86",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 203,
- "latencyMs": 3250.3466250000056
- },
- {
- "questionId": "q86",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 9,
- "latencyMs": 1201.9044580000336
- },
- {
- "questionId": "q86",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 5,
- "latencyMs": 874.0206250000047
- },
- {
- "questionId": "q86",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 203,
- "latencyMs": 9473.656583999982
- },
- {
- "questionId": "q86",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 9,
- "latencyMs": 1253.2470420000027
- },
- {
- "questionId": "q86",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 5,
- "latencyMs": 2383.5771250000107
- },
- {
- "questionId": "q86",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 267,
- "latencyMs": 6551.133333000005
- },
- {
- "questionId": "q86",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 9,
- "latencyMs": 1116.6841669999994
- },
- {
- "questionId": "q86",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 5,
- "latencyMs": 2014.7545000000391
- },
- {
- "questionId": "q86",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 75,
- "latencyMs": 2472.76654099999
- },
- {
- "questionId": "q86",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 9,
- "latencyMs": 1175.5650410000235
- },
- {
- "questionId": "q86",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Elijah Franecki",
- "actual": "Elijah Franecki",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 5,
- "latencyMs": 1389.2444590000086
- },
- {
- "questionId": "q87",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 266,
- "latencyMs": 4308.579541000014
- },
- {
- "questionId": "q87",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 7,
- "latencyMs": 1423.6036659999518
- },
- {
- "questionId": "q87",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 3,
- "latencyMs": 2240.639916999964
- },
- {
- "questionId": "q87",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 202,
- "latencyMs": 3581.8104590000003
- },
- {
- "questionId": "q87",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 7,
- "latencyMs": 1104.380625000049
- },
- {
- "questionId": "q87",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 3,
- "latencyMs": 1940.0862910000142
- },
- {
- "questionId": "q87",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 202,
- "latencyMs": 4205.585124999983
- },
- {
- "questionId": "q87",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 7,
- "latencyMs": 1249.4729159999988
- },
- {
- "questionId": "q87",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 3,
- "latencyMs": 3377.5699580000364
- },
- {
- "questionId": "q87",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 266,
- "latencyMs": 4378.770917000016
- },
- {
- "questionId": "q87",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 7,
- "latencyMs": 1283.0947499999893
- },
- {
- "questionId": "q87",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 3,
- "latencyMs": 1649.8935409999685
- },
- {
- "questionId": "q87",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 138,
- "latencyMs": 4596.174417000031
- },
- {
- "questionId": "q87",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 7,
- "latencyMs": 1117.4153749999823
- },
- {
- "questionId": "q87",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Malcolm Erdman",
- "actual": "Malcolm Erdman",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 3,
- "latencyMs": 2916.328375000041
- },
- {
- "questionId": "q88",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 202,
- "latencyMs": 6150.88295900001
- },
- {
- "questionId": "q88",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 9,
- "latencyMs": 3154.254249999998
- },
- {
- "questionId": "q88",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 1595.2374999999884
- },
- {
- "questionId": "q88",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 138,
- "latencyMs": 2656.5287499999977
- },
- {
- "questionId": "q88",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 9,
- "latencyMs": 1990.0005419999943
- },
- {
- "questionId": "q88",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 2321.1809169999906
- },
- {
- "questionId": "q88",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 3915.817207999993
- },
- {
- "questionId": "q88",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 9,
- "latencyMs": 1246.5829580000136
- },
- {
- "questionId": "q88",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 4516.533583000011
- },
- {
- "questionId": "q88",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 202,
- "latencyMs": 5059.808416999993
- },
- {
- "questionId": "q88",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 9,
- "latencyMs": 1927.3214579999913
- },
- {
- "questionId": "q88",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 1175.4753750000382
- },
- {
- "questionId": "q88",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 138,
- "latencyMs": 6212.469625000027
- },
- {
- "questionId": "q88",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 9,
- "latencyMs": 1526.3683329999913
- },
- {
- "questionId": "q88",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Fannie Skiles",
- "actual": "Fannie Skiles",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 3560.557833000028
- },
- {
- "questionId": "q89",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 331,
- "latencyMs": 4333.316457999987
- },
- {
- "questionId": "q89",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 10,
- "latencyMs": 1150.7639999999665
- },
- {
- "questionId": "q89",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 2529.932083999971
- },
- {
- "questionId": "q89",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 203,
- "latencyMs": 3581.042041000037
- },
- {
- "questionId": "q89",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 10,
- "latencyMs": 1568.8872919999994
- },
- {
- "questionId": "q89",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 1319.7952499999665
- },
- {
- "questionId": "q89",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 331,
- "latencyMs": 3538.970499999996
- },
- {
- "questionId": "q89",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 10,
- "latencyMs": 1241.5265000000363
- },
- {
- "questionId": "q89",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 3917.9875000000466
- },
- {
- "questionId": "q89",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 395,
- "latencyMs": 7058.911167000013
- },
- {
- "questionId": "q89",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 10,
- "latencyMs": 1205.0128329999861
- },
- {
- "questionId": "q89",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 1415.7616670000134
- },
- {
- "questionId": "q89",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 139,
- "latencyMs": 2635.5764160000253
- },
- {
- "questionId": "q89",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 10,
- "latencyMs": 1153.0579160000198
- },
- {
- "questionId": "q89",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Sonja Emmerich",
- "actual": "Sonja Emmerich",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 2894.0762920000125
- },
- {
- "questionId": "q90",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 140,
- "latencyMs": 6845.755584000028
- },
- {
- "questionId": "q90",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 10,
- "latencyMs": 2363.831957999966
- },
- {
- "questionId": "q90",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 5,
- "latencyMs": 2646.4628749999683
- },
- {
- "questionId": "q90",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 140,
- "latencyMs": 2236.9238749999786
- },
- {
- "questionId": "q90",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 10,
- "latencyMs": 1023.8160830000415
- },
- {
- "questionId": "q90",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 5,
- "latencyMs": 1165.2285000000265
- },
- {
- "questionId": "q90",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 268,
- "latencyMs": 4066.1428750000196
- },
- {
- "questionId": "q90",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 10,
- "latencyMs": 1570.4565409999923
- },
- {
- "questionId": "q90",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 5,
- "latencyMs": 3472.6348330000183
- },
- {
- "questionId": "q90",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 268,
- "latencyMs": 3361.3982500000275
- },
- {
- "questionId": "q90",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 10,
- "latencyMs": 1247.454334000009
- },
- {
- "questionId": "q90",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 5,
- "latencyMs": 1382.5874590000021
- },
- {
- "questionId": "q90",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 140,
- "latencyMs": 2949.110708000022
- },
- {
- "questionId": "q90",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 10,
- "latencyMs": 1160.699499999988
- },
- {
- "questionId": "q90",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Frank Emmerich DVM",
- "actual": "Frank Emmerich DVM",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 5,
- "latencyMs": 3016.852790999983
- },
- {
- "questionId": "q91",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 73,
- "latencyMs": 2769.32262500003
- },
- {
- "questionId": "q91",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 5,
- "latencyMs": 1252.1112919999869
- },
- {
- "questionId": "q91",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 2,
- "latencyMs": 1906.2817499999655
- },
- {
- "questionId": "q91",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 201,
- "latencyMs": 5391.403708000027
- },
- {
- "questionId": "q91",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 5,
- "latencyMs": 1126.4195000000182
- },
- {
- "questionId": "q91",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 2,
- "latencyMs": 1148.1653749999823
- },
- {
- "questionId": "q91",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 265,
- "latencyMs": 3649.6608329999726
- },
- {
- "questionId": "q91",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 5,
- "latencyMs": 1054.9641670000274
- },
- {
- "questionId": "q91",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 2,
- "latencyMs": 4520.085083000013
- },
- {
- "questionId": "q91",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 137,
- "latencyMs": 3783.5575830000453
- },
- {
- "questionId": "q91",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 5,
- "latencyMs": 1200.0155000000377
- },
- {
- "questionId": "q91",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 2,
- "latencyMs": 1914.0702499999898
- },
- {
- "questionId": "q91",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 265,
- "latencyMs": 8789.486250000016
- },
- {
- "questionId": "q91",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 5,
- "latencyMs": 1445.0254999999888
- },
- {
- "questionId": "q91",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Ronald Collins",
- "actual": "Ronald Collins",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 2,
- "latencyMs": 3330.7725830000127
- },
- {
- "questionId": "q92",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 201,
- "latencyMs": 6413.151542000007
- },
- {
- "questionId": "q92",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 8,
- "latencyMs": 1204.1578749999753
- },
- {
- "questionId": "q92",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 3,
- "latencyMs": 1412.2799170000362
- },
- {
- "questionId": "q92",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 137,
- "latencyMs": 2630.434041999979
- },
- {
- "questionId": "q92",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 8,
- "latencyMs": 1546.8669579999987
- },
- {
- "questionId": "q92",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 3,
- "latencyMs": 2373.892125000013
- },
- {
- "questionId": "q92",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 201,
- "latencyMs": 3202.2820420000353
- },
- {
- "questionId": "q92",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 8,
- "latencyMs": 1227.2948330000509
- },
- {
- "questionId": "q92",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 3,
- "latencyMs": 3743.526792000048
- },
- {
- "questionId": "q92",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 201,
- "latencyMs": 3238.171458000026
- },
- {
- "questionId": "q92",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 8,
- "latencyMs": 1180.7857080000103
- },
- {
- "questionId": "q92",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 3,
- "latencyMs": 1142.4927089999546
- },
- {
- "questionId": "q92",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 137,
- "latencyMs": 3021.9724590000114
- },
- {
- "questionId": "q92",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 8,
- "latencyMs": 1821.3516250000102
- },
- {
- "questionId": "q92",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Jeannie Klein",
- "actual": "Jeannie Klein",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 3,
- "latencyMs": 2796.1425000000163
- },
- {
- "questionId": "q93",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 9739,
- "outputTokens": 138,
- "latencyMs": 2788.065082999994
- },
- {
- "questionId": "q93",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 11907,
- "outputTokens": 8,
- "latencyMs": 1367.4712089999812
- },
- {
- "questionId": "q93",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 12113,
- "outputTokens": 4,
- "latencyMs": 1443.3402910000295
- },
- {
- "questionId": "q93",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 6013,
- "outputTokens": 202,
- "latencyMs": 3654.0896250000224
- },
- {
- "questionId": "q93",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 6993,
- "outputTokens": 8,
- "latencyMs": 1028.997875000001
- },
- {
- "questionId": "q93",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 7201,
- "outputTokens": 4,
- "latencyMs": 996.1445419999654
- },
- {
- "questionId": "q93",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 6781,
- "outputTokens": 266,
- "latencyMs": 6677.9684579999885
- },
- {
- "questionId": "q93",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 8414,
- "outputTokens": 8,
- "latencyMs": 1639.9640409999993
- },
- {
- "questionId": "q93",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 7838,
- "outputTokens": 4,
- "latencyMs": 1652.2167079999927
- },
- {
- "questionId": "q93",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 11037,
- "outputTokens": 202,
- "latencyMs": 3802.7754580000183
- },
- {
- "questionId": "q93",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 13380,
- "outputTokens": 8,
- "latencyMs": 3327.393792000017
- },
- {
- "questionId": "q93",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 13451,
- "outputTokens": 4,
- "latencyMs": 1257.9510420000297
- },
- {
- "questionId": "q93",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 7373,
- "outputTokens": 202,
- "latencyMs": 3074.6058750000084
- },
- {
- "questionId": "q93",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 8385,
- "outputTokens": 8,
- "latencyMs": 1146.4290829999954
- },
- {
- "questionId": "q93",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "Joshua Watsica",
- "actual": "Joshua Watsica",
- "isCorrect": true,
- "inputTokens": 8427,
- "outputTokens": 4,
- "latencyMs": 1712.0292920000502
- },
- {
- "questionId": "q94",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 9735,
- "outputTokens": 967,
- "latencyMs": 11158.31029200001
- },
- {
- "questionId": "q94",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 11902,
- "outputTokens": 5,
- "latencyMs": 1969.3274160000146
- },
- {
- "questionId": "q94",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 12107,
- "outputTokens": 1,
- "latencyMs": 1012.6363329999731
- },
- {
- "questionId": "q94",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6009,
- "outputTokens": 839,
- "latencyMs": 12387.267332999967
- },
- {
- "questionId": "q94",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 6988,
- "outputTokens": 5,
- "latencyMs": 1146.578125
- },
- {
- "questionId": "q94",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7195,
- "outputTokens": 2,
- "latencyMs": 6065.854290999996
- },
- {
- "questionId": "q94",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6777,
- "outputTokens": 583,
- "latencyMs": 5722.737124999985
- },
- {
- "questionId": "q94",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8409,
- "outputTokens": 5,
- "latencyMs": 1162.2037910000072
- },
- {
- "questionId": "q94",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7832,
- "outputTokens": 2,
- "latencyMs": 5346.4215829999885
- },
- {
- "questionId": "q94",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11033,
- "outputTokens": 967,
- "latencyMs": 9711.181042000011
- },
- {
- "questionId": "q94",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 13375,
- "outputTokens": 5,
- "latencyMs": 1180.9850839999854
- },
- {
- "questionId": "q94",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13445,
- "outputTokens": 2,
- "latencyMs": 6629.622541000019
- },
- {
- "questionId": "q94",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7369,
- "outputTokens": 583,
- "latencyMs": 5019.671374999976
- },
- {
- "questionId": "q94",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 5,
- "latencyMs": 1167.7568749999627
- },
- {
- "questionId": "q94",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 8421,
- "outputTokens": 1,
- "latencyMs": 1625.168708000041
- },
- {
- "questionId": "q95",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 9735,
- "outputTokens": 775,
- "latencyMs": 7411.724082999979
- },
- {
- "questionId": "q95",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 11902,
- "outputTokens": 5,
- "latencyMs": 1554.4648750000051
- },
- {
- "questionId": "q95",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 12107,
- "outputTokens": 2,
- "latencyMs": 2038.4110000000219
- },
- {
- "questionId": "q95",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6009,
- "outputTokens": 455,
- "latencyMs": 8813.801208000048
- },
- {
- "questionId": "q95",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 6988,
- "outputTokens": 5,
- "latencyMs": 1344.8304580000113
- },
- {
- "questionId": "q95",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 7195,
- "outputTokens": 1,
- "latencyMs": 795.6426249999786
- },
- {
- "questionId": "q95",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6777,
- "outputTokens": 903,
- "latencyMs": 9739.22754199995
- },
- {
- "questionId": "q95",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8409,
- "outputTokens": 5,
- "latencyMs": 1163.627124999999
- },
- {
- "questionId": "q95",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7832,
- "outputTokens": 2,
- "latencyMs": 4444.457624999981
- },
- {
- "questionId": "q95",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11033,
- "outputTokens": 1415,
- "latencyMs": 14405.558917000017
- },
- {
- "questionId": "q95",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 13375,
- "outputTokens": 5,
- "latencyMs": 1603.5181249999441
- },
- {
- "questionId": "q95",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 13445,
- "outputTokens": 1,
- "latencyMs": 1466.009625000006
- },
- {
- "questionId": "q95",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7369,
- "outputTokens": 583,
- "latencyMs": 50147.72520799999
- },
- {
- "questionId": "q95",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 5,
- "latencyMs": 1600.4076660000137
- },
- {
- "questionId": "q95",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 8421,
- "outputTokens": 1,
- "latencyMs": 1974.6425419999869
- },
- {
- "questionId": "q96",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 9736,
- "outputTokens": 839,
- "latencyMs": 6029.78350000002
- },
- {
- "questionId": "q96",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 11902,
- "outputTokens": 5,
- "latencyMs": 1108.4398330000113
- },
- {
- "questionId": "q96",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 12108,
- "outputTokens": 1,
- "latencyMs": 1581.965291999979
- },
- {
- "questionId": "q96",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6010,
- "outputTokens": 647,
- "latencyMs": 21748.776332999987
- },
- {
- "questionId": "q96",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 6988,
- "outputTokens": 5,
- "latencyMs": 2333.9817080000066
- },
- {
- "questionId": "q96",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 7196,
- "outputTokens": 1,
- "latencyMs": 1115.266958000022
- },
- {
- "questionId": "q96",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6778,
- "outputTokens": 583,
- "latencyMs": 5761.870166999986
- },
- {
- "questionId": "q96",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8409,
- "outputTokens": 5,
- "latencyMs": 1110.2957919999608
- },
- {
- "questionId": "q96",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7833,
- "outputTokens": 2,
- "latencyMs": 5206.065542000055
- },
- {
- "questionId": "q96",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11034,
- "outputTokens": 839,
- "latencyMs": 10213.124458000006
- },
- {
- "questionId": "q96",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 13375,
- "outputTokens": 5,
- "latencyMs": 1085.2472919999855
- },
- {
- "questionId": "q96",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13446,
- "outputTokens": 2,
- "latencyMs": 6148.1957500000135
- },
- {
- "questionId": "q96",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7370,
- "outputTokens": 647,
- "latencyMs": 10606.282000000007
- },
- {
- "questionId": "q96",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "7",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 5,
- "latencyMs": 1061.5612079999992
- },
- {
- "questionId": "q96",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8422,
- "outputTokens": 1,
- "latencyMs": 940.8403330000001
- },
- {
- "questionId": "q97",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 9736,
- "outputTokens": 647,
- "latencyMs": 6429.81362500001
- },
- {
- "questionId": "q97",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11902,
- "outputTokens": 5,
- "latencyMs": 1373.5127499999944
- },
- {
- "questionId": "q97",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 12107,
- "outputTokens": 1,
- "latencyMs": 1618.8752080000122
- },
- {
- "questionId": "q97",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6010,
- "outputTokens": 583,
- "latencyMs": 5288.105207999994
- },
- {
- "questionId": "q97",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6988,
- "outputTokens": 5,
- "latencyMs": 974.4008749999921
- },
- {
- "questionId": "q97",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7195,
- "outputTokens": 2,
- "latencyMs": 994.4026250000461
- },
- {
- "questionId": "q97",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6778,
- "outputTokens": 1479,
- "latencyMs": 44513.282000000065
- },
- {
- "questionId": "q97",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 8409,
- "outputTokens": 5,
- "latencyMs": 1579.2647080000024
- },
- {
- "questionId": "q97",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7832,
- "outputTokens": 2,
- "latencyMs": 6760.291374999972
- },
- {
- "questionId": "q97",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11034,
- "outputTokens": 647,
- "latencyMs": 6886.205707999994
- },
- {
- "questionId": "q97",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13375,
- "outputTokens": 5,
- "latencyMs": 1140.8538749999716
- },
- {
- "questionId": "q97",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13445,
- "outputTokens": 2,
- "latencyMs": 5500.930916999932
- },
- {
- "questionId": "q97",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7370,
- "outputTokens": 647,
- "latencyMs": 6873.12387499999
- },
- {
- "questionId": "q97",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 5,
- "latencyMs": 1385.4246660000063
- },
- {
- "questionId": "q97",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "9",
- "isCorrect": false,
- "inputTokens": 8421,
- "outputTokens": 1,
- "latencyMs": 1070.8007499999949
- },
- {
- "questionId": "q98",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 9736,
- "outputTokens": 775,
- "latencyMs": 10215.419124999957
- },
- {
- "questionId": "q98",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 11902,
- "outputTokens": 5,
- "latencyMs": 1169.6882500000065
- },
- {
- "questionId": "q98",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 12107,
- "outputTokens": 2,
- "latencyMs": 1497.445791999984
- },
- {
- "questionId": "q98",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6010,
- "outputTokens": 583,
- "latencyMs": 17780.296249999956
- },
- {
- "questionId": "q98",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 6988,
- "outputTokens": 5,
- "latencyMs": 1507.771624999994
- },
- {
- "questionId": "q98",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7195,
- "outputTokens": 2,
- "latencyMs": 1089.9117079999996
- },
- {
- "questionId": "q98",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 6778,
- "outputTokens": 583,
- "latencyMs": 6443.644124999992
- },
- {
- "questionId": "q98",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 8409,
- "outputTokens": 5,
- "latencyMs": 1212.1155410000356
- },
- {
- "questionId": "q98",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7832,
- "outputTokens": 2,
- "latencyMs": 5152.548582999967
- },
- {
- "questionId": "q98",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 11034,
- "outputTokens": 647,
- "latencyMs": 12689.804665999953
- },
- {
- "questionId": "q98",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13375,
- "outputTokens": 5,
- "latencyMs": 1122.1935420000227
- },
- {
- "questionId": "q98",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 13445,
- "outputTokens": 2,
- "latencyMs": 1011.1309159999946
- },
- {
- "questionId": "q98",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 7370,
- "outputTokens": 711,
- "latencyMs": 9792.569583000033
- },
- {
- "questionId": "q98",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "10",
- "actual": "8",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 5,
- "latencyMs": 1111.848708000034
- },
- {
- "questionId": "q98",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "10",
- "actual": "10",
- "isCorrect": true,
- "inputTokens": 8421,
- "outputTokens": 2,
- "latencyMs": 868.7284579999978
- },
- {
- "questionId": "q99",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "42342.25",
- "actual": "41304.82",
- "isCorrect": false,
- "inputTokens": 9736,
- "outputTokens": 2698,
- "latencyMs": 46504.10175000003
- },
- {
- "questionId": "q99",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "42342.25",
- "actual": "50,847.47",
- "isCorrect": false,
- "inputTokens": 11902,
- "outputTokens": 9,
- "latencyMs": 1987.3346250000177
- },
- {
- "questionId": "q99",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "42342.25",
- "actual": "40000.00",
- "isCorrect": false,
- "inputTokens": 12108,
- "outputTokens": 8,
- "latencyMs": 7707.775332999998
- },
- {
- "questionId": "q99",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "42342.25",
- "actual": "42342.25",
- "isCorrect": true,
- "inputTokens": 6010,
- "outputTokens": 5578,
- "latencyMs": 48586.554000000004
- },
- {
- "questionId": "q99",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "42342.25",
- "actual": "41,847.47",
- "isCorrect": false,
- "inputTokens": 6988,
- "outputTokens": 9,
- "latencyMs": 3438.9107920000097
- },
- {
- "questionId": "q99",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "42342.25",
- "actual": "40000.00",
- "isCorrect": false,
- "inputTokens": 7196,
- "outputTokens": 8,
- "latencyMs": 6512.329665999976
- },
- {
- "questionId": "q99",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "42342.25",
- "actual": "42342.25",
- "isCorrect": true,
- "inputTokens": 6778,
- "outputTokens": 4874,
- "latencyMs": 37911.18645799998
- },
- {
- "questionId": "q99",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "42342.25",
- "actual": "48,847.47",
- "isCorrect": false,
- "inputTokens": 8409,
- "outputTokens": 9,
- "latencyMs": 1071.3846250000643
- },
- {
- "questionId": "q99",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "42342.25",
- "actual": "40000.00",
- "isCorrect": false,
- "inputTokens": 7833,
- "outputTokens": 8,
- "latencyMs": 7891.89620800002
- },
- {
- "questionId": "q99",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "42342.25",
- "actual": "42342.25",
- "isCorrect": true,
- "inputTokens": 11034,
- "outputTokens": 3338,
- "latencyMs": 23923.247208000044
- },
- {
- "questionId": "q99",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "42342.25",
- "actual": "47,847.47",
- "isCorrect": false,
- "inputTokens": 13375,
- "outputTokens": 9,
- "latencyMs": 1182.405207999982
- },
- {
- "questionId": "q99",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "42342.25",
- "actual": "43000.00",
- "isCorrect": false,
- "inputTokens": 13446,
- "outputTokens": 8,
- "latencyMs": 9388.739500000025
- },
- {
- "questionId": "q99",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "42342.25",
- "actual": "42342.25",
- "isCorrect": true,
- "inputTokens": 7370,
- "outputTokens": 3082,
- "latencyMs": 31024.954041999998
- },
- {
- "questionId": "q99",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "42342.25",
- "actual": "47,847.89",
- "isCorrect": false,
- "inputTokens": 8380,
- "outputTokens": 9,
- "latencyMs": 1240.8969590000343
- },
- {
- "questionId": "q99",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "42342.25",
- "actual": "30900.09",
- "isCorrect": false,
- "inputTokens": 8422,
- "outputTokens": 8,
- "latencyMs": 2345.1206249999814
- },
- {
- "questionId": "q100",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 2567,
- "latencyMs": 53935.78729200002
- },
- {
- "questionId": "q100",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "44",
- "actual": "48",
- "isCorrect": false,
- "inputTokens": 11904,
- "outputTokens": 5,
- "latencyMs": 1066.0944579999195
- },
- {
- "questionId": "q100",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "44",
- "actual": "45",
- "isCorrect": false,
- "inputTokens": 12112,
- "outputTokens": 2,
- "latencyMs": 1494.8697500000708
- },
- {
- "questionId": "q100",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 1351,
- "latencyMs": 14949.407374999952
- },
- {
- "questionId": "q100",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "44",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 6990,
- "outputTokens": 5,
- "latencyMs": 967.8411250000354
- },
- {
- "questionId": "q100",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 2,
- "latencyMs": 12734.97745799995
- },
- {
- "questionId": "q100",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 1607,
- "latencyMs": 15572.392542000045
- },
- {
- "questionId": "q100",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "44",
- "actual": "48",
- "isCorrect": false,
- "inputTokens": 8411,
- "outputTokens": 5,
- "latencyMs": 2052.4572499999776
- },
- {
- "questionId": "q100",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 2,
- "latencyMs": 13219.975833000033
- },
- {
- "questionId": "q100",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 1735,
- "latencyMs": 69773.56662499998
- },
- {
- "questionId": "q100",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "44",
- "actual": "45",
- "isCorrect": false,
- "inputTokens": 13377,
- "outputTokens": 5,
- "latencyMs": 1719.8178329999791
- },
- {
- "questionId": "q100",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 2,
- "latencyMs": 11322.527541999938
- },
- {
- "questionId": "q100",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "44",
- "actual": "44",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 1607,
- "latencyMs": 20736.131416000077
- },
- {
- "questionId": "q100",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "44",
- "actual": "45",
- "isCorrect": false,
- "inputTokens": 8382,
- "outputTokens": 5,
- "latencyMs": 1052.186207999941
- },
- {
- "questionId": "q100",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "44",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 8426,
- "outputTokens": 2,
- "latencyMs": 1184.4893750000047
- },
- {
- "questionId": "q101",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 967,
- "latencyMs": 12279.209374999977
- },
- {
- "questionId": "q101",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "39",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 11904,
- "outputTokens": 5,
- "latencyMs": 1297.988250000053
- },
- {
- "questionId": "q101",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "39",
- "actual": "45",
- "isCorrect": false,
- "inputTokens": 12112,
- "outputTokens": 2,
- "latencyMs": 1760.7460000000428
- },
- {
- "questionId": "q101",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 1351,
- "latencyMs": 10500.295707999961
- },
- {
- "questionId": "q101",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "39",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 6990,
- "outputTokens": 5,
- "latencyMs": 1138.843208999955
- },
- {
- "questionId": "q101",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 7200,
- "outputTokens": 2,
- "latencyMs": 9441.675416999962
- },
- {
- "questionId": "q101",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 1863,
- "latencyMs": 19287.06454199995
- },
- {
- "questionId": "q101",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "39",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 8411,
- "outputTokens": 5,
- "latencyMs": 1490.810999999987
- },
- {
- "questionId": "q101",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 2,
- "latencyMs": 12331.178375000018
- },
- {
- "questionId": "q101",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 3335,
- "latencyMs": 26443.42041599995
- },
- {
- "questionId": "q101",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "39",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 13377,
- "outputTokens": 5,
- "latencyMs": 1419.3634590000147
- },
- {
- "questionId": "q101",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 2,
- "latencyMs": 11403.771042000037
- },
- {
- "questionId": "q101",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 7372,
- "outputTokens": 1671,
- "latencyMs": 14214.94204200001
- },
- {
- "questionId": "q101",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "39",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 8382,
- "outputTokens": 5,
- "latencyMs": 1183.1556669999845
- },
- {
- "questionId": "q101",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "39",
- "actual": "39",
- "isCorrect": true,
- "inputTokens": 8426,
- "outputTokens": 2,
- "latencyMs": 12192.347249999992
- },
- {
- "questionId": "q102",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 9738,
- "outputTokens": 2311,
- "latencyMs": 286602.893667
- },
- {
- "questionId": "q102",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "32",
- "actual": "28",
- "isCorrect": false,
- "inputTokens": 11904,
- "outputTokens": 5,
- "latencyMs": 1132.721833000076
- },
- {
- "questionId": "q102",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "32",
- "actual": "37",
- "isCorrect": false,
- "inputTokens": 12112,
- "outputTokens": 2,
- "latencyMs": 1632.5237090000883
- },
- {
- "questionId": "q102",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 6012,
- "outputTokens": 839,
- "latencyMs": 12142.227125000092
- },
- {
- "questionId": "q102",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "32",
- "actual": "26",
- "isCorrect": false,
- "inputTokens": 6990,
- "outputTokens": 5,
- "latencyMs": 1184.7071669999277
- },
- {
- "questionId": "q102",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "32",
- "actual": "37",
- "isCorrect": false,
- "inputTokens": 7200,
- "outputTokens": 2,
- "latencyMs": 1000.1081669999985
- },
- {
- "questionId": "q102",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 6780,
- "outputTokens": 1287,
- "latencyMs": 45846.97675000003
- },
- {
- "questionId": "q102",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "32",
- "actual": "28",
- "isCorrect": false,
- "inputTokens": 8411,
- "outputTokens": 5,
- "latencyMs": 1744.5200829999521
- },
- {
- "questionId": "q102",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 7837,
- "outputTokens": 2,
- "latencyMs": 12398.869249999989
- },
- {
- "questionId": "q102",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 11036,
- "outputTokens": 1351,
- "latencyMs": 12448.268124999944
- },
- {
- "questionId": "q102",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "32",
- "actual": "28",
- "isCorrect": false,
- "inputTokens": 13377,
- "outputTokens": 5,
- "latencyMs": 1155.887459000107
- },
- {
- "questionId": "q102",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "32",
- "actual": "32",
- "isCorrect": true,
- "inputTokens": 13450,
- "outputTokens": 2,
- "latencyMs": 12662.306666000048
- },
- {
- "questionId": "q102",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "32",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 7372,
- "outputTokens": 1799,
- "latencyMs": 15611.27658299997
- },
- {
- "questionId": "q102",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "32",
- "actual": "26",
- "isCorrect": false,
- "inputTokens": 8382,
- "outputTokens": 5,
- "latencyMs": 1592.5243330000667
- },
- {
- "questionId": "q102",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "32",
- "actual": "37",
- "isCorrect": false,
- "inputTokens": 8426,
- "outputTokens": 2,
- "latencyMs": 1257.715124999988
- },
- {
- "questionId": "q103",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 72,
- "latencyMs": 1883.4624169999734
- },
- {
- "questionId": "q103",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1072.3808749999152
- },
- {
- "questionId": "q103",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 2622.4323750000913
- },
- {
- "questionId": "q103",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 136,
- "latencyMs": 15307.557292000041
- },
- {
- "questionId": "q103",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1084.2609999999404
- },
- {
- "questionId": "q103",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2758.0986669999547
- },
- {
- "questionId": "q103",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 72,
- "latencyMs": 1854.1639169999398
- },
- {
- "questionId": "q103",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 948.2132079999428
- },
- {
- "questionId": "q103",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 2243.337582999957
- },
- {
- "questionId": "q103",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 200,
- "latencyMs": 4750.478917
- },
- {
- "questionId": "q103",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1168.2797080000164
- },
- {
- "questionId": "q103",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 1235.7723750000587
- },
- {
- "questionId": "q103",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 72,
- "latencyMs": 4593.343416000018
- },
- {
- "questionId": "q103",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1005.8936250000261
- },
- {
- "questionId": "q103",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6975",
- "actual": "6975",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1302.4004580000183
- },
- {
- "questionId": "q104",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 138,
- "latencyMs": 10838.235042000073
- },
- {
- "questionId": "q104",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1148.390958999982
- },
- {
- "questionId": "q104",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 2339.6254999999655
- },
- {
- "questionId": "q104",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 138,
- "latencyMs": 7077.6732909999555
- },
- {
- "questionId": "q104",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1064.9028750000289
- },
- {
- "questionId": "q104",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 2335.216167000006
- },
- {
- "questionId": "q104",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 74,
- "latencyMs": 5253.633124999935
- },
- {
- "questionId": "q104",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1438.5572920000413
- },
- {
- "questionId": "q104",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 1807.325458999956
- },
- {
- "questionId": "q104",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 138,
- "latencyMs": 3436.290666999994
- },
- {
- "questionId": "q104",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1125.5812910000095
- },
- {
- "questionId": "q104",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 984.154334000079
- },
- {
- "questionId": "q104",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 4561.665000000037
- },
- {
- "questionId": "q104",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1273.080958000035
- },
- {
- "questionId": "q104",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6686.23",
- "actual": "6686.23",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 1065.2617909999099
- },
- {
- "questionId": "q105",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 200,
- "latencyMs": 3926.1200409999583
- },
- {
- "questionId": "q105",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1170.2935419999994
- },
- {
- "questionId": "q105",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 2907.920374999987
- },
- {
- "questionId": "q105",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 136,
- "latencyMs": 6013.766874999972
- },
- {
- "questionId": "q105",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1029.452791999909
- },
- {
- "questionId": "q105",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 1767.9035409999778
- },
- {
- "questionId": "q105",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 200,
- "latencyMs": 2931.0335839999607
- },
- {
- "questionId": "q105",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 857.5665409999201
- },
- {
- "questionId": "q105",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1870.161458000075
- },
- {
- "questionId": "q105",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 136,
- "latencyMs": 2792.1963339999784
- },
- {
- "questionId": "q105",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1112.5085419999668
- },
- {
- "questionId": "q105",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 2572.699583999929
- },
- {
- "questionId": "q105",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 136,
- "latencyMs": 3129.4847079999745
- },
- {
- "questionId": "q105",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 2352.252790999948
- },
- {
- "questionId": "q105",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "7500",
- "actual": "7500",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1623.8393749999814
- },
- {
- "questionId": "q106",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 74,
- "latencyMs": 5410.545292000053
- },
- {
- "questionId": "q106",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1382.8987500000512
- },
- {
- "questionId": "q106",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 8,
- "latencyMs": 2918.163458999945
- },
- {
- "questionId": "q106",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 138,
- "latencyMs": 2478.2083329999587
- },
- {
- "questionId": "q106",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1265.4150420000078
- },
- {
- "questionId": "q106",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 8,
- "latencyMs": 1943.8234170000069
- },
- {
- "questionId": "q106",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 138,
- "latencyMs": 4516.7844160000095
- },
- {
- "questionId": "q106",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1502.5052920000162
- },
- {
- "questionId": "q106",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 8,
- "latencyMs": 2691.783666000003
- },
- {
- "questionId": "q106",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 138,
- "latencyMs": 4047.482250000001
- },
- {
- "questionId": "q106",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1547.010666999966
- },
- {
- "questionId": "q106",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 8,
- "latencyMs": 1679.222165999934
- },
- {
- "questionId": "q106",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 202,
- "latencyMs": 4740.509624999948
- },
- {
- "questionId": "q106",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1271.0033330000006
- },
- {
- "questionId": "q106",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "14297.05",
- "actual": "14297.05",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 8,
- "latencyMs": 2636.093916999991
- },
- {
- "questionId": "q107",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 72,
- "latencyMs": 8298.315874999971
- },
- {
- "questionId": "q107",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1520.9959589999635
- },
- {
- "questionId": "q107",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 2487.122250000015
- },
- {
- "questionId": "q107",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 136,
- "latencyMs": 2142.1067079999484
- },
- {
- "questionId": "q107",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1108.5955839999951
- },
- {
- "questionId": "q107",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2469.1304579999996
- },
- {
- "questionId": "q107",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 136,
- "latencyMs": 2567.9449590001022
- },
- {
- "questionId": "q107",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 1078.092707999982
- },
- {
- "questionId": "q107",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1809.784708000021
- },
- {
- "questionId": "q107",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 200,
- "latencyMs": 2525.847415999975
- },
- {
- "questionId": "q107",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1085.6306249999907
- },
- {
- "questionId": "q107",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 2901.1133329999866
- },
- {
- "questionId": "q107",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 200,
- "latencyMs": 3336.295124999946
- },
- {
- "questionId": "q107",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1092.8172920000507
- },
- {
- "questionId": "q107",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6692",
- "actual": "6692",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1070.4765419999603
- },
- {
- "questionId": "q108",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 74,
- "latencyMs": 4454.346332999994
- },
- {
- "questionId": "q108",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1455.8378749999683
- },
- {
- "questionId": "q108",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 1775.3881249999395
- },
- {
- "questionId": "q108",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 74,
- "latencyMs": 3750.9490000000224
- },
- {
- "questionId": "q108",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1294.0682909999741
- },
- {
- "questionId": "q108",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 2086.9909169999883
- },
- {
- "questionId": "q108",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 138,
- "latencyMs": 2283.21883300005
- },
- {
- "questionId": "q108",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 983.0039999999572
- },
- {
- "questionId": "q108",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 2159.7753329999978
- },
- {
- "questionId": "q108",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 202,
- "latencyMs": 6951.322584000067
- },
- {
- "questionId": "q108",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1090.7049170000246
- },
- {
- "questionId": "q108",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1449.565457999939
- },
- {
- "questionId": "q108",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 3853.0687920000637
- },
- {
- "questionId": "q108",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1126.2435420000693
- },
- {
- "questionId": "q108",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "9302.76",
- "actual": "9302.76",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 1764.1200830000453
- },
- {
- "questionId": "q109",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 136,
- "latencyMs": 3300.9657910000533
- },
- {
- "questionId": "q109",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1052.1962920000078
- },
- {
- "questionId": "q109",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 3287.65862500004
- },
- {
- "questionId": "q109",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 200,
- "latencyMs": 3891.706874999916
- },
- {
- "questionId": "q109",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1081.2852920000441
- },
- {
- "questionId": "q109",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2226.4307500000577
- },
- {
- "questionId": "q109",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 72,
- "latencyMs": 1982.5622910000384
- },
- {
- "questionId": "q109",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 929.4726250000531
- },
- {
- "questionId": "q109",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1787.2903330000117
- },
- {
- "questionId": "q109",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 264,
- "latencyMs": 3257.529749999987
- },
- {
- "questionId": "q109",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1576.1779170000227
- },
- {
- "questionId": "q109",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 2836.7503750000615
- },
- {
- "questionId": "q109",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 136,
- "latencyMs": 4072.856582999928
- },
- {
- "questionId": "q109",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 974.9362500000279
- },
- {
- "questionId": "q109",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "3285",
- "actual": "3285",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1213.922833000077
- },
- {
- "questionId": "q110",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 138,
- "latencyMs": 3493.7957090000855
- },
- {
- "questionId": "q110",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1142.0260000000708
- },
- {
- "questionId": "q110",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 2381.430916000041
- },
- {
- "questionId": "q110",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 138,
- "latencyMs": 2413.9573330000276
- },
- {
- "questionId": "q110",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1847.1221249999944
- },
- {
- "questionId": "q110",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 2303.37033299997
- },
- {
- "questionId": "q110",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 138,
- "latencyMs": 2214.3459579999326
- },
- {
- "questionId": "q110",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1087.8486249999842
- },
- {
- "questionId": "q110",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 1525.997917000088
- },
- {
- "questionId": "q110",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 202,
- "latencyMs": 2952.5206250000047
- },
- {
- "questionId": "q110",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1203.7597079999978
- },
- {
- "questionId": "q110",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1580.2738329999847
- },
- {
- "questionId": "q110",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 2473.919208999956
- },
- {
- "questionId": "q110",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1452.058374999906
- },
- {
- "questionId": "q110",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "3826.93",
- "actual": "3826.93",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 2691.815042000031
- },
- {
- "questionId": "q111",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 136,
- "latencyMs": 2043.9027500000084
- },
- {
- "questionId": "q111",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1085.5088339999784
- },
- {
- "questionId": "q111",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 1648.2013329999754
- },
- {
- "questionId": "q111",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 136,
- "latencyMs": 3078.3677920000628
- },
- {
- "questionId": "q111",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 953.482166999951
- },
- {
- "questionId": "q111",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2107.5470000000205
- },
- {
- "questionId": "q111",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 72,
- "latencyMs": 2056.58216599992
- },
- {
- "questionId": "q111",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 1345.5024170000106
- },
- {
- "questionId": "q111",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1387.981958999997
- },
- {
- "questionId": "q111",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 136,
- "latencyMs": 3227.920458999928
- },
- {
- "questionId": "q111",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1789.7077919999138
- },
- {
- "questionId": "q111",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 3015.3227080000797
- },
- {
- "questionId": "q111",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 200,
- "latencyMs": 2481.5284170000814
- },
- {
- "questionId": "q111",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 2319.2710829999996
- },
- {
- "questionId": "q111",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6191",
- "actual": "6191",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1736.7912920000963
- },
- {
- "questionId": "q112",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 138,
- "latencyMs": 2613.5518750000047
- },
- {
- "questionId": "q112",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1411.1959170000628
- },
- {
- "questionId": "q112",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 2631.1534589999355
- },
- {
- "questionId": "q112",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 74,
- "latencyMs": 2247.1309170000022
- },
- {
- "questionId": "q112",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 935.4031660000328
- },
- {
- "questionId": "q112",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 3261.111125000054
- },
- {
- "questionId": "q112",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 74,
- "latencyMs": 2420.4490409999853
- },
- {
- "questionId": "q112",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1112.1383340000175
- },
- {
- "questionId": "q112",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 2340.017957999953
- },
- {
- "questionId": "q112",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 394,
- "latencyMs": 17092.246334000025
- },
- {
- "questionId": "q112",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1153.1710829999065
- },
- {
- "questionId": "q112",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1490.9894589999458
- },
- {
- "questionId": "q112",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 202,
- "latencyMs": 3339.092583000078
- },
- {
- "questionId": "q112",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1555.5642919999082
- },
- {
- "questionId": "q112",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1854.66",
- "actual": "1854.66",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 2120.2490830000024
- },
- {
- "questionId": "q113",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 200,
- "latencyMs": 3111.5985420000507
- },
- {
- "questionId": "q113",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 968.7054999999236
- },
- {
- "questionId": "q113",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 3022.979249999975
- },
- {
- "questionId": "q113",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 136,
- "latencyMs": 3835.2764579999493
- },
- {
- "questionId": "q113",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1366.261957999901
- },
- {
- "questionId": "q113",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 1964.8687499999069
- },
- {
- "questionId": "q113",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 264,
- "latencyMs": 3045.071499999962
- },
- {
- "questionId": "q113",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 804.4215829999885
- },
- {
- "questionId": "q113",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1822.1931249999907
- },
- {
- "questionId": "q113",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 136,
- "latencyMs": 2214.7718329998897
- },
- {
- "questionId": "q113",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1151.622665999923
- },
- {
- "questionId": "q113",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 1762.1509579999838
- },
- {
- "questionId": "q113",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 200,
- "latencyMs": 2739.4318329999223
- },
- {
- "questionId": "q113",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1074.2716670000227
- },
- {
- "questionId": "q113",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "4696",
- "actual": "4696",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1362.9514999999665
- },
- {
- "questionId": "q114",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 138,
- "latencyMs": 2877.9115410000086
- },
- {
- "questionId": "q114",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1239.7438750000438
- },
- {
- "questionId": "q114",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 6,
- "latencyMs": 1514.1683330000378
- },
- {
- "questionId": "q114",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 202,
- "latencyMs": 2804.6751670000376
- },
- {
- "questionId": "q114",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 979.8223330000183
- },
- {
- "questionId": "q114",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 6,
- "latencyMs": 2323.508334000013
- },
- {
- "questionId": "q114",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 74,
- "latencyMs": 1690.5704579999438
- },
- {
- "questionId": "q114",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 886.4768329999642
- },
- {
- "questionId": "q114",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 6,
- "latencyMs": 1805.5540000000037
- },
- {
- "questionId": "q114",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 266,
- "latencyMs": 4743.464458000031
- },
- {
- "questionId": "q114",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1165.764332999941
- },
- {
- "questionId": "q114",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 6,
- "latencyMs": 2148.3432500000345
- },
- {
- "questionId": "q114",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 2704.757041999954
- },
- {
- "questionId": "q114",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1058.6455829999177
- },
- {
- "questionId": "q114",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "4211.6",
- "actual": "4211.6",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 6,
- "latencyMs": 2256.7089169999817
- },
- {
- "questionId": "q115",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 136,
- "latencyMs": 2360.8099159999983
- },
- {
- "questionId": "q115",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1535.8384579999838
- },
- {
- "questionId": "q115",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 3278.595083000022
- },
- {
- "questionId": "q115",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 328,
- "latencyMs": 7969.119124999968
- },
- {
- "questionId": "q115",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1099.6044580000453
- },
- {
- "questionId": "q115",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2112.666833000025
- },
- {
- "questionId": "q115",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 72,
- "latencyMs": 1636.6678329999559
- },
- {
- "questionId": "q115",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 902.907957999967
- },
- {
- "questionId": "q115",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1787.2734170000767
- },
- {
- "questionId": "q115",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 264,
- "latencyMs": 3207.286208000034
- },
- {
- "questionId": "q115",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1176.4805000000633
- },
- {
- "questionId": "q115",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 3314.0558330001077
- },
- {
- "questionId": "q115",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 200,
- "latencyMs": 5537.94308300002
- },
- {
- "questionId": "q115",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 914.5840419998858
- },
- {
- "questionId": "q115",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6196",
- "actual": "6196",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1747.4003750000848
- },
- {
- "questionId": "q116",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 202,
- "latencyMs": 5452.725000000093
- },
- {
- "questionId": "q116",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6105.3",
- "actual": "6105.30",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1257.8495419999817
- },
- {
- "questionId": "q116",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 6,
- "latencyMs": 1183.2777500000084
- },
- {
- "questionId": "q116",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 330,
- "latencyMs": 7140.693124999991
- },
- {
- "questionId": "q116",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1131.5447919999715
- },
- {
- "questionId": "q116",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 6,
- "latencyMs": 2556.5294579999754
- },
- {
- "questionId": "q116",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 266,
- "latencyMs": 3158.0195420000236
- },
- {
- "questionId": "q116",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 926.703375000041
- },
- {
- "questionId": "q116",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 6,
- "latencyMs": 2144.0341659999685
- },
- {
- "questionId": "q116",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 202,
- "latencyMs": 3109.7603749999544
- },
- {
- "questionId": "q116",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6105.3",
- "actual": "6105.30",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1212.1927079999587
- },
- {
- "questionId": "q116",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 6,
- "latencyMs": 3449.487916999962
- },
- {
- "questionId": "q116",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 2570.9303749999963
- },
- {
- "questionId": "q116",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1058.9517500000075
- },
- {
- "questionId": "q116",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6105.3",
- "actual": "6105.3",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 6,
- "latencyMs": 1379.4884169999277
- },
- {
- "questionId": "q117",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 200,
- "latencyMs": 2630.738624999998
- },
- {
- "questionId": "q117",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 884.325959000038
- },
- {
- "questionId": "q117",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 2599.299457999994
- },
- {
- "questionId": "q117",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 200,
- "latencyMs": 5174.115041999961
- },
- {
- "questionId": "q117",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1230.3996659999248
- },
- {
- "questionId": "q117",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 2081.4514590000035
- },
- {
- "questionId": "q117",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 456,
- "latencyMs": 4708.666958000045
- },
- {
- "questionId": "q117",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 1065.470417000004
- },
- {
- "questionId": "q117",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1987.3131250001024
- },
- {
- "questionId": "q117",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 200,
- "latencyMs": 3420.324041999993
- },
- {
- "questionId": "q117",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 897.2685829999391
- },
- {
- "questionId": "q117",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 1442.7957500000484
- },
- {
- "questionId": "q117",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 264,
- "latencyMs": 3038.6226250000764
- },
- {
- "questionId": "q117",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1260.5887920000823
- },
- {
- "questionId": "q117",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6528",
- "actual": "6528",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1877.516042000032
- },
- {
- "questionId": "q118",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 266,
- "latencyMs": 40974.3431249999
- },
- {
- "questionId": "q118",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 867.1927500000456
- },
- {
- "questionId": "q118",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 3284.4902500000317
- },
- {
- "questionId": "q118",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 586,
- "latencyMs": 5396.599999999977
- },
- {
- "questionId": "q118",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1174.796290999977
- },
- {
- "questionId": "q118",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 2751.699709000066
- },
- {
- "questionId": "q118",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 138,
- "latencyMs": 3463.471459000022
- },
- {
- "questionId": "q118",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 925.253083000076
- },
- {
- "questionId": "q118",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 3240.4625000000233
- },
- {
- "questionId": "q118",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 138,
- "latencyMs": 7405.421083000023
- },
- {
- "questionId": "q118",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1061.0794160000514
- },
- {
- "questionId": "q118",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1512.5596659999574
- },
- {
- "questionId": "q118",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 138,
- "latencyMs": 2445.1606250000186
- },
- {
- "questionId": "q118",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1296.5266660000198
- },
- {
- "questionId": "q118",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1136.09",
- "actual": "1136.09",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 1523.473083000048
- },
- {
- "questionId": "q119",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 392,
- "latencyMs": 4885.794165999978
- },
- {
- "questionId": "q119",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 958.9109579999931
- },
- {
- "questionId": "q119",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 2268.0900839999085
- },
- {
- "questionId": "q119",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 648,
- "latencyMs": 12410.339000000036
- },
- {
- "questionId": "q119",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1124.1954169999808
- },
- {
- "questionId": "q119",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 1842.937042000005
- },
- {
- "questionId": "q119",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 200,
- "latencyMs": 14746.862250000006
- },
- {
- "questionId": "q119",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 1070.885459000012
- },
- {
- "questionId": "q119",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 2808.225791999954
- },
- {
- "questionId": "q119",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 4423,
- "outputTokens": 264,
- "latencyMs": 2815.092042000033
- },
- {
- "questionId": "q119",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1285.6015419999603
- },
- {
- "questionId": "q119",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 1620.0065000000177
- },
- {
- "questionId": "q119",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 136,
- "latencyMs": 3353.4782089999644
- },
- {
- "questionId": "q119",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1281.6234170000535
- },
- {
- "questionId": "q119",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "4689",
- "actual": "4689",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1903.9000839999644
- },
- {
- "questionId": "q120",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 330,
- "latencyMs": 3469.9373749999795
- },
- {
- "questionId": "q120",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1129.299417000031
- },
- {
- "questionId": "q120",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 1843.423833000008
- },
- {
- "questionId": "q120",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 74,
- "latencyMs": 3029.9955000000773
- },
- {
- "questionId": "q120",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 976.265458000009
- },
- {
- "questionId": "q120",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 1941.5176659999415
- },
- {
- "questionId": "q120",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 138,
- "latencyMs": 2326.60387500003
- },
- {
- "questionId": "q120",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1340.7505420000525
- },
- {
- "questionId": "q120",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 3061.3734159999294
- },
- {
- "questionId": "q120",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 330,
- "latencyMs": 18444.37216700008
- },
- {
- "questionId": "q120",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1472.8980000000447
- },
- {
- "questionId": "q120",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1203.1091250000754
- },
- {
- "questionId": "q120",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 266,
- "latencyMs": 6852.723041999969
- },
- {
- "questionId": "q120",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1186.3190000000177
- },
- {
- "questionId": "q120",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "2637.73",
- "actual": "2637.73",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 2720.8557080000173
- },
- {
- "questionId": "q121",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 3712,
- "outputTokens": 200,
- "latencyMs": 9941.250375000061
- },
- {
- "questionId": "q121",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 4080,
- "outputTokens": 6,
- "latencyMs": 1254.0278750000289
- },
- {
- "questionId": "q121",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 4784,
- "outputTokens": 4,
- "latencyMs": 3998.6611660000635
- },
- {
- "questionId": "q121",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 1563,
- "outputTokens": 72,
- "latencyMs": 2154.672750000027
- },
- {
- "questionId": "q121",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 1509,
- "outputTokens": 6,
- "latencyMs": 1019.1613750000251
- },
- {
- "questionId": "q121",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 2271,
- "outputTokens": 4,
- "latencyMs": 1623.1509579999838
- },
- {
- "questionId": "q121",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 1441,
- "outputTokens": 200,
- "latencyMs": 5643.6689169999445
- },
- {
- "questionId": "q121",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 1445,
- "outputTokens": 6,
- "latencyMs": 908.8649170000572
- },
- {
- "questionId": "q121",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 2208,
- "outputTokens": 4,
- "latencyMs": 1939.4002079999773
- },
- {
- "questionId": "q121",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "5685",
- "actual": "7409",
- "isCorrect": false,
- "inputTokens": 4423,
- "outputTokens": 392,
- "latencyMs": 18020.185499999905
- },
- {
- "questionId": "q121",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 4787,
- "outputTokens": 6,
- "latencyMs": 1167.9574999999022
- },
- {
- "questionId": "q121",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 5431,
- "outputTokens": 4,
- "latencyMs": 2516.0782500000205
- },
- {
- "questionId": "q121",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 2985,
- "outputTokens": 136,
- "latencyMs": 3538.66266599996
- },
- {
- "questionId": "q121",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 3110,
- "outputTokens": 6,
- "latencyMs": 1074.641707999981
- },
- {
- "questionId": "q121",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "5685",
- "actual": "5685",
- "isCorrect": true,
- "inputTokens": 3814,
- "outputTokens": 4,
- "latencyMs": 1611.2575829999987
- },
- {
- "questionId": "q122",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 3711,
- "outputTokens": 202,
- "latencyMs": 3097.4197080000304
- },
- {
- "questionId": "q122",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 4079,
- "outputTokens": 8,
- "latencyMs": 1068.923999999999
- },
- {
- "questionId": "q122",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 4783,
- "outputTokens": 7,
- "latencyMs": 1952.0416250000708
- },
- {
- "questionId": "q122",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 1562,
- "outputTokens": 906,
- "latencyMs": 11804.22670800006
- },
- {
- "questionId": "q122",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 1508,
- "outputTokens": 8,
- "latencyMs": 1140.642707999912
- },
- {
- "questionId": "q122",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 2270,
- "outputTokens": 7,
- "latencyMs": 3323.8447500000475
- },
- {
- "questionId": "q122",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 1440,
- "outputTokens": 202,
- "latencyMs": 5759.3412499999395
- },
- {
- "questionId": "q122",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 1444,
- "outputTokens": 8,
- "latencyMs": 1174.6347079999978
- },
- {
- "questionId": "q122",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 2207,
- "outputTokens": 7,
- "latencyMs": 1816.737458000076
- },
- {
- "questionId": "q122",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 4422,
- "outputTokens": 138,
- "latencyMs": 14154.70395799994
- },
- {
- "questionId": "q122",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 4786,
- "outputTokens": 8,
- "latencyMs": 1000.3886250000214
- },
- {
- "questionId": "q122",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 5430,
- "outputTokens": 7,
- "latencyMs": 1258.68512499996
- },
- {
- "questionId": "q122",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 2984,
- "outputTokens": 202,
- "latencyMs": 2957.2190829999745
- },
- {
- "questionId": "q122",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 3109,
- "outputTokens": 8,
- "latencyMs": 1128.0480420000385
- },
- {
- "questionId": "q122",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "3421.06",
- "actual": "3421.06",
- "isCorrect": true,
- "inputTokens": 3813,
- "outputTokens": 7,
- "latencyMs": 1714.4717499999097
- },
- {
- "questionId": "q123",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "344498",
- "actual": "344498",
- "isCorrect": true,
- "inputTokens": 3709,
- "outputTokens": 2632,
- "latencyMs": 31555.039709000033
- },
- {
- "questionId": "q123",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "344498",
- "actual": "188,945",
- "isCorrect": false,
- "inputTokens": 4077,
- "outputTokens": 7,
- "latencyMs": 1094.905458000023
- },
- {
- "questionId": "q123",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "344498",
- "actual": "340900",
- "isCorrect": false,
- "inputTokens": 4777,
- "outputTokens": 6,
- "latencyMs": 11993.166834000032
- },
- {
- "questionId": "q123",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "344498",
- "actual": "344498",
- "isCorrect": true,
- "inputTokens": 1560,
- "outputTokens": 4360,
- "latencyMs": 47190.18545800005
- },
- {
- "questionId": "q123",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "344498",
- "actual": "337,045",
- "isCorrect": false,
- "inputTokens": 1506,
- "outputTokens": 7,
- "latencyMs": 1098.8443330000155
- },
- {
- "questionId": "q123",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "344498",
- "actual": "344900",
- "isCorrect": false,
- "inputTokens": 2264,
- "outputTokens": 6,
- "latencyMs": 5982.8935409999685
- },
- {
- "questionId": "q123",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "344498",
- "actual": "344498",
- "isCorrect": true,
- "inputTokens": 1438,
- "outputTokens": 3080,
- "latencyMs": 27390.594666999998
- },
- {
- "questionId": "q123",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "344498",
- "actual": "372,915",
- "isCorrect": false,
- "inputTokens": 1442,
- "outputTokens": 7,
- "latencyMs": 1168.8217080000322
- },
- {
- "questionId": "q123",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "344498",
- "actual": "349900",
- "isCorrect": false,
- "inputTokens": 2201,
- "outputTokens": 6,
- "latencyMs": 5658.501500000013
- },
- {
- "questionId": "q123",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "344498",
- "actual": "344498",
- "isCorrect": true,
- "inputTokens": 4420,
- "outputTokens": 3592,
- "latencyMs": 25827.663583000074
- },
- {
- "questionId": "q123",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "344498",
- "actual": "372,089",
- "isCorrect": false,
- "inputTokens": 4784,
- "outputTokens": 7,
- "latencyMs": 1297.9579999999842
- },
- {
- "questionId": "q123",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "344498",
- "actual": "340900",
- "isCorrect": false,
- "inputTokens": 5424,
- "outputTokens": 6,
- "latencyMs": 7942.432666000095
- },
- {
- "questionId": "q123",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "344498",
- "actual": "344498",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 3144,
- "latencyMs": 26846.991665999987
- },
- {
- "questionId": "q123",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "344498",
- "actual": "181,854",
- "isCorrect": false,
- "inputTokens": 3107,
- "outputTokens": 7,
- "latencyMs": 1012.253665999975
- },
- {
- "questionId": "q123",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "344498",
- "actual": "300900",
- "isCorrect": false,
- "inputTokens": 3807,
- "outputTokens": 6,
- "latencyMs": 1351.5872090000194
- },
- {
- "questionId": "q124",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "312818.50",
- "actual": "312818.50",
- "isCorrect": true,
- "inputTokens": 3707,
- "outputTokens": 4746,
- "latencyMs": 38656.80637499993
- },
- {
- "questionId": "q124",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "312818.50",
- "actual": "287,745.89",
- "isCorrect": false,
- "inputTokens": 4075,
- "outputTokens": 9,
- "latencyMs": 1336.5668340000557
- },
- {
- "questionId": "q124",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "312818.50",
- "actual": "300000.00",
- "isCorrect": false,
- "inputTokens": 4775,
- "outputTokens": 9,
- "latencyMs": 45570.00233399996
- },
- {
- "questionId": "q124",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "312818.50",
- "actual": "312818.50",
- "isCorrect": true,
- "inputTokens": 1558,
- "outputTokens": 3594,
- "latencyMs": 36589.136415999965
- },
- {
- "questionId": "q124",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "312818.50",
- "actual": "487,891.45",
- "isCorrect": false,
- "inputTokens": 1504,
- "outputTokens": 9,
- "latencyMs": 1009.5284579999279
- },
- {
- "questionId": "q124",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "312818.50",
- "actual": "320000.00",
- "isCorrect": false,
- "inputTokens": 2262,
- "outputTokens": 9,
- "latencyMs": 11883.04608400003
- },
- {
- "questionId": "q124",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "312818.50",
- "actual": "312818.50",
- "isCorrect": true,
- "inputTokens": 1436,
- "outputTokens": 3402,
- "latencyMs": 209516.903208
- },
- {
- "questionId": "q124",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "312818.50",
- "actual": "487,891.89",
- "isCorrect": false,
- "inputTokens": 1440,
- "outputTokens": 9,
- "latencyMs": 1453.1753339999123
- },
- {
- "questionId": "q124",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "312818.50",
- "actual": "329999.99",
- "isCorrect": false,
- "inputTokens": 2199,
- "outputTokens": 9,
- "latencyMs": 12329.097540999996
- },
- {
- "questionId": "q124",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "312818.50",
- "actual": "312818.50",
- "isCorrect": true,
- "inputTokens": 4418,
- "outputTokens": 3274,
- "latencyMs": 32337.936125000007
- },
- {
- "questionId": "q124",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "312818.50",
- "actual": "381,847.89",
- "isCorrect": false,
- "inputTokens": 4782,
- "outputTokens": 9,
- "latencyMs": 990.2755830000388
- },
- {
- "questionId": "q124",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "312818.50",
- "actual": "300000.00",
- "isCorrect": false,
- "inputTokens": 5422,
- "outputTokens": 9,
- "latencyMs": 12093.661916999961
- },
- {
- "questionId": "q124",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "312818.50",
- "actual": "312818.50",
- "isCorrect": true,
- "inputTokens": 2980,
- "outputTokens": 6730,
- "latencyMs": 45238.25570800004
- },
- {
- "questionId": "q124",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "312818.50",
- "actual": "381,847.89",
- "isCorrect": false,
- "inputTokens": 3105,
- "outputTokens": 9,
- "latencyMs": 1242.9971659999574
- },
- {
- "questionId": "q124",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "312818.50",
- "actual": "369000.00",
- "isCorrect": false,
- "inputTokens": 3805,
- "outputTokens": 9,
- "latencyMs": 1604.1214169999585
- },
- {
- "questionId": "q125",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 3709,
- "outputTokens": 2184,
- "latencyMs": 22585.809791999985
- },
- {
- "questionId": "q125",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "1811",
- "actual": "1,234",
- "isCorrect": false,
- "inputTokens": 4078,
- "outputTokens": 7,
- "latencyMs": 1230.1040829999838
- },
- {
- "questionId": "q125",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 4777,
- "outputTokens": 4,
- "latencyMs": 9357.454415999935
- },
- {
- "questionId": "q125",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 1560,
- "outputTokens": 2888,
- "latencyMs": 19966.08491700003
- },
- {
- "questionId": "q125",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "1811",
- "actual": "1,945",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 7,
- "latencyMs": 961.2437919999938
- },
- {
- "questionId": "q125",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 2264,
- "outputTokens": 4,
- "latencyMs": 9139.956667000079
- },
- {
- "questionId": "q125",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 1438,
- "outputTokens": 2504,
- "latencyMs": 21066.86054100003
- },
- {
- "questionId": "q125",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "1811",
- "actual": "1,945",
- "isCorrect": false,
- "inputTokens": 1443,
- "outputTokens": 7,
- "latencyMs": 902.673208000022
- },
- {
- "questionId": "q125",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 2201,
- "outputTokens": 4,
- "latencyMs": 7727.039290999994
- },
- {
- "questionId": "q125",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 4420,
- "outputTokens": 1864,
- "latencyMs": 15644.210124999983
- },
- {
- "questionId": "q125",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "1811",
- "actual": "1,532",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 7,
- "latencyMs": 1311.9297919999808
- },
- {
- "questionId": "q125",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 5424,
- "outputTokens": 4,
- "latencyMs": 11031.984583999962
- },
- {
- "questionId": "q125",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "1811",
- "actual": "1811",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 1928,
- "latencyMs": 26268.215167000075
- },
- {
- "questionId": "q125",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "1811",
- "actual": "1,454",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 7,
- "latencyMs": 1283.3860000000568
- },
- {
- "questionId": "q125",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "1811",
- "actual": "1560",
- "isCorrect": false,
- "inputTokens": 3807,
- "outputTokens": 4,
- "latencyMs": 1390.9544999999925
- },
- {
- "questionId": "q126",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 3709,
- "outputTokens": 1671,
- "latencyMs": 18722.413541999995
- },
- {
- "questionId": "q126",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 957.5536249999423
- },
- {
- "questionId": "q126",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "42",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 4779,
- "outputTokens": 2,
- "latencyMs": 1718.3615829999326
- },
- {
- "questionId": "q126",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 1560,
- "outputTokens": 2439,
- "latencyMs": 20739.166833000025
- },
- {
- "questionId": "q126",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 1305.5439999999944
- },
- {
- "questionId": "q126",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 2266,
- "outputTokens": 2,
- "latencyMs": 13351.089582999935
- },
- {
- "questionId": "q126",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 1438,
- "outputTokens": 2567,
- "latencyMs": 23067.457167000044
- },
- {
- "questionId": "q126",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 1073.1606669999892
- },
- {
- "questionId": "q126",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 2203,
- "outputTokens": 2,
- "latencyMs": 22770.808125000098
- },
- {
- "questionId": "q126",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 4420,
- "outputTokens": 2439,
- "latencyMs": 28125.872208000044
- },
- {
- "questionId": "q126",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "42",
- "actual": "54",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 1046.3992919999873
- },
- {
- "questionId": "q126",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 5426,
- "outputTokens": 2,
- "latencyMs": 12982.094000000041
- },
- {
- "questionId": "q126",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "42",
- "actual": "42",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 2631,
- "latencyMs": 31181.451875000028
- },
- {
- "questionId": "q126",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "42",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1418.826708000037
- },
- {
- "questionId": "q126",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "42",
- "actual": "49",
- "isCorrect": false,
- "inputTokens": 3809,
- "outputTokens": 2,
- "latencyMs": 2009.2083750000456
- },
- {
- "questionId": "q127",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 3709,
- "outputTokens": 2503,
- "latencyMs": 26827.34341699991
- },
- {
- "questionId": "q127",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "28",
- "actual": "24",
- "isCorrect": false,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 1093.9559999998892
- },
- {
- "questionId": "q127",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 4779,
- "outputTokens": 2,
- "latencyMs": 18861.496042000013
- },
- {
- "questionId": "q127",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 1560,
- "outputTokens": 1799,
- "latencyMs": 18378.229374999995
- },
- {
- "questionId": "q127",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "28",
- "actual": "26",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 1111.1742920000106
- },
- {
- "questionId": "q127",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 2266,
- "outputTokens": 2,
- "latencyMs": 12380.956957999966
- },
- {
- "questionId": "q127",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 1438,
- "outputTokens": 2055,
- "latencyMs": 112325.29683300003
- },
- {
- "questionId": "q127",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "28",
- "actual": "23",
- "isCorrect": false,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 1231.2409169999883
- },
- {
- "questionId": "q127",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 2203,
- "outputTokens": 2,
- "latencyMs": 20394.07720900001
- },
- {
- "questionId": "q127",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 4420,
- "outputTokens": 1799,
- "latencyMs": 22818.38325000007
- },
- {
- "questionId": "q127",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "28",
- "actual": "24",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 1324.3675420000218
- },
- {
- "questionId": "q127",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 5426,
- "outputTokens": 2,
- "latencyMs": 14308.32895799994
- },
- {
- "questionId": "q127",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "28",
- "actual": "28",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 2055,
- "latencyMs": 22493.268166999915
- },
- {
- "questionId": "q127",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "28",
- "actual": "23",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1449.5348340000492
- },
- {
- "questionId": "q127",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "28",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 3809,
- "outputTokens": 2,
- "latencyMs": 1329.5626659999834
- },
- {
- "questionId": "q128",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3709,
- "outputTokens": 2183,
- "latencyMs": 20410.59154199995
- },
- {
- "questionId": "q128",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 1137.8916250000475
- },
- {
- "questionId": "q128",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 4779,
- "outputTokens": 2,
- "latencyMs": 15306.355875000008
- },
- {
- "questionId": "q128",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 1560,
- "outputTokens": 967,
- "latencyMs": 9355.326041999971
- },
- {
- "questionId": "q128",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "12",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 970.5706669999054
- },
- {
- "questionId": "q128",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2266,
- "outputTokens": 2,
- "latencyMs": 12738.58170900005
- },
- {
- "questionId": "q128",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 1438,
- "outputTokens": 1095,
- "latencyMs": 11532.495875000022
- },
- {
- "questionId": "q128",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 1092.326875000028
- },
- {
- "questionId": "q128",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2203,
- "outputTokens": 2,
- "latencyMs": 9477.962708000094
- },
- {
- "questionId": "q128",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 4420,
- "outputTokens": 1287,
- "latencyMs": 12363.918167000054
- },
- {
- "questionId": "q128",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 1086.439250000054
- },
- {
- "questionId": "q128",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 5426,
- "outputTokens": 2,
- "latencyMs": 13847.167500000098
- },
- {
- "questionId": "q128",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 2982,
- "outputTokens": 1607,
- "latencyMs": 18025.304333999986
- },
- {
- "questionId": "q128",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1525.7963329999475
- },
- {
- "questionId": "q128",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "11",
- "actual": "11",
- "isCorrect": true,
- "inputTokens": 3809,
- "outputTokens": 2,
- "latencyMs": 11297.281415999983
- },
- {
- "questionId": "q129",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 3708,
- "outputTokens": 1607,
- "latencyMs": 16793.02033300011
- },
- {
- "questionId": "q129",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "58",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 1524.2867090000072
- },
- {
- "questionId": "q129",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 4777,
- "outputTokens": 2,
- "latencyMs": 20291.370166999986
- },
- {
- "questionId": "q129",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 1559,
- "outputTokens": 2631,
- "latencyMs": 31767.777667000075
- },
- {
- "questionId": "q129",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "58",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 1128.108874999918
- },
- {
- "questionId": "q129",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 2264,
- "outputTokens": 2,
- "latencyMs": 17774.151832999894
- },
- {
- "questionId": "q129",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 1437,
- "outputTokens": 2887,
- "latencyMs": 24058.048583999975
- },
- {
- "questionId": "q129",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "58",
- "actual": "54",
- "isCorrect": false,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 833.2049999999581
- },
- {
- "questionId": "q129",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 2201,
- "outputTokens": 2,
- "latencyMs": 7901.533541000099
- },
- {
- "questionId": "q129",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 4419,
- "outputTokens": 1415,
- "latencyMs": 13345.296500000055
- },
- {
- "questionId": "q129",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "58",
- "actual": "54",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 1001.3450419999426
- },
- {
- "questionId": "q129",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "58",
- "actual": "55",
- "isCorrect": false,
- "inputTokens": 5424,
- "outputTokens": 2,
- "latencyMs": 2326.790707999957
- },
- {
- "questionId": "q129",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "58",
- "actual": "58",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 1287,
- "latencyMs": 14444.245874999906
- },
- {
- "questionId": "q129",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "58",
- "actual": "54",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1060.1971249999478
- },
- {
- "questionId": "q129",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "58",
- "actual": "59",
- "isCorrect": false,
- "inputTokens": 3807,
- "outputTokens": 2,
- "latencyMs": 2816.4778749999823
- },
- {
- "questionId": "q130",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 3708,
- "outputTokens": 3015,
- "latencyMs": 190630.39133400004
- },
- {
- "questionId": "q130",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 5375.239707999979
- },
- {
- "questionId": "q130",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 4777,
- "outputTokens": 2,
- "latencyMs": 19789.381042000023
- },
- {
- "questionId": "q130",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 1559,
- "outputTokens": 2055,
- "latencyMs": 16472.23841599992
- },
- {
- "questionId": "q130",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 1042.922583000036
- },
- {
- "questionId": "q130",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 2264,
- "outputTokens": 2,
- "latencyMs": 13095.397083000047
- },
- {
- "questionId": "q130",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 1437,
- "outputTokens": 2311,
- "latencyMs": 26893.475125000114
- },
- {
- "questionId": "q130",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "38",
- "isCorrect": false,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 1042.875250000041
- },
- {
- "questionId": "q130",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 2201,
- "outputTokens": 2,
- "latencyMs": 28097.87474999996
- },
- {
- "questionId": "q130",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 4419,
- "outputTokens": 1735,
- "latencyMs": 14091.963709000032
- },
- {
- "questionId": "q130",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 1151.6397919999436
- },
- {
- "questionId": "q130",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 5424,
- "outputTokens": 2,
- "latencyMs": 15769.612874999992
- },
- {
- "questionId": "q130",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 1799,
- "latencyMs": 18804.838290999993
- },
- {
- "questionId": "q130",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "41",
- "actual": "31",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1030.810417000088
- },
- {
- "questionId": "q130",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "41",
- "actual": "41",
- "isCorrect": true,
- "inputTokens": 3807,
- "outputTokens": 2,
- "latencyMs": 14482.474917000043
- },
- {
- "questionId": "q131",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 3708,
- "outputTokens": 1351,
- "latencyMs": 21887.844958
- },
- {
- "questionId": "q131",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "23",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 4078,
- "outputTokens": 5,
- "latencyMs": 1332.5089160000207
- },
- {
- "questionId": "q131",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 4777,
- "outputTokens": 2,
- "latencyMs": 17226.03358399996
- },
- {
- "questionId": "q131",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 1559,
- "outputTokens": 2055,
- "latencyMs": 20772.763792000012
- },
- {
- "questionId": "q131",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "23",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 1507,
- "outputTokens": 5,
- "latencyMs": 966.6354170000413
- },
- {
- "questionId": "q131",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 2264,
- "outputTokens": 2,
- "latencyMs": 10442.985291999998
- },
- {
- "questionId": "q131",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 1437,
- "outputTokens": 1095,
- "latencyMs": 10072.030124999932
- },
- {
- "questionId": "q131",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "23",
- "actual": "20",
- "isCorrect": false,
- "inputTokens": 1443,
- "outputTokens": 5,
- "latencyMs": 1233.0955420000246
- },
- {
- "questionId": "q131",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 2201,
- "outputTokens": 2,
- "latencyMs": 18590.031917000073
- },
- {
- "questionId": "q131",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 4419,
- "outputTokens": 1735,
- "latencyMs": 17035.41470799991
- },
- {
- "questionId": "q131",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "23",
- "actual": "21",
- "isCorrect": false,
- "inputTokens": 4785,
- "outputTokens": 5,
- "latencyMs": 994.0176249999786
- },
- {
- "questionId": "q131",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 5424,
- "outputTokens": 2,
- "latencyMs": 12477.123250000062
- },
- {
- "questionId": "q131",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 2981,
- "outputTokens": 1479,
- "latencyMs": 14346.053416999988
- },
- {
- "questionId": "q131",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "23",
- "actual": "21",
- "isCorrect": false,
- "inputTokens": 3108,
- "outputTokens": 5,
- "latencyMs": 1269.5552920000628
- },
- {
- "questionId": "q131",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "23",
- "actual": "23",
- "isCorrect": true,
- "inputTokens": 3807,
- "outputTokens": 2,
- "latencyMs": 13739.479209000012
- },
- {
- "questionId": "q132",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 15187,
- "outputTokens": 136,
- "latencyMs": 3680.113916000002
- },
- {
- "questionId": "q132",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 17409,
- "outputTokens": 6,
- "latencyMs": 1548.528917000047
- },
- {
- "questionId": "q132",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 6,
- "latencyMs": 1637.454792000004
- },
- {
- "questionId": "q132",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 776,
- "latencyMs": 8918.199665999971
- },
- {
- "questionId": "q132",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 9279,
- "outputTokens": 6,
- "latencyMs": 1900.8446669999976
- },
- {
- "questionId": "q132",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "430828",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12337,
- "outputTokens": 1,
- "latencyMs": 2677.7128749999683
- },
- {
- "questionId": "q132",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 8556,
- "outputTokens": 712,
- "latencyMs": 10733.462500000023
- },
- {
- "questionId": "q132",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 9125,
- "outputTokens": 6,
- "latencyMs": 1135.363000000012
- },
- {
- "questionId": "q132",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 12207,
- "outputTokens": 6,
- "latencyMs": 1007.8897500000894
- },
- {
- "questionId": "q132",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 17138,
- "outputTokens": 328,
- "latencyMs": 7708.789500000072
- },
- {
- "questionId": "q132",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 19804,
- "outputTokens": 6,
- "latencyMs": 1477.8527500000782
- },
- {
- "questionId": "q132",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 6,
- "latencyMs": 2380.750500000082
- },
- {
- "questionId": "q132",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 328,
- "latencyMs": 9429.131750000059
- },
- {
- "questionId": "q132",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 14483,
- "outputTokens": 6,
- "latencyMs": 1359.2385419999482
- },
- {
- "questionId": "q132",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "430828",
- "actual": "430828",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 6,
- "latencyMs": 1939.293042000034
- },
- {
- "questionId": "q133",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 15189,
- "outputTokens": 392,
- "latencyMs": 6479.065457999939
- },
- {
- "questionId": "q133",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 17410,
- "outputTokens": 6,
- "latencyMs": 1155.017041999963
- },
- {
- "questionId": "q133",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 19992,
- "outputTokens": 5,
- "latencyMs": 2049.621832999983
- },
- {
- "questionId": "q133",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 8790,
- "outputTokens": 648,
- "latencyMs": 11672.019874999998
- },
- {
- "questionId": "q133",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 9280,
- "outputTokens": 6,
- "latencyMs": 1597.3725000000559
- },
- {
- "questionId": "q133",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "11798",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12338,
- "outputTokens": 1,
- "latencyMs": 11414.63520800008
- },
- {
- "questionId": "q133",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 8558,
- "outputTokens": 584,
- "latencyMs": 15138.947667
- },
- {
- "questionId": "q133",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 9126,
- "outputTokens": 6,
- "latencyMs": 1173.9259160000365
- },
- {
- "questionId": "q133",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 12208,
- "outputTokens": 5,
- "latencyMs": 2788.6645000000717
- },
- {
- "questionId": "q133",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 17140,
- "outputTokens": 328,
- "latencyMs": 4541.789875000017
- },
- {
- "questionId": "q133",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 19805,
- "outputTokens": 6,
- "latencyMs": 1787.0144160001073
- },
- {
- "questionId": "q133",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 21882,
- "outputTokens": 5,
- "latencyMs": 3930.188833000022
- },
- {
- "questionId": "q133",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 13173,
- "outputTokens": 264,
- "latencyMs": 4459.655541999964
- },
- {
- "questionId": "q133",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 14484,
- "outputTokens": 6,
- "latencyMs": 1239.003000000026
- },
- {
- "questionId": "q133",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "11798",
- "actual": "11798",
- "isCorrect": true,
- "inputTokens": 17077,
- "outputTokens": 5,
- "latencyMs": 4828.425707999966
- },
- {
- "questionId": "q134",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 15192,
- "outputTokens": 200,
- "latencyMs": 4039.568958000047
- },
- {
- "questionId": "q134",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 17412,
- "outputTokens": 6,
- "latencyMs": 1455.9585000000661
- },
- {
- "questionId": "q134",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 19995,
- "outputTokens": 6,
- "latencyMs": 1600.7708750000456
- },
- {
- "questionId": "q134",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 8793,
- "outputTokens": 456,
- "latencyMs": 5973.896042000037
- },
- {
- "questionId": "q134",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 9282,
- "outputTokens": 6,
- "latencyMs": 2000.6470419999678
- },
- {
- "questionId": "q134",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 12341,
- "outputTokens": 6,
- "latencyMs": 2543.431542000035
- },
- {
- "questionId": "q134",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 8561,
- "outputTokens": 648,
- "latencyMs": 6973.037040999974
- },
- {
- "questionId": "q134",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 9128,
- "outputTokens": 6,
- "latencyMs": 1655.6718330000294
- },
- {
- "questionId": "q134",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 12211,
- "outputTokens": 6,
- "latencyMs": 2357.3444590000436
- },
- {
- "questionId": "q134",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 17143,
- "outputTokens": 392,
- "latencyMs": 6136.790167000028
- },
- {
- "questionId": "q134",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 19807,
- "outputTokens": 6,
- "latencyMs": 2510.24762499996
- },
- {
- "questionId": "q134",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 21885,
- "outputTokens": 6,
- "latencyMs": 1737.0276670000749
- },
- {
- "questionId": "q134",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 13176,
- "outputTokens": 520,
- "latencyMs": 5081.17487499991
- },
- {
- "questionId": "q134",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 14486,
- "outputTokens": 6,
- "latencyMs": 1191.4632079999428
- },
- {
- "questionId": "q134",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "183631",
- "actual": "183631",
- "isCorrect": true,
- "inputTokens": 17080,
- "outputTokens": 6,
- "latencyMs": 1325.217249999987
- },
- {
- "questionId": "q135",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 15191,
- "outputTokens": 328,
- "latencyMs": 3314.1483749999898
- },
- {
- "questionId": "q135",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 17412,
- "outputTokens": 6,
- "latencyMs": 1204.2171249999665
- },
- {
- "questionId": "q135",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 19994,
- "outputTokens": 5,
- "latencyMs": 2558.019417000003
- },
- {
- "questionId": "q135",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 8792,
- "outputTokens": 968,
- "latencyMs": 11319.296415999997
- },
- {
- "questionId": "q135",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 9282,
- "outputTokens": 6,
- "latencyMs": 1324.4548749999376
- },
- {
- "questionId": "q135",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 12340,
- "outputTokens": 5,
- "latencyMs": 2740.4004170000553
- },
- {
- "questionId": "q135",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 8560,
- "outputTokens": 392,
- "latencyMs": 7471.323291999986
- },
- {
- "questionId": "q135",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 9128,
- "outputTokens": 6,
- "latencyMs": 1267.6016660000896
- },
- {
- "questionId": "q135",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 12210,
- "outputTokens": 5,
- "latencyMs": 28672.12370799994
- },
- {
- "questionId": "q135",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 17142,
- "outputTokens": 392,
- "latencyMs": 12836.502833000035
- },
- {
- "questionId": "q135",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 19807,
- "outputTokens": 6,
- "latencyMs": 2346.9032910000533
- },
- {
- "questionId": "q135",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 21884,
- "outputTokens": 5,
- "latencyMs": 2969.614082999993
- },
- {
- "questionId": "q135",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 13175,
- "outputTokens": 392,
- "latencyMs": 5687.641541999998
- },
- {
- "questionId": "q135",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 14486,
- "outputTokens": 6,
- "latencyMs": 1316.798792000045
- },
- {
- "questionId": "q135",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "29246",
- "actual": "29246",
- "isCorrect": true,
- "inputTokens": 17079,
- "outputTokens": 5,
- "latencyMs": 2823.280541000073
- },
- {
- "questionId": "q136",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 15187,
- "outputTokens": 392,
- "latencyMs": 5053.899791999953
- },
- {
- "questionId": "q136",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 17407,
- "outputTokens": 6,
- "latencyMs": 2537.008167000022
- },
- {
- "questionId": "q136",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 6,
- "latencyMs": 1954.4713340000017
- },
- {
- "questionId": "q136",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 3208,
- "latencyMs": 26572.223459
- },
- {
- "questionId": "q136",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 9277,
- "outputTokens": 6,
- "latencyMs": 1112.2888329999987
- },
- {
- "questionId": "q136",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 12337,
- "outputTokens": 6,
- "latencyMs": 2422.114500000025
- },
- {
- "questionId": "q136",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 8556,
- "outputTokens": 1352,
- "latencyMs": 15821.266082999995
- },
- {
- "questionId": "q136",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 9123,
- "outputTokens": 6,
- "latencyMs": 1033.3786669999827
- },
- {
- "questionId": "q136",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "135306",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12207,
- "outputTokens": 1,
- "latencyMs": 1657.3498749999562
- },
- {
- "questionId": "q136",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 17138,
- "outputTokens": 328,
- "latencyMs": 4357.477583000087
- },
- {
- "questionId": "q136",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 19802,
- "outputTokens": 6,
- "latencyMs": 1578.6591250000056
- },
- {
- "questionId": "q136",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 6,
- "latencyMs": 16684.568500000052
- },
- {
- "questionId": "q136",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 712,
- "latencyMs": 7845.738333999994
- },
- {
- "questionId": "q136",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 14481,
- "outputTokens": 6,
- "latencyMs": 1408.234832999995
- },
- {
- "questionId": "q136",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "135306",
- "actual": "135306",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 6,
- "latencyMs": 3420.9656670000404
- },
- {
- "questionId": "q137",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "24914",
- "actual": "not found",
- "isCorrect": false,
- "inputTokens": 15186,
- "outputTokens": 1608,
- "latencyMs": 16271.314957999974
- },
- {
- "questionId": "q137",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 17408,
- "outputTokens": 6,
- "latencyMs": 1741.4425829999382
- },
- {
- "questionId": "q137",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 5,
- "latencyMs": 4409.774542000028
- },
- {
- "questionId": "q137",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 8787,
- "outputTokens": 1736,
- "latencyMs": 16616.36137499998
- },
- {
- "questionId": "q137",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 9278,
- "outputTokens": 6,
- "latencyMs": 1489.443333000061
- },
- {
- "questionId": "q137",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "24914",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12337,
- "outputTokens": 1,
- "latencyMs": 2424.8680840000743
- },
- {
- "questionId": "q137",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 8555,
- "outputTokens": 2952,
- "latencyMs": 26078.49774999998
- },
- {
- "questionId": "q137",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 9124,
- "outputTokens": 6,
- "latencyMs": 1111.9479170000413
- },
- {
- "questionId": "q137",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 12207,
- "outputTokens": 5,
- "latencyMs": 2661.1345420000143
- },
- {
- "questionId": "q137",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "24914",
- "actual": "not found",
- "isCorrect": false,
- "inputTokens": 17137,
- "outputTokens": 3464,
- "latencyMs": 36029.06325000001
- },
- {
- "questionId": "q137",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 19803,
- "outputTokens": 6,
- "latencyMs": 1756.511334000039
- },
- {
- "questionId": "q137",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 5,
- "latencyMs": 1706.1073340000585
- },
- {
- "questionId": "q137",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 13170,
- "outputTokens": 968,
- "latencyMs": 8245.267290999996
- },
- {
- "questionId": "q137",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 14482,
- "outputTokens": 6,
- "latencyMs": 1405.9593330000062
- },
- {
- "questionId": "q137",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "24914",
- "actual": "24914",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 5,
- "latencyMs": 2634.141583000077
- },
- {
- "questionId": "q138",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 15186,
- "outputTokens": 520,
- "latencyMs": 6238.670834000106
- },
- {
- "questionId": "q138",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 17407,
- "outputTokens": 6,
- "latencyMs": 1915.2061669999966
- },
- {
- "questionId": "q138",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "111683",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 19990,
- "outputTokens": 1,
- "latencyMs": 15802.735749999993
- },
- {
- "questionId": "q138",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 8787,
- "outputTokens": 840,
- "latencyMs": 9492.533834000002
- },
- {
- "questionId": "q138",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 9277,
- "outputTokens": 6,
- "latencyMs": 1264.6480839999858
- },
- {
- "questionId": "q138",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 12336,
- "outputTokens": 6,
- "latencyMs": 2581.858165999991
- },
- {
- "questionId": "q138",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 8555,
- "outputTokens": 1736,
- "latencyMs": 20963.487291999976
- },
- {
- "questionId": "q138",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 9123,
- "outputTokens": 6,
- "latencyMs": 2031.7733340000268
- },
- {
- "questionId": "q138",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 12206,
- "outputTokens": 6,
- "latencyMs": 2651.7060409999685
- },
- {
- "questionId": "q138",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 17137,
- "outputTokens": 520,
- "latencyMs": 5960.176208000048
- },
- {
- "questionId": "q138",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 19802,
- "outputTokens": 6,
- "latencyMs": 1636.6764170000097
- },
- {
- "questionId": "q138",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 21880,
- "outputTokens": 6,
- "latencyMs": 1322.0868340000743
- },
- {
- "questionId": "q138",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 13170,
- "outputTokens": 264,
- "latencyMs": 5836.014208000037
- },
- {
- "questionId": "q138",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 14481,
- "outputTokens": 6,
- "latencyMs": 1280.6878750000615
- },
- {
- "questionId": "q138",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "111683",
- "actual": "111683",
- "isCorrect": true,
- "inputTokens": 17075,
- "outputTokens": 6,
- "latencyMs": 3788.612332999939
- },
- {
- "questionId": "q139",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 15193,
- "outputTokens": 456,
- "latencyMs": 6374.532041999977
- },
- {
- "questionId": "q139",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 17412,
- "outputTokens": 6,
- "latencyMs": 1435.1170410000486
- },
- {
- "questionId": "q139",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 19995,
- "outputTokens": 5,
- "latencyMs": 2480.6709170000395
- },
- {
- "questionId": "q139",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 8794,
- "outputTokens": 904,
- "latencyMs": 10770.860708000022
- },
- {
- "questionId": "q139",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 9282,
- "outputTokens": 6,
- "latencyMs": 1362.2076670000097
- },
- {
- "questionId": "q139",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 12341,
- "outputTokens": 5,
- "latencyMs": 1725.4546669999836
- },
- {
- "questionId": "q139",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 8562,
- "outputTokens": 776,
- "latencyMs": 7485.538915999932
- },
- {
- "questionId": "q139",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 9128,
- "outputTokens": 6,
- "latencyMs": 1517.6439580000006
- },
- {
- "questionId": "q139",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 12211,
- "outputTokens": 5,
- "latencyMs": 3422.7879589999793
- },
- {
- "questionId": "q139",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 17144,
- "outputTokens": 456,
- "latencyMs": 9032.850083000027
- },
- {
- "questionId": "q139",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 19807,
- "outputTokens": 6,
- "latencyMs": 1400.4656250000698
- },
- {
- "questionId": "q139",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 21885,
- "outputTokens": 5,
- "latencyMs": 1666.045665999991
- },
- {
- "questionId": "q139",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 13177,
- "outputTokens": 264,
- "latencyMs": 3696.009834000026
- },
- {
- "questionId": "q139",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 14486,
- "outputTokens": 6,
- "latencyMs": 1177.9945420000004
- },
- {
- "questionId": "q139",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "13364",
- "actual": "13364",
- "isCorrect": true,
- "inputTokens": 17080,
- "outputTokens": 5,
- "latencyMs": 1399.2657909999834
- },
- {
- "questionId": "q140",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 15185,
- "outputTokens": 520,
- "latencyMs": 8902.311666999944
- },
- {
- "questionId": "q140",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 17405,
- "outputTokens": 6,
- "latencyMs": 1588.589624999906
- },
- {
- "questionId": "q140",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 19989,
- "outputTokens": 5,
- "latencyMs": 2070.6354159999173
- },
- {
- "questionId": "q140",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 8786,
- "outputTokens": 1736,
- "latencyMs": 19399.512374999933
- },
- {
- "questionId": "q140",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 9275,
- "outputTokens": 6,
- "latencyMs": 1322.7961249999935
- },
- {
- "questionId": "q140",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "98464",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12335,
- "outputTokens": 1,
- "latencyMs": 2467.938582999981
- },
- {
- "questionId": "q140",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "98464",
- "actual": "Not found",
- "isCorrect": false,
- "inputTokens": 8554,
- "outputTokens": 4808,
- "latencyMs": 46970.624375000014
- },
- {
- "questionId": "q140",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 9121,
- "outputTokens": 6,
- "latencyMs": 1310.4520839999896
- },
- {
- "questionId": "q140",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "98464",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12205,
- "outputTokens": 1,
- "latencyMs": 3555.658332999912
- },
- {
- "questionId": "q140",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "98464",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 17136,
- "outputTokens": 1735,
- "latencyMs": 16477.424583000015
- },
- {
- "questionId": "q140",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 19800,
- "outputTokens": 6,
- "latencyMs": 1970.4299579999642
- },
- {
- "questionId": "q140",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 21879,
- "outputTokens": 5,
- "latencyMs": 26671.477541
- },
- {
- "questionId": "q140",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 13169,
- "outputTokens": 1096,
- "latencyMs": 10919.952667000005
- },
- {
- "questionId": "q140",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "98464",
- "actual": "98464",
- "isCorrect": true,
- "inputTokens": 14479,
- "outputTokens": 6,
- "latencyMs": 1168.6287909999955
- },
- {
- "questionId": "q140",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "98464",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 17074,
- "outputTokens": 1,
- "latencyMs": 2765.029874999891
- },
- {
- "questionId": "q141",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 15187,
- "outputTokens": 200,
- "latencyMs": 6004.068291999982
- },
- {
- "questionId": "q141",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 17408,
- "outputTokens": 6,
- "latencyMs": 1499.0042079999112
- },
- {
- "questionId": "q141",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 4,
- "latencyMs": 2506.4855830000015
- },
- {
- "questionId": "q141",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 1032,
- "latencyMs": 16463.560791999917
- },
- {
- "questionId": "q141",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 9278,
- "outputTokens": 6,
- "latencyMs": 1441.4096249999711
- },
- {
- "questionId": "q141",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 12337,
- "outputTokens": 4,
- "latencyMs": 2663.2737919999054
- },
- {
- "questionId": "q141",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 8556,
- "outputTokens": 904,
- "latencyMs": 9668.898624999914
- },
- {
- "questionId": "q141",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 9124,
- "outputTokens": 6,
- "latencyMs": 1173.9928749999963
- },
- {
- "questionId": "q141",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "6378",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12207,
- "outputTokens": 1,
- "latencyMs": 9857.754333000048
- },
- {
- "questionId": "q141",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 17138,
- "outputTokens": 392,
- "latencyMs": 9638.438333999948
- },
- {
- "questionId": "q141",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 19803,
- "outputTokens": 6,
- "latencyMs": 1636.777374999947
- },
- {
- "questionId": "q141",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 4,
- "latencyMs": 1841.5572499999544
- },
- {
- "questionId": "q141",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 328,
- "latencyMs": 5539.711917000008
- },
- {
- "questionId": "q141",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 14482,
- "outputTokens": 6,
- "latencyMs": 1485.2025829999475
- },
- {
- "questionId": "q141",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "6378",
- "actual": "6378",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 4,
- "latencyMs": 1622.3209579999093
- },
- {
- "questionId": "q142",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 15189,
- "outputTokens": 456,
- "latencyMs": 5173.022708000033
- },
- {
- "questionId": "q142",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 17409,
- "outputTokens": 6,
- "latencyMs": 1700.1781669999473
- },
- {
- "questionId": "q142",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 19992,
- "outputTokens": 6,
- "latencyMs": 2883.810959000024
- },
- {
- "questionId": "q142",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 8790,
- "outputTokens": 1352,
- "latencyMs": 14519.361791000003
- },
- {
- "questionId": "q142",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 9279,
- "outputTokens": 6,
- "latencyMs": 1391.6377499999944
- },
- {
- "questionId": "q142",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 12338,
- "outputTokens": 6,
- "latencyMs": 2150.8105409999844
- },
- {
- "questionId": "q142",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 8558,
- "outputTokens": 968,
- "latencyMs": 12890.400166000007
- },
- {
- "questionId": "q142",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 9125,
- "outputTokens": 6,
- "latencyMs": 1352.297750000027
- },
- {
- "questionId": "q142",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 12208,
- "outputTokens": 6,
- "latencyMs": 3035.361290999921
- },
- {
- "questionId": "q142",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 17140,
- "outputTokens": 648,
- "latencyMs": 26188.04208299995
- },
- {
- "questionId": "q142",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 19804,
- "outputTokens": 6,
- "latencyMs": 1935.45787500008
- },
- {
- "questionId": "q142",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 21882,
- "outputTokens": 6,
- "latencyMs": 5415.2192920000525
- },
- {
- "questionId": "q142",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 13173,
- "outputTokens": 648,
- "latencyMs": 6512.995166999986
- },
- {
- "questionId": "q142",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 14483,
- "outputTokens": 6,
- "latencyMs": 1957.1825840000529
- },
- {
- "questionId": "q142",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "254916",
- "actual": "254916",
- "isCorrect": true,
- "inputTokens": 17077,
- "outputTokens": 6,
- "latencyMs": 1273.1987079998944
- },
- {
- "questionId": "q143",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 15187,
- "outputTokens": 712,
- "latencyMs": 7402.821666999953
- },
- {
- "questionId": "q143",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 17410,
- "outputTokens": 6,
- "latencyMs": 1297.3980420000153
- },
- {
- "questionId": "q143",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 19993,
- "outputTokens": 5,
- "latencyMs": 1398.1769159999676
- },
- {
- "questionId": "q143",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 520,
- "latencyMs": 8047.9024590000045
- },
- {
- "questionId": "q143",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 9280,
- "outputTokens": 6,
- "latencyMs": 1149.3695000000298
- },
- {
- "questionId": "q143",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 12339,
- "outputTokens": 5,
- "latencyMs": 3275.751125000068
- },
- {
- "questionId": "q143",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 8556,
- "outputTokens": 520,
- "latencyMs": 10626.252958000056
- },
- {
- "questionId": "q143",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 9126,
- "outputTokens": 6,
- "latencyMs": 1084.1253329999745
- },
- {
- "questionId": "q143",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 12209,
- "outputTokens": 5,
- "latencyMs": 2478.551666000043
- },
- {
- "questionId": "q143",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "32413",
- "actual": "43222",
- "isCorrect": false,
- "inputTokens": 17138,
- "outputTokens": 2248,
- "latencyMs": 24645.130125000025
- },
- {
- "questionId": "q143",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 19805,
- "outputTokens": 6,
- "latencyMs": 1504.6681670000544
- },
- {
- "questionId": "q143",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 21883,
- "outputTokens": 5,
- "latencyMs": 1577.2633330000099
- },
- {
- "questionId": "q143",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 776,
- "latencyMs": 8342.271167000057
- },
- {
- "questionId": "q143",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 14484,
- "outputTokens": 6,
- "latencyMs": 1397.2225839999737
- },
- {
- "questionId": "q143",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "32413",
- "actual": "32413",
- "isCorrect": true,
- "inputTokens": 17078,
- "outputTokens": 5,
- "latencyMs": 2600.8139589999337
- },
- {
- "questionId": "q144",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 15185,
- "outputTokens": 648,
- "latencyMs": 10642.901458999957
- },
- {
- "questionId": "q144",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 17405,
- "outputTokens": 6,
- "latencyMs": 1309.3054169999668
- },
- {
- "questionId": "q144",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 19989,
- "outputTokens": 6,
- "latencyMs": 1797.455083000008
- },
- {
- "questionId": "q144",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 8786,
- "outputTokens": 1096,
- "latencyMs": 11485.876249999972
- },
- {
- "questionId": "q144",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 9275,
- "outputTokens": 6,
- "latencyMs": 1909.1485000000102
- },
- {
- "questionId": "q144",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "240059",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12335,
- "outputTokens": 1,
- "latencyMs": 2114.457832999993
- },
- {
- "questionId": "q144",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "240059",
- "actual": "Not found",
- "isCorrect": false,
- "inputTokens": 8554,
- "outputTokens": 2760,
- "latencyMs": 36680.54220799997
- },
- {
- "questionId": "q144",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 9121,
- "outputTokens": 6,
- "latencyMs": 1069.4299589999719
- },
- {
- "questionId": "q144",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 12205,
- "outputTokens": 6,
- "latencyMs": 2047.3995000000577
- },
- {
- "questionId": "q144",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 17136,
- "outputTokens": 456,
- "latencyMs": 8763.321875000023
- },
- {
- "questionId": "q144",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 19800,
- "outputTokens": 6,
- "latencyMs": 1591.410208000103
- },
- {
- "questionId": "q144",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 21879,
- "outputTokens": 6,
- "latencyMs": 1814.5240000000922
- },
- {
- "questionId": "q144",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "240059",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 13169,
- "outputTokens": 2951,
- "latencyMs": 28527.662250000052
- },
- {
- "questionId": "q144",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 14479,
- "outputTokens": 6,
- "latencyMs": 1341.8624169999966
- },
- {
- "questionId": "q144",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "240059",
- "actual": "240059",
- "isCorrect": true,
- "inputTokens": 17074,
- "outputTokens": 6,
- "latencyMs": 2672.0011249999516
- },
- {
- "questionId": "q145",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 15186,
- "outputTokens": 1288,
- "latencyMs": 11650.464916000026
- },
- {
- "questionId": "q145",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 17406,
- "outputTokens": 6,
- "latencyMs": 1736.123957999982
- },
- {
- "questionId": "q145",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 19989,
- "outputTokens": 5,
- "latencyMs": 2115.1809580000117
- },
- {
- "questionId": "q145",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "48986",
- "actual": "undefined",
- "isCorrect": false,
- "inputTokens": 8787,
- "outputTokens": 2119,
- "latencyMs": 22429.965708000003
- },
- {
- "questionId": "q145",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 9276,
- "outputTokens": 6,
- "latencyMs": 1280.45074999996
- },
- {
- "questionId": "q145",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "48986",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12335,
- "outputTokens": 1,
- "latencyMs": 2039.6975419999799
- },
- {
- "questionId": "q145",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 8555,
- "outputTokens": 1352,
- "latencyMs": 13713.023125000065
- },
- {
- "questionId": "q145",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 9122,
- "outputTokens": 6,
- "latencyMs": 1190.7314999999944
- },
- {
- "questionId": "q145",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "48986",
- "actual": "None",
- "isCorrect": false,
- "inputTokens": 12205,
- "outputTokens": 1,
- "latencyMs": 3054.557584000053
- },
- {
- "questionId": "q145",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 17137,
- "outputTokens": 456,
- "latencyMs": 8163.3440420000115
- },
- {
- "questionId": "q145",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 19801,
- "outputTokens": 6,
- "latencyMs": 2508.831208000076
- },
- {
- "questionId": "q145",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "48986",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 21879,
- "outputTokens": 1,
- "latencyMs": 13907.184875000035
- },
- {
- "questionId": "q145",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 13170,
- "outputTokens": 968,
- "latencyMs": 9999.614625000046
- },
- {
- "questionId": "q145",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 14480,
- "outputTokens": 6,
- "latencyMs": 1401.668834000011
- },
- {
- "questionId": "q145",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "48986",
- "actual": "48986",
- "isCorrect": true,
- "inputTokens": 17074,
- "outputTokens": 5,
- "latencyMs": 3342.504416999989
- },
- {
- "questionId": "q146",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "209624",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 15185,
- "outputTokens": 1607,
- "latencyMs": 14253.204374999972
- },
- {
- "questionId": "q146",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 17405,
- "outputTokens": 6,
- "latencyMs": 1633.1817499999888
- },
- {
- "questionId": "q146",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 19989,
- "outputTokens": 6,
- "latencyMs": 4013.2274579999503
- },
- {
- "questionId": "q146",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 8786,
- "outputTokens": 1864,
- "latencyMs": 18068.214749999926
- },
- {
- "questionId": "q146",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 9275,
- "outputTokens": 6,
- "latencyMs": 2633.8406670000404
- },
- {
- "questionId": "q146",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 12335,
- "outputTokens": 6,
- "latencyMs": 2308.719957999885
- },
- {
- "questionId": "q146",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 8554,
- "outputTokens": 3592,
- "latencyMs": 34956.612250000006
- },
- {
- "questionId": "q146",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 9121,
- "outputTokens": 6,
- "latencyMs": 1042.174875000026
- },
- {
- "questionId": "q146",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "209624",
- "actual": "Not found",
- "isCorrect": false,
- "inputTokens": 12205,
- "outputTokens": 2,
- "latencyMs": 3570.2167079999344
- },
- {
- "questionId": "q146",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 17136,
- "outputTokens": 584,
- "latencyMs": 8155.267999999924
- },
- {
- "questionId": "q146",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 19800,
- "outputTokens": 6,
- "latencyMs": 1908.0532499999972
- },
- {
- "questionId": "q146",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 21879,
- "outputTokens": 6,
- "latencyMs": 4646.213583000004
- },
- {
- "questionId": "q146",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 13169,
- "outputTokens": 392,
- "latencyMs": 8023.040708000073
- },
- {
- "questionId": "q146",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "209624",
- "actual": "209624",
- "isCorrect": true,
- "inputTokens": 14479,
- "outputTokens": 6,
- "latencyMs": 1252.574666999979
- },
- {
- "questionId": "q146",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "209624",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 17074,
- "outputTokens": 1,
- "latencyMs": 9256.544125000015
- },
- {
- "questionId": "q147",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 15185,
- "outputTokens": 328,
- "latencyMs": 6800.243999999948
- },
- {
- "questionId": "q147",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 17406,
- "outputTokens": 6,
- "latencyMs": 1856.026916999952
- },
- {
- "questionId": "q147",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "58023",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 19989,
- "outputTokens": 1,
- "latencyMs": 1783.4203330000164
- },
- {
- "questionId": "q147",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 8786,
- "outputTokens": 904,
- "latencyMs": 8408.46395799995
- },
- {
- "questionId": "q147",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 9276,
- "outputTokens": 6,
- "latencyMs": 1048.0284159999574
- },
- {
- "questionId": "q147",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "58023",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12335,
- "outputTokens": 1,
- "latencyMs": 2309.89829199994
- },
- {
- "questionId": "q147",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 8554,
- "outputTokens": 456,
- "latencyMs": 7778.412583000027
- },
- {
- "questionId": "q147",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 9122,
- "outputTokens": 6,
- "latencyMs": 1095.3032080000266
- },
- {
- "questionId": "q147",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 12205,
- "outputTokens": 5,
- "latencyMs": 2191.419332999969
- },
- {
- "questionId": "q147",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 17136,
- "outputTokens": 328,
- "latencyMs": 5028.444708000054
- },
- {
- "questionId": "q147",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 19801,
- "outputTokens": 6,
- "latencyMs": 1697.0504170000786
- },
- {
- "questionId": "q147",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 21879,
- "outputTokens": 5,
- "latencyMs": 1800.0818329999456
- },
- {
- "questionId": "q147",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 13169,
- "outputTokens": 712,
- "latencyMs": 8022.871625000029
- },
- {
- "questionId": "q147",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 14480,
- "outputTokens": 6,
- "latencyMs": 1105.1744999999646
- },
- {
- "questionId": "q147",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "58023",
- "actual": "58023",
- "isCorrect": true,
- "inputTokens": 17074,
- "outputTokens": 5,
- "latencyMs": 2765.7437500000233
- },
- {
- "questionId": "q148",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 15188,
- "outputTokens": 328,
- "latencyMs": 4684.178457999951
- },
- {
- "questionId": "q148",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 17407,
- "outputTokens": 6,
- "latencyMs": 1856.438208000036
- },
- {
- "questionId": "q148",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 6,
- "latencyMs": 4894.268209000002
- },
- {
- "questionId": "q148",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 1608,
- "latencyMs": 19985.54383400001
- },
- {
- "questionId": "q148",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 9277,
- "outputTokens": 6,
- "latencyMs": 1212.5407500000438
- },
- {
- "questionId": "q148",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "196024",
- "actual": "N/A",
- "isCorrect": false,
- "inputTokens": 12337,
- "outputTokens": 3,
- "latencyMs": 12548.686624999973
- },
- {
- "questionId": "q148",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 2760,
- "latencyMs": 20131.88070800004
- },
- {
- "questionId": "q148",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 9123,
- "outputTokens": 6,
- "latencyMs": 1217.2275000000373
- },
- {
- "questionId": "q148",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 12207,
- "outputTokens": 6,
- "latencyMs": 2748.620916999993
- },
- {
- "questionId": "q148",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 17139,
- "outputTokens": 392,
- "latencyMs": 6418.833957999945
- },
- {
- "questionId": "q148",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 19802,
- "outputTokens": 6,
- "latencyMs": 2019.8872089999495
- },
- {
- "questionId": "q148",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 6,
- "latencyMs": 2523.128167000017
- },
- {
- "questionId": "q148",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 13172,
- "outputTokens": 584,
- "latencyMs": 8212.874959000037
- },
- {
- "questionId": "q148",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 14481,
- "outputTokens": 6,
- "latencyMs": 1151.26241700002
- },
- {
- "questionId": "q148",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "196024",
- "actual": "196024",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 6,
- "latencyMs": 3479.8169999999227
- },
- {
- "questionId": "q149",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 15188,
- "outputTokens": 456,
- "latencyMs": 6856.402957999962
- },
- {
- "questionId": "q149",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 17408,
- "outputTokens": 6,
- "latencyMs": 1727.7318750000559
- },
- {
- "questionId": "q149",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 19991,
- "outputTokens": 5,
- "latencyMs": 5595.708332999959
- },
- {
- "questionId": "q149",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 584,
- "latencyMs": 5889.62179200002
- },
- {
- "questionId": "q149",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 9278,
- "outputTokens": 6,
- "latencyMs": 1206.469458000036
- },
- {
- "questionId": "q149",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 12337,
- "outputTokens": 5,
- "latencyMs": 2057.8787500000326
- },
- {
- "questionId": "q149",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 584,
- "latencyMs": 6905.8247499999125
- },
- {
- "questionId": "q149",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 9124,
- "outputTokens": 6,
- "latencyMs": 1003.953542000032
- },
- {
- "questionId": "q149",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "30919",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12207,
- "outputTokens": 1,
- "latencyMs": 2500.2377919999417
- },
- {
- "questionId": "q149",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 17139,
- "outputTokens": 264,
- "latencyMs": 4909.18979199999
- },
- {
- "questionId": "q149",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 19803,
- "outputTokens": 6,
- "latencyMs": 2457.2324580000713
- },
- {
- "questionId": "q149",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 21881,
- "outputTokens": 5,
- "latencyMs": 1428.471666000085
- },
- {
- "questionId": "q149",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 13172,
- "outputTokens": 392,
- "latencyMs": 5668.693708000006
- },
- {
- "questionId": "q149",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 14482,
- "outputTokens": 6,
- "latencyMs": 1222.2983330000425
- },
- {
- "questionId": "q149",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "30919",
- "actual": "30919",
- "isCorrect": true,
- "inputTokens": 17076,
- "outputTokens": 5,
- "latencyMs": 3050.278290999937
- },
- {
- "questionId": "q150",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 15187,
- "outputTokens": 456,
- "latencyMs": 7561.326083000051
- },
- {
- "questionId": "q150",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 17405,
- "outputTokens": 6,
- "latencyMs": 2041.015417000046
- },
- {
- "questionId": "q150",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 19989,
- "outputTokens": 6,
- "latencyMs": 1918.6380409999983
- },
- {
- "questionId": "q150",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 776,
- "latencyMs": 7871.997415999998
- },
- {
- "questionId": "q150",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 9275,
- "outputTokens": 6,
- "latencyMs": 1578.9285829999717
- },
- {
- "questionId": "q150",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 12335,
- "outputTokens": 6,
- "latencyMs": 2032.75475000008
- },
- {
- "questionId": "q150",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "192220",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 8556,
- "outputTokens": 1159,
- "latencyMs": 30959.83791699994
- },
- {
- "questionId": "q150",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 9121,
- "outputTokens": 6,
- "latencyMs": 1389.4868339999812
- },
- {
- "questionId": "q150",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "192220",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12205,
- "outputTokens": 1,
- "latencyMs": 3573.9437089998974
- },
- {
- "questionId": "q150",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 17138,
- "outputTokens": 392,
- "latencyMs": 6992.854374999995
- },
- {
- "questionId": "q150",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 19800,
- "outputTokens": 6,
- "latencyMs": 1679.577958000009
- },
- {
- "questionId": "q150",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 21879,
- "outputTokens": 6,
- "latencyMs": 1553.5702499999898
- },
- {
- "questionId": "q150",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 328,
- "latencyMs": 4169.634166999953
- },
- {
- "questionId": "q150",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 14479,
- "outputTokens": 6,
- "latencyMs": 1384.3902089999756
- },
- {
- "questionId": "q150",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "192220",
- "actual": "192220",
- "isCorrect": true,
- "inputTokens": 17074,
- "outputTokens": 6,
- "latencyMs": 2953.2877919999883
- },
- {
- "questionId": "q151",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 15190,
- "outputTokens": 584,
- "latencyMs": 6612.153208000003
- },
- {
- "questionId": "q151",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 17414,
- "outputTokens": 6,
- "latencyMs": 2259.919874999905
- },
- {
- "questionId": "q151",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 19997,
- "outputTokens": 5,
- "latencyMs": 4557.873041000101
- },
- {
- "questionId": "q151",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 8791,
- "outputTokens": 712,
- "latencyMs": 7556.261375000002
- },
- {
- "questionId": "q151",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 9284,
- "outputTokens": 6,
- "latencyMs": 1012.9206669999985
- },
- {
- "questionId": "q151",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 12343,
- "outputTokens": 5,
- "latencyMs": 6754.191916999989
- },
- {
- "questionId": "q151",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 8559,
- "outputTokens": 712,
- "latencyMs": 7742.647875000024
- },
- {
- "questionId": "q151",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 9130,
- "outputTokens": 6,
- "latencyMs": 1578.1971669999184
- },
- {
- "questionId": "q151",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 12213,
- "outputTokens": 5,
- "latencyMs": 7366.954833999975
- },
- {
- "questionId": "q151",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 17141,
- "outputTokens": 328,
- "latencyMs": 6099.567540999968
- },
- {
- "questionId": "q151",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 19809,
- "outputTokens": 6,
- "latencyMs": 1278.9319580000592
- },
- {
- "questionId": "q151",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 21887,
- "outputTokens": 5,
- "latencyMs": 4035.024666000041
- },
- {
- "questionId": "q151",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 13174,
- "outputTokens": 456,
- "latencyMs": 4068.7430829999503
- },
- {
- "questionId": "q151",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 14488,
- "outputTokens": 6,
- "latencyMs": 1183.168624999933
- },
- {
- "questionId": "q151",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "11763",
- "actual": "11763",
- "isCorrect": true,
- "inputTokens": 17082,
- "outputTokens": 5,
- "latencyMs": 1311.251791000017
- },
- {
- "questionId": "q152",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "114",
- "isCorrect": false,
- "inputTokens": 15187,
- "outputTokens": 3271,
- "latencyMs": 26292.3486250001
- },
- {
- "questionId": "q152",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 17406,
- "outputTokens": 5,
- "latencyMs": 1269.8386670000618
- },
- {
- "questionId": "q152",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "2",
- "isCorrect": false,
- "inputTokens": 19990,
- "outputTokens": 1,
- "latencyMs": 1418.8326250000391
- },
- {
- "questionId": "q152",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8788,
- "outputTokens": 711,
- "latencyMs": 7467.631458999938
- },
- {
- "questionId": "q152",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 9276,
- "outputTokens": 5,
- "latencyMs": 1310.1392090000445
- },
- {
- "questionId": "q152",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12336,
- "outputTokens": 1,
- "latencyMs": 2714.426749999984
- },
- {
- "questionId": "q152",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 8556,
- "outputTokens": 903,
- "latencyMs": 10460.54125000001
- },
- {
- "questionId": "q152",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 9122,
- "outputTokens": 5,
- "latencyMs": 1165.5718329999363
- },
- {
- "questionId": "q152",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 12206,
- "outputTokens": 1,
- "latencyMs": 6584.999583999976
- },
- {
- "questionId": "q152",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 17138,
- "outputTokens": 519,
- "latencyMs": 7805.630750000011
- },
- {
- "questionId": "q152",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 19801,
- "outputTokens": 5,
- "latencyMs": 1370.0252500000643
- },
- {
- "questionId": "q152",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 21880,
- "outputTokens": 1,
- "latencyMs": 1457.9777079999913
- },
- {
- "questionId": "q152",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 13171,
- "outputTokens": 2055,
- "latencyMs": 73627.54529200005
- },
- {
- "questionId": "q152",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "0",
- "isCorrect": false,
- "inputTokens": 14480,
- "outputTokens": 5,
- "latencyMs": 1786.1586249999236
- },
- {
- "questionId": "q152",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "2",
- "isCorrect": false,
- "inputTokens": 17075,
- "outputTokens": 1,
- "latencyMs": 19150.725124999997
- },
- {
- "questionId": "q153",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "15404143",
- "actual": "13886916",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 5833,
- "latencyMs": 354484.18529200007
- },
- {
- "questionId": "q153",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "15404143",
- "actual": "13,847,892",
- "isCorrect": false,
- "inputTokens": 17407,
- "outputTokens": 9,
- "latencyMs": 1871.1713750000345
- },
- {
- "questionId": "q153",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "15404143",
- "actual": "12990000",
- "isCorrect": false,
- "inputTokens": 19991,
- "outputTokens": 8,
- "latencyMs": 155538.94058299996
- },
- {
- "questionId": "q153",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "15404143",
- "actual": "15404143",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 5577,
- "latencyMs": 46411.59825000004
- },
- {
- "questionId": "q153",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "15404143",
- "actual": "13,847,892",
- "isCorrect": false,
- "inputTokens": 9277,
- "outputTokens": 9,
- "latencyMs": 1184.7457910000812
- },
- {
- "questionId": "q153",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "15404143",
- "actual": "14371343",
- "isCorrect": false,
- "inputTokens": 12337,
- "outputTokens": 8,
- "latencyMs": 27093.977375000017
- },
- {
- "questionId": "q153",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "15404143",
- "actual": "15404143",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 5321,
- "latencyMs": 40838.23450000002
- },
- {
- "questionId": "q153",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "15404143",
- "actual": "15,847,892",
- "isCorrect": false,
- "inputTokens": 9123,
- "outputTokens": 9,
- "latencyMs": 1243.0417080000043
- },
- {
- "questionId": "q153",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "15404143",
- "actual": "10000000",
- "isCorrect": false,
- "inputTokens": 12207,
- "outputTokens": 8,
- "latencyMs": 1697.566125000012
- },
- {
- "questionId": "q153",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "15404143",
- "actual": "11887802",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 3465,
- "latencyMs": 35017.48091599997
- },
- {
- "questionId": "q153",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "15404143",
- "actual": "10,847,892",
- "isCorrect": false,
- "inputTokens": 19802,
- "outputTokens": 9,
- "latencyMs": 1783.1710419999436
- },
- {
- "questionId": "q153",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "15404143",
- "actual": "14000000",
- "isCorrect": false,
- "inputTokens": 21881,
- "outputTokens": 8,
- "latencyMs": 20208.78741599992
- },
- {
- "questionId": "q153",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "15404143",
- "actual": "14012139",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 14601,
- "latencyMs": 139937.6586659999
- },
- {
- "questionId": "q153",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "15404143",
- "actual": "13,847,892",
- "isCorrect": false,
- "inputTokens": 14481,
- "outputTokens": 9,
- "latencyMs": 1949.8563330000034
- },
- {
- "questionId": "q153",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "15404143",
- "actual": "10999999",
- "isCorrect": false,
- "inputTokens": 17076,
- "outputTokens": 8,
- "latencyMs": 1061.2076249999227
- },
- {
- "questionId": "q154",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "86",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 3591,
- "latencyMs": 186054.49916699994
- },
- {
- "questionId": "q154",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 17408,
- "outputTokens": 5,
- "latencyMs": 1541.018458000035
- },
- {
- "questionId": "q154",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "59",
- "isCorrect": false,
- "inputTokens": 19994,
- "outputTokens": 2,
- "latencyMs": 1209.527832999942
- },
- {
- "questionId": "q154",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 2311,
- "latencyMs": 20000.66104200005
- },
- {
- "questionId": "q154",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 9278,
- "outputTokens": 5,
- "latencyMs": 1125.2787499999395
- },
- {
- "questionId": "q154",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 12340,
- "outputTokens": 2,
- "latencyMs": 2061.19062499993
- },
- {
- "questionId": "q154",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 3271,
- "latencyMs": 29091.357792000053
- },
- {
- "questionId": "q154",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 9124,
- "outputTokens": 5,
- "latencyMs": 1029.3966670000227
- },
- {
- "questionId": "q154",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "59",
- "isCorrect": false,
- "inputTokens": 12210,
- "outputTokens": 2,
- "latencyMs": 2304.6412080000155
- },
- {
- "questionId": "q154",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "88",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 2375,
- "latencyMs": 25588.054458
- },
- {
- "questionId": "q154",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 19803,
- "outputTokens": 5,
- "latencyMs": 1378.1570839999476
- },
- {
- "questionId": "q154",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 21884,
- "outputTokens": 3,
- "latencyMs": 28098.016750000068
- },
- {
- "questionId": "q154",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "88",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 4359,
- "latencyMs": 47106.68116699997
- },
- {
- "questionId": "q154",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 14482,
- "outputTokens": 5,
- "latencyMs": 2077.1985829999903
- },
- {
- "questionId": "q154",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 17079,
- "outputTokens": 2,
- "latencyMs": 1049.9515410000458
- },
- {
- "questionId": "q155",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 15188,
- "outputTokens": 5639,
- "latencyMs": 52034.31104199996
- },
- {
- "questionId": "q155",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "71",
- "isCorrect": false,
- "inputTokens": 17408,
- "outputTokens": 5,
- "latencyMs": 1774.2209169999696
- },
- {
- "questionId": "q155",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 19994,
- "outputTokens": 2,
- "latencyMs": 1397.8998329999158
- },
- {
- "questionId": "q155",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 2823,
- "latencyMs": 26509.484792000032
- },
- {
- "questionId": "q155",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 9278,
- "outputTokens": 5,
- "latencyMs": 1028.7182500000345
- },
- {
- "questionId": "q155",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 12340,
- "outputTokens": 3,
- "latencyMs": 21919.32149999996
- },
- {
- "questionId": "q155",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 2631,
- "latencyMs": 32920.081041999976
- },
- {
- "questionId": "q155",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 9124,
- "outputTokens": 5,
- "latencyMs": 1246.9641250000568
- },
- {
- "questionId": "q155",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 12210,
- "outputTokens": 3,
- "latencyMs": 17704.908124999958
- },
- {
- "questionId": "q155",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "79",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 4359,
- "latencyMs": 36706.952500000014
- },
- {
- "questionId": "q155",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 19803,
- "outputTokens": 5,
- "latencyMs": 1653.922874999931
- },
- {
- "questionId": "q155",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 21884,
- "outputTokens": 3,
- "latencyMs": 18907.825375000015
- },
- {
- "questionId": "q155",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "88",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 2567,
- "latencyMs": 29826.266333999927
- },
- {
- "questionId": "q155",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "71",
- "isCorrect": false,
- "inputTokens": 14482,
- "outputTokens": 5,
- "latencyMs": 1877.8078329999698
- },
- {
- "questionId": "q155",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 17079,
- "outputTokens": 2,
- "latencyMs": 1709.5576250000158
- },
- {
- "questionId": "q156",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "76",
- "actual": "61",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 3015,
- "latencyMs": 27373.73904200003
- },
- {
- "questionId": "q156",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "76",
- "actual": "100",
- "isCorrect": false,
- "inputTokens": 17408,
- "outputTokens": 5,
- "latencyMs": 2553.873874999932
- },
- {
- "questionId": "q156",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "76",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 19995,
- "outputTokens": 2,
- "latencyMs": 1292.7788750000764
- },
- {
- "questionId": "q156",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "76",
- "actual": "76",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 3911,
- "latencyMs": 38466.93025000009
- },
- {
- "questionId": "q156",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "76",
- "actual": "100",
- "isCorrect": false,
- "inputTokens": 9278,
- "outputTokens": 5,
- "latencyMs": 1207.3981249999488
- },
- {
- "questionId": "q156",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "76",
- "actual": "76",
- "isCorrect": true,
- "inputTokens": 12341,
- "outputTokens": 2,
- "latencyMs": 21904.33095799992
- },
- {
- "questionId": "q156",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "76",
- "actual": "75",
- "isCorrect": false,
- "inputTokens": 8557,
- "outputTokens": 2951,
- "latencyMs": 38943.062832999974
- },
- {
- "questionId": "q156",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "76",
- "actual": "100",
- "isCorrect": false,
- "inputTokens": 9124,
- "outputTokens": 5,
- "latencyMs": 1096.0891670000274
- },
- {
- "questionId": "q156",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "76",
- "actual": "76",
- "isCorrect": true,
- "inputTokens": 12211,
- "outputTokens": 2,
- "latencyMs": 16468.647499999963
- },
- {
- "questionId": "q156",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "76",
- "actual": "64",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 1863,
- "latencyMs": 18473.753917000024
- },
- {
- "questionId": "q156",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "76",
- "actual": "100",
- "isCorrect": false,
- "inputTokens": 19803,
- "outputTokens": 5,
- "latencyMs": 1316.2989590000361
- },
- {
- "questionId": "q156",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "76",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 21885,
- "outputTokens": 2,
- "latencyMs": 1786.060832999996
- },
- {
- "questionId": "q156",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "76",
- "actual": "72",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 8711,
- "latencyMs": 86456.99716699996
- },
- {
- "questionId": "q156",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "76",
- "actual": "100",
- "isCorrect": false,
- "inputTokens": 14482,
- "outputTokens": 5,
- "latencyMs": 1337.9467500000028
- },
- {
- "questionId": "q156",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "76",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 17080,
- "outputTokens": 2,
- "latencyMs": 1272.1261659999145
- },
- {
- "questionId": "q157",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "139",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 8199,
- "latencyMs": 117751.80679199996
- },
- {
- "questionId": "q157",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "89",
- "isCorrect": false,
- "inputTokens": 17409,
- "outputTokens": 5,
- "latencyMs": 6994.20404099999
- },
- {
- "questionId": "q157",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 19993,
- "outputTokens": 2,
- "latencyMs": 1664.0891249999404
- },
- {
- "questionId": "q157",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 4103,
- "latencyMs": 33535.55912499991
- },
- {
- "questionId": "q157",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "73",
- "isCorrect": false,
- "inputTokens": 9279,
- "outputTokens": 5,
- "latencyMs": 1228.1867499999935
- },
- {
- "questionId": "q157",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 12339,
- "outputTokens": 2,
- "latencyMs": 1517.6247079999885
- },
- {
- "questionId": "q157",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "87",
- "isCorrect": false,
- "inputTokens": 8557,
- "outputTokens": 3079,
- "latencyMs": 27126.57024999999
- },
- {
- "questionId": "q157",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "89",
- "isCorrect": false,
- "inputTokens": 9125,
- "outputTokens": 5,
- "latencyMs": 949.5018749999581
- },
- {
- "questionId": "q157",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 12209,
- "outputTokens": 2,
- "latencyMs": 2366.7855419999687
- },
- {
- "questionId": "q157",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "69",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 2183,
- "latencyMs": 35555.629874999984
- },
- {
- "questionId": "q157",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "71",
- "isCorrect": false,
- "inputTokens": 19804,
- "outputTokens": 5,
- "latencyMs": 1865.6005420000292
- },
- {
- "questionId": "q157",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 21883,
- "outputTokens": 3,
- "latencyMs": 22966.85654200008
- },
- {
- "questionId": "q157",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "100",
- "actual": "100",
- "isCorrect": true,
- "inputTokens": 13172,
- "outputTokens": 2503,
- "latencyMs": 23299.811666000052
- },
- {
- "questionId": "q157",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "100",
- "actual": "95",
- "isCorrect": false,
- "inputTokens": 14483,
- "outputTokens": 5,
- "latencyMs": 1111.9951249998994
- },
- {
- "questionId": "q157",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "100",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 17078,
- "outputTokens": 2,
- "latencyMs": 1229.8220420000143
- },
- {
- "questionId": "q158",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "95",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 2439,
- "latencyMs": 23952.90112500009
- },
- {
- "questionId": "q158",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "95",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 17409,
- "outputTokens": 5,
- "latencyMs": 2635.0509999999776
- },
- {
- "questionId": "q158",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "95",
- "actual": "59",
- "isCorrect": false,
- "inputTokens": 19993,
- "outputTokens": 2,
- "latencyMs": 1382.6497909999453
- },
- {
- "questionId": "q158",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "95",
- "actual": "95",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 5255,
- "latencyMs": 52427.638499999885
- },
- {
- "questionId": "q158",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "95",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 9279,
- "outputTokens": 5,
- "latencyMs": 1752.1665410000132
- },
- {
- "questionId": "q158",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "95",
- "actual": "95",
- "isCorrect": true,
- "inputTokens": 12339,
- "outputTokens": 2,
- "latencyMs": 30665.240666999947
- },
- {
- "questionId": "q158",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "95",
- "actual": "96",
- "isCorrect": false,
- "inputTokens": 8557,
- "outputTokens": 4999,
- "latencyMs": 52545.94787500007
- },
- {
- "questionId": "q158",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "95",
- "actual": "42",
- "isCorrect": false,
- "inputTokens": 9125,
- "outputTokens": 5,
- "latencyMs": 1330.860624999972
- },
- {
- "questionId": "q158",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "95",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 12209,
- "outputTokens": 2,
- "latencyMs": 2559.635125000146
- },
- {
- "questionId": "q158",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "95",
- "actual": "96",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 13447,
- "latencyMs": 177292.60950000002
- },
- {
- "questionId": "q158",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "95",
- "actual": "32",
- "isCorrect": false,
- "inputTokens": 19804,
- "outputTokens": 5,
- "latencyMs": 1816.5423749999609
- },
- {
- "questionId": "q158",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "95",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 21883,
- "outputTokens": 2,
- "latencyMs": 3004.8347500001546
- },
- {
- "questionId": "q158",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "95",
- "actual": "96",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 3975,
- "latencyMs": 42573.26512499992
- },
- {
- "questionId": "q158",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "95",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 14483,
- "outputTokens": 5,
- "latencyMs": 1499.2267080000602
- },
- {
- "questionId": "q158",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "95",
- "actual": "60",
- "isCorrect": false,
- "inputTokens": 17078,
- "outputTokens": 2,
- "latencyMs": 1173.8084579999559
- },
- {
- "questionId": "q159",
- "format": "json",
- "model": "gpt-5-nano",
- "expected": "83",
- "actual": "50",
- "isCorrect": false,
- "inputTokens": 15188,
- "outputTokens": 11719,
- "latencyMs": 109516.51062500011
- },
- {
- "questionId": "q159",
- "format": "json",
- "model": "claude-haiku-4-5",
- "expected": "83",
- "actual": "71",
- "isCorrect": false,
- "inputTokens": 17409,
- "outputTokens": 5,
- "latencyMs": 1886.0561250001192
- },
- {
- "questionId": "q159",
- "format": "json",
- "model": "gemini-2.5-flash",
- "expected": "83",
- "actual": "59",
- "isCorrect": false,
- "inputTokens": 19994,
- "outputTokens": 2,
- "latencyMs": 2211.6038330001757
- },
- {
- "questionId": "q159",
- "format": "toon",
- "model": "gpt-5-nano",
- "expected": "83",
- "actual": "83",
- "isCorrect": true,
- "inputTokens": 8789,
- "outputTokens": 3463,
- "latencyMs": 36709.80866700015
- },
- {
- "questionId": "q159",
- "format": "toon",
- "model": "claude-haiku-4-5",
- "expected": "83",
- "actual": "73",
- "isCorrect": false,
- "inputTokens": 9279,
- "outputTokens": 5,
- "latencyMs": 1961.9631250000093
- },
- {
- "questionId": "q159",
- "format": "toon",
- "model": "gemini-2.5-flash",
- "expected": "83",
- "actual": "83",
- "isCorrect": true,
- "inputTokens": 12340,
- "outputTokens": 2,
- "latencyMs": 18972.830374999903
- },
- {
- "questionId": "q159",
- "format": "csv",
- "model": "gpt-5-nano",
- "expected": "83",
- "actual": "83",
- "isCorrect": true,
- "inputTokens": 8557,
- "outputTokens": 6919,
- "latencyMs": 69083.2129579999
- },
- {
- "questionId": "q159",
- "format": "csv",
- "model": "claude-haiku-4-5",
- "expected": "83",
- "actual": "73",
- "isCorrect": false,
- "inputTokens": 9125,
- "outputTokens": 5,
- "latencyMs": 1200.284708000021
- },
- {
- "questionId": "q159",
- "format": "csv",
- "model": "gemini-2.5-flash",
- "expected": "83",
- "actual": "83",
- "isCorrect": true,
- "inputTokens": 12210,
- "outputTokens": 2,
- "latencyMs": 33046.47866699984
- },
- {
- "questionId": "q159",
- "format": "xml",
- "model": "gpt-5-nano",
- "expected": "83",
- "actual": "112",
- "isCorrect": false,
- "inputTokens": 17139,
- "outputTokens": 6535,
- "latencyMs": 62622.555124999955
- },
- {
- "questionId": "q159",
- "format": "xml",
- "model": "claude-haiku-4-5",
- "expected": "83",
- "actual": "47",
- "isCorrect": false,
- "inputTokens": 19804,
- "outputTokens": 5,
- "latencyMs": 1500.2770829999354
- },
- {
- "questionId": "q159",
- "format": "xml",
- "model": "gemini-2.5-flash",
- "expected": "83",
- "actual": "49",
- "isCorrect": false,
- "inputTokens": 21884,
- "outputTokens": 2,
- "latencyMs": 2811.6203749999404
- },
- {
- "questionId": "q159",
- "format": "yaml",
- "model": "gpt-5-nano",
- "expected": "83",
- "actual": "90",
- "isCorrect": false,
- "inputTokens": 13172,
- "outputTokens": 25095,
- "latencyMs": 237521.54700000002
- },
- {
- "questionId": "q159",
- "format": "yaml",
- "model": "claude-haiku-4-5",
- "expected": "83",
- "actual": "71",
- "isCorrect": false,
- "inputTokens": 14483,
- "outputTokens": 5,
- "latencyMs": 1567.613791000098
- },
- {
- "questionId": "q159",
- "format": "yaml",
- "model": "gemini-2.5-flash",
- "expected": "83",
- "actual": "49",
- "isCorrect": false,
- "inputTokens": 17079,
- "outputTokens": 2,
- "latencyMs": 1373.2515409998596
- }
-]
diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json
deleted file mode 100644
index b3aa797..0000000
--- a/benchmarks/results/accuracy/summary.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
- "formatResults": [
- {
- "format": "toon",
- "accuracy": 0.8658280922431866,
- "totalTokens": 4678,
- "averageLatency": 5321,
- "correctCount": 413,
- "totalCount": 477
- },
- {
- "format": "xml",
- "accuracy": 0.8616352201257862,
- "totalTokens": 9944,
- "averageLatency": 6035,
- "correctCount": 411,
- "totalCount": 477
- },
- {
- "format": "csv",
- "accuracy": 0.8469601677148847,
- "totalTokens": 4745,
- "averageLatency": 6551,
- "correctCount": 404,
- "totalCount": 477
- },
- {
- "format": "json",
- "accuracy": 0.8322851153039832,
- "totalTokens": 8713,
- "averageLatency": 7981,
- "correctCount": 397,
- "totalCount": 477
- },
- {
- "format": "yaml",
- "accuracy": 0.8259958071278826,
- "totalTokens": 7091,
- "averageLatency": 5561,
- "correctCount": 394,
- "totalCount": 477
- }
- ],
- "questions": 159,
- "models": [
- "gpt-5-nano",
- "claude-haiku-4-5",
- "gemini-2.5-flash"
- ],
- "datasets": [
- {
- "name": "tabular",
- "description": "Uniform employee records (TOON optimal format)"
- },
- {
- "name": "nested",
- "description": "E-commerce orders with nested structures"
- },
- {
- "name": "analytics",
- "description": "Time-series analytics data"
- },
- {
- "name": "github",
- "description": "Top 100 GitHub repositories"
- }
- ],
- "tokenCounts": {
- "json-tabular": 6347,
- "json-nested": 9694,
- "json-analytics": 3665,
- "json-github": 15145,
- "toon-tabular": 2483,
- "toon-nested": 5967,
- "toon-analytics": 1515,
- "toon-github": 8745,
- "csv-tabular": 2337,
- "csv-nested": 6735,
- "csv-analytics": 1393,
- "csv-github": 8513,
- "xml-tabular": 7314,
- "xml-nested": 10992,
- "xml-analytics": 4376,
- "xml-github": 17095,
- "yaml-tabular": 4969,
- "yaml-nested": 7328,
- "yaml-analytics": 2938,
- "yaml-github": 13129
- },
- "timestamp": "2025-10-28T07:39:09.360Z"
-}
diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/retrieval-accuracy.md
similarity index 52%
rename from benchmarks/results/accuracy/report.md
rename to benchmarks/results/retrieval-accuracy.md
index 3ddd0f2..3650edd 100644
--- a/benchmarks/results/accuracy/report.md
+++ b/benchmarks/results/retrieval-accuracy.md
@@ -1,31 +1,31 @@
### Retrieval Accuracy
-Accuracy across **3 LLMs** on **159 data retrieval questions**:
+Accuracy across **3 LLMs** on **154 data retrieval questions**:
```
gpt-5-nano
- toon ████████████████████ 99.4% (158/159)
- yaml ███████████████████░ 95.0% (151/159)
- csv ██████████████████░░ 92.5% (147/159)
- json ██████████████████░░ 92.5% (147/159)
- xml ██████████████████░░ 91.2% (145/159)
-
-claude-haiku-4-5
- toon ███████████████░░░░░ 75.5% (120/159)
- xml ███████████████░░░░░ 75.5% (120/159)
- csv ███████████████░░░░░ 75.5% (120/159)
- json ███████████████░░░░░ 75.5% (120/159)
- yaml ███████████████░░░░░ 74.2% (118/159)
+ toon ███████████████████░ 96.1% (148/154)
+ csv ██████████████████░░ 90.3% (139/154)
+ yaml ██████████████████░░ 89.0% (137/154)
+ json ██████████████████░░ 87.7% (135/154)
+ xml █████████████████░░░ 83.8% (129/154)
gemini-2.5-flash
- xml ██████████████████░░ 91.8% (146/159)
- csv █████████████████░░░ 86.2% (137/159)
- toon █████████████████░░░ 84.9% (135/159)
- json ████████████████░░░░ 81.8% (130/159)
- yaml ████████████████░░░░ 78.6% (125/159)
+ xml ██████████████████░░ 90.3% (139/154)
+ csv ██████████████████░░ 89.0% (137/154)
+ toon █████████████████░░░ 87.0% (134/154)
+ json ████████████████░░░░ 79.2% (122/154)
+ yaml ███████████████░░░░░ 76.0% (117/154)
+
+claude-haiku-4-5-20251001
+ json ██████████░░░░░░░░░░ 48.7% (75/154)
+ toon ██████████░░░░░░░░░░ 48.1% (74/154)
+ xml █████████░░░░░░░░░░░ 47.4% (73/154)
+ yaml █████████░░░░░░░░░░░ 47.4% (73/154)
+ csv █████████░░░░░░░░░░░ 45.5% (70/154)
```
-**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**.
+**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.
Performance by dataset and model
@@ -36,41 +36,41 @@ gemini-2.5-flash
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 87.4% | 2.483 | 152/174 |
-| `csv` | 82.8% | 2.337 | 144/174 |
-| `yaml` | 83.9% | 4.969 | 146/174 |
-| `json` | 83.9% | 6.347 | 146/174 |
-| `xml` | 88.5% | 7.314 | 154/174 |
+| `csv` | 74.7% | 2,337 | 112/150 |
+| `toon` | 76.7% | 2,483 | 115/150 |
+| `yaml` | 70.7% | 4,969 | 106/150 |
+| `xml` | 77.3% | 7,314 | 116/150 |
+| `json` | 69.3% | 6,347 | 104/150 |
##### E-commerce orders with nested structures
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 90.9% | 5.967 | 120/132 |
-| `csv` | 93.9% | 6.735 | 124/132 |
-| `yaml` | 87.1% | 7.328 | 115/132 |
-| `json` | 87.9% | 9.694 | 116/132 |
-| `xml` | 93.2% | 10.992 | 123/132 |
+| `toon` | 80.0% | 5,967 | 96/120 |
+| `csv` | 75.8% | 6,735 | 91/120 |
+| `yaml` | 74.2% | 7,328 | 89/120 |
+| `json` | 79.2% | 9,694 | 95/120 |
+| `xml` | 78.3% | 10,992 | 94/120 |
##### Time-series analytics data
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `csv` | 89.7% | 1.393 | 78/87 |
-| `toon` | 88.5% | 1.515 | 77/87 |
-| `yaml` | 83.9% | 2.938 | 73/87 |
-| `json` | 88.5% | 3.665 | 77/87 |
-| `xml` | 85.1% | 4.376 | 74/87 |
+| `csv` | 75.5% | 1,393 | 77/102 |
+| `toon` | 76.5% | 1,515 | 78/102 |
+| `yaml` | 74.5% | 2,938 | 76/102 |
+| `json` | 76.5% | 3,665 | 78/102 |
+| `xml` | 74.5% | 4,376 | 76/102 |
##### Top 100 GitHub repositories
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
-| `toon` | 76.2% | 8.745 | 64/84 |
-| `csv` | 69.0% | 8.513 | 58/84 |
-| `yaml` | 71.4% | 13.129 | 60/84 |
-| `json` | 69.0% | 15.145 | 58/84 |
-| `xml` | 71.4% | 17.095 | 60/84 |
+| `toon` | 74.4% | 8,745 | 67/90 |
+| `csv` | 73.3% | 8,513 | 66/90 |
+| `yaml` | 62.2% | 13,129 | 56/90 |
+| `json` | 61.1% | 15,145 | 55/90 |
+| `xml` | 61.1% | 17,095 | 55/90 |
#### Performance by Model
@@ -78,31 +78,31 @@ gemini-2.5-flash
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
-| `toon` | 99.4% | 158/159 |
-| `yaml` | 95.0% | 151/159 |
-| `csv` | 92.5% | 147/159 |
-| `json` | 92.5% | 147/159 |
-| `xml` | 91.2% | 145/159 |
-
-##### claude-haiku-4-5
-
-| Format | Accuracy | Correct/Total |
-| ------ | -------- | ------------- |
-| `toon` | 75.5% | 120/159 |
-| `xml` | 75.5% | 120/159 |
-| `csv` | 75.5% | 120/159 |
-| `json` | 75.5% | 120/159 |
-| `yaml` | 74.2% | 118/159 |
+| `toon` | 96.1% | 148/154 |
+| `csv` | 90.3% | 139/154 |
+| `yaml` | 89.0% | 137/154 |
+| `json` | 87.7% | 135/154 |
+| `xml` | 83.8% | 129/154 |
##### gemini-2.5-flash
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
-| `xml` | 91.8% | 146/159 |
-| `csv` | 86.2% | 137/159 |
-| `toon` | 84.9% | 135/159 |
-| `json` | 81.8% | 130/159 |
-| `yaml` | 78.6% | 125/159 |
+| `xml` | 90.3% | 139/154 |
+| `csv` | 89.0% | 137/154 |
+| `toon` | 87.0% | 134/154 |
+| `json` | 79.2% | 122/154 |
+| `yaml` | 76.0% | 117/154 |
+
+##### claude-haiku-4-5-20251001
+
+| Format | Accuracy | Correct/Total |
+| ------ | -------- | ------------- |
+| `json` | 48.7% | 75/154 |
+| `toon` | 48.1% | 74/154 |
+| `xml` | 47.4% | 73/154 |
+| `yaml` | 47.4% | 73/154 |
+| `csv` | 45.5% | 70/154 |
@@ -124,31 +124,33 @@ Four datasets designed to test different structural patterns:
#### Question Types
-159 questions are generated dynamically across three categories:
+154 questions are generated dynamically across three categories:
-- **Field retrieval (50%)**: Direct value lookups
+- **Field retrieval (40%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → `75000`
+ - Example: "How many items are in order ORD-0042?" → `3`
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
-- **Aggregation (25%)**: Counting and summation tasks
+- **Aggregation (32%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
- Example: "How many employees work in Engineering?" → `17`
- Example: "What is the total revenue across all orders?" → `45123.50`
+ - Example: "How many employees have salary > 80000?" → `23`
-- **Filtering (25%)**: Conditional queries
+- **Filtering (28%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
- Example: "How many employees in Sales have salary > 80000?" → `5`
- - Example: "How many orders have total > 400?" → `12`
+ - Example: "How many active employees have more than 10 years of experience?" → `8`
#### Evaluation Process
-1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-4. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
+3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
#### Models & Configuration
-- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash`
+- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
-- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
+- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls
diff --git a/benchmarks/results/token-efficiency.md b/benchmarks/results/token-efficiency.md
index 8c382e8..f15963e 100644
--- a/benchmarks/results/token-efficiency.md
+++ b/benchmarks/results/token-efficiency.md
@@ -39,11 +39,11 @@ Total ████████████░░░░░
"repo": "freeCodeCamp/freeCodeCamp",
"description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…",
"createdAt": "2014-12-24T17:49:19Z",
- "updatedAt": "2025-10-27T07:40:58Z",
- "pushedAt": "2025-10-26T11:31:08Z",
- "stars": 430828,
- "watchers": 8582,
- "forks": 42136,
+ "updatedAt": "2025-10-28T11:58:08Z",
+ "pushedAt": "2025-10-28T10:17:16Z",
+ "stars": 430886,
+ "watchers": 8583,
+ "forks": 42146,
"defaultBranch": "main"
},
{
@@ -52,11 +52,11 @@ Total ████████████░░░░░
"repo": "codecrafters-io/build-your-own-x",
"description": "Master programming by recreating your favorite technologies from scratch.",
"createdAt": "2018-05-09T12:03:18Z",
- "updatedAt": "2025-10-27T07:43:25Z",
+ "updatedAt": "2025-10-28T12:37:11Z",
"pushedAt": "2025-10-10T18:45:01Z",
- "stars": 430102,
- "watchers": 6322,
- "forks": 40388,
+ "stars": 430877,
+ "watchers": 6332,
+ "forks": 40453,
"defaultBranch": "master"
},
{
@@ -65,11 +65,11 @@ Total ████████████░░░░░
"repo": "sindresorhus/awesome",
"description": "😎 Awesome lists about all kinds of interesting topics",
"createdAt": "2014-07-11T13:42:37Z",
- "updatedAt": "2025-10-27T07:44:27Z",
- "pushedAt": "2025-10-23T17:26:53Z",
- "stars": 409760,
- "watchers": 8016,
- "forks": 32015,
+ "updatedAt": "2025-10-28T12:40:21Z",
+ "pushedAt": "2025-10-27T17:57:31Z",
+ "stars": 410052,
+ "watchers": 8017,
+ "forks": 32029,
"defaultBranch": "main"
}
]
@@ -80,9 +80,9 @@ Total ████████████░░░░░
```
repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
- 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
- 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
- 21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
+ 28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-28T11:58:08Z","2025-10-28T10:17:16Z",430886,8583,42146,main
+ 132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-28T12:37:11Z","2025-10-10T18:45:01Z",430877,6332,40453,master
+ 21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-28T12:40:21Z","2025-10-27T17:57:31Z",410052,8017,32029,main
```
---
diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts
index 70172a1..e80ecc1 100644
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -1,51 +1,53 @@
-/**
- * LLM Retrieval Accuracy Benchmark
- *
- * Main entry point that orchestrates the full benchmark:
- * 1. Generate questions from datasets
- * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
- * 3. Evaluate each question with each format using LLMs
- * 4. Generate reports
- */
-
-import type { EvaluationResult, Question } from '../src/types'
-import * as fsp from 'node:fs/promises'
+import type { Question } from '../src/types'
import * as path from 'node:path'
-import { consola } from 'consola'
-import pMap from 'p-map'
-import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
+import process from 'node:process'
+import * as prompts from '@clack/prompts'
+import PQueue from 'p-queue'
+import { DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, MODEL_RPM_LIMITS, ROOT_DIR } from '../src/constants'
import { datasets } from '../src/datasets'
import { evaluateQuestion, models } from '../src/evaluate'
import { formatters } from '../src/formatters'
import { generateQuestions } from '../src/questions'
import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
+import { getAllModelResults, hasModelResults, saveModelResults } from '../src/storage'
-consola.start('Retrieval Accuracy Benchmark for TOON')
+prompts.intro('Retrieval Accuracy Benchmark')
-// Check if results already exist
-const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
-const rawResultsPath = path.join(resultsDir, 'raw-results.json')
-const summaryPath = path.join(resultsDir, 'summary.json')
+// Prompt user to select which models to benchmark
+const modelChoices = models.map(({ modelId }) => ({
+ value: modelId,
+ label: modelId,
+}))
-let existingResults: EvaluationResult[] | undefined
-let existingTokenCounts: Record | undefined
+const selectedModels = await prompts.multiselect({
+ message: 'Select models to benchmark (Space to select, Enter to confirm)',
+ options: modelChoices,
+ required: true,
+})
-try {
- const [rawData, summaryData] = await Promise.all([
- fsp.readFile(rawResultsPath, 'utf-8'),
- fsp.readFile(summaryPath, 'utf-8'),
- ])
- existingResults = JSON.parse(rawData)
- const summary = JSON.parse(summaryData)
- existingTokenCounts = summary.tokenCounts
- consola.info('Found existing results – regenerating report only')
+if (prompts.isCancel(selectedModels)) {
+ prompts.cancel('Benchmark cancelled')
+ process.exit(0)
}
-catch {
- // Results don't exist, will run full evaluation
+
+const activeModels = models.filter(m => selectedModels.includes(m.modelId))
+
+prompts.log.info(`Selected ${activeModels.length} model(s): ${activeModels.map(m => m.modelId).join(', ')}`)
+
+// Check which models already have results
+const existingModelResults: Record = {}
+for (const model of activeModels) {
+ const existingResult = await hasModelResults(model.modelId)
+ if (existingResult)
+ existingModelResults[model.modelId] = existingResult
+}
+
+if (Object.keys(existingModelResults).length > 0) {
+ prompts.log.info(`Found existing results for ${Object.values(existingModelResults).length} model(s)`)
}
if (DRY_RUN) {
- consola.info('Limiting questions and models for dry run')
+ prompts.log.info('Limiting questions and models for dry run')
}
let questions = generateQuestions()
@@ -55,79 +57,98 @@ if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
}
-// Filter models for dry run
-const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
- ? Object.fromEntries(
- Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
- )
- : models
+prompts.log.info(`Evaluating ${questions.length} questions`)
+prompts.log.info(`Testing ${Object.keys(formatters).length} formats`)
-let results: EvaluationResult[]
-let tokenCounts: Record
+// Evaluate each model separately and save results incrementally
+for (const model of activeModels) {
+ const modelId = model.modelId
-if (existingResults && existingTokenCounts) {
- // Reuse existing results
- results = existingResults
- tokenCounts = existingTokenCounts
-}
-else {
- // Run full evaluation
- consola.info(`Evaluating ${questions.length} questions`)
- consola.info(`Testing ${Object.keys(formatters).length} formats`)
- consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
+ // Skip if results already exist
+ if (existingModelResults[modelId]) {
+ prompts.log.info(`Skipping ${modelId} (results already exist)`)
+ continue
+ }
- // Calculate token counts for all format+dataset combinations
- tokenCounts = calculateTokenCounts(formatters)
-
- // Generate evaluation tasks
- const tasks: { question: Question, formatName: string, modelName: string }[] = []
+ prompts.log.step(`Running benchmark for ${modelId}`)
+ // Generate evaluation tasks for this model
+ const tasks: { question: Question, formatName: string }[] = []
for (const question of questions) {
for (const [formatName] of Object.entries(formatters)) {
- for (const [modelName] of Object.entries(activeModels)) {
- tasks.push({ question, formatName, modelName })
- }
+ tasks.push({ question, formatName })
}
}
const total = tasks.length
- consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
+ const rpmLimit = MODEL_RPM_LIMITS[modelId]
+ const queue = new PQueue({
+ concurrency: DEFAULT_CONCURRENCY,
+ intervalCap: rpmLimit,
+ interval: rpmLimit ? 60_000 : undefined,
+ })
- results = await pMap(
- tasks,
- async (task, index) => {
+ const evalSpinner = prompts.spinner()
+ evalSpinner.start(`Running ${total} evaluations (concurrency: ${DEFAULT_CONCURRENCY}, RPM limit: ${rpmLimit ?? 'unlimited'})`)
+
+ let completed = 0
+
+ // Queue all tasks
+ const modelResultPromises = tasks.map(task =>
+ queue.add(async () => {
// Format data on-demand
const dataset = datasets.find(d => d.name === task.question.dataset)!
const formatter = formatters[task.formatName]!
const formattedData = formatter(dataset.data)
- const model = activeModels[task.modelName as keyof typeof activeModels]!
const result = await evaluateQuestion({
question: task.question,
formatName: task.formatName,
formattedData,
model,
- modelName: task.modelName,
})
// Progress update after task completes
- if ((index + 1) % 10 === 0 || (index + 1) === total) {
- const percent = (((index + 1) / total) * 100).toFixed(1)
- consola.start(`Progress: ${index + 1}/${total} (${percent}%)`)
+ completed++
+ if (completed % 10 === 0 || completed === total) {
+ const percent = ((completed / total) * 100).toFixed(1)
+ evalSpinner.message(`Progress: ${completed}/${total} (${percent}%)`)
}
return result
- },
- { concurrency: DEFAULT_CONCURRENCY },
+ }),
)
- consola.success('Evaluation complete!')
+ // Wait for all tasks to complete
+ const modelResults = await Promise.all(modelResultPromises)
+
+ evalSpinner.stop(`Evaluation complete for ${modelId}`)
+
+ // Save results immediately for this model
+ await saveModelResults(modelId, modelResults)
+ prompts.log.success(`Saved results for ${modelId}`)
}
-// Generate/regenerate markdown report
-consola.start('Generating report and saving results…')
-const formatResults = calculateFormatResults(results, tokenCounts)
-await saveResults(results, formatResults, questions, tokenCounts)
+// Generate/regenerate markdown report from all available model results
+const reportSpinner = prompts.spinner()
+reportSpinner.start('Generating report from all model results')
-consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
-consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
+// Load all available model results (including any that were skipped)
+const allModelResults = await getAllModelResults()
+const allResults = Object.values(allModelResults).flat()
+
+if (allResults.length === 0) {
+ prompts.log.warn('No results available to generate report')
+ process.exit(0)
+}
+
+// Calculate token counts freshly (deterministic, no need to persist)
+const tokenCounts = calculateTokenCounts(formatters)
+
+// Calculate format statistics and save report
+const formatResults = calculateFormatResults(allResults, tokenCounts)
+const resultsDir = await saveResults(allResults, formatResults, questions, tokenCounts)
+
+const reportPath = path.join(resultsDir, 'retrieval-accuracy.md')
+prompts.log.info(`Report saved to: \`${path.relative(ROOT_DIR, reportPath)}\``)
+reportSpinner.stop('Report generation complete!')
diff --git a/benchmarks/scripts/fetch-github-data.ts b/benchmarks/scripts/fetch-github-repos.ts
similarity index 72%
rename from benchmarks/scripts/fetch-github-data.ts
rename to benchmarks/scripts/fetch-github-repos.ts
index 1ded0ee..3b75fd9 100644
--- a/benchmarks/scripts/fetch-github-data.ts
+++ b/benchmarks/scripts/fetch-github-repos.ts
@@ -1,18 +1,20 @@
import * as path from 'node:path'
import process from 'node:process'
-import { consola } from 'consola'
+import * as prompts from '@clack/prompts'
import { ofetch } from 'ofetch'
import pMap from 'p-map'
import { BENCHMARKS_DIR } from '../src/constants'
import { ensureDir, saveJsonFile } from '../src/utils'
+prompts.intro('GitHub Repositories Fetcher')
+
try {
// Fetch top 100 repos from GitHub
const repoList = await searchTop100Repos()
const repos = await fetchRepoDetails(repoList)
if (repos.length === 0) {
- consola.error('❌ No repositories fetched. Exiting.')
+ prompts.log.error('No repositories fetched. Exiting.')
process.exit(1)
}
@@ -21,15 +23,16 @@ try {
await saveRepos(repos)
- consola.success('Done!')
+ prompts.log.success('Done!')
}
catch (error) {
- consola.error(error)
+ prompts.log.error(String(error))
process.exit(1)
}
async function searchTop100Repos(): Promise {
- consola.start('Fetching top 100 starred repositories from GitHub API…')
+ const s = prompts.spinner()
+ s.start('Fetching top 100 starred repositories')
const response = await ofetch<{ items: { full_name: string }[] }>(
'https://api.github.com/search/repositories',
@@ -47,23 +50,26 @@ async function searchTop100Repos(): Promise {
},
)
+ s.stop('Fetched top 100 repositories')
+
return response.items.map(item => item.full_name)
}
async function fetchRepoDetails(repoList: string[]): Promise[]> {
- consola.start(`Fetching ${repoList.length} GitHub repositories…`)
+ const s = prompts.spinner()
+ s.start(`Fetching ${repoList.length} GitHub repositories`)
const repos = await pMap(
repoList,
async (repoPath, index) => {
- consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}…`)
+ s.message(`[${index + 1}/${repoList.length}] Fetching ${repoPath}`)
const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`)
return repo
},
{ concurrency: 5 },
)
- consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
+ s.stop(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
return repos
}
@@ -76,5 +82,5 @@ async function saveRepos(repos: Record[]): Promise {
await saveJsonFile(outputFile, repos)
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
- consola.info(`Saved to \`${relativePath}\``)
+ prompts.log.info(`Result saved to \`${relativePath}\``)
}
diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
index 34c8c32..afefafe 100644
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -1,6 +1,6 @@
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
-import { consola } from 'consola'
+import * as prompts from '@clack/prompts'
import { encode } from '../../src/index'
import githubRepos from '../data/github-repos.json' with { type: 'json' }
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
@@ -24,8 +24,6 @@ interface BenchmarkResult {
showDetailed: boolean
}
-const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md')
-
const BENCHMARK_EXAMPLES = [
{
name: 'GitHub Repositories',
@@ -50,6 +48,8 @@ const BENCHMARK_EXAMPLES = [
},
] as const
+prompts.intro('Token Efficiency Benchmark')
+
// Calculate total savings
let totalJsonTokens = 0
let totalToonTokens = 0
@@ -204,9 +204,12 @@ ${detailedExamples}
`.trimStart()
-console.log(`${barChartSection}\n`)
+prompts.log.message(`${barChartSection}\n`)
-await ensureDir(path.join(BENCHMARKS_DIR, 'results'))
+const resultsDir = path.join(BENCHMARKS_DIR, 'results')
+await ensureDir(resultsDir)
+
+const outputFilePath = path.join(resultsDir, 'token-efficiency.md')
await fsp.writeFile(outputFilePath, markdown, 'utf-8')
-consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
+prompts.log.success(`Result saved to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts
index e9301d9..35eb2c1 100644
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -5,9 +5,22 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
/**
- * Default concurrency for parallel evaluations
+ * Model-specific RPM (requests per minute) limits to handle API quotas
+ *
+ * @remarks
+ * Set `undefined` for models without specific limits
*/
-export const DEFAULT_CONCURRENCY = 20
+/// keep-sorted
+export const MODEL_RPM_LIMITS: Record = {
+ 'claude-haiku-4-5-20251001': 50,
+ 'gemini-2.5-flash': 25,
+ 'gpt-5-nano': undefined,
+}
+
+/**
+ * Default concurrency for parallel evaluations to prevent bursting
+ */
+export const DEFAULT_CONCURRENCY = 10
/**
* Progress bar configuration
@@ -28,13 +41,83 @@ export const PROGRESS_BAR = {
export const DRY_RUN: boolean = process.env.DRY_RUN === 'true'
/**
- * Limits applied when DRY_RUN is enabled
+ * Limits applied during dry run mode
*/
export const DRY_RUN_LIMITS = {
/** Maximum number of questions to evaluate */
maxQuestions: 10,
- /** Maximum number of formats to test */
- maxFormats: undefined as number | undefined,
- /** Models to use in dry run */
- allowedModels: [] as string[],
}
+
+/**
+ * Threshold values for filtering and aggregation questions
+ */
+export const QUESTION_THRESHOLDS = {
+ tabular: {
+ salaryRanges: [60000, 80000, 100000, 120000],
+ experienceYears: [5, 10, 15, 20],
+ departmentSalaryThreshold: 80000,
+ departmentExperienceThreshold: 10,
+ },
+ nested: {
+ highValueOrders: [200, 400, 600],
+ statusValueThreshold: 300,
+ itemCountThreshold: 3,
+ totalThresholdsForItems: [300, 500],
+ },
+ analytics: {
+ views: [5000, 7000],
+ conversions: [10, 30],
+ viewsForFiltering: [6000, 7000],
+ conversionsForFiltering: 15,
+ revenueThresholds: [500, 1000, 1500, 2000, 2500],
+ viewsThresholdForRevenue: 6000,
+ clicksForFiltering: [250, 400],
+ conversionsForClickFiltering: 15,
+ revenueForBounceRate: [1000, 1500],
+ bounceRateThreshold: 0.5,
+ },
+ github: {
+ stars: [100000, 150000, 200000],
+ forks: [20000, 35000, 50000],
+ watchers: [5000, 8000],
+ starForkCombinations: [
+ { stars: 75000, forks: 15000 },
+ { stars: 100000, forks: 20000 },
+ { stars: 150000, forks: 30000 },
+ { stars: 200000, forks: 45000 },
+ ],
+ starWatcherCombinations: [
+ { stars: 100000, watchers: 7000 },
+ { stars: 150000, watchers: 9000 },
+ ],
+ },
+} as const
+
+/**
+ * Question generation configuration
+ */
+export const QUESTION_LIMITS = {
+ tabular: {
+ fieldRetrieval: 20,
+ aggregationDepartments: 6,
+ filteringMultiConditionDepartments: 6,
+ filteringExperience: 4,
+ filteringDepartmentExp: 3,
+ filteringDepartmentActive: 3,
+ },
+ nested: {
+ fieldRetrievalOrders: 8,
+ fieldRetrievalCustomers: 10,
+ aggregationStatuses: 5,
+ filteringStatusAndValue: 5,
+ filteringStatusAndItems: 3,
+ },
+ analytics: {
+ fieldRetrievalDates: 13,
+ },
+ github: {
+ fieldRetrievalRepos: 11,
+ aggregationBranches: 2,
+ filteringStarsAndForks: 8,
+ },
+} as const
diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts
index 49de5fb..fc2d274 100644
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -1,12 +1,3 @@
-/**
- * Datasets for TOON benchmarks
- *
- * These datasets are designed to test TOON's strengths and weaknesses:
- * - Tabular: Uniform records (TOON optimal)
- * - Nested: Complex structures with nested objects
- * - Analytics: Time-series data
- */
-
import type { Dataset } from './types'
import { faker } from '@faker-js/faker'
import githubRepos from '../data/github-repos.json' with { type: 'json' }
@@ -128,7 +119,7 @@ const tabularDataset: Dataset = {
description: 'Uniform employee records (TOON optimal format)',
data: {
employees: Array.from({ length: 100 }, (_, i): Employee => {
- const yearsExp = faker.number.int({ min: 1, max: 20 })
+ const yearsExp = faker.number.int({ min: 1, max: 25 })
return {
id: i + 1,
name: faker.person.fullName(),
diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts
index 6bb39c4..468390c 100644
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -1,28 +1,19 @@
-/**
- * LLM evaluation logic for TOON benchmarks
- *
- * Handles:
- * - Model configuration
- * - Question evaluation with LLMs
- * - Answer validation using LLM-as-judge
- */
-
import type { LanguageModelV2 } from '@ai-sdk/provider'
import type { EvaluationResult, Question } from './types'
import { anthropic } from '@ai-sdk/anthropic'
import { google } from '@ai-sdk/google'
import { openai } from '@ai-sdk/openai'
+import * as prompts from '@clack/prompts'
import { generateText } from 'ai'
-import { consola } from 'consola'
/**
* Models used for evaluation
*/
-export const models: Record = {
- 'gpt-5-nano': openai('gpt-5-nano'),
- 'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
- 'gemini-2.5-flash': google('gemini-2.5-flash'),
-}
+export const models: LanguageModelV2[] = [
+ openai('gpt-5-nano'),
+ google('gemini-2.5-flash'),
+ anthropic('claude-haiku-4-5-20251001'),
+]
/**
* Evaluate a single question with a specific format and model
@@ -33,14 +24,12 @@ export async function evaluateQuestion(
formatName,
formattedData,
model,
- modelName,
}:
{
question: Question
formatName: string
formattedData: string
model: LanguageModelV2
- modelName: string
},
): Promise {
const prompt = `
@@ -59,10 +48,11 @@ Provide only the direct answer, without any additional explanation or formatting
const { text, usage } = await generateText({
model,
prompt,
- temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
+ temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
})
const latencyMs = performance.now() - startTime
+
const isCorrect = await validateAnswer({
actual: text.trim(),
expected: question.groundTruth,
@@ -72,7 +62,7 @@ Provide only the direct answer, without any additional explanation or formatting
return {
questionId: question.id,
format: formatName,
- model: modelName,
+ model: model.modelId,
expected: question.groundTruth,
actual: text.trim(),
isCorrect,
@@ -115,14 +105,14 @@ Respond with only "YES" or "NO".
try {
const { text } = await generateText({
- model: models['gpt-5-nano']!,
+ model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
- consola.error('Validation error:', error)
+ prompts.log.error(`Validation error: ${error}`)
// Fallback to simple string comparison
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
}
diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts
index 23326e5..b20b1b7 100644
--- a/benchmarks/src/formatters.ts
+++ b/benchmarks/src/formatters.ts
@@ -1,20 +1,3 @@
-/**
- * Format converters for TOON benchmarks
- *
- * Converts data to different formats for comparison:
- * - JSON
- * - TOON
- * - CSV
- * - XML
- * - YAML
- *
- * ## Semantic Equivalence
- *
- * All formatters attempt to preserve semantic equivalence with the source data,
- * meaning the converted data should represent the same information. However,
- * CSV has inherent limitations with nested structures (see `toCSV` docs).
- */
-
import { stringify as stringifyCSV } from 'csv-stringify/sync'
import { XMLBuilder } from 'fast-xml-parser'
import { stringify as stringifyYAML } from 'yaml'
@@ -23,7 +6,10 @@ import { encode as encodeToon } from '../../src/index'
/**
* Format converters registry
*
- * Each formatter takes unknown data and returns a string representation
+ * @remarks
+ * All formatters attempt to preserve semantic equivalence with the source data,
+ * meaning the converted data should represent the same information. However,
+ * CSV has inherent limitations with nested structures (see `toCSV` docs).
*/
export const formatters: Record string> = {
json: data => JSON.stringify(data, undefined, 2),
@@ -37,11 +23,13 @@ export const formatters: Record string> = {
* Convert data to CSV format
*
* @remarks
- * **Limitations**: CSV is designed for flat tabular data only. This formatter:
- * - Only handles top-level objects with arrays of flat objects
- * - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
- * - Loses nested structure information during conversion
- * - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
+ * Limitations: CSV is designed for flat tabular data only.
+ *
+ * This formatter:
+ * - Only handles top-level objects with arrays of flat objects
+ * - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
+ * - Loses nested structure information during conversion
+ * - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
*
* For datasets with nested structures, CSV comparisons may not be fair or representative
* of how CSV would typically be used in practice.
diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts
index cbd5ce8..a644ec2 100644
--- a/benchmarks/src/questions.ts
+++ b/benchmarks/src/questions.ts
@@ -1,24 +1,18 @@
/**
* Question generation for TOON benchmarks
*
- * Generates ~160 questions across different types:
- * - Field retrieval (50%): "What is X's Y?"
- * - Aggregation (25%): "How many X have Y?"
- * - Filtering (25%): "List/count X where Y"
- *
- * Questions are generated dynamically based on actual data values
- *
- * TODO: Balance question distribution across datasets to ensure fair representation.
- * Current distribution:
- * - Tabular: 70 questions (43%)
- * - Nested: 50 questions (31%)
- * - Analytics: 40 questions (25%)
- * - GitHub: 40 questions (25%)
+ * Generates ~150-160 questions across different question types and datasets:
+ * - Field Retrieval: Direct field access with no computation
+ * Examples: "What is X's salary?", "What is the status of order Y?"
+ * - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
+ * Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
+ * - Filtering: Multi-condition queries requiring complex logical operations
+ * Examples: "How many X WHERE condition1 AND condition2?"
*/
import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
import type { Question } from './types'
-import { consola } from 'consola'
+import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from './constants'
import { datasets } from './datasets'
/**
@@ -34,19 +28,15 @@ export function generateQuestions(): Question[] {
const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) ?? []
const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) ?? []
- // ========================================
- // TABULAR DATASET QUESTIONS (70 questions)
- // ========================================
-
if (tabular.length > 0) {
- // Field retrieval: specific employees (40 questions)
- for (let i = 0; i < Math.min(40, tabular.length); i++) {
+ // Field retrieval: specific employees
+ for (let i = 0; i < Math.min(QUESTION_LIMITS.tabular.fieldRetrieval, tabular.length); i++) {
const emp = tabular[i * 2] || tabular[i]
if (!emp)
continue
- // Alternate between different field types
- if (i % 3 === 0) {
+ // Rotate through all field types
+ if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the salary of ${emp.name}?`,
@@ -55,7 +45,7 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
- else if (i % 3 === 1) {
+ else if (i % 5 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `What department does ${emp.name} work in?`,
@@ -64,7 +54,7 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
- else {
+ else if (i % 5 === 2) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the email address of ${emp.name}?`,
@@ -73,11 +63,29 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
+ else if (i % 5 === 3) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many years of experience does ${emp.name} have?`,
+ groundTruth: String(emp.yearsExperience),
+ type: 'field-retrieval',
+ dataset: 'tabular',
+ })
+ }
+ else {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `Is ${emp.name} an active employee?`,
+ groundTruth: emp.active ? 'yes' : 'no',
+ type: 'field-retrieval',
+ dataset: 'tabular',
+ })
+ }
}
// Aggregation: count by department
const departments = [...new Set(tabular.map(e => e.department))]
- for (const dept of departments.slice(0, 6)) {
+ for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
const count = tabular.filter(e => e.department === dept).length
questions.push({
id: `q${idCounter++}`,
@@ -88,9 +96,8 @@ export function generateQuestions(): Question[] {
})
}
- // Aggregation: salary ranges (4 questions)
- const salaryThresholds = [60000, 80000, 100000, 120000]
- for (const threshold of salaryThresholds) {
+ // Aggregation: salary ranges (single-condition filters)
+ for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
const count = tabular.filter(e => e.salary > threshold).length
questions.push({
id: `q${idCounter++}`,
@@ -101,39 +108,57 @@ export function generateQuestions(): Question[] {
})
}
- // Filtering: active status
+ // Aggregation: totals and averages
+ const totalEmployees = tabular.length
+ const avgSalary = Math.round(tabular.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
const activeCount = tabular.filter(e => e.active).length
const inactiveCount = tabular.filter(e => !e.active).length
+
questions.push(
+ {
+ id: `q${idCounter++}`,
+ prompt: 'How many employees are in the dataset?',
+ groundTruth: String(totalEmployees),
+ type: 'aggregation',
+ dataset: 'tabular',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average salary across all employees?',
+ groundTruth: String(avgSalary),
+ type: 'aggregation',
+ dataset: 'tabular',
+ },
{
id: `q${idCounter++}`,
prompt: 'How many employees are active?',
groundTruth: String(activeCount),
- type: 'filtering',
+ type: 'aggregation',
dataset: 'tabular',
},
{
id: `q${idCounter++}`,
prompt: 'How many employees are inactive?',
groundTruth: String(inactiveCount),
- type: 'filtering',
+ type: 'aggregation',
dataset: 'tabular',
},
)
- // Complex filtering: multi-condition (8 questions)
- for (const dept of departments.slice(0, 4)) {
- const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
+ // Filtering: count by department with salary filter (multi-condition)
+ for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
+ const count = tabular.filter(e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many employees in ${dept} have a salary greater than 80000?`,
+ prompt: `How many employees in ${dept} have a salary greater than ${QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'tabular',
})
}
- for (const exp of [5, 10]) {
+ // Filtering: active employees by experience (multi-condition)
+ for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
questions.push({
id: `q${idCounter++}`,
@@ -143,15 +168,35 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
+
+ // Filtering: department by experience (multi-condition)
+ for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
+ const count = tabular.filter(e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many employees in ${dept} have more than ${QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold} years of experience?`,
+ groundTruth: String(count),
+ type: 'filtering',
+ dataset: 'tabular',
+ })
+ }
+
+ // Filtering: department by active status (multi-condition)
+ for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
+ const count = tabular.filter(e => e.department === dept && e.active).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many active employees work in ${dept}?`,
+ groundTruth: String(count),
+ type: 'filtering',
+ dataset: 'tabular',
+ })
+ }
}
- // ========================================
- // NESTED DATASET QUESTIONS (50 questions)
- // ========================================
-
if (nested.length > 0) {
- // Field retrieval: order totals (20 questions)
- for (let i = 0; i < Math.min(20, nested.length); i++) {
+ // Field retrieval: order totals and statuses
+ for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalOrders, nested.length); i++) {
const order = nested[i * 2] || nested[i]
if (!order)
continue
@@ -159,7 +204,7 @@ export function generateQuestions(): Question[] {
if (i % 2 === 0) {
questions.push({
id: `q${idCounter++}`,
- prompt: `What is the total amount for order ${order.orderId}?`,
+ prompt: `What is the total for order ${order.orderId}?`,
groundTruth: String(order.total),
type: 'field-retrieval',
dataset: 'nested',
@@ -176,51 +221,143 @@ export function generateQuestions(): Question[] {
}
}
- // Field retrieval: customer info (15 questions)
- for (let i = 0; i < Math.min(15, nested.length); i++) {
- const order = nested[i * 3] || nested[i]
+ // Field retrieval: customer info and order dates (expanded)
+ for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalCustomers, nested.length); i++) {
+ const order = nested[i * 2 + 1] || nested[i]
if (!order)
continue
- questions.push({
- id: `q${idCounter++}`,
- prompt: `What is the customer name for order ${order.orderId}?`,
- groundTruth: order.customer.name,
- type: 'field-retrieval',
- dataset: 'nested',
- })
+ if (i % 4 === 0) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What is the customer name for order ${order.orderId}?`,
+ groundTruth: order.customer.name,
+ type: 'field-retrieval',
+ dataset: 'nested',
+ })
+ }
+ else if (i % 4 === 1) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What is the customer email for order ${order.orderId}?`,
+ groundTruth: order.customer.email,
+ type: 'field-retrieval',
+ dataset: 'nested',
+ })
+ }
+ else if (i % 4 === 2) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What is the order date for order ${order.orderId}?`,
+ groundTruth: order.orderDate || '',
+ type: 'field-retrieval',
+ dataset: 'nested',
+ })
+ }
+ else {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many items are in order ${order.orderId}?`,
+ groundTruth: String(order.items.length),
+ type: 'field-retrieval',
+ dataset: 'nested',
+ })
+ }
}
- // Aggregation: count by status
+ // Aggregation: totals and averages
+ const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
+ const avgOrderValue = totalRevenue / nested.length
+ const totalOrders = nested.length
+ const maxOrderValue = Math.max(...nested.map(o => o.total))
+
+ // Count by status
const statuses = [...new Set(nested.map(o => o.status))]
- for (const status of statuses) {
+ for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
const count = nested.filter(o => o.status === status).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have status "${status}"?`,
groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'nested',
+ })
+ }
+
+ questions.push(
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the total revenue across all orders?',
+ groundTruth: String(totalRevenue.toFixed(2)),
+ type: 'aggregation',
+ dataset: 'nested',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average order value?',
+ groundTruth: String(avgOrderValue.toFixed(2)),
+ type: 'aggregation',
+ dataset: 'nested',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'How many orders are in the dataset?',
+ groundTruth: String(totalOrders),
+ type: 'aggregation',
+ dataset: 'nested',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the highest order total?',
+ groundTruth: String(maxOrderValue.toFixed(2)),
+ type: 'aggregation',
+ dataset: 'nested',
+ },
+ )
+
+ // Aggregation: high-value orders (single-condition filter)
+ for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
+ const count = nested.filter(o => o.total > threshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many orders have a total greater than ${threshold}?`,
+ groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'nested',
+ })
+ }
+
+ // Filtering: multi-condition queries (status AND value)
+ const orderStatuses = [...new Set(nested.map(o => o.status))]
+ for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
+ const count = nested.filter(o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many orders have status "${status}" and total greater than ${QUESTION_THRESHOLDS.nested.statusValueThreshold}?`,
+ groundTruth: String(count),
type: 'filtering',
dataset: 'nested',
})
}
- // Aggregation: total revenue
- const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
- questions.push({
- id: `q${idCounter++}`,
- prompt: 'What is the total revenue across all orders?',
- groundTruth: String(totalRevenue.toFixed(2)),
- type: 'aggregation',
- dataset: 'nested',
- })
-
- // Filtering: high-value orders (3 questions)
- const highValueThresholds = [200, 400, 600]
- for (const threshold of highValueThresholds) {
- const count = nested.filter(o => o.total > threshold).length
+ // Filtering: status AND items count (multi-condition)
+ for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
+ const count = nested.filter(o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many orders have a total greater than ${threshold}?`,
+ prompt: `How many orders have status "${status}" and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
+ groundTruth: String(count),
+ type: 'filtering',
+ dataset: 'nested',
+ })
+ }
+
+ // Filtering: total AND items count (multi-condition)
+ for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
+ const count = nested.filter(o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many orders have a total greater than ${threshold} and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'nested',
@@ -228,18 +365,14 @@ export function generateQuestions(): Question[] {
}
}
- // ========================================
- // ANALYTICS DATASET QUESTIONS (40 questions)
- // ========================================
-
if (analytics.length > 0) {
- // Field retrieval: specific dates (20 questions)
- for (let i = 0; i < Math.min(20, analytics.length); i++) {
+ // Field retrieval: specific dates (expanded with all metrics)
+ for (let i = 0; i < Math.min(QUESTION_LIMITS.analytics.fieldRetrievalDates, analytics.length); i++) {
const metric = analytics[i * 3] || analytics[i]
if (!metric)
continue
- if (i % 2 === 0) {
+ if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many views were recorded on ${metric.date}?`,
@@ -248,7 +381,7 @@ export function generateQuestions(): Question[] {
dataset: 'analytics',
})
}
- else {
+ else if (i % 5 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `What was the revenue on ${metric.date}?`,
@@ -257,12 +390,42 @@ export function generateQuestions(): Question[] {
dataset: 'analytics',
})
}
+ else if (i % 5 === 2) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What was the conversion count on ${metric.date}?`,
+ groundTruth: String(metric.conversions),
+ type: 'field-retrieval',
+ dataset: 'analytics',
+ })
+ }
+ else if (i % 5 === 3) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many clicks were recorded on ${metric.date}?`,
+ groundTruth: String(metric.clicks),
+ type: 'field-retrieval',
+ dataset: 'analytics',
+ })
+ }
+ else {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What was the bounce rate on ${metric.date}?`,
+ groundTruth: String(metric.bounceRate),
+ type: 'field-retrieval',
+ dataset: 'analytics',
+ })
+ }
}
- // Aggregation: totals (4 questions)
+ // Aggregation: totals and averages
const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
+ const avgViews = Math.round(totalViews / analytics.length)
+ const avgRevenue = totalRevenue / analytics.length
+ const avgConversions = Math.round(totalConversions / analytics.length)
questions.push(
{
@@ -286,27 +449,97 @@ export function generateQuestions(): Question[] {
type: 'aggregation',
dataset: 'analytics',
},
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average number of views per day?',
+ groundTruth: String(avgViews),
+ type: 'aggregation',
+ dataset: 'analytics',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average revenue per day?',
+ groundTruth: String(avgRevenue.toFixed(2)),
+ type: 'aggregation',
+ dataset: 'analytics',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average number of conversions per day?',
+ groundTruth: String(avgConversions),
+ type: 'aggregation',
+ dataset: 'analytics',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'How many days are included in the analytics data?',
+ groundTruth: String(analytics.length),
+ type: 'aggregation',
+ dataset: 'analytics',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the highest number of views recorded in a single day?',
+ groundTruth: String(Math.max(...analytics.map(m => m.views))),
+ type: 'aggregation',
+ dataset: 'analytics',
+ },
)
- // Filtering: high-performing days (10 questions)
- const viewThresholds = [5000, 6000, 7000]
- for (const threshold of viewThresholds) {
+ // Aggregation: high-performing days (single-condition filters)
+ for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
const count = analytics.filter(m => m.views > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${threshold} views?`,
groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'analytics',
+ })
+ }
+
+ // Filtering: multi-condition queries (views AND conversions)
+ for (const viewThreshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
+ const count = analytics.filter(m => m.views > viewThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many days had more than ${viewThreshold} views and more than ${QUESTION_THRESHOLDS.analytics.conversionsForFiltering} conversions?`,
+ groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
})
}
- const conversionThresholds = [10, 20, 30]
- for (const threshold of conversionThresholds) {
- const count = analytics.filter(m => m.conversions > threshold).length
+ // Filtering: views AND revenue (expanded)
+ for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueThresholds.slice(0, 5)) {
+ const count = analytics.filter(m => m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue && m.revenue > revenueThreshold).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many days had more than ${threshold} conversions?`,
+ prompt: `How many days had more than ${QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue} views and revenue greater than ${revenueThreshold}?`,
+ groundTruth: String(count),
+ type: 'filtering',
+ dataset: 'analytics',
+ })
+ }
+
+ // Filtering: clicks AND conversions (multi-condition)
+ for (const clickThreshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
+ const count = analytics.filter(m => m.clicks > clickThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many days had more than ${clickThreshold} clicks and more than ${QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering} conversions?`,
+ groundTruth: String(count),
+ type: 'filtering',
+ dataset: 'analytics',
+ })
+ }
+
+ // Filtering: revenue AND bounce rate (multi-condition)
+ for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
+ const count = analytics.filter(m => m.revenue > revenueThreshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many days had revenue greater than ${revenueThreshold} and bounce rate less than ${QUESTION_THRESHOLDS.analytics.bounceRateThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
@@ -314,79 +547,159 @@ export function generateQuestions(): Question[] {
}
}
- // ========================================
- // GITHUB DATASET QUESTIONS (40 questions)
- // ========================================
-
if (github.length > 0) {
- // Field retrieval: specific repos (20 questions)
- for (let i = 0; i < Math.min(20, github.length); i++) {
- const repo = github[i * 10] || github[i]
+ // Helper to extract owner from repo field
+ const getOwner = (repoFullName: string) => repoFullName.split('/')[0]!
+
+ // Field retrieval: specific repos (diverse fields)
+ for (let i = 0; i < Math.min(QUESTION_LIMITS.github.fieldRetrievalRepos, github.length); i++) {
+ const repo = github[i * 7]
if (!repo)
continue
- if (i % 2 === 0) {
+ if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
- prompt: `How many stars does ${repo.owner}/${repo.name} have?`,
+ prompt: `How many stars does ${repo.repo} have?`,
groundTruth: String(repo.stars),
type: 'field-retrieval',
dataset: 'github',
})
}
+ else if (i % 5 === 1) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many forks does ${repo.repo} have?`,
+ groundTruth: String(repo.forks),
+ type: 'field-retrieval',
+ dataset: 'github',
+ })
+ }
+ else if (i % 5 === 2) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `Who is the owner of ${repo.repo}?`,
+ groundTruth: getOwner(repo.repo),
+ type: 'field-retrieval',
+ dataset: 'github',
+ })
+ }
+ else if (i % 5 === 3) {
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `What is the default branch of ${repo.repo}?`,
+ groundTruth: repo.defaultBranch,
+ type: 'field-retrieval',
+ dataset: 'github',
+ })
+ }
else {
questions.push({
id: `q${idCounter++}`,
- prompt: `How many forks does ${repo.owner}/${repo.name} have?`,
- groundTruth: String(repo.forks),
+ prompt: `How many watchers does ${repo.repo} have?`,
+ groundTruth: String(repo.watchers),
type: 'field-retrieval',
dataset: 'github',
})
}
}
- // Aggregation: count by owner (5 questions)
- const owners = [...new Set(github.map(r => r.owner))]
- for (const owner of owners.slice(0, 5)) {
- const count = github.filter(r => r.owner === owner).length
+ // Aggregation: popular repositories
+ const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
+ const totalRepos = github.length
+ const avgStars = Math.round(totalStars / totalRepos)
+
+ questions.push(
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the total number of stars across all repositories?',
+ groundTruth: String(totalStars),
+ type: 'aggregation',
+ dataset: 'github',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'How many repositories are in the dataset?',
+ groundTruth: String(totalRepos),
+ type: 'aggregation',
+ dataset: 'github',
+ },
+ {
+ id: `q${idCounter++}`,
+ prompt: 'What is the average number of stars per repository?',
+ groundTruth: String(avgStars),
+ type: 'aggregation',
+ dataset: 'github',
+ },
+ )
+
+ // Aggregation: star thresholds (single-condition filters)
+ for (const threshold of QUESTION_THRESHOLDS.github.stars) {
+ const count = github.filter(r => r.stars > threshold).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many repositories does ${owner} have in the dataset?`,
+ prompt: `How many repositories have more than ${threshold} stars?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'github',
})
}
- // Aggregation: total stars
- const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
- questions.push({
- id: `q${idCounter++}`,
- prompt: 'What is the total number of stars across all repositories?',
- groundTruth: String(totalStars),
- type: 'aggregation',
- dataset: 'github',
- })
-
- // Filtering: popular repos (8 questions)
- const starThresholds = [10000, 50000, 100000]
- for (const threshold of starThresholds) {
- const count = github.filter(r => r.stars > threshold).length
+ // Aggregation: fork thresholds (single-condition filters)
+ for (const threshold of QUESTION_THRESHOLDS.github.forks) {
+ const count = github.filter(r => r.forks > threshold).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many repositories have more than ${threshold} stars?`,
+ prompt: `How many repositories have more than ${threshold} forks?`,
+ groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'github',
+ })
+ }
+
+ // Aggregation: watcher thresholds (single-condition filters)
+ for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
+ const count = github.filter(r => r.watchers > threshold).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many repositories have more than ${threshold} watchers?`,
+ groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'github',
+ })
+ }
+
+ // Aggregation: default branch counts
+ const branches = [...new Set(github.map(r => r.defaultBranch))]
+ for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
+ const count = github.filter(r => r.defaultBranch === branch).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many repositories use "${branch}" as their default branch?`,
+ groundTruth: String(count),
+ type: 'aggregation',
+ dataset: 'github',
+ })
+ }
+
+ // Filtering: multi-condition queries (stars AND forks)
+ for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
+ const count = github.filter(r => r.stars > combo.stars && r.forks > combo.forks).length
+ questions.push({
+ id: `q${idCounter++}`,
+ prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.forks} forks?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'github',
})
}
- const forkThresholds = [1000, 5000, 10000]
- for (const threshold of forkThresholds) {
- const count = github.filter(r => r.forks > threshold).length
+ // Filtering: stars AND watchers (multi-condition)
+ for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
+ const count = github.filter(r => r.stars > combo.stars && r.watchers > combo.watchers).length
questions.push({
id: `q${idCounter++}`,
- prompt: `How many repositories have more than ${threshold} forks?`,
+ prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.watchers} watchers?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'github',
@@ -394,14 +707,5 @@ export function generateQuestions(): Question[] {
}
}
- consola.info(`Question breakdown:`)
- consola.box(`
-Tabular: ${questions.filter(q => q.dataset === 'tabular').length}
-Nested: ${questions.filter(q => q.dataset === 'nested').length}
-Analytics: ${questions.filter(q => q.dataset === 'analytics').length}
-GitHub: ${questions.filter(q => q.dataset === 'github').length}
-Total: ${questions.length}
-`.trim())
-
return questions
}
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
index dbc5987..845d048 100644
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,21 +1,9 @@
-/**
- * Report generation for TOON benchmarks
- *
- * Handles:
- * - Statistical analysis
- * - Markdown report generation with visual elements
- * - Per-dataset breakdowns
- * - Cost analysis
- * - Result file saving
- */
-
import type { EvaluationResult, FormatResult, Question } from './types'
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
import { BENCHMARKS_DIR } from './constants'
import { datasets } from './datasets'
-import { models } from './evaluate'
-import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
+import { createProgressBar, ensureDir, tokenize } from './utils'
/**
* Calculate per-format statistics from evaluation results
@@ -63,8 +51,8 @@ export function generateMarkdownReport(
const json = formatResults.find(r => r.format === 'json')
// Build model-by-model breakdown with ASCII bars
- const modelCount = Object.keys(models).length
- const modelNames = Object.keys(models)
+ const modelNames = [...new Set(results.map(r => r.model))].reverse()
+ const modelCount = modelNames.length
const modelBreakdown = modelNames.map((modelName, i) => {
const modelResults = formatResults.map((fr) => {
@@ -136,7 +124,7 @@ export function generateMarkdownReport(
})
const tableRows = datasetResults.slice(0, 6).map(result =>
- `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
+ `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString('en-US')} | ${result.correctCount}/${result.totalCount} |`,
).join('\n')
return `
@@ -180,6 +168,27 @@ ${tableRows}
// Calculate total unique questions
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
+ // Calculate question type distribution
+ const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
+ const aggregationCount = questions.filter(q => q.type === 'aggregation').length
+ const filteringCount = questions.filter(q => q.type === 'filtering').length
+
+ const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
+ const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
+ const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
+
+ // Calculate dataset sizes
+ const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
+ const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
+ const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
+ const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
+
+ // Calculate number of formats and models
+ const formatCount = formatResults.length
+ const modelsUsed = [...new Set(results.map(r => r.model))]
+ const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
+ const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
+
return `
### Retrieval Accuracy
@@ -213,39 +222,41 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
Four datasets designed to test different structural patterns:
-1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
-2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
-3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
-4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars.
+1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
+2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
+3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
+4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.
#### Question Types
${totalQuestions} questions are generated dynamically across three categories:
-- **Field retrieval (50%)**: Direct value lookups
+\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → \`75000\`
+ - Example: "How many items are in order ORD-0042?" → \`3\`
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
-- **Aggregation (25%)**: Counting and summation tasks
+- **Aggregation (${aggregationPercent}%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
- Example: "How many employees work in Engineering?" → \`17\`
- Example: "What is the total revenue across all orders?" → \`45123.50\`
+ - Example: "How many employees have salary > 80000?" → \`23\`
-- **Filtering (25%)**: Conditional queries
+- **Filtering (${filteringPercent}%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
- - Example: "How many orders have total > 400?" → \`12\`
+ - Example: "How many active employees have more than 10 years of experience?" → \`8\`
#### Evaluation Process
-1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
+3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
#### Models & Configuration
-- **Models tested**: \`gpt-5-nano\`, \`claude-haiku-4-5\`, \`gemini-2.5-flash\`
+- **Models tested**: ${modelsListStr}
- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
-- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
+- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
`.trimStart()
@@ -272,6 +283,10 @@ export function calculateTokenCounts(
/**
* Save results to disk
+ *
+ * @remarks
+ * Per-model results are managed separately via storage.ts
+ * This function only generates the aggregated markdown report
*/
export async function saveResults(
results: EvaluationResult[],
@@ -279,31 +294,12 @@ export async function saveResults(
questions: Question[],
tokenCounts: Record,
): Promise {
- const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
+ const resultsDir = path.join(BENCHMARKS_DIR, 'results')
await ensureDir(resultsDir)
- // Save raw results
- await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
-
- // Save summary
- await saveJsonFile(
- path.join(resultsDir, 'summary.json'),
- {
- formatResults,
- questions: questions.length,
- models: Object.keys(models),
- datasets: datasets.map(d => ({ name: d.name, description: d.description })),
- tokenCounts,
- timestamp: new Date().toISOString(),
- },
- )
-
- // Generate markdown report
+ // Generate markdown report from all available model results
const report = generateMarkdownReport(formatResults, results, questions, tokenCounts)
- await fsp.writeFile(
- path.join(resultsDir, 'report.md'),
- report,
- )
+ await fsp.writeFile(path.join(resultsDir, 'retrieval-accuracy.md'), report)
return resultsDir
}
diff --git a/benchmarks/src/storage.ts b/benchmarks/src/storage.ts
new file mode 100644
index 0000000..aab9287
--- /dev/null
+++ b/benchmarks/src/storage.ts
@@ -0,0 +1,46 @@
+import type { Storage, StorageValue } from 'unstorage'
+import type { EvaluationResult } from './types'
+import * as path from 'node:path'
+import { createStorage } from 'unstorage'
+import fsDriver from 'unstorage/drivers/fs'
+import { BENCHMARKS_DIR } from './constants'
+
+/**
+ * Storage instance for model results
+ *
+ * @remarks
+ * Stores results in: `benchmarks/results/accuracy/models/`
+ */
+export const resultsStorage: Storage = createStorage({
+ driver: fsDriver({
+ base: path.join(BENCHMARKS_DIR, 'results', 'accuracy', 'models'),
+ }),
+})
+
+export async function loadModelResults(modelId: string): Promise {
+ const data = await resultsStorage.getItem(modelId)
+ return data ?? undefined
+}
+
+export async function saveModelResults(modelId: string, results: EvaluationResult[]): Promise {
+ await resultsStorage.setItem(modelId, results)
+}
+
+export async function getAllModelResults(): Promise> {
+ const keys = await resultsStorage.getKeys()
+ const results: Record = {}
+
+ await Promise.all(
+ keys.map(async (modelId) => {
+ const data = await resultsStorage.getItem(modelId)
+ if (data)
+ results[modelId] = data
+ }),
+ )
+
+ return results
+}
+
+export async function hasModelResults(modelId: string): Promise {
+ return await resultsStorage.hasItem(modelId)
+}
diff --git a/benchmarks/src/utils.ts b/benchmarks/src/utils.ts
index 3b0a735..bbd44c0 100644
--- a/benchmarks/src/utils.ts
+++ b/benchmarks/src/utils.ts
@@ -1,13 +1,3 @@
-/**
- * Shared utility functions for TOON benchmarks
- *
- * Provides common functionality used across multiple benchmark scripts:
- * - Progress bar visualization
- * - Token counting
- * - File I/O operations
- * - Retry logic for API calls
- */
-
import * as fsp from 'node:fs/promises'
import { encode } from 'gpt-tokenizer'
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index a645f96..2363e29 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -56,15 +56,15 @@ importers:
'@antfu/eslint-config':
specifier: ^6.1.0
version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
+ '@clack/prompts':
+ specifier: ^0.11.0
+ version: 0.11.0
'@faker-js/faker':
specifier: ^10.1.0
version: 10.1.0
ai:
specifier: ^5.0.80
version: 5.0.80(zod@4.1.12)
- consola:
- specifier: ^3.4.2
- version: 3.4.2
csv-stringify:
specifier: ^6.6.0
version: 6.6.0
@@ -80,6 +80,12 @@ importers:
p-map:
specifier: ^7.0.3
version: 7.0.3
+ p-queue:
+ specifier: ^9.0.0
+ version: 9.0.0
+ unstorage:
+ specifier: ^1.17.1
+ version: 1.17.1
yaml:
specifier: ^2.8.1
version: 2.8.1
@@ -985,6 +991,10 @@ packages:
resolution: {integrity: sha512-HqZ5rWlFjGiV0tDm3UxxgNRqsOTniqoKZu0pIAfh7TZQMGuZK+hH0drySty0si0QXj1ieop4+SkSfPZBPPkHig==}
engines: {node: '>=14'}
+ anymatch@3.1.3:
+ resolution: {integrity: sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==}
+ engines: {node: '>= 8'}
+
are-docs-informative@0.0.2:
resolution: {integrity: sha512-ixiS0nLNNG5jNQzgZJNoUpBKdo9yTYZMGJ+QgT2jmjR7G7+QHRCc4v6LQ3NgE7EBJq+o0ams3waJwkrlBom8Ig==}
engines: {node: '>=14'}
@@ -1119,6 +1129,9 @@ packages:
resolution: {integrity: sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==}
engines: {node: ^14.18.0 || >=16.10.0}
+ cookie-es@1.2.2:
+ resolution: {integrity: sha512-+W7VmiVINB+ywl1HGXJXmrqkOhpKrIiVZV6tQuV54ZyQC7MMuBt81Vc336GMLoHBq5hV/F9eXgt5Mnx0Rha5Fg==}
+
core-js-compat@3.46.0:
resolution: {integrity: sha512-p9hObIIEENxSV8xIu+V68JjSeARg6UVMG5mR+JEUguG3sI6MsiS1njz2jHmyJDvA+8jX/sytkBHup6kxhM9law==}
@@ -1126,6 +1139,9 @@ packages:
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
engines: {node: '>= 8'}
+ crossws@0.3.5:
+ resolution: {integrity: sha512-ojKiDvcmByhwa8YYqbQI/hg7MEU0NC03+pSdEq4ZUnZR9xXpwk7E43SMNGkn+JxJGPFtNvQ48+vV2p+P1ml5PA==}
+
cssesc@3.0.0:
resolution: {integrity: sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==}
engines: {node: '>=4'}
@@ -1431,6 +1447,9 @@ packages:
resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==}
engines: {node: '>=0.10.0'}
+ eventemitter3@5.0.1:
+ resolution: {integrity: sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==}
+
eventsource-parser@3.0.6:
resolution: {integrity: sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==}
engines: {node: '>=18.0.0'}
@@ -1552,6 +1571,9 @@ packages:
graphemer@1.4.0:
resolution: {integrity: sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==}
+ h3@1.15.4:
+ resolution: {integrity: sha512-z5cFQWDffyOe4vQ9xIqNfCZdV4p//vy6fBnr8Q1AWnVZ0teurKMG66rLj++TKwKPUP3u7iMUvrvKaEUiQw2QWQ==}
+
has-flag@4.0.0:
resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==}
engines: {node: '>=8'}
@@ -1582,6 +1604,9 @@ packages:
resolution: {integrity: sha512-m6FAo/spmsW2Ab2fU35JTYwtOKa2yAwXSwgjSv1TJzh4Mh7mC3lzAOVLBprb72XsTrgkEIsl7YrFNAiDiRhIGg==}
engines: {node: '>=12'}
+ iron-webcrypto@1.2.1:
+ resolution: {integrity: sha512-feOM6FaSr6rEABp/eDfVseKyTMDt+KGpeB35SkVn9Tyn0CqvVsY3EwI0v5i8nMHyJnzCIQf7nsy3p41TPkJZhg==}
+
is-builtin-module@5.0.0:
resolution: {integrity: sha512-f4RqJKBUe5rQkJ2eJEJBXSticB3hGbN9j0yxxMQFqIW89Jp9WYFtzfTcRlstDKVUTRzSOTLKRfO9vIztenwtxA==}
engines: {node: '>=18.20'}
@@ -1680,6 +1705,9 @@ packages:
longest-streak@3.1.0:
resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==}
+ lru-cache@10.4.3:
+ resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
+
magic-string@0.30.21:
resolution: {integrity: sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==}
@@ -1854,9 +1882,16 @@ packages:
node-fetch-native@1.6.7:
resolution: {integrity: sha512-g9yhqoedzIUm0nTnTqAQvueMPVOuIY16bqgAJJC8XOOubYFNwz6IER9qs0Gq2Xd0+CecCKFjtdDTMA4u4xG06Q==}
+ node-mock-http@1.0.3:
+ resolution: {integrity: sha512-jN8dK25fsfnMrVsEhluUTPkBFY+6ybu7jSB1n+ri/vOGjJxU8J9CZhpSGkHXSkFjtUhbmoncG/YG9ta5Ludqog==}
+
node-releases@2.0.26:
resolution: {integrity: sha512-S2M9YimhSjBSvYnlr5/+umAnPHE++ODwt5e2Ij6FoX45HA/s4vHdkDx1eax2pAPeAOqu4s9b7ppahsyEFdVqQA==}
+ normalize-path@3.0.0:
+ resolution: {integrity: sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==}
+ engines: {node: '>=0.10.0'}
+
nth-check@2.1.1:
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
@@ -1890,6 +1925,14 @@ packages:
resolution: {integrity: sha512-VkndIv2fIB99swvQoA65bm+fsmt6UNdGeIB0oxBs+WhAhdh08QA04JXpI7rbB9r08/nkbysKoya9rtDERYOYMA==}
engines: {node: '>=18'}
+ p-queue@9.0.0:
+ resolution: {integrity: sha512-KO1RyxstL9g1mK76530TExamZC/S2Glm080Nx8PE5sTd7nlduDQsAfEl4uXX+qZjLiwvDauvzXavufy3+rJ9zQ==}
+ engines: {node: '>=20'}
+
+ p-timeout@7.0.1:
+ resolution: {integrity: sha512-AxTM2wDGORHGEkPCt8yqxOTMgpfbEHqF51f/5fJCmwFC3C/zNcGT63SymH2ttOAaiIws2zVg4+izQCjrakcwHg==}
+ engines: {node: '>=20'}
+
package-manager-detector@1.5.0:
resolution: {integrity: sha512-uBj69dVlYe/+wxj8JOpr97XfsxH/eumMt6HqjNTmJDf/6NO9s+0uxeOneIz3AsPt2m6y9PqzDzd3ATcU17MNfw==}
@@ -1967,6 +2010,9 @@ packages:
queue-microtask@1.2.3:
resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==}
+ radix3@1.1.2:
+ resolution: {integrity: sha512-b484I/7b8rDEdSDKckSSBA8knMpcdsXudlE/LNL639wFoHKwLbEkQFZHWEYwDC0wa0FKUcCY+GAF73Z7wxNVFA==}
+
rc9@2.1.2:
resolution: {integrity: sha512-btXCnMmRIBINM2LDZoEmOogIZU7Qe7zn4BpomSKZ/ykbLObuBdvG+mFq11DL6fjH1DRwHhrlgtYWG96bJiC7Cg==}
@@ -2194,6 +2240,9 @@ packages:
unconfig@7.3.3:
resolution: {integrity: sha512-QCkQoOnJF8L107gxfHL0uavn7WD9b3dpBcFX6HtfQYmjw2YzWxGuFQ0N0J6tE9oguCBJn9KOvfqYDCMPHIZrBA==}
+ uncrypto@0.1.3:
+ resolution: {integrity: sha512-Ql87qFHB3s/De2ClA9e0gsnS6zXG27SkTiSJwjCc9MebbfapQfuPzumMIUMi38ezPZVNFcHI9sUIepeQfw8J8Q==}
+
undici-types@7.16.0:
resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==}
@@ -2214,6 +2263,68 @@ packages:
engines: {node: '>=20.19.0'}
hasBin: true
+ unstorage@1.17.1:
+ resolution: {integrity: sha512-KKGwRTT0iVBCErKemkJCLs7JdxNVfqTPc/85ae1XES0+bsHbc/sFBfVi5kJp156cc51BHinIH2l3k0EZ24vOBQ==}
+ peerDependencies:
+ '@azure/app-configuration': ^1.8.0
+ '@azure/cosmos': ^4.2.0
+ '@azure/data-tables': ^13.3.0
+ '@azure/identity': ^4.6.0
+ '@azure/keyvault-secrets': ^4.9.0
+ '@azure/storage-blob': ^12.26.0
+ '@capacitor/preferences': ^6.0.3 || ^7.0.0
+ '@deno/kv': '>=0.9.0'
+ '@netlify/blobs': ^6.5.0 || ^7.0.0 || ^8.1.0 || ^9.0.0 || ^10.0.0
+ '@planetscale/database': ^1.19.0
+ '@upstash/redis': ^1.34.3
+ '@vercel/blob': '>=0.27.1'
+ '@vercel/functions': ^2.2.12 || ^3.0.0
+ '@vercel/kv': ^1.0.1
+ aws4fetch: ^1.0.20
+ db0: '>=0.2.1'
+ idb-keyval: ^6.2.1
+ ioredis: ^5.4.2
+ uploadthing: ^7.4.4
+ peerDependenciesMeta:
+ '@azure/app-configuration':
+ optional: true
+ '@azure/cosmos':
+ optional: true
+ '@azure/data-tables':
+ optional: true
+ '@azure/identity':
+ optional: true
+ '@azure/keyvault-secrets':
+ optional: true
+ '@azure/storage-blob':
+ optional: true
+ '@capacitor/preferences':
+ optional: true
+ '@deno/kv':
+ optional: true
+ '@netlify/blobs':
+ optional: true
+ '@planetscale/database':
+ optional: true
+ '@upstash/redis':
+ optional: true
+ '@vercel/blob':
+ optional: true
+ '@vercel/functions':
+ optional: true
+ '@vercel/kv':
+ optional: true
+ aws4fetch:
+ optional: true
+ db0:
+ optional: true
+ idb-keyval:
+ optional: true
+ ioredis:
+ optional: true
+ uploadthing:
+ optional: true
+
untyped@2.0.0:
resolution: {integrity: sha512-nwNCjxJTjNuLCgFr42fEak5OcLuB3ecca+9ksPFNvtfYSLpjf+iJqSIaSnIile6ZPbKYxI5k2AfXqeopGudK/g==}
hasBin: true
@@ -3143,6 +3254,11 @@ snapshots:
ansis@4.2.0: {}
+ anymatch@3.1.3:
+ dependencies:
+ normalize-path: 3.0.0
+ picomatch: 2.3.1
+
are-docs-informative@0.0.2: {}
argparse@2.0.1: {}
@@ -3289,6 +3405,8 @@ snapshots:
consola@3.4.2: {}
+ cookie-es@1.2.2: {}
+
core-js-compat@3.46.0:
dependencies:
browserslist: 4.27.0
@@ -3299,6 +3417,10 @@ snapshots:
shebang-command: 2.0.0
which: 2.0.2
+ crossws@0.3.5:
+ dependencies:
+ uncrypto: 0.1.3
+
cssesc@3.0.0: {}
csv-stringify@6.6.0: {}
@@ -3674,6 +3796,8 @@ snapshots:
esutils@2.0.3: {}
+ eventemitter3@5.0.1: {}
+
eventsource-parser@3.0.6: {}
expect-type@1.2.2: {}
@@ -3776,6 +3900,18 @@ snapshots:
graphemer@1.4.0: {}
+ h3@1.15.4:
+ dependencies:
+ cookie-es: 1.2.2
+ crossws: 0.3.5
+ defu: 6.1.4
+ destr: 2.0.5
+ iron-webcrypto: 1.2.1
+ node-mock-http: 1.0.3
+ radix3: 1.1.2
+ ufo: 1.6.1
+ uncrypto: 0.1.3
+
has-flag@4.0.0: {}
hookable@5.5.3: {}
@@ -3795,6 +3931,8 @@ snapshots:
indent-string@5.0.0: {}
+ iron-webcrypto@1.2.1: {}
+
is-builtin-module@5.0.0:
dependencies:
builtin-modules: 5.0.0
@@ -3871,6 +4009,8 @@ snapshots:
longest-streak@3.1.0: {}
+ lru-cache@10.4.3: {}
+
magic-string@0.30.21:
dependencies:
'@jridgewell/sourcemap-codec': 1.5.5
@@ -4228,8 +4368,12 @@ snapshots:
node-fetch-native@1.6.7: {}
+ node-mock-http@1.0.3: {}
+
node-releases@2.0.26: {}
+ normalize-path@3.0.0: {}
+
nth-check@2.1.1:
dependencies:
boolbase: 1.0.0
@@ -4271,6 +4415,13 @@ snapshots:
p-map@7.0.3: {}
+ p-queue@9.0.0:
+ dependencies:
+ eventemitter3: 5.0.1
+ p-timeout: 7.0.1
+
+ p-timeout@7.0.1: {}
+
package-manager-detector@1.5.0: {}
parent-module@1.0.1:
@@ -4336,6 +4487,8 @@ snapshots:
queue-microtask@1.2.3: {}
+ radix3@1.1.2: {}
+
rc9@2.1.2:
dependencies:
defu: 6.1.4
@@ -4575,6 +4728,8 @@ snapshots:
jiti: 2.6.1
quansync: 0.2.11
+ uncrypto@0.1.3: {}
+
undici-types@7.16.0: {}
unist-util-is@6.0.1:
@@ -4602,6 +4757,17 @@ snapshots:
rolldown: 1.0.0-beta.44
synckit: 0.11.11
+ unstorage@1.17.1:
+ dependencies:
+ anymatch: 3.1.3
+ chokidar: 4.0.3
+ destr: 2.0.5
+ h3: 1.15.4
+ lru-cache: 10.4.3
+ node-fetch-native: 1.6.7
+ ofetch: 1.4.1
+ ufo: 1.6.1
+
untyped@2.0.0:
dependencies:
citty: 0.1.6