From 1a5e6199acdaeba79ead770ca23535db4187bba3 Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Mon, 27 Oct 2025 13:45:48 +0100 Subject: [PATCH] test: update retrieval accuracy benchmarks --- README.md | 79 +- benchmarks/results/accuracy/raw-results.json | 10976 ++++++++-------- benchmarks/results/accuracy/report.md | 77 +- benchmarks/results/accuracy/summary.json | 49 +- benchmarks/results/token-efficiency.md | 2 +- benchmarks/scripts/accuracy-benchmark.ts | 14 +- .../scripts/token-efficiency-benchmark.ts | 2 +- benchmarks/src/evaluate.ts | 134 +- benchmarks/src/report.ts | 55 +- benchmarks/src/types.ts | 7 +- 10 files changed, 5686 insertions(+), 5709 deletions(-) diff --git a/README.md b/README.md index 8d7d411..ca05be5 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ users[2]{id,name,role}: 🛒 E-commerce Order ███████████████░░░░░░░░░░ 203 tokens (JSON: 338) 💰 39.9% saved ``` -**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → 47.9% savings +**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → **47.9% savings**
View detailed examples @@ -200,19 +200,22 @@ metrics[5]{date,views,clicks,conversions,revenue}: Tested across **2 LLMs** with data retrieval tasks: ``` -gpt-4o-mini ██████████████░░░░░░ 72.3% accuracy -claude-haiku-4-5 ███████████████░░░░░ 76.7% accuracy +gpt-5-nano + toon ███████████████████░ 97.5% (155/159) + markdown-kv ███████████████████░ 95.6% (152/159) + yaml ███████████████████░ 94.3% (150/159) + json ███████████████████░ 93.7% (149/159) + csv ███████████████████░ 93.7% (149/159) + +claude-haiku-4-5 + markdown-kv ███████████████░░░░░ 76.7% (122/159) + toon ███████████████░░░░░ 75.5% (120/159) + json ███████████████░░░░░ 75.5% (120/159) + csv ███████████████░░░░░ 75.5% (120/159) + yaml ███████████████░░░░░ 74.8% (119/159) ``` -**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.** - -| Format | Accuracy | Average Tokens | -| ------ | -------- | -------------- | -| `toon` | 73.9% | 4.678 | -| `json` | 73.6% | 8.713 | -| `markdown-kv` | 73.6% | 8.649 | -| `csv` | 72.3% | 4.745 | -| `yaml` | 71.7% | 7.091 | +**Tradeoff:** TOON achieves 86.5% accuracy (vs JSON's 84.6%) while using 46.3% fewer tokens.
View detailed breakdown by dataset and model @@ -223,53 +226,53 @@ claude-haiku-4-5 ███████████████░░░░ | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 72.4% | 2.483 | 84/116 | -| `csv` | 69.0% | 2.337 | 80/116 | -| `yaml` | 68.1% | 4.969 | 79/116 | -| `markdown-kv` | 68.1% | 6.270 | 79/116 | -| `json` | 68.1% | 6.347 | 79/116 | +| `toon` | 86.2% | 2.483 | 100/116 | +| `csv` | 80.2% | 2.337 | 93/116 | +| `yaml` | 82.8% | 4.969 | 96/116 | +| `markdown-kv` | 84.5% | 6.270 | 98/116 | +| `json` | 84.5% | 6.347 | 98/116 | ##### E-commerce orders with nested structures | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 84.1% | 5.967 | 74/88 | -| `csv` | 83.0% | 6.735 | 73/88 | -| `yaml` | 81.8% | 7.328 | 72/88 | -| `markdown-kv` | 86.4% | 9.110 | 76/88 | -| `json` | 84.1% | 9.694 | 74/88 | +| `toon` | 90.9% | 5.967 | 80/88 | +| `csv` | 90.9% | 6.735 | 80/88 | +| `yaml` | 89.8% | 7.328 | 79/88 | +| `markdown-kv` | 90.9% | 9.110 | 80/88 | +| `json` | 89.8% | 9.694 | 79/88 | ##### Time-series analytics data | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `csv` | 72.4% | 1.393 | 42/58 | -| `toon` | 70.7% | 1.515 | 41/58 | -| `yaml` | 72.4% | 2.938 | 42/58 | -| `json` | 74.1% | 3.665 | 43/58 | -| `markdown-kv` | 70.7% | 3.779 | 41/58 | +| `csv` | 87.9% | 1.393 | 51/58 | +| `toon` | 86.2% | 1.515 | 50/58 | +| `yaml` | 86.2% | 2.938 | 50/58 | +| `json` | 87.9% | 3.665 | 51/58 | +| `markdown-kv` | 86.2% | 3.779 | 50/58 | ##### Popular GitHub repositories | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 64.3% | 8.745 | 36/56 | -| `csv` | 62.5% | 8.513 | 35/56 | -| `json` | 67.9% | 15.145 | 38/56 | -| `markdown-kv` | 67.9% | 15.436 | 38/56 | -| `yaml` | 62.5% | 13.129 | 35/56 | +| `csv` | 80.4% | 8.513 | 45/56 | +| `toon` | 80.4% | 8.745 | 45/56 | +| `yaml` | 78.6% | 13.129 | 44/56 | +| `markdown-kv` | 82.1% | 15.436 | 46/56 | +| `json` | 73.2% | 15.145 | 41/56 | #### Performance by Model -##### gpt-4o-mini +##### gpt-5-nano | Format | Accuracy | Correct/Total | |--------|----------|---------------| -| `toon` | 72.3% | 115/159 | -| `json` | 71.7% | 114/159 | -| `markdown-kv` | 70.4% | 112/159 | -| `csv` | 69.2% | 110/159 | -| `yaml` | 68.6% | 109/159 | +| `toon` | 97.5% | 155/159 | +| `markdown-kv` | 95.6% | 152/159 | +| `yaml` | 94.3% | 150/159 | +| `json` | 93.7% | 149/159 | +| `csv` | 93.7% | 149/159 | ##### claude-haiku-4-5 diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json index 761a265..adbe71c 100644 --- a/benchmarks/results/accuracy/raw-results.json +++ b/benchmarks/results/accuracy/raw-results.json @@ -2,13 +2,13 @@ { "questionId": "q1", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "56176", "actual": "56176", "correct": true, - "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 1313 + "inputTokens": 6390, + "outputTokens": 72, + "latencyMs": 2221.390167 }, { "questionId": "q1", @@ -19,18 +19,18 @@ "correct": true, "inputTokens": 7870, "outputTokens": 6, - "latencyMs": 1346 + "latencyMs": 1276.715333 }, { "questionId": "q1", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "56176", "actual": "56176", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 1191 + "inputTokens": 2527, + "outputTokens": 72, + "latencyMs": 3718.250833 }, { "questionId": "q1", @@ -41,18 +41,18 @@ "correct": true, "inputTokens": 2982, "outputTokens": 6, - "latencyMs": 1399 + "latencyMs": 1215.944708 }, { "questionId": "q1", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "56176", "actual": "56176", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 5010 + "inputTokens": 2381, + "outputTokens": 72, + "latencyMs": 2417.306625 }, { "questionId": "q1", @@ -63,18 +63,18 @@ "correct": true, "inputTokens": 2856, "outputTokens": 6, - "latencyMs": 1472 + "latencyMs": 1152.5258749999998 }, { "questionId": "q1", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "56176", "actual": "56176", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 1667 + "inputTokens": 6316, + "outputTokens": 72, + "latencyMs": 4603.444417 }, { "questionId": "q1", @@ -85,18 +85,18 @@ "correct": true, "inputTokens": 6365, "outputTokens": 6, - "latencyMs": 1507 + "latencyMs": 1390.011125 }, { "questionId": "q1", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "56176", "actual": "56176", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 1325 + "inputTokens": 5012, + "outputTokens": 8, + "latencyMs": 4339.294459 }, { "questionId": "q1", @@ -107,18 +107,18 @@ "correct": true, "inputTokens": 5760, "outputTokens": 6, - "latencyMs": 2280 + "latencyMs": 1374.47325 }, { "questionId": "q2", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6391, - "outputTokens": 2, - "latencyMs": 3167 + "inputTokens": 6390, + "outputTokens": 135, + "latencyMs": 2550.589042 }, { "questionId": "q2", @@ -129,18 +129,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 4, - "latencyMs": 1267 + "latencyMs": 1139.559917 }, { "questionId": "q2", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2528, - "outputTokens": 2, - "latencyMs": 1402 + "inputTokens": 2527, + "outputTokens": 135, + "latencyMs": 2422.8178749999997 }, { "questionId": "q2", @@ -151,18 +151,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 4, - "latencyMs": 1290 + "latencyMs": 1135.579459 }, { "questionId": "q2", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2382, - "outputTokens": 2, - "latencyMs": 5070 + "inputTokens": 2381, + "outputTokens": 71, + "latencyMs": 4198.553583999999 }, { "questionId": "q2", @@ -173,18 +173,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 4, - "latencyMs": 1320 + "latencyMs": 1147.9685829999999 }, { "questionId": "q2", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6317, - "outputTokens": 2, - "latencyMs": 1745 + "inputTokens": 6316, + "outputTokens": 71, + "latencyMs": 2594.702667 }, { "questionId": "q2", @@ -195,18 +195,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 4, - "latencyMs": 1191 + "latencyMs": 1568.4054999999998 }, { "questionId": "q2", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5013, - "outputTokens": 2, - "latencyMs": 2713 + "inputTokens": 5012, + "outputTokens": 71, + "latencyMs": 2516.345875 }, { "questionId": "q2", @@ -217,18 +217,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 4, - "latencyMs": 1309 + "latencyMs": 1633.5375000000001 }, { "questionId": "q3", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", "correct": true, - "inputTokens": 6393, - "outputTokens": 7, - "latencyMs": 1160 + "inputTokens": 6392, + "outputTokens": 76, + "latencyMs": 2079.8442499999996 }, { "questionId": "q3", @@ -239,18 +239,18 @@ "correct": true, "inputTokens": 7874, "outputTokens": 12, - "latencyMs": 1338 + "latencyMs": 1201.556458 }, { "questionId": "q3", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", "correct": true, - "inputTokens": 2530, - "outputTokens": 7, - "latencyMs": 1478 + "inputTokens": 2529, + "outputTokens": 140, + "latencyMs": 2356.408 }, { "questionId": "q3", @@ -261,18 +261,18 @@ "correct": true, "inputTokens": 2986, "outputTokens": 12, - "latencyMs": 1563 + "latencyMs": 1113.255166 }, { "questionId": "q3", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", "correct": true, - "inputTokens": 2384, - "outputTokens": 7, - "latencyMs": 1310 + "inputTokens": 2383, + "outputTokens": 140, + "latencyMs": 2188.5425419999997 }, { "questionId": "q3", @@ -283,18 +283,18 @@ "correct": true, "inputTokens": 2860, "outputTokens": 12, - "latencyMs": 1236 + "latencyMs": 1029.9496669999999 }, { "questionId": "q3", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", "correct": true, - "inputTokens": 6319, - "outputTokens": 7, - "latencyMs": 2236 + "inputTokens": 6318, + "outputTokens": 140, + "latencyMs": 2605.8857080000002 }, { "questionId": "q3", @@ -305,18 +305,18 @@ "correct": true, "inputTokens": 6369, "outputTokens": 12, - "latencyMs": 1253 + "latencyMs": 1273.5997920000004 }, { "questionId": "q3", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", "correct": true, - "inputTokens": 5015, - "outputTokens": 7, - "latencyMs": 1917 + "inputTokens": 5014, + "outputTokens": 140, + "latencyMs": 2530.4294580000005 }, { "questionId": "q3", @@ -327,18 +327,18 @@ "correct": true, "inputTokens": 5764, "outputTokens": 12, - "latencyMs": 1332 + "latencyMs": 1404.4837089999996 }, { "questionId": "q4", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "117381", "actual": "117381", "correct": true, - "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 2945 + "inputTokens": 6390, + "outputTokens": 72, + "latencyMs": 2302.062125 }, { "questionId": "q4", @@ -349,18 +349,18 @@ "correct": true, "inputTokens": 7870, "outputTokens": 6, - "latencyMs": 1773 + "latencyMs": 1114.0778329999998 }, { "questionId": "q4", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "117381", "actual": "117381", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 1294 + "inputTokens": 2527, + "outputTokens": 72, + "latencyMs": 2006.7020830000001 }, { "questionId": "q4", @@ -371,18 +371,18 @@ "correct": true, "inputTokens": 2982, "outputTokens": 6, - "latencyMs": 980 + "latencyMs": 1641.5518749999997 }, { "questionId": "q4", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "117381", "actual": "117381", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 1747 + "inputTokens": 2381, + "outputTokens": 136, + "latencyMs": 2850.351709 }, { "questionId": "q4", @@ -393,18 +393,18 @@ "correct": true, "inputTokens": 2856, "outputTokens": 6, - "latencyMs": 1197 + "latencyMs": 1367.7319589999997 }, { "questionId": "q4", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "117381", "actual": "117381", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 1039 + "inputTokens": 6316, + "outputTokens": 72, + "latencyMs": 2477.8365839999997 }, { "questionId": "q4", @@ -415,18 +415,18 @@ "correct": true, "inputTokens": 6365, "outputTokens": 6, - "latencyMs": 1453 + "latencyMs": 1309.567083 }, { "questionId": "q4", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "117381", "actual": "117381", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 1056 + "inputTokens": 5012, + "outputTokens": 72, + "latencyMs": 1794.2651250000008 }, { "questionId": "q4", @@ -437,18 +437,18 @@ "correct": true, "inputTokens": 5760, "outputTokens": 6, - "latencyMs": 1564 + "latencyMs": 1177.5377079999998 }, { "questionId": "q5", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6390, - "outputTokens": 2, - "latencyMs": 1263 + "inputTokens": 6389, + "outputTokens": 71, + "latencyMs": 1963.9477500000003 }, { "questionId": "q5", @@ -459,18 +459,18 @@ "correct": true, "inputTokens": 7868, "outputTokens": 4, - "latencyMs": 1097 + "latencyMs": 1024.5166669999999 }, { "questionId": "q5", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2527, - "outputTokens": 2, - "latencyMs": 1248 + "inputTokens": 2526, + "outputTokens": 135, + "latencyMs": 2291.4288749999996 }, { "questionId": "q5", @@ -481,18 +481,18 @@ "correct": true, "inputTokens": 2980, "outputTokens": 4, - "latencyMs": 1486 + "latencyMs": 1312.7111250000007 }, { "questionId": "q5", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2381, - "outputTokens": 2, - "latencyMs": 1311 + "inputTokens": 2380, + "outputTokens": 135, + "latencyMs": 1727.6371660000004 }, { "questionId": "q5", @@ -503,18 +503,18 @@ "correct": true, "inputTokens": 2854, "outputTokens": 4, - "latencyMs": 1019 + "latencyMs": 1097.0443749999995 }, { "questionId": "q5", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6316, - "outputTokens": 2, - "latencyMs": 1287 + "inputTokens": 6315, + "outputTokens": 135, + "latencyMs": 2671.2276250000004 }, { "questionId": "q5", @@ -525,18 +525,18 @@ "correct": true, "inputTokens": 6363, "outputTokens": 4, - "latencyMs": 1243 + "latencyMs": 1174.8639999999996 }, { "questionId": "q5", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5012, - "outputTokens": 2, - "latencyMs": 1339 + "inputTokens": 5011, + "outputTokens": 71, + "latencyMs": 2306.2642499999993 }, { "questionId": "q5", @@ -547,18 +547,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 4, - "latencyMs": 1621 + "latencyMs": 2822.8963750000003 }, { "questionId": "q6", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", "correct": true, - "inputTokens": 6391, - "outputTokens": 6, - "latencyMs": 1625 + "inputTokens": 6390, + "outputTokens": 139, + "latencyMs": 2827.0400409999993 }, { "questionId": "q6", @@ -569,18 +569,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 11, - "latencyMs": 1328 + "latencyMs": 1151.7215829999996 }, { "questionId": "q6", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", "correct": true, - "inputTokens": 2528, - "outputTokens": 6, - "latencyMs": 1463 + "inputTokens": 2527, + "outputTokens": 75, + "latencyMs": 1714.2902919999997 }, { "questionId": "q6", @@ -591,18 +591,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 11, - "latencyMs": 1149 + "latencyMs": 1810.6344170000011 }, { "questionId": "q6", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", "correct": true, - "inputTokens": 2382, - "outputTokens": 6, - "latencyMs": 1474 + "inputTokens": 2381, + "outputTokens": 75, + "latencyMs": 2548.0390000000007 }, { "questionId": "q6", @@ -613,18 +613,18 @@ "correct": true, "inputTokens": 2857, "outputTokens": 11, - "latencyMs": 977 + "latencyMs": 1046.7650829999993 }, { "questionId": "q6", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", "correct": true, - "inputTokens": 6317, - "outputTokens": 6, - "latencyMs": 2079 + "inputTokens": 6316, + "outputTokens": 139, + "latencyMs": 2408.879916000001 }, { "questionId": "q6", @@ -635,18 +635,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 11, - "latencyMs": 1134 + "latencyMs": 1186.5773750000008 }, { "questionId": "q6", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", "correct": true, - "inputTokens": 5013, - "outputTokens": 6, - "latencyMs": 1124 + "inputTokens": 5012, + "outputTokens": 139, + "latencyMs": 3157.9398329999995 }, { "questionId": "q6", @@ -657,18 +657,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 11, - "latencyMs": 1053 + "latencyMs": 1129.6754170000004 }, { "questionId": "q7", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "92971", "actual": "92971", "correct": true, - "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 1427 + "inputTokens": 6390, + "outputTokens": 72, + "latencyMs": 2893.3476250000003 }, { "questionId": "q7", @@ -679,18 +679,18 @@ "correct": true, "inputTokens": 7870, "outputTokens": 6, - "latencyMs": 1246 + "latencyMs": 1288.7682919999988 }, { "questionId": "q7", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "92971", "actual": "92971", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 1171 + "inputTokens": 2527, + "outputTokens": 72, + "latencyMs": 2324.6738330000007 }, { "questionId": "q7", @@ -701,18 +701,18 @@ "correct": true, "inputTokens": 2982, "outputTokens": 6, - "latencyMs": 1547 + "latencyMs": 1095.704291 }, { "questionId": "q7", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "92971", "actual": "92971", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 1523 + "inputTokens": 2381, + "outputTokens": 136, + "latencyMs": 3980.3727500000005 }, { "questionId": "q7", @@ -723,18 +723,18 @@ "correct": true, "inputTokens": 2856, "outputTokens": 6, - "latencyMs": 1148 + "latencyMs": 1122.8730419999993 }, { "questionId": "q7", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "92971", "actual": "92971", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 1360 + "inputTokens": 6316, + "outputTokens": 72, + "latencyMs": 2030.0818330000002 }, { "questionId": "q7", @@ -745,18 +745,18 @@ "correct": true, "inputTokens": 6365, "outputTokens": 6, - "latencyMs": 1100 + "latencyMs": 1705.6364999999987 }, { "questionId": "q7", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "92971", "actual": "92971", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 1116 + "inputTokens": 5012, + "outputTokens": 72, + "latencyMs": 1611.3567500000008 }, { "questionId": "q7", @@ -767,18 +767,18 @@ "correct": true, "inputTokens": 5760, "outputTokens": 6, - "latencyMs": 1202 + "latencyMs": 1109.0094590000008 }, { "questionId": "q8", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", - "actual": "Operations", - "correct": false, - "inputTokens": 6391, - "outputTokens": 2, - "latencyMs": 974 + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 199, + "latencyMs": 3099.078125 }, { "questionId": "q8", @@ -789,18 +789,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 4, - "latencyMs": 1357 + "latencyMs": 1115.9911250000005 }, { "questionId": "q8", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2528, - "outputTokens": 2, - "latencyMs": 1107 + "inputTokens": 2527, + "outputTokens": 135, + "latencyMs": 2833.193875000001 }, { "questionId": "q8", @@ -811,18 +811,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 4, - "latencyMs": 1126 + "latencyMs": 933.1444169999995 }, { "questionId": "q8", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2382, - "outputTokens": 2, - "latencyMs": 1124 + "inputTokens": 2381, + "outputTokens": 199, + "latencyMs": 2315.536 }, { "questionId": "q8", @@ -833,18 +833,18 @@ "correct": true, "inputTokens": 2857, "outputTokens": 4, - "latencyMs": 1208 + "latencyMs": 1300.336792 }, { "questionId": "q8", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", - "actual": "Operations", - "correct": false, - "inputTokens": 6317, - "outputTokens": 2, - "latencyMs": 1463 + "actual": "Marketing", + "correct": true, + "inputTokens": 6316, + "outputTokens": 135, + "latencyMs": 7016.997917000002 }, { "questionId": "q8", @@ -855,18 +855,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 4, - "latencyMs": 1175 + "latencyMs": 1288.107333 }, { "questionId": "q8", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5013, - "outputTokens": 2, - "latencyMs": 1952 + "inputTokens": 5012, + "outputTokens": 135, + "latencyMs": 2474.8247499999998 }, { "questionId": "q8", @@ -877,128 +877,128 @@ "correct": true, "inputTokens": 5761, "outputTokens": 4, - "latencyMs": 1271 + "latencyMs": 1027.9775420000005 }, { "questionId": "q9", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", "correct": true, - "inputTokens": 6393, - "outputTokens": 7, - "latencyMs": 1301 - }, - { - "questionId": "q9", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 7871, - "outputTokens": 11, - "latencyMs": 1371 - }, - { - "questionId": "q9", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 2530, - "outputTokens": 7, - "latencyMs": 1197 - }, - { - "questionId": "q9", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 2983, - "outputTokens": 11, - "latencyMs": 1088 - }, - { - "questionId": "q9", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 2384, - "outputTokens": 7, - "latencyMs": 1310 - }, - { - "questionId": "q9", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 2857, - "outputTokens": 11, - "latencyMs": 1300 - }, - { - "questionId": "q9", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 6319, - "outputTokens": 7, - "latencyMs": 1531 - }, - { - "questionId": "q9", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 6366, - "outputTokens": 11, - "latencyMs": 1275 - }, - { - "questionId": "q9", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrence.hansen@yahoo.com", - "correct": false, - "inputTokens": 5015, - "outputTokens": 7, - "latencyMs": 1245 - }, - { - "questionId": "q9", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "terrance.hansen@yahoo.com", - "actual": "terrance.hansen@yahoo.com", - "correct": true, - "inputTokens": 5761, - "outputTokens": 11, - "latencyMs": 1215 - }, - { - "questionId": "q10", - "format": "json", - "model": "gpt-4o-mini", - "expected": "107744", - "actual": "107744", - "correct": true, "inputTokens": 6392, - "outputTokens": 3, - "latencyMs": 4959 + "outputTokens": 652, + "latencyMs": 8322.172416 + }, + { + "questionId": "q9", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 11, + "latencyMs": 1066.3422090000004 + }, + { + "questionId": "q9", + "format": "toon", + "model": "gpt-5-nano", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2529, + "outputTokens": 76, + "latencyMs": 2245.5604999999996 + }, + { + "questionId": "q9", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 11, + "latencyMs": 1179.7512079999997 + }, + { + "questionId": "q9", + "format": "csv", + "model": "gpt-5-nano", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2383, + "outputTokens": 204, + "latencyMs": 2584.0723340000004 + }, + { + "questionId": "q9", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 11, + "latencyMs": 1204.6979589999992 + }, + { + "questionId": "q9", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 6318, + "outputTokens": 396, + "latencyMs": 3824.918375000001 + }, + { + "questionId": "q9", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 11, + "latencyMs": 1492.6765830000004 + }, + { + "questionId": "q9", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 5014, + "outputTokens": 76, + "latencyMs": 1834.562 + }, + { + "questionId": "q9", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "terrance.hansen@yahoo.com", + "actual": "terrance.hansen@yahoo.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 11, + "latencyMs": 1245.0000419999997 + }, + { + "questionId": "q10", + "format": "json", + "model": "gpt-5-nano", + "expected": "107744", + "actual": "107744", + "correct": true, + "inputTokens": 6391, + "outputTokens": 136, + "latencyMs": 2337.0652499999997 }, { "questionId": "q10", @@ -1009,18 +1009,18 @@ "correct": true, "inputTokens": 7870, "outputTokens": 6, - "latencyMs": 1269 + "latencyMs": 1148.1971250000006 }, { "questionId": "q10", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "107744", "actual": "107744", "correct": true, - "inputTokens": 2529, - "outputTokens": 3, - "latencyMs": 1111 + "inputTokens": 2528, + "outputTokens": 72, + "latencyMs": 2736.2375420000008 }, { "questionId": "q10", @@ -1031,18 +1031,18 @@ "correct": true, "inputTokens": 2982, "outputTokens": 6, - "latencyMs": 1254 + "latencyMs": 1164.4291250000006 }, { "questionId": "q10", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "107744", "actual": "107744", "correct": true, - "inputTokens": 2383, - "outputTokens": 3, - "latencyMs": 1616 + "inputTokens": 2382, + "outputTokens": 72, + "latencyMs": 2479.8535840000004 }, { "questionId": "q10", @@ -1053,18 +1053,18 @@ "correct": true, "inputTokens": 2856, "outputTokens": 6, - "latencyMs": 1123 + "latencyMs": 1032.3198329999996 }, { "questionId": "q10", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "107744", "actual": "107744", "correct": true, - "inputTokens": 6318, - "outputTokens": 3, - "latencyMs": 1201 + "inputTokens": 6317, + "outputTokens": 136, + "latencyMs": 2237.465583000001 }, { "questionId": "q10", @@ -1075,18 +1075,18 @@ "correct": true, "inputTokens": 6365, "outputTokens": 6, - "latencyMs": 1371 + "latencyMs": 1254.3189160000002 }, { "questionId": "q10", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "107744", "actual": "107744", "correct": true, - "inputTokens": 5014, - "outputTokens": 3, - "latencyMs": 1503 + "inputTokens": 5013, + "outputTokens": 72, + "latencyMs": 3753.917125 }, { "questionId": "q10", @@ -1097,18 +1097,18 @@ "correct": true, "inputTokens": 5760, "outputTokens": 6, - "latencyMs": 1249 + "latencyMs": 1154.7003750000003 }, { "questionId": "q11", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6391, - "outputTokens": 2, - "latencyMs": 1383 + "inputTokens": 6390, + "outputTokens": 135, + "latencyMs": 2621.2275420000005 }, { "questionId": "q11", @@ -1119,18 +1119,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 4, - "latencyMs": 1081 + "latencyMs": 1222.843499999999 }, { "questionId": "q11", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2528, - "outputTokens": 2, - "latencyMs": 1677 + "inputTokens": 2527, + "outputTokens": 71, + "latencyMs": 1762.1339159999989 }, { "questionId": "q11", @@ -1141,18 +1141,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 4, - "latencyMs": 1072 + "latencyMs": 1630.7307079999991 }, { "questionId": "q11", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2382, - "outputTokens": 2, - "latencyMs": 1142 + "inputTokens": 2381, + "outputTokens": 71, + "latencyMs": 1848.9775829999999 }, { "questionId": "q11", @@ -1163,18 +1163,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 4, - "latencyMs": 991 + "latencyMs": 1080.8682500000014 }, { "questionId": "q11", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6317, - "outputTokens": 2, - "latencyMs": 1339 + "inputTokens": 6316, + "outputTokens": 135, + "latencyMs": 26303.357959 }, { "questionId": "q11", @@ -1185,18 +1185,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 4, - "latencyMs": 1117 + "latencyMs": 1354.007999999998 }, { "questionId": "q11", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5013, - "outputTokens": 2, - "latencyMs": 2483 + "inputTokens": 5012, + "outputTokens": 71, + "latencyMs": 1924.4625829999986 }, { "questionId": "q11", @@ -1207,18 +1207,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 4, - "latencyMs": 1187 + "latencyMs": 1279.5235830000001 }, { "questionId": "q12", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", "correct": true, - "inputTokens": 6390, - "outputTokens": 5, - "latencyMs": 1827 + "inputTokens": 6389, + "outputTokens": 330, + "latencyMs": 3997.3972079999985 }, { "questionId": "q12", @@ -1229,18 +1229,18 @@ "correct": true, "inputTokens": 7867, "outputTokens": 9, - "latencyMs": 1121 + "latencyMs": 1153.9412079999984 }, { "questionId": "q12", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", "correct": true, - "inputTokens": 2527, - "outputTokens": 5, - "latencyMs": 1373 + "inputTokens": 2526, + "outputTokens": 138, + "latencyMs": 2494.580582999999 }, { "questionId": "q12", @@ -1251,18 +1251,18 @@ "correct": true, "inputTokens": 2979, "outputTokens": 9, - "latencyMs": 1284 + "latencyMs": 1350.1353750000017 }, { "questionId": "q12", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", "correct": true, - "inputTokens": 2381, - "outputTokens": 5, - "latencyMs": 1751 + "inputTokens": 2380, + "outputTokens": 138, + "latencyMs": 3024.4009160000023 }, { "questionId": "q12", @@ -1273,18 +1273,18 @@ "correct": true, "inputTokens": 2853, "outputTokens": 9, - "latencyMs": 1140 + "latencyMs": 1199.3955830000014 }, { "questionId": "q12", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", "correct": true, - "inputTokens": 6316, - "outputTokens": 5, - "latencyMs": 1624 + "inputTokens": 6315, + "outputTokens": 138, + "latencyMs": 5168.116582999999 }, { "questionId": "q12", @@ -1295,18 +1295,18 @@ "correct": true, "inputTokens": 6362, "outputTokens": 9, - "latencyMs": 1071 + "latencyMs": 1198.3554160000022 }, { "questionId": "q12", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", "correct": true, - "inputTokens": 5012, - "outputTokens": 5, - "latencyMs": 1970 + "inputTokens": 5011, + "outputTokens": 74, + "latencyMs": 2632.998958999997 }, { "questionId": "q12", @@ -1317,216 +1317,216 @@ "correct": true, "inputTokens": 5757, "outputTokens": 9, - "latencyMs": 1437 + "latencyMs": 1124.5625419999997 }, { "questionId": "q13", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "145843", "actual": "145843", "correct": true, + "inputTokens": 6388, + "outputTokens": 72, + "latencyMs": 2357.2276249999995 + }, + { + "questionId": "q13", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 7868, + "outputTokens": 6, + "latencyMs": 1267.960791999998 + }, + { + "questionId": "q13", + "format": "toon", + "model": "gpt-5-nano", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2525, + "outputTokens": 136, + "latencyMs": 2397.798125000001 + }, + { + "questionId": "q13", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2980, + "outputTokens": 6, + "latencyMs": 1170.6429580000004 + }, + { + "questionId": "q13", + "format": "csv", + "model": "gpt-5-nano", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2379, + "outputTokens": 136, + "latencyMs": 3227.198124999999 + }, + { + "questionId": "q13", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 2854, + "outputTokens": 6, + "latencyMs": 1112.6066250000003 + }, + { + "questionId": "q13", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 6314, + "outputTokens": 72, + "latencyMs": 2036.251791999999 + }, + { + "questionId": "q13", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 6363, + "outputTokens": 6, + "latencyMs": 1290.7641250000015 + }, + { + "questionId": "q13", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 5010, + "outputTokens": 72, + "latencyMs": 2262.8405840000014 + }, + { + "questionId": "q13", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "145843", + "actual": "145843", + "correct": true, + "inputTokens": 5758, + "outputTokens": 6, + "latencyMs": 1193.2695419999982 + }, + { + "questionId": "q14", + "format": "json", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, "inputTokens": 6389, - "outputTokens": 3, - "latencyMs": 1263 + "outputTokens": 71, + "latencyMs": 3198.2654159999984 }, { - "questionId": "q13", + "questionId": "q14", "format": "json", "model": "claude-haiku-4-5", - "expected": "145843", - "actual": "145843", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 7868, - "outputTokens": 6, - "latencyMs": 1277 + "outputTokens": 4, + "latencyMs": 1229.8644999999997 }, { - "questionId": "q13", + "questionId": "q14", "format": "toon", - "model": "gpt-4o-mini", - "expected": "145843", - "actual": "145843", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 2526, - "outputTokens": 3, - "latencyMs": 1151 + "outputTokens": 71, + "latencyMs": 3293.710084000002 }, { - "questionId": "q13", + "questionId": "q14", "format": "toon", "model": "claude-haiku-4-5", - "expected": "145843", - "actual": "145843", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 2980, - "outputTokens": 6, - "latencyMs": 1260 + "outputTokens": 4, + "latencyMs": 1121.200334000001 }, { - "questionId": "q13", + "questionId": "q14", "format": "csv", - "model": "gpt-4o-mini", - "expected": "145843", - "actual": "145843", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 2380, - "outputTokens": 3, - "latencyMs": 1071 + "outputTokens": 71, + "latencyMs": 2497.4451249999984 }, { - "questionId": "q13", + "questionId": "q14", "format": "csv", "model": "claude-haiku-4-5", - "expected": "145843", - "actual": "145843", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 2854, - "outputTokens": 6, - "latencyMs": 891 + "outputTokens": 4, + "latencyMs": 1152.0107500000013 }, { - "questionId": "q13", + "questionId": "q14", "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "145843", - "actual": "145843", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 6315, - "outputTokens": 3, - "latencyMs": 1548 + "outputTokens": 71, + "latencyMs": 3547.6399999999994 }, { - "questionId": "q13", + "questionId": "q14", "format": "markdown-kv", "model": "claude-haiku-4-5", - "expected": "145843", - "actual": "145843", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 6363, - "outputTokens": 6, - "latencyMs": 1456 + "outputTokens": 4, + "latencyMs": 2007.6731249999975 }, { - "questionId": "q13", + "questionId": "q14", "format": "yaml", - "model": "gpt-4o-mini", - "expected": "145843", - "actual": "145843", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", "correct": true, "inputTokens": 5011, - "outputTokens": 3, - "latencyMs": 1268 - }, - { - "questionId": "q13", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "145843", - "actual": "145843", - "correct": true, - "inputTokens": 5758, - "outputTokens": 6, - "latencyMs": 1205 - }, - { - "questionId": "q14", - "format": "json", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 6390, - "outputTokens": 2, - "latencyMs": 1310 - }, - { - "questionId": "q14", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 7868, - "outputTokens": 4, - "latencyMs": 1071 - }, - { - "questionId": "q14", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 2527, - "outputTokens": 2, - "latencyMs": 895 - }, - { - "questionId": "q14", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 2980, - "outputTokens": 4, - "latencyMs": 1020 - }, - { - "questionId": "q14", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 2381, - "outputTokens": 2, - "latencyMs": 1168 - }, - { - "questionId": "q14", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 2854, - "outputTokens": 4, - "latencyMs": 977 - }, - { - "questionId": "q14", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Operations", - "correct": false, - "inputTokens": 6316, - "outputTokens": 2, - "latencyMs": 1370 - }, - { - "questionId": "q14", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 6363, - "outputTokens": 4, - "latencyMs": 1508 - }, - { - "questionId": "q14", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, - "inputTokens": 5012, - "outputTokens": 2, - "latencyMs": 3622 + "outputTokens": 71, + "latencyMs": 7054.295208 }, { "questionId": "q14", @@ -1537,18 +1537,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 4, - "latencyMs": 1249 + "latencyMs": 1230.5032920000012 }, { "questionId": "q15", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", "correct": true, - "inputTokens": 6391, - "outputTokens": 7, - "latencyMs": 3269 + "inputTokens": 6390, + "outputTokens": 76, + "latencyMs": 2049.933416 }, { "questionId": "q15", @@ -1559,18 +1559,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 9, - "latencyMs": 1538 + "latencyMs": 1217.1906249999993 }, { "questionId": "q15", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", "correct": true, - "inputTokens": 2528, - "outputTokens": 7, - "latencyMs": 1413 + "inputTokens": 2527, + "outputTokens": 204, + "latencyMs": 2844.136208 }, { "questionId": "q15", @@ -1581,18 +1581,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 9, - "latencyMs": 1027 + "latencyMs": 2166.8829589999987 }, { "questionId": "q15", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", "correct": true, - "inputTokens": 2382, - "outputTokens": 7, - "latencyMs": 1257 + "inputTokens": 2381, + "outputTokens": 204, + "latencyMs": 2726.5934579999994 }, { "questionId": "q15", @@ -1603,18 +1603,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 9, - "latencyMs": 1169 + "latencyMs": 1107.4675410000018 }, { "questionId": "q15", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", "correct": true, - "inputTokens": 6317, - "outputTokens": 7, - "latencyMs": 1464 + "inputTokens": 6316, + "outputTokens": 76, + "latencyMs": 2260.4548749999994 }, { "questionId": "q15", @@ -1625,18 +1625,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 9, - "latencyMs": 1799 + "latencyMs": 1257.2797080000018 }, { "questionId": "q15", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", "correct": true, - "inputTokens": 5013, - "outputTokens": 7, - "latencyMs": 1616 + "inputTokens": 5012, + "outputTokens": 140, + "latencyMs": 2565.571791999999 }, { "questionId": "q15", @@ -1647,18 +1647,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 9, - "latencyMs": 1349 + "latencyMs": 1255.2880829999995 }, { "questionId": "q16", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89436", "actual": "89436", "correct": true, - "inputTokens": 6390, - "outputTokens": 3, - "latencyMs": 1298 + "inputTokens": 6389, + "outputTokens": 136, + "latencyMs": 2595.422042000002 }, { "questionId": "q16", @@ -1669,18 +1669,18 @@ "correct": true, "inputTokens": 7870, "outputTokens": 6, - "latencyMs": 1115 + "latencyMs": 1090.4299170000013 }, { "questionId": "q16", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89436", "actual": "89436", "correct": true, - "inputTokens": 2527, - "outputTokens": 3, - "latencyMs": 1180 + "inputTokens": 2526, + "outputTokens": 72, + "latencyMs": 2985.3881250000013 }, { "questionId": "q16", @@ -1691,18 +1691,18 @@ "correct": true, "inputTokens": 2982, "outputTokens": 6, - "latencyMs": 1110 + "latencyMs": 1521.227415999998 }, { "questionId": "q16", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89436", "actual": "89436", "correct": true, - "inputTokens": 2381, - "outputTokens": 3, - "latencyMs": 1235 + "inputTokens": 2380, + "outputTokens": 72, + "latencyMs": 2918.142082999999 }, { "questionId": "q16", @@ -1713,18 +1713,18 @@ "correct": true, "inputTokens": 2856, "outputTokens": 6, - "latencyMs": 1228 + "latencyMs": 1049.085916 }, { "questionId": "q16", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89436", "actual": "89436", "correct": true, - "inputTokens": 6316, - "outputTokens": 3, - "latencyMs": 1832 + "inputTokens": 6315, + "outputTokens": 136, + "latencyMs": 2414.9711669999997 }, { "questionId": "q16", @@ -1735,18 +1735,18 @@ "correct": true, "inputTokens": 6365, "outputTokens": 6, - "latencyMs": 1401 + "latencyMs": 1178.0064170000005 }, { "questionId": "q16", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89436", "actual": "89436", "correct": true, - "inputTokens": 5012, - "outputTokens": 3, - "latencyMs": 933 + "inputTokens": 5011, + "outputTokens": 72, + "latencyMs": 1772.788625000001 }, { "questionId": "q16", @@ -1757,18 +1757,18 @@ "correct": true, "inputTokens": 5760, "outputTokens": 6, - "latencyMs": 1570 + "latencyMs": 1134.7022499999985 }, { "questionId": "q17", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6393, - "outputTokens": 2, - "latencyMs": 1221 + "inputTokens": 6392, + "outputTokens": 135, + "latencyMs": 2528.6098330000023 }, { "questionId": "q17", @@ -1779,18 +1779,18 @@ "correct": true, "inputTokens": 7872, "outputTokens": 4, - "latencyMs": 1293 + "latencyMs": 1353.3026250000003 }, { "questionId": "q17", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2530, - "outputTokens": 2, - "latencyMs": 1147 + "inputTokens": 2529, + "outputTokens": 71, + "latencyMs": 2286.120999999999 }, { "questionId": "q17", @@ -1801,18 +1801,18 @@ "correct": true, "inputTokens": 2984, "outputTokens": 4, - "latencyMs": 923 + "latencyMs": 961.078292000002 }, { "questionId": "q17", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2384, - "outputTokens": 2, - "latencyMs": 1180 + "inputTokens": 2383, + "outputTokens": 71, + "latencyMs": 3445.204249999999 }, { "questionId": "q17", @@ -1823,18 +1823,18 @@ "correct": true, "inputTokens": 2858, "outputTokens": 4, - "latencyMs": 1025 + "latencyMs": 1003.445125000002 }, { "questionId": "q17", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6319, - "outputTokens": 2, - "latencyMs": 1748 + "inputTokens": 6318, + "outputTokens": 135, + "latencyMs": 2696.166874999999 }, { "questionId": "q17", @@ -1845,18 +1845,18 @@ "correct": true, "inputTokens": 6367, "outputTokens": 4, - "latencyMs": 1188 + "latencyMs": 1063.340791999999 }, { "questionId": "q17", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5015, - "outputTokens": 2, - "latencyMs": 1452 + "inputTokens": 5014, + "outputTokens": 135, + "latencyMs": 3367.6109579999975 }, { "questionId": "q17", @@ -1867,18 +1867,18 @@ "correct": true, "inputTokens": 5762, "outputTokens": 4, - "latencyMs": 1329 + "latencyMs": 1322.4013339999983 }, { "questionId": "q18", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", "correct": true, - "inputTokens": 6391, - "outputTokens": 6, - "latencyMs": 768 + "inputTokens": 6390, + "outputTokens": 139, + "latencyMs": 2745.6627499999995 }, { "questionId": "q18", @@ -1889,18 +1889,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 10, - "latencyMs": 1150 + "latencyMs": 1312.9286670000001 }, { "questionId": "q18", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", "correct": true, - "inputTokens": 2528, - "outputTokens": 6, - "latencyMs": 1501 + "inputTokens": 2527, + "outputTokens": 1483, + "latencyMs": 13678.859999999997 }, { "questionId": "q18", @@ -1911,18 +1911,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 10, - "latencyMs": 1201 + "latencyMs": 1030.3843339999985 }, { "questionId": "q18", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", "correct": true, - "inputTokens": 2382, - "outputTokens": 6, - "latencyMs": 1604 + "inputTokens": 2381, + "outputTokens": 139, + "latencyMs": 2223.2737909999996 }, { "questionId": "q18", @@ -1933,18 +1933,18 @@ "correct": true, "inputTokens": 2857, "outputTokens": 10, - "latencyMs": 1060 + "latencyMs": 1224.2647080000024 }, { "questionId": "q18", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", "correct": true, - "inputTokens": 6317, - "outputTokens": 6, - "latencyMs": 1350 + "inputTokens": 6316, + "outputTokens": 139, + "latencyMs": 3198.8672499999993 }, { "questionId": "q18", @@ -1955,18 +1955,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 10, - "latencyMs": 1154 + "latencyMs": 1234.557084 }, { "questionId": "q18", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", "correct": true, - "inputTokens": 5013, - "outputTokens": 6, - "latencyMs": 1199 + "inputTokens": 5012, + "outputTokens": 139, + "latencyMs": 2861.692708999999 }, { "questionId": "q18", @@ -1977,18 +1977,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 10, - "latencyMs": 1216 + "latencyMs": 1284.2591250000005 }, { "questionId": "q19", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "143365", "actual": "143365", "correct": true, - "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 1412 + "inputTokens": 6390, + "outputTokens": 136, + "latencyMs": 2741.803499999998 }, { "questionId": "q19", @@ -1999,18 +1999,18 @@ "correct": true, "inputTokens": 7872, "outputTokens": 6, - "latencyMs": 1908 + "latencyMs": 1096.6906249999993 }, { "questionId": "q19", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "143365", "actual": "143365", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 1366 + "inputTokens": 2527, + "outputTokens": 136, + "latencyMs": 3692.904416999998 }, { "questionId": "q19", @@ -2021,18 +2021,18 @@ "correct": true, "inputTokens": 2984, "outputTokens": 6, - "latencyMs": 1054 + "latencyMs": 1516.7794159999976 }, { "questionId": "q19", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "143365", "actual": "143365", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 1121 + "inputTokens": 2381, + "outputTokens": 392, + "latencyMs": 5068.4152909999975 }, { "questionId": "q19", @@ -2043,18 +2043,18 @@ "correct": true, "inputTokens": 2858, "outputTokens": 6, - "latencyMs": 1262 + "latencyMs": 1356.2728330000027 }, { "questionId": "q19", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "143365", "actual": "143365", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 4632 + "inputTokens": 6316, + "outputTokens": 136, + "latencyMs": 2866.8642500000024 }, { "questionId": "q19", @@ -2065,18 +2065,18 @@ "correct": true, "inputTokens": 6367, "outputTokens": 6, - "latencyMs": 1118 + "latencyMs": 1462.041624999998 }, { "questionId": "q19", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "143365", "actual": "143365", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 928 + "inputTokens": 5012, + "outputTokens": 72, + "latencyMs": 2320.320083999999 }, { "questionId": "q19", @@ -2087,18 +2087,18 @@ "correct": true, "inputTokens": 5762, "outputTokens": 6, - "latencyMs": 1191 + "latencyMs": 1082.976666999999 }, { "questionId": "q20", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6390, - "outputTokens": 2, - "latencyMs": 1053 + "inputTokens": 6389, + "outputTokens": 7, + "latencyMs": 2427.6330409999973 }, { "questionId": "q20", @@ -2109,18 +2109,18 @@ "correct": true, "inputTokens": 7868, "outputTokens": 4, - "latencyMs": 1096 + "latencyMs": 1108.7309170000008 }, { "questionId": "q20", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2527, - "outputTokens": 2, - "latencyMs": 1784 + "inputTokens": 2526, + "outputTokens": 71, + "latencyMs": 4405.948458000003 }, { "questionId": "q20", @@ -2131,18 +2131,18 @@ "correct": true, "inputTokens": 2980, "outputTokens": 4, - "latencyMs": 1093 + "latencyMs": 1235.6647919999996 }, { "questionId": "q20", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2381, - "outputTokens": 2, - "latencyMs": 1335 + "inputTokens": 2380, + "outputTokens": 71, + "latencyMs": 2528.553082999999 }, { "questionId": "q20", @@ -2153,18 +2153,18 @@ "correct": true, "inputTokens": 2854, "outputTokens": 4, - "latencyMs": 1546 + "latencyMs": 974.1328329999997 }, { "questionId": "q20", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6316, - "outputTokens": 2, - "latencyMs": 1293 + "inputTokens": 6315, + "outputTokens": 135, + "latencyMs": 2243.1775420000013 }, { "questionId": "q20", @@ -2175,18 +2175,18 @@ "correct": true, "inputTokens": 6363, "outputTokens": 4, - "latencyMs": 1230 + "latencyMs": 2416.867124999997 }, { "questionId": "q20", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5012, - "outputTokens": 2, - "latencyMs": 1467 + "inputTokens": 5011, + "outputTokens": 135, + "latencyMs": 2429.5548750000016 }, { "questionId": "q20", @@ -2197,18 +2197,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 4, - "latencyMs": 1370 + "latencyMs": 1257.326083 }, { "questionId": "q21", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", "correct": true, - "inputTokens": 6394, - "outputTokens": 6, - "latencyMs": 5026 + "inputTokens": 6393, + "outputTokens": 203, + "latencyMs": 4366.677041999996 }, { "questionId": "q21", @@ -2219,18 +2219,18 @@ "correct": true, "inputTokens": 7876, "outputTokens": 9, - "latencyMs": 1786 + "latencyMs": 1410.3295419999995 }, { "questionId": "q21", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", "correct": true, - "inputTokens": 2531, - "outputTokens": 6, - "latencyMs": 826 + "inputTokens": 2530, + "outputTokens": 75, + "latencyMs": 2834.2883330000004 }, { "questionId": "q21", @@ -2241,18 +2241,18 @@ "correct": true, "inputTokens": 2988, "outputTokens": 9, - "latencyMs": 909 + "latencyMs": 1023.437750000001 }, { "questionId": "q21", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", "correct": true, - "inputTokens": 2385, - "outputTokens": 6, - "latencyMs": 1120 + "inputTokens": 2384, + "outputTokens": 139, + "latencyMs": 3091.7722909999975 }, { "questionId": "q21", @@ -2263,18 +2263,18 @@ "correct": true, "inputTokens": 2862, "outputTokens": 9, - "latencyMs": 996 + "latencyMs": 1910.5562920000011 }, { "questionId": "q21", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", "correct": true, - "inputTokens": 6320, - "outputTokens": 6, - "latencyMs": 1639 + "inputTokens": 6319, + "outputTokens": 75, + "latencyMs": 2335.239207999999 }, { "questionId": "q21", @@ -2285,18 +2285,18 @@ "correct": true, "inputTokens": 6371, "outputTokens": 9, - "latencyMs": 1299 + "latencyMs": 1145.7144169999992 }, { "questionId": "q21", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", "correct": true, - "inputTokens": 5016, - "outputTokens": 6, - "latencyMs": 1151 + "inputTokens": 5015, + "outputTokens": 75, + "latencyMs": 2204.0944169999966 }, { "questionId": "q21", @@ -2307,18 +2307,18 @@ "correct": true, "inputTokens": 5766, "outputTokens": 9, - "latencyMs": 1246 + "latencyMs": 1102.2122499999969 }, { "questionId": "q22", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111314", "actual": "111314", "correct": true, - "inputTokens": 6392, - "outputTokens": 3, - "latencyMs": 1838 + "inputTokens": 6391, + "outputTokens": 200, + "latencyMs": 3785.0480830000015 }, { "questionId": "q22", @@ -2329,18 +2329,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 6, - "latencyMs": 1191 + "latencyMs": 1147.6056669999962 }, { "questionId": "q22", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111314", "actual": "111314", "correct": true, - "inputTokens": 2529, - "outputTokens": 3, - "latencyMs": 980 + "inputTokens": 2528, + "outputTokens": 72, + "latencyMs": 3996.1190410000054 }, { "questionId": "q22", @@ -2351,18 +2351,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 6, - "latencyMs": 1299 + "latencyMs": 1101.5621670000037 }, { "questionId": "q22", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111314", "actual": "111314", "correct": true, - "inputTokens": 2383, - "outputTokens": 3, - "latencyMs": 1027 + "inputTokens": 2382, + "outputTokens": 136, + "latencyMs": 2563.2732499999984 }, { "questionId": "q22", @@ -2373,18 +2373,18 @@ "correct": true, "inputTokens": 2857, "outputTokens": 6, - "latencyMs": 1433 + "latencyMs": 1224.5424589999966 }, { "questionId": "q22", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111314", "actual": "111314", "correct": true, - "inputTokens": 6318, - "outputTokens": 3, - "latencyMs": 2256 + "inputTokens": 6317, + "outputTokens": 136, + "latencyMs": 2436.8848329999964 }, { "questionId": "q22", @@ -2395,18 +2395,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 6, - "latencyMs": 1091 + "latencyMs": 1500.1066250000003 }, { "questionId": "q22", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111314", "actual": "111314", "correct": true, - "inputTokens": 5014, - "outputTokens": 3, - "latencyMs": 1288 + "inputTokens": 5013, + "outputTokens": 72, + "latencyMs": 2529.925833000001 }, { "questionId": "q22", @@ -2417,18 +2417,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 6, - "latencyMs": 1306 + "latencyMs": 1701.0276660000018 }, { "questionId": "q23", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6389, - "outputTokens": 2, - "latencyMs": 1951 + "inputTokens": 6388, + "outputTokens": 135, + "latencyMs": 3078.5496249999997 }, { "questionId": "q23", @@ -2439,18 +2439,18 @@ "correct": true, "inputTokens": 7868, "outputTokens": 4, - "latencyMs": 1440 + "latencyMs": 1224.1848329999993 }, { "questionId": "q23", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2526, - "outputTokens": 2, - "latencyMs": 978 + "inputTokens": 2525, + "outputTokens": 71, + "latencyMs": 2287.0156669999997 }, { "questionId": "q23", @@ -2461,18 +2461,18 @@ "correct": true, "inputTokens": 2980, "outputTokens": 4, - "latencyMs": 1385 + "latencyMs": 1209.1454999999987 }, { "questionId": "q23", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2380, - "outputTokens": 2, - "latencyMs": 2311 + "inputTokens": 2379, + "outputTokens": 71, + "latencyMs": 2059.012499999997 }, { "questionId": "q23", @@ -2483,18 +2483,18 @@ "correct": true, "inputTokens": 2854, "outputTokens": 4, - "latencyMs": 1066 + "latencyMs": 1393.596375000001 }, { "questionId": "q23", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6315, - "outputTokens": 2, - "latencyMs": 1914 + "inputTokens": 6314, + "outputTokens": 71, + "latencyMs": 1858.8989159999983 }, { "questionId": "q23", @@ -2505,18 +2505,18 @@ "correct": true, "inputTokens": 6363, "outputTokens": 4, - "latencyMs": 1596 + "latencyMs": 1193.9375419999997 }, { "questionId": "q23", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5011, - "outputTokens": 2, - "latencyMs": 1820 + "inputTokens": 5010, + "outputTokens": 135, + "latencyMs": 2755.0157499999987 }, { "questionId": "q23", @@ -2527,18 +2527,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 4, - "latencyMs": 1067 + "latencyMs": 1366.030666999999 }, { "questionId": "q24", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", "correct": true, - "inputTokens": 6391, - "outputTokens": 6, - "latencyMs": 2594 + "inputTokens": 6390, + "outputTokens": 395, + "latencyMs": 4352.137999999999 }, { "questionId": "q24", @@ -2549,18 +2549,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 10, - "latencyMs": 1139 + "latencyMs": 1093.9707500000004 }, { "questionId": "q24", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", "correct": true, - "inputTokens": 2528, - "outputTokens": 6, - "latencyMs": 1225 + "inputTokens": 2527, + "outputTokens": 139, + "latencyMs": 2481.934500000003 }, { "questionId": "q24", @@ -2571,18 +2571,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 10, - "latencyMs": 1082 + "latencyMs": 1262.3894579999978 }, { "questionId": "q24", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", "correct": true, - "inputTokens": 2382, - "outputTokens": 6, - "latencyMs": 4857 + "inputTokens": 2381, + "outputTokens": 75, + "latencyMs": 2360.7159170000014 }, { "questionId": "q24", @@ -2593,18 +2593,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 10, - "latencyMs": 1082 + "latencyMs": 1462.5894999999946 }, { "questionId": "q24", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", "correct": true, - "inputTokens": 6317, - "outputTokens": 6, - "latencyMs": 1272 + "inputTokens": 6316, + "outputTokens": 75, + "latencyMs": 3247.478041000002 }, { "questionId": "q24", @@ -2615,18 +2615,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 10, - "latencyMs": 1201 + "latencyMs": 1693.1597089999996 }, { "questionId": "q24", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", "correct": true, - "inputTokens": 5013, - "outputTokens": 6, - "latencyMs": 1197 + "inputTokens": 5012, + "outputTokens": 75, + "latencyMs": 1726.2765839999993 }, { "questionId": "q24", @@ -2637,18 +2637,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 10, - "latencyMs": 1198 + "latencyMs": 1605.044458000004 }, { "questionId": "q25", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89553", "actual": "89553", "correct": true, - "inputTokens": 6392, - "outputTokens": 3, - "latencyMs": 1085 + "inputTokens": 6391, + "outputTokens": 136, + "latencyMs": 2263.1207090000025 }, { "questionId": "q25", @@ -2659,18 +2659,18 @@ "correct": true, "inputTokens": 7873, "outputTokens": 6, - "latencyMs": 1102 + "latencyMs": 3789.016875000001 }, { "questionId": "q25", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89553", "actual": "89553", "correct": true, - "inputTokens": 2529, - "outputTokens": 3, - "latencyMs": 1350 + "inputTokens": 2528, + "outputTokens": 72, + "latencyMs": 1829.9641669999983 }, { "questionId": "q25", @@ -2681,18 +2681,18 @@ "correct": true, "inputTokens": 2985, "outputTokens": 6, - "latencyMs": 1300 + "latencyMs": 989.6153750000012 }, { "questionId": "q25", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89553", "actual": "89553", "correct": true, - "inputTokens": 2383, - "outputTokens": 3, - "latencyMs": 998 + "inputTokens": 2382, + "outputTokens": 72, + "latencyMs": 2717.4773339999956 }, { "questionId": "q25", @@ -2703,18 +2703,18 @@ "correct": true, "inputTokens": 2859, "outputTokens": 6, - "latencyMs": 972 + "latencyMs": 1717.8889999999956 }, { "questionId": "q25", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89553", - "actual": "89553", - "correct": true, - "inputTokens": 6318, - "outputTokens": 3, - "latencyMs": 1331 + "actual": "46730", + "correct": false, + "inputTokens": 6317, + "outputTokens": 72, + "latencyMs": 5490.572667 }, { "questionId": "q25", @@ -2725,18 +2725,18 @@ "correct": true, "inputTokens": 6368, "outputTokens": 6, - "latencyMs": 1027 + "latencyMs": 1427.4055000000008 }, { "questionId": "q25", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89553", "actual": "89553", "correct": true, - "inputTokens": 5014, - "outputTokens": 3, - "latencyMs": 1170 + "inputTokens": 5013, + "outputTokens": 264, + "latencyMs": 4052.875957999997 }, { "questionId": "q25", @@ -2747,18 +2747,18 @@ "correct": true, "inputTokens": 5763, "outputTokens": 6, - "latencyMs": 1074 + "latencyMs": 1586.255124999996 }, { "questionId": "q26", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6389, - "outputTokens": 2, - "latencyMs": 1862 + "inputTokens": 6388, + "outputTokens": 135, + "latencyMs": 3787.343541000002 }, { "questionId": "q26", @@ -2769,18 +2769,18 @@ "correct": true, "inputTokens": 7866, "outputTokens": 4, - "latencyMs": 1435 + "latencyMs": 1196.934000000001 }, { "questionId": "q26", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2526, - "outputTokens": 2, - "latencyMs": 989 + "inputTokens": 2525, + "outputTokens": 71, + "latencyMs": 2172.2377080000006 }, { "questionId": "q26", @@ -2791,18 +2791,18 @@ "correct": true, "inputTokens": 2978, "outputTokens": 4, - "latencyMs": 1035 + "latencyMs": 1112.6987080000035 }, { "questionId": "q26", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2380, - "outputTokens": 2, - "latencyMs": 2157 + "inputTokens": 2379, + "outputTokens": 71, + "latencyMs": 2074.6067919999987 }, { "questionId": "q26", @@ -2813,18 +2813,18 @@ "correct": true, "inputTokens": 2852, "outputTokens": 4, - "latencyMs": 1094 + "latencyMs": 1202.2165000000023 }, { "questionId": "q26", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6315, - "outputTokens": 2, - "latencyMs": 1912 + "inputTokens": 6314, + "outputTokens": 135, + "latencyMs": 3257.5967080000046 }, { "questionId": "q26", @@ -2835,18 +2835,18 @@ "correct": true, "inputTokens": 6361, "outputTokens": 4, - "latencyMs": 1364 + "latencyMs": 1316.7435000000041 }, { "questionId": "q26", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5011, - "outputTokens": 2, - "latencyMs": 1435 + "inputTokens": 5010, + "outputTokens": 71, + "latencyMs": 2391.9063749999987 }, { "questionId": "q26", @@ -2857,128 +2857,128 @@ "correct": true, "inputTokens": 5756, "outputTokens": 4, - "latencyMs": 1082 + "latencyMs": 1208.8820829999968 }, { "questionId": "q27", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", "correct": true, - "inputTokens": 6392, - "outputTokens": 9, - "latencyMs": 1274 - }, - { - "questionId": "q27", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 7871, - "outputTokens": 14, - "latencyMs": 1130 - }, - { - "questionId": "q27", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 2529, - "outputTokens": 9, - "latencyMs": 1795 - }, - { - "questionId": "q27", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 2983, - "outputTokens": 14, - "latencyMs": 1309 - }, - { - "questionId": "q27", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 2383, - "outputTokens": 9, - "latencyMs": 1406 - }, - { - "questionId": "q27", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 2857, - "outputTokens": 14, - "latencyMs": 1398 - }, - { - "questionId": "q27", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 6318, - "outputTokens": 9, - "latencyMs": 1114 - }, - { - "questionId": "q27", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 6366, - "outputTokens": 14, - "latencyMs": 1251 - }, - { - "questionId": "q27", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 5014, - "outputTokens": 9, - "latencyMs": 1941 - }, - { - "questionId": "q27", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "jayme.kertzmann77@gmail.com", - "actual": "jayme.kertzmann77@gmail.com", - "correct": true, - "inputTokens": 5761, - "outputTokens": 14, - "latencyMs": 1218 - }, - { - "questionId": "q28", - "format": "json", - "model": "gpt-4o-mini", - "expected": "104053", - "actual": "104053", - "correct": true, "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 1395 + "outputTokens": 142, + "latencyMs": 2735.679790999995 + }, + { + "questionId": "q27", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 7871, + "outputTokens": 14, + "latencyMs": 1253.706624999999 + }, + { + "questionId": "q27", + "format": "toon", + "model": "gpt-5-nano", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2528, + "outputTokens": 142, + "latencyMs": 2471.819457999998 + }, + { + "questionId": "q27", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2983, + "outputTokens": 14, + "latencyMs": 1063.2195409999986 + }, + { + "questionId": "q27", + "format": "csv", + "model": "gpt-5-nano", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2382, + "outputTokens": 142, + "latencyMs": 2061.6382500000036 + }, + { + "questionId": "q27", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 2857, + "outputTokens": 14, + "latencyMs": 1877.579082999997 + }, + { + "questionId": "q27", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 6317, + "outputTokens": 142, + "latencyMs": 3448.810375000001 + }, + { + "questionId": "q27", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 6366, + "outputTokens": 14, + "latencyMs": 1265.9410419999986 + }, + { + "questionId": "q27", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 5013, + "outputTokens": 78, + "latencyMs": 2152.5591669999994 + }, + { + "questionId": "q27", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "jayme.kertzmann77@gmail.com", + "actual": "jayme.kertzmann77@gmail.com", + "correct": true, + "inputTokens": 5761, + "outputTokens": 14, + "latencyMs": 1432.513583 + }, + { + "questionId": "q28", + "format": "json", + "model": "gpt-5-nano", + "expected": "104053", + "actual": "104053", + "correct": true, + "inputTokens": 6390, + "outputTokens": 136, + "latencyMs": 2707.4454169999954 }, { "questionId": "q28", @@ -2989,18 +2989,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 6, - "latencyMs": 1342 + "latencyMs": 1568.5869169999933 }, { "questionId": "q28", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "104053", "actual": "104053", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 919 + "inputTokens": 2527, + "outputTokens": 136, + "latencyMs": 2373.4566669999986 }, { "questionId": "q28", @@ -3011,18 +3011,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 6, - "latencyMs": 1187 + "latencyMs": 1525.172749999998 }, { "questionId": "q28", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "104053", "actual": "104053", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 1131 + "inputTokens": 2381, + "outputTokens": 136, + "latencyMs": 9347.989583000002 }, { "questionId": "q28", @@ -3033,18 +3033,18 @@ "correct": true, "inputTokens": 2857, "outputTokens": 6, - "latencyMs": 1191 + "latencyMs": 1748.783334000007 }, { "questionId": "q28", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "104053", "actual": "104053", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 1435 + "inputTokens": 6316, + "outputTokens": 72, + "latencyMs": 1929.517458000002 }, { "questionId": "q28", @@ -3055,18 +3055,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 6, - "latencyMs": 1095 + "latencyMs": 1022.1345000000001 }, { "questionId": "q28", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "104053", "actual": "104053", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 4588 + "inputTokens": 5012, + "outputTokens": 136, + "latencyMs": 2102.925624999996 }, { "questionId": "q28", @@ -3077,18 +3077,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 6, - "latencyMs": 1291 + "latencyMs": 1471.7255839999998 }, { "questionId": "q29", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6392, - "outputTokens": 2, - "latencyMs": 1688 + "inputTokens": 6391, + "outputTokens": 71, + "latencyMs": 1983.693041999999 }, { "questionId": "q29", @@ -3099,18 +3099,18 @@ "correct": true, "inputTokens": 7872, "outputTokens": 4, - "latencyMs": 1301 + "latencyMs": 1077.2119579999999 }, { "questionId": "q29", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2529, - "outputTokens": 2, - "latencyMs": 1914 + "inputTokens": 2528, + "outputTokens": 71, + "latencyMs": 2549.1221250000017 }, { "questionId": "q29", @@ -3121,18 +3121,18 @@ "correct": true, "inputTokens": 2984, "outputTokens": 4, - "latencyMs": 1447 + "latencyMs": 921.1110840000038 }, { "questionId": "q29", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2383, - "outputTokens": 2, - "latencyMs": 1725 + "inputTokens": 2382, + "outputTokens": 135, + "latencyMs": 4070.615666999998 }, { "questionId": "q29", @@ -3143,18 +3143,18 @@ "correct": true, "inputTokens": 2858, "outputTokens": 4, - "latencyMs": 923 + "latencyMs": 974.754832999999 }, { "questionId": "q29", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6318, - "outputTokens": 2, - "latencyMs": 879 + "inputTokens": 6317, + "outputTokens": 135, + "latencyMs": 2665.842083000003 }, { "questionId": "q29", @@ -3165,18 +3165,18 @@ "correct": true, "inputTokens": 6367, "outputTokens": 4, - "latencyMs": 1322 + "latencyMs": 1081.2904160000035 }, { "questionId": "q29", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5014, - "outputTokens": 2, - "latencyMs": 1394 + "inputTokens": 5013, + "outputTokens": 135, + "latencyMs": 2897.919332999998 }, { "questionId": "q29", @@ -3187,18 +3187,18 @@ "correct": true, "inputTokens": 5762, "outputTokens": 4, - "latencyMs": 1008 + "latencyMs": 1341.0955420000028 }, { "questionId": "q30", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", "correct": true, - "inputTokens": 6391, - "outputTokens": 7, - "latencyMs": 894 + "inputTokens": 6390, + "outputTokens": 204, + "latencyMs": 3231.9646249999932 }, { "questionId": "q30", @@ -3209,18 +3209,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 12, - "latencyMs": 1220 + "latencyMs": 1288.5363330000037 }, { "questionId": "q30", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", "correct": true, - "inputTokens": 2528, - "outputTokens": 7, - "latencyMs": 2225 + "inputTokens": 2527, + "outputTokens": 76, + "latencyMs": 2581.508915999999 }, { "questionId": "q30", @@ -3231,18 +3231,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 12, - "latencyMs": 1282 + "latencyMs": 1183.8337079999983 }, { "questionId": "q30", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", "correct": true, - "inputTokens": 2382, - "outputTokens": 7, - "latencyMs": 1414 + "inputTokens": 2381, + "outputTokens": 140, + "latencyMs": 2073.944792000002 }, { "questionId": "q30", @@ -3253,18 +3253,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 12, - "latencyMs": 1686 + "latencyMs": 1302.5857499999984 }, { "questionId": "q30", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", "correct": true, - "inputTokens": 6317, - "outputTokens": 7, - "latencyMs": 1113 + "inputTokens": 6316, + "outputTokens": 204, + "latencyMs": 3076.5304590000014 }, { "questionId": "q30", @@ -3275,18 +3275,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 12, - "latencyMs": 1089 + "latencyMs": 1110.9787920000017 }, { "questionId": "q30", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", "correct": true, - "inputTokens": 5013, - "outputTokens": 7, - "latencyMs": 949 + "inputTokens": 5012, + "outputTokens": 76, + "latencyMs": 3381.732917000001 }, { "questionId": "q30", @@ -3297,18 +3297,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 12, - "latencyMs": 1273 + "latencyMs": 1198.1488329999993 }, { "questionId": "q31", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "142029", "actual": "142029", "correct": true, - "inputTokens": 6394, - "outputTokens": 3, - "latencyMs": 4741 + "inputTokens": 6393, + "outputTokens": 136, + "latencyMs": 2687.965959000001 }, { "questionId": "q31", @@ -3319,18 +3319,18 @@ "correct": true, "inputTokens": 7874, "outputTokens": 6, - "latencyMs": 1132 + "latencyMs": 2615.956250000003 }, { "questionId": "q31", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "142029", "actual": "142029", "correct": true, - "inputTokens": 2531, - "outputTokens": 3, - "latencyMs": 1184 + "inputTokens": 2530, + "outputTokens": 136, + "latencyMs": 2132.413249999998 }, { "questionId": "q31", @@ -3341,18 +3341,18 @@ "correct": true, "inputTokens": 2986, "outputTokens": 6, - "latencyMs": 1137 + "latencyMs": 1091.060666999998 }, { "questionId": "q31", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "142029", "actual": "142029", "correct": true, - "inputTokens": 2385, - "outputTokens": 3, - "latencyMs": 963 + "inputTokens": 2384, + "outputTokens": 72, + "latencyMs": 2074.8201670000053 }, { "questionId": "q31", @@ -3363,18 +3363,18 @@ "correct": true, "inputTokens": 2860, "outputTokens": 6, - "latencyMs": 1096 + "latencyMs": 1622.2757499999934 }, { "questionId": "q31", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "142029", "actual": "142029", "correct": true, - "inputTokens": 6320, - "outputTokens": 3, - "latencyMs": 1399 + "inputTokens": 6319, + "outputTokens": 200, + "latencyMs": 3122.3756670000002 }, { "questionId": "q31", @@ -3385,18 +3385,18 @@ "correct": true, "inputTokens": 6369, "outputTokens": 6, - "latencyMs": 1594 + "latencyMs": 1175.7301249999946 }, { "questionId": "q31", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "142029", "actual": "142029", "correct": true, - "inputTokens": 5016, - "outputTokens": 3, - "latencyMs": 1900 + "inputTokens": 5015, + "outputTokens": 136, + "latencyMs": 2601.074916999998 }, { "questionId": "q31", @@ -3407,18 +3407,18 @@ "correct": true, "inputTokens": 5764, "outputTokens": 6, - "latencyMs": 1274 + "latencyMs": 1089.4757079999981 }, { "questionId": "q32", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", - "actual": "Sales", - "correct": false, - "inputTokens": 6390, - "outputTokens": 2, - "latencyMs": 5224 + "actual": "Marketing", + "correct": true, + "inputTokens": 6389, + "outputTokens": 135, + "latencyMs": 6939.617750000005 }, { "questionId": "q32", @@ -3429,18 +3429,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 4, - "latencyMs": 1038 + "latencyMs": 1207.9619999999995 }, { "questionId": "q32", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2527, - "outputTokens": 2, - "latencyMs": 1902 + "inputTokens": 2526, + "outputTokens": 135, + "latencyMs": 2784.063166 }, { "questionId": "q32", @@ -3451,18 +3451,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 4, - "latencyMs": 1010 + "latencyMs": 1011.0956670000014 }, { "questionId": "q32", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2381, - "outputTokens": 2, - "latencyMs": 3263 + "inputTokens": 2380, + "outputTokens": 135, + "latencyMs": 3098.7147909999985 }, { "questionId": "q32", @@ -3473,18 +3473,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 4, - "latencyMs": 871 + "latencyMs": 983.9449170000007 }, { "questionId": "q32", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", - "actual": "Sales", - "correct": false, - "inputTokens": 6316, - "outputTokens": 2, - "latencyMs": 1278 + "actual": "Marketing", + "correct": true, + "inputTokens": 6315, + "outputTokens": 135, + "latencyMs": 3889.572291999997 }, { "questionId": "q32", @@ -3495,18 +3495,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 4, - "latencyMs": 1048 + "latencyMs": 1096.1613339999967 }, { "questionId": "q32", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", - "actual": "Sales", - "correct": false, - "inputTokens": 5012, - "outputTokens": 2, - "latencyMs": 1271 + "actual": "Marketing", + "correct": true, + "inputTokens": 5011, + "outputTokens": 71, + "latencyMs": 2484.078917000006 }, { "questionId": "q32", @@ -3517,18 +3517,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 4, - "latencyMs": 1075 + "latencyMs": 1150.418792000004 }, { "questionId": "q33", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", "correct": true, - "inputTokens": 6394, - "outputTokens": 7, - "latencyMs": 1139 + "inputTokens": 6393, + "outputTokens": 140, + "latencyMs": 2221.4447079999954 }, { "questionId": "q33", @@ -3539,18 +3539,18 @@ "correct": true, "inputTokens": 7872, "outputTokens": 14, - "latencyMs": 1319 + "latencyMs": 1193.9583749999947 }, { "questionId": "q33", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", "correct": true, - "inputTokens": 2531, - "outputTokens": 7, - "latencyMs": 1856 + "inputTokens": 2530, + "outputTokens": 76, + "latencyMs": 2170.8865829999995 }, { "questionId": "q33", @@ -3561,18 +3561,18 @@ "correct": true, "inputTokens": 2984, "outputTokens": 14, - "latencyMs": 1393 + "latencyMs": 1247.6116660000043 }, { "questionId": "q33", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", "correct": true, - "inputTokens": 2385, - "outputTokens": 7, - "latencyMs": 1766 + "inputTokens": 2384, + "outputTokens": 76, + "latencyMs": 3827.705667000002 }, { "questionId": "q33", @@ -3583,18 +3583,18 @@ "correct": true, "inputTokens": 2858, "outputTokens": 14, - "latencyMs": 1609 + "latencyMs": 1084.8218339999949 }, { "questionId": "q33", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", "correct": true, - "inputTokens": 6320, - "outputTokens": 7, - "latencyMs": 1329 + "inputTokens": 6319, + "outputTokens": 140, + "latencyMs": 3311.8220839999994 }, { "questionId": "q33", @@ -3605,18 +3605,18 @@ "correct": true, "inputTokens": 6367, "outputTokens": 14, - "latencyMs": 1178 + "latencyMs": 1269.2092920000068 }, { "questionId": "q33", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", "correct": true, - "inputTokens": 5016, - "outputTokens": 7, - "latencyMs": 1890 + "inputTokens": 5015, + "outputTokens": 140, + "latencyMs": 2648.3102500000023 }, { "questionId": "q33", @@ -3627,194 +3627,194 @@ "correct": true, "inputTokens": 5762, "outputTokens": 14, - "latencyMs": 1326 + "latencyMs": 1278.0403750000041 }, { "questionId": "q34", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "84650", "actual": "84650", "correct": true, - "inputTokens": 6392, - "outputTokens": 3, - "latencyMs": 1898 - }, - { - "questionId": "q34", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 7871, - "outputTokens": 6, - "latencyMs": 1074 - }, - { - "questionId": "q34", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 2529, - "outputTokens": 3, - "latencyMs": 1382 - }, - { - "questionId": "q34", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 2983, - "outputTokens": 6, - "latencyMs": 1060 - }, - { - "questionId": "q34", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 2383, - "outputTokens": 3, - "latencyMs": 1286 - }, - { - "questionId": "q34", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 2857, - "outputTokens": 6, - "latencyMs": 1591 - }, - { - "questionId": "q34", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 6318, - "outputTokens": 3, - "latencyMs": 2158 - }, - { - "questionId": "q34", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 6366, - "outputTokens": 6, - "latencyMs": 1532 - }, - { - "questionId": "q34", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 5014, - "outputTokens": 3, - "latencyMs": 1381 - }, - { - "questionId": "q34", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "84650", - "actual": "84650", - "correct": true, - "inputTokens": 5761, - "outputTokens": 6, - "latencyMs": 2262 - }, - { - "questionId": "q35", - "format": "json", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", - "correct": true, "inputTokens": 6391, - "outputTokens": 2, - "latencyMs": 2664 + "outputTokens": 136, + "latencyMs": 3555.1511670000036 }, { - "questionId": "q35", + "questionId": "q34", "format": "json", "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 7871, - "outputTokens": 4, - "latencyMs": 1260 + "outputTokens": 6, + "latencyMs": 1317.5797499999971 }, { - "questionId": "q35", + "questionId": "q34", "format": "toon", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", + "model": "gpt-5-nano", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 2528, - "outputTokens": 2, - "latencyMs": 1563 + "outputTokens": 136, + "latencyMs": 2291.943041999999 }, { - "questionId": "q35", + "questionId": "q34", "format": "toon", "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 2983, - "outputTokens": 4, - "latencyMs": 1415 + "outputTokens": 6, + "latencyMs": 2081.3947499999995 }, { - "questionId": "q35", + "questionId": "q34", "format": "csv", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", + "model": "gpt-5-nano", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 2382, - "outputTokens": 2, - "latencyMs": 1038 + "outputTokens": 72, + "latencyMs": 2067.9348329999993 }, { - "questionId": "q35", + "questionId": "q34", "format": "csv", "model": "claude-haiku-4-5", - "expected": "Marketing", - "actual": "Marketing", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 2857, - "outputTokens": 4, - "latencyMs": 1021 + "outputTokens": 6, + "latencyMs": 1192.6603340000001 }, { - "questionId": "q35", + "questionId": "q34", "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "Marketing", - "actual": "Marketing", + "model": "gpt-5-nano", + "expected": "84650", + "actual": "84650", "correct": true, "inputTokens": 6317, - "outputTokens": 2, - "latencyMs": 4276 + "outputTokens": 200, + "latencyMs": 3044.592457999999 + }, + { + "questionId": "q34", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 6366, + "outputTokens": 6, + "latencyMs": 1106.2235409999994 + }, + { + "questionId": "q34", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 5013, + "outputTokens": 136, + "latencyMs": 2627.8240000000005 + }, + { + "questionId": "q34", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "84650", + "actual": "84650", + "correct": true, + "inputTokens": 5761, + "outputTokens": 6, + "latencyMs": 1379.9015 + }, + { + "questionId": "q35", + "format": "json", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6390, + "outputTokens": 263, + "latencyMs": 3705.3900829999984 + }, + { + "questionId": "q35", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 7871, + "outputTokens": 4, + "latencyMs": 1909.4442500000005 + }, + { + "questionId": "q35", + "format": "toon", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2527, + "outputTokens": 135, + "latencyMs": 2173.6019589999996 + }, + { + "questionId": "q35", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2983, + "outputTokens": 4, + "latencyMs": 1063.8584580000024 + }, + { + "questionId": "q35", + "format": "csv", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2381, + "outputTokens": 71, + "latencyMs": 1800.4930420000019 + }, + { + "questionId": "q35", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 2857, + "outputTokens": 4, + "latencyMs": 1011.3969579999975 + }, + { + "questionId": "q35", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "Marketing", + "actual": "Marketing", + "correct": true, + "inputTokens": 6316, + "outputTokens": 135, + "latencyMs": 2562.2492500000008 }, { "questionId": "q35", @@ -3825,18 +3825,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 4, - "latencyMs": 1301 + "latencyMs": 1349.1809170000051 }, { "questionId": "q35", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5013, - "outputTokens": 2, - "latencyMs": 1399 + "inputTokens": 5012, + "outputTokens": 71, + "latencyMs": 1883.7523750000037 }, { "questionId": "q35", @@ -3847,18 +3847,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 4, - "latencyMs": 1197 + "latencyMs": 1135.412292000001 }, { "questionId": "q36", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", "correct": true, - "inputTokens": 6390, - "outputTokens": 9, - "latencyMs": 1390 + "inputTokens": 6389, + "outputTokens": 334, + "latencyMs": 4067.161957999997 }, { "questionId": "q36", @@ -3869,18 +3869,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 14, - "latencyMs": 1482 + "latencyMs": 1333.0713749999995 }, { "questionId": "q36", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", "correct": true, - "inputTokens": 2527, - "outputTokens": 9, - "latencyMs": 1754 + "inputTokens": 2526, + "outputTokens": 142, + "latencyMs": 2081.8315000000002 }, { "questionId": "q36", @@ -3891,18 +3891,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 14, - "latencyMs": 1100 + "latencyMs": 1231.0224579999995 }, { "questionId": "q36", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", "correct": true, - "inputTokens": 2381, - "outputTokens": 9, - "latencyMs": 1421 + "inputTokens": 2380, + "outputTokens": 78, + "latencyMs": 2333.0360409999994 }, { "questionId": "q36", @@ -3913,18 +3913,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 14, - "latencyMs": 2173 + "latencyMs": 1175.1937500000058 }, { "questionId": "q36", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", "correct": true, - "inputTokens": 6316, - "outputTokens": 9, - "latencyMs": 2911 + "inputTokens": 6315, + "outputTokens": 206, + "latencyMs": 7391.094749999997 }, { "questionId": "q36", @@ -3935,18 +3935,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 14, - "latencyMs": 1235 + "latencyMs": 1843.981458000002 }, { "questionId": "q36", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", "correct": true, - "inputTokens": 5012, - "outputTokens": 9, - "latencyMs": 1303 + "inputTokens": 5011, + "outputTokens": 142, + "latencyMs": 2386.8134589999972 }, { "questionId": "q36", @@ -3957,18 +3957,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 14, - "latencyMs": 1148 + "latencyMs": 1449.751750000003 }, { "questionId": "q37", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89773", "actual": "89773", "correct": true, - "inputTokens": 6390, - "outputTokens": 3, - "latencyMs": 1430 + "inputTokens": 6389, + "outputTokens": 136, + "latencyMs": 4075.600666999999 }, { "questionId": "q37", @@ -3979,18 +3979,18 @@ "correct": true, "inputTokens": 7868, "outputTokens": 6, - "latencyMs": 1089 + "latencyMs": 985.1729999999952 }, { "questionId": "q37", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89773", "actual": "89773", "correct": true, - "inputTokens": 2527, - "outputTokens": 3, - "latencyMs": 1059 + "inputTokens": 2526, + "outputTokens": 136, + "latencyMs": 2891.2602079999997 }, { "questionId": "q37", @@ -4001,18 +4001,18 @@ "correct": true, "inputTokens": 2980, "outputTokens": 6, - "latencyMs": 1057 + "latencyMs": 2073.129000000001 }, { "questionId": "q37", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89773", "actual": "89773", "correct": true, - "inputTokens": 2381, - "outputTokens": 3, - "latencyMs": 1716 + "inputTokens": 2380, + "outputTokens": 72, + "latencyMs": 1894.3316669999986 }, { "questionId": "q37", @@ -4023,18 +4023,18 @@ "correct": true, "inputTokens": 2854, "outputTokens": 6, - "latencyMs": 904 + "latencyMs": 1172.3735000000015 }, { "questionId": "q37", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89773", "actual": "89773", "correct": true, - "inputTokens": 6316, - "outputTokens": 3, - "latencyMs": 2950 + "inputTokens": 6315, + "outputTokens": 72, + "latencyMs": 2456.6511249999967 }, { "questionId": "q37", @@ -4045,18 +4045,18 @@ "correct": true, "inputTokens": 6363, "outputTokens": 6, - "latencyMs": 1189 + "latencyMs": 1298.1367079999982 }, { "questionId": "q37", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "89773", "actual": "89773", "correct": true, - "inputTokens": 5012, - "outputTokens": 3, - "latencyMs": 1050 + "inputTokens": 5011, + "outputTokens": 136, + "latencyMs": 6018.304375 }, { "questionId": "q37", @@ -4067,18 +4067,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 6, - "latencyMs": 1329 + "latencyMs": 1103.9152499999982 }, { "questionId": "q38", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6390, - "outputTokens": 2, - "latencyMs": 3410 + "inputTokens": 6389, + "outputTokens": 71, + "latencyMs": 3867.303832999998 }, { "questionId": "q38", @@ -4089,18 +4089,18 @@ "correct": true, "inputTokens": 7868, "outputTokens": 4, - "latencyMs": 1891 + "latencyMs": 1287.7528749999983 }, { "questionId": "q38", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2527, - "outputTokens": 2, - "latencyMs": 1010 + "inputTokens": 2526, + "outputTokens": 135, + "latencyMs": 2355.0305829999998 }, { "questionId": "q38", @@ -4111,18 +4111,18 @@ "correct": true, "inputTokens": 2980, "outputTokens": 4, - "latencyMs": 988 + "latencyMs": 1086.8424579999992 }, { "questionId": "q38", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 2381, - "outputTokens": 2, - "latencyMs": 1364 + "inputTokens": 2380, + "outputTokens": 71, + "latencyMs": 3472.6323339999944 }, { "questionId": "q38", @@ -4133,18 +4133,18 @@ "correct": true, "inputTokens": 2854, "outputTokens": 4, - "latencyMs": 1395 + "latencyMs": 948.3086249999978 }, { "questionId": "q38", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 6316, - "outputTokens": 2, - "latencyMs": 2293 + "inputTokens": 6315, + "outputTokens": 71, + "latencyMs": 3343.3446659999972 }, { "questionId": "q38", @@ -4155,18 +4155,18 @@ "correct": true, "inputTokens": 6363, "outputTokens": 4, - "latencyMs": 1137 + "latencyMs": 1048.567959 }, { "questionId": "q38", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", "correct": true, - "inputTokens": 5012, - "outputTokens": 2, - "latencyMs": 1451 + "inputTokens": 5011, + "outputTokens": 71, + "latencyMs": 3761.141875000001 }, { "questionId": "q38", @@ -4177,18 +4177,18 @@ "correct": true, "inputTokens": 5758, "outputTokens": 4, - "latencyMs": 1100 + "latencyMs": 1130.9393339999951 }, { "questionId": "q39", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", "correct": true, - "inputTokens": 6390, - "outputTokens": 10, - "latencyMs": 1674 + "inputTokens": 6389, + "outputTokens": 79, + "latencyMs": 4200.215792000003 }, { "questionId": "q39", @@ -4199,18 +4199,18 @@ "correct": true, "inputTokens": 7869, "outputTokens": 13, - "latencyMs": 1403 + "latencyMs": 1351.981166999998 }, { "questionId": "q39", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", "correct": true, - "inputTokens": 2527, - "outputTokens": 10, - "latencyMs": 1413 + "inputTokens": 2526, + "outputTokens": 143, + "latencyMs": 2465.4245840000003 }, { "questionId": "q39", @@ -4221,18 +4221,18 @@ "correct": true, "inputTokens": 2981, "outputTokens": 13, - "latencyMs": 1200 + "latencyMs": 885.4770840000056 }, { "questionId": "q39", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", "correct": true, - "inputTokens": 2381, - "outputTokens": 10, - "latencyMs": 1730 + "inputTokens": 2380, + "outputTokens": 143, + "latencyMs": 2903.201958000005 }, { "questionId": "q39", @@ -4243,18 +4243,18 @@ "correct": true, "inputTokens": 2855, "outputTokens": 13, - "latencyMs": 1226 + "latencyMs": 1006.1219579999961 }, { "questionId": "q39", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", "correct": true, - "inputTokens": 6316, - "outputTokens": 10, - "latencyMs": 1251 + "inputTokens": 6315, + "outputTokens": 207, + "latencyMs": 3253.900333999998 }, { "questionId": "q39", @@ -4265,18 +4265,18 @@ "correct": true, "inputTokens": 6364, "outputTokens": 13, - "latencyMs": 1337 + "latencyMs": 1219.713582999997 }, { "questionId": "q39", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", "correct": true, - "inputTokens": 5012, - "outputTokens": 10, - "latencyMs": 2368 + "inputTokens": 5011, + "outputTokens": 143, + "latencyMs": 2335.6635000000024 }, { "questionId": "q39", @@ -4287,18 +4287,18 @@ "correct": true, "inputTokens": 5759, "outputTokens": 13, - "latencyMs": 1251 + "latencyMs": 1334.1358330000003 }, { "questionId": "q40", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "49741", "actual": "49741", "correct": true, - "inputTokens": 6391, - "outputTokens": 3, - "latencyMs": 3815 + "inputTokens": 6390, + "outputTokens": 136, + "latencyMs": 1912.2536669999972 }, { "questionId": "q40", @@ -4309,18 +4309,18 @@ "correct": true, "inputTokens": 7871, "outputTokens": 6, - "latencyMs": 1169 + "latencyMs": 1104.4684160000033 }, { "questionId": "q40", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "49741", "actual": "49741", "correct": true, - "inputTokens": 2528, - "outputTokens": 3, - "latencyMs": 1070 + "inputTokens": 2527, + "outputTokens": 72, + "latencyMs": 2648.919750000001 }, { "questionId": "q40", @@ -4331,18 +4331,18 @@ "correct": true, "inputTokens": 2983, "outputTokens": 6, - "latencyMs": 1162 + "latencyMs": 1525.6309170000022 }, { "questionId": "q40", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "49741", "actual": "49741", "correct": true, - "inputTokens": 2382, - "outputTokens": 3, - "latencyMs": 1115 + "inputTokens": 2381, + "outputTokens": 136, + "latencyMs": 2736.3283749999973 }, { "questionId": "q40", @@ -4353,18 +4353,18 @@ "correct": false, "inputTokens": 2857, "outputTokens": 6, - "latencyMs": 1365 + "latencyMs": 1077.766334 }, { "questionId": "q40", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "49741", "actual": "49741", "correct": true, - "inputTokens": 6317, - "outputTokens": 3, - "latencyMs": 2004 + "inputTokens": 6316, + "outputTokens": 72, + "latencyMs": 2116.5284170000014 }, { "questionId": "q40", @@ -4375,18 +4375,18 @@ "correct": true, "inputTokens": 6366, "outputTokens": 6, - "latencyMs": 1113 + "latencyMs": 1159.7744170000005 }, { "questionId": "q40", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "49741", "actual": "49741", "correct": true, - "inputTokens": 5013, - "outputTokens": 3, - "latencyMs": 3055 + "inputTokens": 5012, + "outputTokens": 72, + "latencyMs": 2529.7074160000047 }, { "questionId": "q40", @@ -4397,18 +4397,18 @@ "correct": true, "inputTokens": 5761, "outputTokens": 6, - "latencyMs": 1392 + "latencyMs": 1604.601791999994 }, { "questionId": "q41", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 3877 + "actual": "17", + "correct": true, + "inputTokens": 6387, + "outputTokens": 967, + "latencyMs": 8300.216583000001 }, { "questionId": "q41", @@ -4419,18 +4419,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 1128 + "latencyMs": 1204.089749999992 }, { "questionId": "q41", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 966 + "actual": "17", + "correct": true, + "inputTokens": 2524, + "outputTokens": 455, + "latencyMs": 5231.604541000001 }, { "questionId": "q41", @@ -4441,18 +4441,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1070 + "latencyMs": 1168.508707999994 }, { "questionId": "q41", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 2411 + "actual": "17", + "correct": true, + "inputTokens": 2378, + "outputTokens": 967, + "latencyMs": 8396.912500000006 }, { "questionId": "q41", @@ -4463,18 +4463,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 1286 + "latencyMs": 1060.6276250000083 }, { "questionId": "q41", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 2082 + "actual": "17", + "correct": true, + "inputTokens": 6313, + "outputTokens": 775, + "latencyMs": 9340.763791999998 }, { "questionId": "q41", @@ -4485,18 +4485,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1107 + "latencyMs": 1020.8827080000046 }, { "questionId": "q41", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 1216 + "actual": "17", + "correct": true, + "inputTokens": 5009, + "outputTokens": 903, + "latencyMs": 8792.062000000005 }, { "questionId": "q41", @@ -4507,18 +4507,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1052 + "latencyMs": 1459.8301659999997 }, { "questionId": "q42", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 1572 + "actual": "17", + "correct": true, + "inputTokens": 6387, + "outputTokens": 519, + "latencyMs": 6439.622583000004 }, { "questionId": "q42", @@ -4529,18 +4529,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 1084 + "latencyMs": 1416.1659170000057 }, { "questionId": "q42", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 1377 + "actual": "17", + "correct": true, + "inputTokens": 2524, + "outputTokens": 903, + "latencyMs": 8064.398499999996 }, { "questionId": "q42", @@ -4551,18 +4551,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1197 + "latencyMs": 998.3781250000029 }, { "questionId": "q42", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 2705 + "actual": "17", + "correct": true, + "inputTokens": 2378, + "outputTokens": 647, + "latencyMs": 5498.786500000002 }, { "questionId": "q42", @@ -4573,18 +4573,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 1020 + "latencyMs": 1343.9632910000073 }, { "questionId": "q42", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 5345 + "actual": "17", + "correct": true, + "inputTokens": 6313, + "outputTokens": 647, + "latencyMs": 7565.158291 }, { "questionId": "q42", @@ -4595,18 +4595,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1207 + "latencyMs": 1320.9714169999934 }, { "questionId": "q42", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 921 + "actual": "17", + "correct": true, + "inputTokens": 5009, + "outputTokens": 839, + "latencyMs": 10626.395499999999 }, { "questionId": "q42", @@ -4617,18 +4617,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1289 + "latencyMs": 3227.584917 }, { "questionId": "q43", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 2423 + "actual": "17", + "correct": true, + "inputTokens": 6387, + "outputTokens": 583, + "latencyMs": 6690.373416000002 }, { "questionId": "q43", @@ -4639,18 +4639,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 1273 + "latencyMs": 1187.1296250000014 }, { "questionId": "q43", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 975 + "actual": "17", + "correct": true, + "inputTokens": 2524, + "outputTokens": 519, + "latencyMs": 5081.884875000003 }, { "questionId": "q43", @@ -4661,18 +4661,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1301 + "latencyMs": 1576.2339999999967 }, { "questionId": "q43", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 1423 + "actual": "17", + "correct": true, + "inputTokens": 2378, + "outputTokens": 1031, + "latencyMs": 9927.5775 }, { "questionId": "q43", @@ -4683,18 +4683,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 927 + "latencyMs": 1169.6451669999951 }, { "questionId": "q43", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 1258 + "actual": "17", + "correct": true, + "inputTokens": 6313, + "outputTokens": 519, + "latencyMs": 6772.954291999995 }, { "questionId": "q43", @@ -4705,18 +4705,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1250 + "latencyMs": 1905.9189590000024 }, { "questionId": "q43", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 872 + "actual": "17", + "correct": true, + "inputTokens": 5009, + "outputTokens": 455, + "latencyMs": 6827.424666999999 }, { "questionId": "q43", @@ -4727,18 +4727,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1385 + "latencyMs": 2121.3979160000017 }, { "questionId": "q44", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 1201 + "actual": "17", + "correct": true, + "inputTokens": 6387, + "outputTokens": 519, + "latencyMs": 15235.099042000002 }, { "questionId": "q44", @@ -4749,18 +4749,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 1149 + "latencyMs": 1182.0669170000037 }, { "questionId": "q44", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 1498 + "actual": "17", + "correct": true, + "inputTokens": 2524, + "outputTokens": 583, + "latencyMs": 6872.47600000001 }, { "questionId": "q44", @@ -4771,18 +4771,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1149 + "latencyMs": 931.0203749999928 }, { "questionId": "q44", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 1098 + "actual": "17", + "correct": true, + "inputTokens": 2378, + "outputTokens": 2311, + "latencyMs": 17952.683875000002 }, { "questionId": "q44", @@ -4793,18 +4793,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 1121 + "latencyMs": 1167.8899999999994 }, { "questionId": "q44", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 2522 + "actual": "17", + "correct": true, + "inputTokens": 6313, + "outputTokens": 455, + "latencyMs": 6896.831916999989 }, { "questionId": "q44", @@ -4815,18 +4815,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1532 + "latencyMs": 1401.859083000003 }, { "questionId": "q44", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "17", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 4914 + "actual": "17", + "correct": true, + "inputTokens": 5009, + "outputTokens": 647, + "latencyMs": 5266.956917000003 }, { "questionId": "q44", @@ -4837,18 +4837,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1324 + "latencyMs": 1100.9057919999905 }, { "questionId": "q45", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 1446 + "actual": "16", + "correct": true, + "inputTokens": 6387, + "outputTokens": 1095, + "latencyMs": 15621.264291999993 }, { "questionId": "q45", @@ -4859,18 +4859,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 1105 + "latencyMs": 1063.5868750000081 }, { "questionId": "q45", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 1297 + "actual": "16", + "correct": true, + "inputTokens": 2524, + "outputTokens": 455, + "latencyMs": 5703.061916000006 }, { "questionId": "q45", @@ -4881,18 +4881,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1251 + "latencyMs": 1113.9432499999966 }, { "questionId": "q45", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 1561 + "actual": "16", + "correct": true, + "inputTokens": 2378, + "outputTokens": 3015, + "latencyMs": 22321.357124999995 }, { "questionId": "q45", @@ -4903,18 +4903,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 1292 + "latencyMs": 968.0936249999941 }, { "questionId": "q45", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 1127 + "actual": "16", + "correct": true, + "inputTokens": 6313, + "outputTokens": 1287, + "latencyMs": 14521.080749999994 }, { "questionId": "q45", @@ -4925,18 +4925,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1207 + "latencyMs": 1228.1847500000003 }, { "questionId": "q45", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 1582 + "actual": "16", + "correct": true, + "inputTokens": 5009, + "outputTokens": 455, + "latencyMs": 5216.268042000011 }, { "questionId": "q45", @@ -4947,18 +4947,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1278 + "latencyMs": 1026.5127079999947 }, { "questionId": "q46", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 6388, - "outputTokens": 2, - "latencyMs": 1278 + "actual": "16", + "correct": true, + "inputTokens": 6387, + "outputTokens": 391, + "latencyMs": 4335.125541000001 }, { "questionId": "q46", @@ -4969,18 +4969,18 @@ "correct": false, "inputTokens": 7865, "outputTokens": 5, - "latencyMs": 3084 + "latencyMs": 1116.4177909999999 }, { "questionId": "q46", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 2525, - "outputTokens": 2, - "latencyMs": 1289 + "actual": "16", + "correct": true, + "inputTokens": 2524, + "outputTokens": 583, + "latencyMs": 4128.823499999999 }, { "questionId": "q46", @@ -4991,18 +4991,18 @@ "correct": false, "inputTokens": 2977, "outputTokens": 5, - "latencyMs": 1591 + "latencyMs": 1105.622457999998 }, { "questionId": "q46", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 2379, - "outputTokens": 2, - "latencyMs": 3038 + "actual": "16", + "correct": true, + "inputTokens": 2378, + "outputTokens": 839, + "latencyMs": 6542.58583299999 }, { "questionId": "q46", @@ -5013,18 +5013,18 @@ "correct": false, "inputTokens": 2851, "outputTokens": 5, - "latencyMs": 1447 + "latencyMs": 1084.2237909999967 }, { "questionId": "q46", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 6314, - "outputTokens": 2, - "latencyMs": 1224 + "actual": "16", + "correct": true, + "inputTokens": 6313, + "outputTokens": 455, + "latencyMs": 5050.133375000005 }, { "questionId": "q46", @@ -5035,18 +5035,18 @@ "correct": false, "inputTokens": 6360, "outputTokens": 5, - "latencyMs": 1250 + "latencyMs": 1075.023709000001 }, { "questionId": "q46", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "16", - "actual": "20", - "correct": false, - "inputTokens": 5010, - "outputTokens": 2, - "latencyMs": 1364 + "actual": "16", + "correct": true, + "inputTokens": 5009, + "outputTokens": 711, + "latencyMs": 9237.985791 }, { "questionId": "q46", @@ -5057,18 +5057,18 @@ "correct": false, "inputTokens": 5755, "outputTokens": 5, - "latencyMs": 1560 + "latencyMs": 1346.3510000000097 }, { "questionId": "q47", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "91", - "actual": "66", - "correct": false, - "inputTokens": 6393, - "outputTokens": 2, - "latencyMs": 989 + "actual": "91", + "correct": true, + "inputTokens": 6392, + "outputTokens": 2375, + "latencyMs": 27655.89520900001 }, { "questionId": "q47", @@ -5079,18 +5079,18 @@ "correct": false, "inputTokens": 7870, "outputTokens": 5, - "latencyMs": 1358 + "latencyMs": 1315.7111659999937 }, { "questionId": "q47", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "91", - "actual": "66", - "correct": false, - "inputTokens": 2530, - "outputTokens": 2, - "latencyMs": 1406 + "actual": "91", + "correct": true, + "inputTokens": 2529, + "outputTokens": 2695, + "latencyMs": 26482.504707999993 }, { "questionId": "q47", @@ -5101,18 +5101,18 @@ "correct": false, "inputTokens": 2982, "outputTokens": 5, - "latencyMs": 1123 + "latencyMs": 1368.221916999988 }, { "questionId": "q47", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "91", - "actual": "66", - "correct": false, - "inputTokens": 2384, - "outputTokens": 2, - "latencyMs": 4883 + "actual": "91", + "correct": true, + "inputTokens": 2383, + "outputTokens": 1671, + "latencyMs": 18249.434333000012 }, { "questionId": "q47", @@ -5123,18 +5123,18 @@ "correct": false, "inputTokens": 2856, "outputTokens": 5, - "latencyMs": 1402 + "latencyMs": 1051.9521660000028 }, { "questionId": "q47", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "91", - "actual": "66", - "correct": false, - "inputTokens": 6319, - "outputTokens": 2, - "latencyMs": 1915 + "actual": "91", + "correct": true, + "inputTokens": 6318, + "outputTokens": 1799, + "latencyMs": 15867.284083999999 }, { "questionId": "q47", @@ -5145,18 +5145,18 @@ "correct": false, "inputTokens": 6365, "outputTokens": 5, - "latencyMs": 1263 + "latencyMs": 1831.3835839999956 }, { "questionId": "q47", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "91", - "actual": "66", - "correct": false, - "inputTokens": 5015, - "outputTokens": 2, - "latencyMs": 1448 + "actual": "91", + "correct": true, + "inputTokens": 5014, + "outputTokens": 2247, + "latencyMs": 19254.821666999997 }, { "questionId": "q47", @@ -5167,18 +5167,18 @@ "correct": false, "inputTokens": 5760, "outputTokens": 5, - "latencyMs": 1243 + "latencyMs": 1762.2908329999918 }, { "questionId": "q48", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "67", - "actual": "54", - "correct": false, - "inputTokens": 6393, - "outputTokens": 2, - "latencyMs": 1456 + "actual": "67", + "correct": true, + "inputTokens": 6392, + "outputTokens": 1479, + "latencyMs": 13444.104542000001 }, { "questionId": "q48", @@ -5189,18 +5189,18 @@ "correct": false, "inputTokens": 7870, "outputTokens": 5, - "latencyMs": 1186 + "latencyMs": 1182.2523340000043 }, { "questionId": "q48", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "67", - "actual": "54", - "correct": false, - "inputTokens": 2530, - "outputTokens": 2, - "latencyMs": 1076 + "actual": "67", + "correct": true, + "inputTokens": 2529, + "outputTokens": 2183, + "latencyMs": 19257.86050000001 }, { "questionId": "q48", @@ -5211,18 +5211,18 @@ "correct": false, "inputTokens": 2982, "outputTokens": 5, - "latencyMs": 1168 + "latencyMs": 1081.3142080000107 }, { "questionId": "q48", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "67", - "actual": "56", - "correct": false, - "inputTokens": 2384, - "outputTokens": 2, - "latencyMs": 3105 + "actual": "67", + "correct": true, + "inputTokens": 2383, + "outputTokens": 3463, + "latencyMs": 21384.707542000004 }, { "questionId": "q48", @@ -5233,40 +5233,40 @@ "correct": false, "inputTokens": 2856, "outputTokens": 5, - "latencyMs": 1375 + "latencyMs": 1051.6647080000112 }, { "questionId": "q48", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", + "expected": "67", + "actual": "67", + "correct": true, + "inputTokens": 6318, + "outputTokens": 2439, + "latencyMs": 19519.416207999995 + }, + { + "questionId": "q48", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "67", + "actual": "47", + "correct": false, + "inputTokens": 6365, + "outputTokens": 5, + "latencyMs": 1060.1008749999892 + }, + { + "questionId": "q48", + "format": "yaml", + "model": "gpt-5-nano", "expected": "67", "actual": "66", "correct": false, - "inputTokens": 6319, - "outputTokens": 2, - "latencyMs": 1618 - }, - { - "questionId": "q48", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "67", - "actual": "47", - "correct": false, - "inputTokens": 6365, - "outputTokens": 5, - "latencyMs": 1454 - }, - { - "questionId": "q48", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "67", - "actual": "54", - "correct": false, - "inputTokens": 5015, - "outputTokens": 2, - "latencyMs": 1244 + "inputTokens": 5014, + "outputTokens": 1991, + "latencyMs": 15234.403459000008 }, { "questionId": "q48", @@ -5277,18 +5277,18 @@ "correct": false, "inputTokens": 5760, "outputTokens": 5, - "latencyMs": 1113 + "latencyMs": 1208.8559589999932 }, { "questionId": "q49", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "30", - "correct": false, - "inputTokens": 6393, - "outputTokens": 2, - "latencyMs": 1267 + "actual": "41", + "correct": true, + "inputTokens": 6392, + "outputTokens": 1415, + "latencyMs": 14119.885540999996 }, { "questionId": "q49", @@ -5299,18 +5299,18 @@ "correct": false, "inputTokens": 7870, "outputTokens": 5, - "latencyMs": 1227 + "latencyMs": 1428.8373750000028 }, { "questionId": "q49", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "30", - "correct": false, - "inputTokens": 2530, - "outputTokens": 2, - "latencyMs": 1246 + "actual": "41", + "correct": true, + "inputTokens": 2529, + "outputTokens": 1607, + "latencyMs": 13997.297709000006 }, { "questionId": "q49", @@ -5321,18 +5321,18 @@ "correct": false, "inputTokens": 2982, "outputTokens": 5, - "latencyMs": 1127 + "latencyMs": 1270.4412920000032 }, { "questionId": "q49", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "34", - "correct": false, - "inputTokens": 2384, - "outputTokens": 2, - "latencyMs": 1260 + "actual": "41", + "correct": true, + "inputTokens": 2383, + "outputTokens": 1415, + "latencyMs": 13861.177167000002 }, { "questionId": "q49", @@ -5343,18 +5343,18 @@ "correct": false, "inputTokens": 2856, "outputTokens": 5, - "latencyMs": 1293 + "latencyMs": 916.5238340000069 }, { "questionId": "q49", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "24", + "actual": "42", "correct": false, - "inputTokens": 6319, - "outputTokens": 2, - "latencyMs": 1246 + "inputTokens": 6318, + "outputTokens": 1799, + "latencyMs": 16007.06925 }, { "questionId": "q49", @@ -5365,18 +5365,18 @@ "correct": false, "inputTokens": 6365, "outputTokens": 5, - "latencyMs": 1598 + "latencyMs": 1426.0594579999888 }, { "questionId": "q49", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "24", - "correct": false, - "inputTokens": 5015, - "outputTokens": 2, - "latencyMs": 1471 + "actual": "41", + "correct": true, + "inputTokens": 5014, + "outputTokens": 2055, + "latencyMs": 22966.680624999994 }, { "questionId": "q49", @@ -5387,18 +5387,18 @@ "correct": false, "inputTokens": 5760, "outputTokens": 5, - "latencyMs": 1311 + "latencyMs": 1044.6609999999928 }, { "questionId": "q50", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "26", - "actual": "22", - "correct": false, - "inputTokens": 6393, - "outputTokens": 2, - "latencyMs": 3950 + "actual": "26", + "correct": true, + "inputTokens": 6392, + "outputTokens": 1159, + "latencyMs": 10799.117333000002 }, { "questionId": "q50", @@ -5409,18 +5409,18 @@ "correct": false, "inputTokens": 7870, "outputTokens": 5, - "latencyMs": 1075 + "latencyMs": 1359.5568330000096 }, { "questionId": "q50", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "26", - "actual": "22", - "correct": false, - "inputTokens": 2530, - "outputTokens": 2, - "latencyMs": 1868 + "actual": "26", + "correct": true, + "inputTokens": 2529, + "outputTokens": 1543, + "latencyMs": 13702.052542000005 }, { "questionId": "q50", @@ -5431,18 +5431,18 @@ "correct": false, "inputTokens": 2982, "outputTokens": 5, - "latencyMs": 1075 + "latencyMs": 967.0454159999936 }, { "questionId": "q50", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "26", - "actual": "24", - "correct": false, - "inputTokens": 2384, - "outputTokens": 2, - "latencyMs": 1973 + "actual": "26", + "correct": true, + "inputTokens": 2383, + "outputTokens": 1671, + "latencyMs": 13116.871958000003 }, { "questionId": "q50", @@ -5453,18 +5453,18 @@ "correct": false, "inputTokens": 2856, "outputTokens": 5, - "latencyMs": 947 + "latencyMs": 1088.8372910000035 }, { "questionId": "q50", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "26", - "actual": "22", - "correct": false, - "inputTokens": 6319, - "outputTokens": 2, - "latencyMs": 1414 + "actual": "26", + "correct": true, + "inputTokens": 6318, + "outputTokens": 1543, + "latencyMs": 14387.148624999987 }, { "questionId": "q50", @@ -5475,18 +5475,18 @@ "correct": false, "inputTokens": 6365, "outputTokens": 5, - "latencyMs": 1221 + "latencyMs": 1273.9564170000085 }, { "questionId": "q50", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "26", - "actual": "18", - "correct": false, - "inputTokens": 5015, - "outputTokens": 2, - "latencyMs": 1148 + "actual": "26", + "correct": true, + "inputTokens": 5014, + "outputTokens": 1223, + "latencyMs": 12143.083792000005 }, { "questionId": "q50", @@ -5497,18 +5497,18 @@ "correct": false, "inputTokens": 5760, "outputTokens": 5, - "latencyMs": 1286 + "latencyMs": 1032.9807079999882 }, { "questionId": "q51", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "78", - "actual": "66", - "correct": false, - "inputTokens": 6387, - "outputTokens": 2, - "latencyMs": 2525 + "actual": "78", + "correct": true, + "inputTokens": 6386, + "outputTokens": 2631, + "latencyMs": 23077.678417000003 }, { "questionId": "q51", @@ -5519,18 +5519,18 @@ "correct": false, "inputTokens": 7864, "outputTokens": 5, - "latencyMs": 1613 + "latencyMs": 1281.171417000005 }, { "questionId": "q51", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "78", - "actual": "66", - "correct": false, - "inputTokens": 2524, - "outputTokens": 2, - "latencyMs": 1132 + "actual": "78", + "correct": true, + "inputTokens": 2523, + "outputTokens": 2759, + "latencyMs": 20331.962667 }, { "questionId": "q51", @@ -5541,18 +5541,18 @@ "correct": true, "inputTokens": 2976, "outputTokens": 5, - "latencyMs": 1104 + "latencyMs": 1014.3847079999978 }, { "questionId": "q51", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "78", - "actual": "77", + "actual": "81", "correct": false, - "inputTokens": 2378, - "outputTokens": 2, - "latencyMs": 1069 + "inputTokens": 2377, + "outputTokens": 3335, + "latencyMs": 18037.630208000002 }, { "questionId": "q51", @@ -5563,18 +5563,18 @@ "correct": false, "inputTokens": 2850, "outputTokens": 5, - "latencyMs": 1113 + "latencyMs": 918.3078749999986 }, { "questionId": "q51", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "78", - "actual": "66", - "correct": false, - "inputTokens": 6313, - "outputTokens": 2, - "latencyMs": 1999 + "actual": "78", + "correct": true, + "inputTokens": 6312, + "outputTokens": 1991, + "latencyMs": 15660.232958000008 }, { "questionId": "q51", @@ -5585,18 +5585,18 @@ "correct": true, "inputTokens": 6359, "outputTokens": 5, - "latencyMs": 1214 + "latencyMs": 1033.7647080000024 }, { "questionId": "q51", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "78", - "actual": "66", - "correct": false, - "inputTokens": 5009, - "outputTokens": 2, - "latencyMs": 1613 + "actual": "78", + "correct": true, + "inputTokens": 5008, + "outputTokens": 4295, + "latencyMs": 26817.97 }, { "questionId": "q51", @@ -5607,18 +5607,18 @@ "correct": false, "inputTokens": 5754, "outputTokens": 5, - "latencyMs": 1012 + "latencyMs": 1348.084750000009 }, { "questionId": "q52", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "22", - "actual": "30", - "correct": false, - "inputTokens": 6387, - "outputTokens": 2, - "latencyMs": 1580 + "actual": "22", + "correct": true, + "inputTokens": 6386, + "outputTokens": 1223, + "latencyMs": 10273.866540999996 }, { "questionId": "q52", @@ -5629,18 +5629,18 @@ "correct": false, "inputTokens": 7864, "outputTokens": 5, - "latencyMs": 1688 + "latencyMs": 1081.604707999999 }, { "questionId": "q52", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "22", "actual": "22", "correct": true, - "inputTokens": 2524, - "outputTokens": 2, - "latencyMs": 1290 + "inputTokens": 2523, + "outputTokens": 903, + "latencyMs": 13862.020499999999 }, { "questionId": "q52", @@ -5651,18 +5651,18 @@ "correct": false, "inputTokens": 2976, "outputTokens": 5, - "latencyMs": 1121 + "latencyMs": 965.817916 }, { "questionId": "q52", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "22", - "actual": "10", + "actual": "21", "correct": false, - "inputTokens": 2378, - "outputTokens": 2, - "latencyMs": 1544 + "inputTokens": 2377, + "outputTokens": 2631, + "latencyMs": 24254.82570799999 }, { "questionId": "q52", @@ -5673,18 +5673,18 @@ "correct": false, "inputTokens": 2850, "outputTokens": 5, - "latencyMs": 822 + "latencyMs": 998.7978339999972 }, { "questionId": "q52", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "22", - "actual": "34", - "correct": false, - "inputTokens": 6313, - "outputTokens": 2, - "latencyMs": 2718 + "actual": "22", + "correct": true, + "inputTokens": 6312, + "outputTokens": 1095, + "latencyMs": 10401.351500000004 }, { "questionId": "q52", @@ -5695,18 +5695,18 @@ "correct": false, "inputTokens": 6359, "outputTokens": 5, - "latencyMs": 1211 + "latencyMs": 1479.388791999998 }, { "questionId": "q52", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "22", - "actual": "34", - "correct": false, - "inputTokens": 5009, - "outputTokens": 2, - "latencyMs": 1162 + "actual": "22", + "correct": true, + "inputTokens": 5008, + "outputTokens": 839, + "latencyMs": 8160.454833999989 }, { "questionId": "q52", @@ -5717,370 +5717,370 @@ "correct": false, "inputTokens": 5754, "outputTokens": 5, - "latencyMs": 1156 + "latencyMs": 1763.230291999993 }, { "questionId": "q53", "format": "json", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "24", - "correct": false, - "inputTokens": 6395, - "outputTokens": 2, - "latencyMs": 1089 - }, - { - "questionId": "q53", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "9", - "correct": false, - "inputTokens": 7872, - "outputTokens": 5, - "latencyMs": 1368 - }, - { - "questionId": "q53", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "24", - "correct": false, - "inputTokens": 2532, - "outputTokens": 2, - "latencyMs": 1850 - }, - { - "questionId": "q53", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "9", - "correct": false, - "inputTokens": 2984, - "outputTokens": 5, - "latencyMs": 914 - }, - { - "questionId": "q53", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "34", - "correct": false, - "inputTokens": 2386, - "outputTokens": 2, - "latencyMs": 1156 - }, - { - "questionId": "q53", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "10", - "correct": false, - "inputTokens": 2858, - "outputTokens": 5, - "latencyMs": 1118 - }, - { - "questionId": "q53", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "22", - "correct": false, - "inputTokens": 6321, - "outputTokens": 2, - "latencyMs": 1020 - }, - { - "questionId": "q53", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "8", - "correct": false, - "inputTokens": 6367, - "outputTokens": 5, - "latencyMs": 1021 - }, - { - "questionId": "q53", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "18", - "correct": false, - "inputTokens": 5017, - "outputTokens": 2, - "latencyMs": 1236 - }, - { - "questionId": "q53", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "10", - "correct": false, - "inputTokens": 5762, - "outputTokens": 5, - "latencyMs": 1574 - }, - { - "questionId": "q54", - "format": "json", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "24", - "correct": false, - "inputTokens": 6395, - "outputTokens": 2, - "latencyMs": 1437 - }, - { - "questionId": "q54", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "7", - "correct": false, - "inputTokens": 7872, - "outputTokens": 5, - "latencyMs": 1091 - }, - { - "questionId": "q54", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "24", - "correct": false, - "inputTokens": 2532, - "outputTokens": 2, - "latencyMs": 1917 - }, - { - "questionId": "q54", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "6", - "correct": false, - "inputTokens": 2984, - "outputTokens": 5, - "latencyMs": 1095 - }, - { - "questionId": "q54", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "34", - "correct": false, - "inputTokens": 2386, - "outputTokens": 2, - "latencyMs": 4230 - }, - { - "questionId": "q54", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "8", - "correct": false, - "inputTokens": 2858, - "outputTokens": 5, - "latencyMs": 1187 - }, - { - "questionId": "q54", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "24", - "correct": false, - "inputTokens": 6321, - "outputTokens": 2, - "latencyMs": 1197 - }, - { - "questionId": "q54", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "6", - "correct": false, - "inputTokens": 6367, - "outputTokens": 5, - "latencyMs": 1176 - }, - { - "questionId": "q54", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "18", - "correct": false, - "inputTokens": 5017, - "outputTokens": 2, - "latencyMs": 1249 - }, - { - "questionId": "q54", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "8", - "correct": false, - "inputTokens": 5762, - "outputTokens": 5, - "latencyMs": 1383 - }, - { - "questionId": "q55", - "format": "json", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "30", - "correct": false, - "inputTokens": 6395, - "outputTokens": 2, - "latencyMs": 1149 - }, - { - "questionId": "q55", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "8", - "correct": false, - "inputTokens": 7872, - "outputTokens": 5, - "latencyMs": 1072 - }, - { - "questionId": "q55", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "18", - "correct": false, - "inputTokens": 2532, - "outputTokens": 2, - "latencyMs": 1213 - }, - { - "questionId": "q55", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "7", - "correct": false, - "inputTokens": 2984, - "outputTokens": 5, - "latencyMs": 1507 - }, - { - "questionId": "q55", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "34", - "correct": false, - "inputTokens": 2386, - "outputTokens": 2, - "latencyMs": 1826 - }, - { - "questionId": "q55", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "8", - "correct": false, - "inputTokens": 2858, - "outputTokens": 5, - "latencyMs": 1162 - }, - { - "questionId": "q55", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "24", - "correct": false, - "inputTokens": 6321, - "outputTokens": 2, - "latencyMs": 1008 - }, - { - "questionId": "q55", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "7", - "correct": false, - "inputTokens": 6367, - "outputTokens": 5, - "latencyMs": 1285 - }, - { - "questionId": "q55", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "11", - "actual": "22", - "correct": false, - "inputTokens": 5017, - "outputTokens": 2, - "latencyMs": 1124 - }, - { - "questionId": "q55", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "11", - "actual": "9", - "correct": false, - "inputTokens": 5762, - "outputTokens": 5, - "latencyMs": 1212 - }, - { - "questionId": "q56", - "format": "json", - "model": "gpt-4o-mini", - "expected": "12", - "actual": "22", - "correct": false, - "inputTokens": 6395, - "outputTokens": 2, - "latencyMs": 1232 - }, - { - "questionId": "q56", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "12", - "actual": "7", - "correct": false, - "inputTokens": 7872, - "outputTokens": 5, - "latencyMs": 1792 - }, - { - "questionId": "q56", - "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "12", "actual": "12", "correct": true, - "inputTokens": 2532, - "outputTokens": 2, - "latencyMs": 1357 + "inputTokens": 6394, + "outputTokens": 1671, + "latencyMs": 14807.253333 + }, + { + "questionId": "q53", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "9", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1185.018333 + }, + { + "questionId": "q53", + "format": "toon", + "model": "gpt-5-nano", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 2531, + "outputTokens": 1607, + "latencyMs": 13592.477832999997 + }, + { + "questionId": "q53", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "9", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 947.2789590000029 + }, + { + "questionId": "q53", + "format": "csv", + "model": "gpt-5-nano", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 2385, + "outputTokens": 2759, + "latencyMs": 22718.536041999992 + }, + { + "questionId": "q53", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 973.4814580000093 + }, + { + "questionId": "q53", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 6320, + "outputTokens": 1031, + "latencyMs": 10025.186000000002 + }, + { + "questionId": "q53", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "8", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1038.4732499999955 + }, + { + "questionId": "q53", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 5016, + "outputTokens": 903, + "latencyMs": 12459.619915999996 + }, + { + "questionId": "q53", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "10", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1448.7940839999937 + }, + { + "questionId": "q54", + "format": "json", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 6394, + "outputTokens": 1415, + "latencyMs": 13094.547666999992 + }, + { + "questionId": "q54", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1241.7239169999957 + }, + { + "questionId": "q54", + "format": "toon", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 2531, + "outputTokens": 1031, + "latencyMs": 10610.864084 + }, + { + "questionId": "q54", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "6", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1100.7670829999988 + }, + { + "questionId": "q54", + "format": "csv", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 2385, + "outputTokens": 1095, + "latencyMs": 11523.293417000008 + }, + { + "questionId": "q54", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 980.1522499999992 + }, + { + "questionId": "q54", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 6320, + "outputTokens": 1095, + "latencyMs": 8184.143375 + }, + { + "questionId": "q54", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "6", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1175.0723330000037 + }, + { + "questionId": "q54", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 5016, + "outputTokens": 1159, + "latencyMs": 13082.53912500001 + }, + { + "questionId": "q54", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1020.4026659999945 + }, + { + "questionId": "q55", + "format": "json", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 6394, + "outputTokens": 1223, + "latencyMs": 13166.679334 + }, + { + "questionId": "q55", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1090.0060839999933 + }, + { + "questionId": "q55", + "format": "toon", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 2531, + "outputTokens": 1287, + "latencyMs": 11181.234958000001 + }, + { + "questionId": "q55", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 2984, + "outputTokens": 5, + "latencyMs": 1365.1262080000015 + }, + { + "questionId": "q55", + "format": "csv", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 2385, + "outputTokens": 967, + "latencyMs": 9549.427916999994 + }, + { + "questionId": "q55", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "8", + "correct": false, + "inputTokens": 2858, + "outputTokens": 5, + "latencyMs": 981.8662500000064 + }, + { + "questionId": "q55", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "11", + "actual": "11", + "correct": true, + "inputTokens": 6320, + "outputTokens": 1223, + "latencyMs": 11591.030333000002 + }, + { + "questionId": "q55", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "7", + "correct": false, + "inputTokens": 6367, + "outputTokens": 5, + "latencyMs": 1430.038750000007 + }, + { + "questionId": "q55", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "11", + "actual": "10", + "correct": false, + "inputTokens": 5016, + "outputTokens": 1735, + "latencyMs": 11458.303500000009 + }, + { + "questionId": "q55", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "11", + "actual": "9", + "correct": false, + "inputTokens": 5762, + "outputTokens": 5, + "latencyMs": 1103.2402909999946 + }, + { + "questionId": "q56", + "format": "json", + "model": "gpt-5-nano", + "expected": "12", + "actual": "11", + "correct": false, + "inputTokens": 6394, + "outputTokens": 2631, + "latencyMs": 16900.63120799999 + }, + { + "questionId": "q56", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "12", + "actual": "7", + "correct": false, + "inputTokens": 7872, + "outputTokens": 5, + "latencyMs": 1043.442332999999 + }, + { + "questionId": "q56", + "format": "toon", + "model": "gpt-5-nano", + "expected": "12", + "actual": "12", + "correct": true, + "inputTokens": 2531, + "outputTokens": 839, + "latencyMs": 7278.612083 }, { "questionId": "q56", @@ -6091,18 +6091,18 @@ "correct": false, "inputTokens": 2984, "outputTokens": 5, - "latencyMs": 1247 + "latencyMs": 1705.2114999999903 }, { "questionId": "q56", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "12", - "actual": "22", + "actual": "11", "correct": false, - "inputTokens": 2386, - "outputTokens": 2, - "latencyMs": 1043 + "inputTokens": 2385, + "outputTokens": 1415, + "latencyMs": 10625.603375000006 }, { "questionId": "q56", @@ -6113,18 +6113,18 @@ "correct": false, "inputTokens": 2858, "outputTokens": 5, - "latencyMs": 1065 + "latencyMs": 1081.0501670000085 }, { "questionId": "q56", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "12", - "actual": "10", - "correct": false, - "inputTokens": 6321, - "outputTokens": 2, - "latencyMs": 1298 + "actual": "12", + "correct": true, + "inputTokens": 6320, + "outputTokens": 2055, + "latencyMs": 17548.71483299999 }, { "questionId": "q56", @@ -6135,18 +6135,18 @@ "correct": false, "inputTokens": 6367, "outputTokens": 5, - "latencyMs": 1767 + "latencyMs": 2302.2003750000003 }, { "questionId": "q56", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "12", - "actual": "10", + "actual": "11", "correct": false, - "inputTokens": 5017, - "outputTokens": 2, - "latencyMs": 3525 + "inputTokens": 5016, + "outputTokens": 1287, + "latencyMs": 13187.201000000015 }, { "questionId": "q56", @@ -6157,18 +6157,18 @@ "correct": false, "inputTokens": 5762, "outputTokens": 5, - "latencyMs": 1355 + "latencyMs": 2621.4970829999947 }, { "questionId": "q57", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "62", - "actual": "54", - "correct": false, - "inputTokens": 6394, - "outputTokens": 2, - "latencyMs": 1359 + "actual": "62", + "correct": true, + "inputTokens": 6393, + "outputTokens": 3783, + "latencyMs": 29393.69395799999 }, { "questionId": "q57", @@ -6179,18 +6179,18 @@ "correct": true, "inputTokens": 7872, "outputTokens": 5, - "latencyMs": 1447 + "latencyMs": 1402.049291999996 }, { "questionId": "q57", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "62", - "actual": "54", - "correct": false, - "inputTokens": 2531, - "outputTokens": 2, - "latencyMs": 3832 + "actual": "62", + "correct": true, + "inputTokens": 2530, + "outputTokens": 2823, + "latencyMs": 23696.75 }, { "questionId": "q57", @@ -6201,18 +6201,18 @@ "correct": true, "inputTokens": 2984, "outputTokens": 5, - "latencyMs": 1143 + "latencyMs": 1064.7778749999998 }, { "questionId": "q57", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "62", - "actual": "66", + "actual": "64", "correct": false, - "inputTokens": 2385, - "outputTokens": 2, - "latencyMs": 1370 + "inputTokens": 2384, + "outputTokens": 3143, + "latencyMs": 28384.533249999993 }, { "questionId": "q57", @@ -6223,18 +6223,18 @@ "correct": true, "inputTokens": 2858, "outputTokens": 5, - "latencyMs": 1042 + "latencyMs": 889.2725839999912 }, { "questionId": "q57", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "62", - "actual": "54", - "correct": false, - "inputTokens": 6320, - "outputTokens": 2, - "latencyMs": 1015 + "actual": "62", + "correct": true, + "inputTokens": 6319, + "outputTokens": 6663, + "latencyMs": 50113.09675 }, { "questionId": "q57", @@ -6245,18 +6245,18 @@ "correct": true, "inputTokens": 6367, "outputTokens": 5, - "latencyMs": 1395 + "latencyMs": 1074.8158330000006 }, { "questionId": "q57", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "62", - "actual": "54", - "correct": false, - "inputTokens": 5016, - "outputTokens": 2, - "latencyMs": 1008 + "actual": "62", + "correct": true, + "inputTokens": 5015, + "outputTokens": 2631, + "latencyMs": 23841.036083999992 }, { "questionId": "q57", @@ -6267,18 +6267,18 @@ "correct": true, "inputTokens": 5762, "outputTokens": 5, - "latencyMs": 1191 + "latencyMs": 1010.4629169999971 }, { "questionId": "q58", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "45", - "actual": "38", - "correct": false, - "inputTokens": 6394, - "outputTokens": 2, - "latencyMs": 1304 + "actual": "45", + "correct": true, + "inputTokens": 6393, + "outputTokens": 2247, + "latencyMs": 18818.030874999997 }, { "questionId": "q58", @@ -6289,18 +6289,18 @@ "correct": false, "inputTokens": 7872, "outputTokens": 5, - "latencyMs": 1386 + "latencyMs": 1203.152833 }, { "questionId": "q58", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "45", - "actual": "38", - "correct": false, - "inputTokens": 2531, - "outputTokens": 2, - "latencyMs": 1433 + "actual": "45", + "correct": true, + "inputTokens": 2530, + "outputTokens": 2631, + "latencyMs": 21987.539915999994 }, { "questionId": "q58", @@ -6311,18 +6311,18 @@ "correct": false, "inputTokens": 2984, "outputTokens": 5, - "latencyMs": 967 + "latencyMs": 1000.0181669999874 }, { "questionId": "q58", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "45", - "actual": "42", + "actual": "46", "correct": false, - "inputTokens": 2385, - "outputTokens": 2, - "latencyMs": 2469 + "inputTokens": 2384, + "outputTokens": 3079, + "latencyMs": 24534.847250000006 }, { "questionId": "q58", @@ -6333,18 +6333,18 @@ "correct": false, "inputTokens": 2858, "outputTokens": 5, - "latencyMs": 1382 + "latencyMs": 1125.7029999999795 }, { "questionId": "q58", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "45", - "actual": "38", - "correct": false, - "inputTokens": 6320, - "outputTokens": 2, - "latencyMs": 1658 + "actual": "45", + "correct": true, + "inputTokens": 6319, + "outputTokens": 2823, + "latencyMs": 27053.90824999998 }, { "questionId": "q58", @@ -6355,18 +6355,18 @@ "correct": false, "inputTokens": 6367, "outputTokens": 5, - "latencyMs": 1450 + "latencyMs": 1474.1193330000096 }, { "questionId": "q58", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "45", - "actual": "38", - "correct": false, - "inputTokens": 5016, - "outputTokens": 2, - "latencyMs": 1428 + "actual": "45", + "correct": true, + "inputTokens": 5015, + "outputTokens": 2567, + "latencyMs": 21642.824207999976 }, { "questionId": "q58", @@ -6377,18 +6377,18 @@ "correct": false, "inputTokens": 5762, "outputTokens": 5, - "latencyMs": 1144 + "latencyMs": 1170.1535830000066 }, { "questionId": "q59", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1577 + "inputTokens": 9739, + "outputTokens": 73, + "latencyMs": 2340.6126670000085 }, { "questionId": "q59", @@ -6399,18 +6399,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1181 + "latencyMs": 1337.4746670000022 }, { "questionId": "q59", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1231 + "inputTokens": 6013, + "outputTokens": 137, + "latencyMs": 2275.1715830000176 }, { "questionId": "q59", @@ -6421,18 +6421,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1407 + "latencyMs": 1086.9557499999937 }, { "questionId": "q59", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1393 + "inputTokens": 6781, + "outputTokens": 137, + "latencyMs": 2881.4037499999977 }, { "questionId": "q59", @@ -6443,18 +6443,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1534 + "latencyMs": 1172.774000000005 }, { "questionId": "q59", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1456 + "inputTokens": 9158, + "outputTokens": 201, + "latencyMs": 7706.478582999989 }, { "questionId": "q59", @@ -6465,18 +6465,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1933 + "latencyMs": 1106.0717920000025 }, { "questionId": "q59", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1472 + "inputTokens": 7373, + "outputTokens": 137, + "latencyMs": 6185.161250000005 }, { "questionId": "q59", @@ -6487,18 +6487,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1224 + "latencyMs": 1388.4410000000207 }, { "questionId": "q60", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 2069 + "inputTokens": 9738, + "outputTokens": 136, + "latencyMs": 6699.9394589999865 }, { "questionId": "q60", @@ -6509,18 +6509,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1172 + "latencyMs": 1152.8117919999931 }, { "questionId": "q60", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 1236 + "inputTokens": 6012, + "outputTokens": 136, + "latencyMs": 2446.019666999986 }, { "questionId": "q60", @@ -6531,18 +6531,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1157 + "latencyMs": 1046.3494580000115 }, { "questionId": "q60", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 1364 + "inputTokens": 6780, + "outputTokens": 200, + "latencyMs": 6084.429165999987 }, { "questionId": "q60", @@ -6553,18 +6553,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1041 + "latencyMs": 1787.2428749999963 }, { "questionId": "q60", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1478 + "inputTokens": 9157, + "outputTokens": 264, + "latencyMs": 5364.3007919999945 }, { "questionId": "q60", @@ -6575,18 +6575,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1266 + "latencyMs": 1269.2162499999977 }, { "questionId": "q60", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 3477 + "inputTokens": 7372, + "outputTokens": 72, + "latencyMs": 2381.514374999999 }, { "questionId": "q60", @@ -6597,18 +6597,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 2630 + "latencyMs": 1222.1361669999897 }, { "questionId": "q61", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1479 + "inputTokens": 9739, + "outputTokens": 201, + "latencyMs": 3641.536167000013 }, { "questionId": "q61", @@ -6619,18 +6619,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1270 + "latencyMs": 2457.5752079999947 }, { "questionId": "q61", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1270 + "inputTokens": 6013, + "outputTokens": 201, + "latencyMs": 3384.6115839999984 }, { "questionId": "q61", @@ -6641,18 +6641,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1342 + "latencyMs": 1372.8756669999857 }, { "questionId": "q61", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1350 + "inputTokens": 6781, + "outputTokens": 265, + "latencyMs": 5826.962750000006 }, { "questionId": "q61", @@ -6663,18 +6663,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1205 + "latencyMs": 1303.1691670000146 }, { "questionId": "q61", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1502 + "inputTokens": 9158, + "outputTokens": 265, + "latencyMs": 3602.1091250000172 }, { "questionId": "q61", @@ -6685,18 +6685,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1571 + "latencyMs": 1451.1585410000116 }, { "questionId": "q61", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 2013 + "inputTokens": 7373, + "outputTokens": 137, + "latencyMs": 2453.183083000011 }, { "questionId": "q61", @@ -6707,18 +6707,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1428 + "latencyMs": 1152.136541999993 }, { "questionId": "q62", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 1666 + "inputTokens": 9738, + "outputTokens": 199, + "latencyMs": 5025.56916699998 }, { "questionId": "q62", @@ -6729,18 +6729,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1549 + "latencyMs": 1111.5014169999922 }, { "questionId": "q62", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1033 + "inputTokens": 6012, + "outputTokens": 199, + "latencyMs": 3548.9061660000007 }, { "questionId": "q62", @@ -6751,18 +6751,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1061 + "latencyMs": 1404.0692500000005 }, { "questionId": "q62", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 2008 + "inputTokens": 6780, + "outputTokens": 135, + "latencyMs": 2879.9619169999787 }, { "questionId": "q62", @@ -6773,18 +6773,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1214 + "latencyMs": 1258.860249999998 }, { "questionId": "q62", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1321 + "inputTokens": 9157, + "outputTokens": 263, + "latencyMs": 7819.738958000002 }, { "questionId": "q62", @@ -6795,18 +6795,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1311 + "latencyMs": 1495.973915999988 }, { "questionId": "q62", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 1769 + "inputTokens": 7372, + "outputTokens": 135, + "latencyMs": 3092.4329169999983 }, { "questionId": "q62", @@ -6817,18 +6817,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1157 + "latencyMs": 1268.1641250000102 }, { "questionId": "q63", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1213 + "inputTokens": 9739, + "outputTokens": 265, + "latencyMs": 4409.96212500002 }, { "questionId": "q63", @@ -6839,18 +6839,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1332 + "latencyMs": 1422.6079999999783 }, { "questionId": "q63", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 3749 + "inputTokens": 6013, + "outputTokens": 329, + "latencyMs": 3593.100334000017 }, { "questionId": "q63", @@ -6861,18 +6861,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1326 + "latencyMs": 1474.3911249999946 }, { "questionId": "q63", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 947 + "inputTokens": 6781, + "outputTokens": 265, + "latencyMs": 5419.795374999987 }, { "questionId": "q63", @@ -6883,18 +6883,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1251 + "latencyMs": 1059.3489999999874 }, { "questionId": "q63", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1428 + "inputTokens": 9158, + "outputTokens": 265, + "latencyMs": 4783.504167000006 }, { "questionId": "q63", @@ -6905,18 +6905,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1659 + "latencyMs": 1340.6675410000025 }, { "questionId": "q63", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 5584 + "inputTokens": 7373, + "outputTokens": 329, + "latencyMs": 4222.140958000004 }, { "questionId": "q63", @@ -6927,18 +6927,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1251 + "latencyMs": 1169.892125000013 }, { "questionId": "q64", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 2425 + "inputTokens": 9738, + "outputTokens": 135, + "latencyMs": 2854.8382500000007 }, { "questionId": "q64", @@ -6949,18 +6949,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1481 + "latencyMs": 1077.335374999995 }, { "questionId": "q64", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1109 + "inputTokens": 6012, + "outputTokens": 135, + "latencyMs": 2525.2092499999853 }, { "questionId": "q64", @@ -6971,18 +6971,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1048 + "latencyMs": 2100.2050000000163 }, { "questionId": "q64", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 1256 + "inputTokens": 6780, + "outputTokens": 263, + "latencyMs": 5882.592499999999 }, { "questionId": "q64", @@ -6993,18 +6993,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1117 + "latencyMs": 1168.5295410000253 }, { "questionId": "q64", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1168 + "inputTokens": 9157, + "outputTokens": 263, + "latencyMs": 3944.433083000011 }, { "questionId": "q64", @@ -7015,18 +7015,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1504 + "latencyMs": 1882.1263749999925 }, { "questionId": "q64", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 1134 + "inputTokens": 7372, + "outputTokens": 135, + "latencyMs": 1657.7255829999922 }, { "questionId": "q64", @@ -7037,18 +7037,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1059 + "latencyMs": 1056.5719169999938 }, { "questionId": "q65", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 2361 + "inputTokens": 9739, + "outputTokens": 266, + "latencyMs": 5764.2531250000175 }, { "questionId": "q65", @@ -7059,18 +7059,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 8, - "latencyMs": 1158 + "latencyMs": 1241.8239590000012 }, { "questionId": "q65", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1493 + "inputTokens": 6013, + "outputTokens": 266, + "latencyMs": 3203.148416000011 }, { "questionId": "q65", @@ -7081,18 +7081,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 8, - "latencyMs": 1068 + "latencyMs": 1395.2265419999894 }, { "questionId": "q65", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 1490 + "inputTokens": 6781, + "outputTokens": 330, + "latencyMs": 3854.1738750000077 }, { "questionId": "q65", @@ -7103,18 +7103,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 8, - "latencyMs": 1386 + "latencyMs": 1868.680457999988 }, { "questionId": "q65", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 1470 + "inputTokens": 9158, + "outputTokens": 330, + "latencyMs": 4486.571708000003 }, { "questionId": "q65", @@ -7125,18 +7125,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 8, - "latencyMs": 1189 + "latencyMs": 1336.9320829999924 }, { "questionId": "q65", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 2824 + "inputTokens": 7373, + "outputTokens": 266, + "latencyMs": 3571.6664579999924 }, { "questionId": "q65", @@ -7147,18 +7147,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 8, - "latencyMs": 1565 + "latencyMs": 1179.5032920000085 }, { "questionId": "q66", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 1480 + "inputTokens": 9738, + "outputTokens": 200, + "latencyMs": 3395.709499999997 }, { "questionId": "q66", @@ -7169,18 +7169,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1354 + "latencyMs": 1374.4573329999985 }, { "questionId": "q66", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 5334 + "inputTokens": 6012, + "outputTokens": 200, + "latencyMs": 3162.779542000004 }, { "questionId": "q66", @@ -7191,18 +7191,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1158 + "latencyMs": 1010.6076670000039 }, { "questionId": "q66", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 2043 + "inputTokens": 6780, + "outputTokens": 328, + "latencyMs": 3606.7964999999967 }, { "questionId": "q66", @@ -7213,18 +7213,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1302 + "latencyMs": 1432.5227920000034 }, { "questionId": "q66", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1006 + "inputTokens": 9157, + "outputTokens": 328, + "latencyMs": 2916.351958000014 }, { "questionId": "q66", @@ -7235,18 +7235,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1106 + "latencyMs": 1207.7237920000043 }, { "questionId": "q66", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 1801 + "inputTokens": 7372, + "outputTokens": 136, + "latencyMs": 2741.256458000018 }, { "questionId": "q66", @@ -7257,18 +7257,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1626 + "latencyMs": 1385.7817920000234 }, { "questionId": "q67", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 2107 + "inputTokens": 9739, + "outputTokens": 201, + "latencyMs": 4731.81024999998 }, { "questionId": "q67", @@ -7279,18 +7279,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1183 + "latencyMs": 1572.4971659999865 }, { "questionId": "q67", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 7091 + "inputTokens": 6013, + "outputTokens": 137, + "latencyMs": 2684.556333000015 }, { "questionId": "q67", @@ -7301,18 +7301,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1730 + "latencyMs": 1314.9989999999816 }, { "questionId": "q67", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1222 + "inputTokens": 6781, + "outputTokens": 137, + "latencyMs": 2746.457541999989 }, { "questionId": "q67", @@ -7323,18 +7323,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1447 + "latencyMs": 1254.8903329999885 }, { "questionId": "q67", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 10295 + "inputTokens": 9158, + "outputTokens": 137, + "latencyMs": 4298.293416 }, { "questionId": "q67", @@ -7345,18 +7345,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1228 + "latencyMs": 1346.4980839999916 }, { "questionId": "q67", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1748 + "inputTokens": 7373, + "outputTokens": 265, + "latencyMs": 3634.2565419999883 }, { "questionId": "q67", @@ -7367,18 +7367,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1373 + "latencyMs": 1363.8280410000007 }, { "questionId": "q68", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 3836 + "inputTokens": 9738, + "outputTokens": 392, + "latencyMs": 3933.217000000004 }, { "questionId": "q68", @@ -7389,18 +7389,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1297 + "latencyMs": 1229.9339579999796 }, { "questionId": "q68", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 1927 + "inputTokens": 6012, + "outputTokens": 136, + "latencyMs": 2728.4598340000084 }, { "questionId": "q68", @@ -7411,18 +7411,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1171 + "latencyMs": 1427.2494170000136 }, { "questionId": "q68", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 1551 + "inputTokens": 6780, + "outputTokens": 200, + "latencyMs": 3187.385666999995 }, { "questionId": "q68", @@ -7433,18 +7433,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1273 + "latencyMs": 1482.2487079999992 }, { "questionId": "q68", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1387 + "inputTokens": 9157, + "outputTokens": 264, + "latencyMs": 3429.744458000001 }, { "questionId": "q68", @@ -7455,18 +7455,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1237 + "latencyMs": 1100.8814589999965 }, { "questionId": "q68", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 1934 + "inputTokens": 7372, + "outputTokens": 72, + "latencyMs": 1993.443707999977 }, { "questionId": "q68", @@ -7477,18 +7477,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1132 + "latencyMs": 1105.5260419999831 }, { "questionId": "q69", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 2267 + "inputTokens": 9739, + "outputTokens": 137, + "latencyMs": 3255.3775840000017 }, { "questionId": "q69", @@ -7499,18 +7499,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1772 + "latencyMs": 1274.000417000003 }, { "questionId": "q69", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1315 + "inputTokens": 6013, + "outputTokens": 265, + "latencyMs": 3098.326624999987 }, { "questionId": "q69", @@ -7521,18 +7521,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1165 + "latencyMs": 1057.8637079999899 }, { "questionId": "q69", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1097 + "inputTokens": 6781, + "outputTokens": 201, + "latencyMs": 3651.3826249999984 }, { "questionId": "q69", @@ -7543,18 +7543,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1299 + "latencyMs": 1404.9795829999784 }, { "questionId": "q69", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1779 + "inputTokens": 9158, + "outputTokens": 201, + "latencyMs": 4157.148833000014 }, { "questionId": "q69", @@ -7565,18 +7565,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 3153 + "latencyMs": 1607.9431249999907 }, { "questionId": "q69", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1813 + "inputTokens": 7373, + "outputTokens": 329, + "latencyMs": 4582.246665999992 }, { "questionId": "q69", @@ -7587,18 +7587,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1867 + "latencyMs": 1458.8513329999987 }, { "questionId": "q70", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 1611 + "inputTokens": 9738, + "outputTokens": 200, + "latencyMs": 3341.994207999989 }, { "questionId": "q70", @@ -7609,18 +7609,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1173 + "latencyMs": 1144.3136670000094 }, { "questionId": "q70", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 1977 + "inputTokens": 6012, + "outputTokens": 392, + "latencyMs": 6067.672458999994 }, { "questionId": "q70", @@ -7631,18 +7631,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1108 + "latencyMs": 1325.0467500000086 }, { "questionId": "q70", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 1324 + "inputTokens": 6780, + "outputTokens": 200, + "latencyMs": 2847.485000000015 }, { "questionId": "q70", @@ -7653,18 +7653,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1225 + "latencyMs": 1212.1944169999915 }, { "questionId": "q70", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1416 + "inputTokens": 9157, + "outputTokens": 456, + "latencyMs": 5099.853499999997 }, { "questionId": "q70", @@ -7675,18 +7675,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1200 + "latencyMs": 1284.708416999987 }, { "questionId": "q70", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 1259 + "inputTokens": 7372, + "outputTokens": 200, + "latencyMs": 2745.7869170000195 }, { "questionId": "q70", @@ -7697,18 +7697,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1433 + "latencyMs": 1114.6338329999999 }, { "questionId": "q71", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1729 + "inputTokens": 9739, + "outputTokens": 265, + "latencyMs": 3482.8154170000053 }, { "questionId": "q71", @@ -7719,18 +7719,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1143 + "latencyMs": 1156.5491669999901 }, { "questionId": "q71", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1837 + "inputTokens": 6013, + "outputTokens": 201, + "latencyMs": 2970.104541000008 }, { "questionId": "q71", @@ -7741,18 +7741,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1147 + "latencyMs": 1297.768374999985 }, { "questionId": "q71", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1777 + "inputTokens": 6781, + "outputTokens": 201, + "latencyMs": 3475.6895419999782 }, { "questionId": "q71", @@ -7763,18 +7763,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1295 + "latencyMs": 1469.7436250000028 }, { "questionId": "q71", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1081 + "inputTokens": 9158, + "outputTokens": 265, + "latencyMs": 4107.424582999985 }, { "questionId": "q71", @@ -7785,18 +7785,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1692 + "latencyMs": 1070.4507500000182 }, { "questionId": "q71", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1661 + "inputTokens": 7373, + "outputTokens": 265, + "latencyMs": 3768.3023749999993 }, { "questionId": "q71", @@ -7807,18 +7807,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1475 + "latencyMs": 1111.744915999996 }, { "questionId": "q72", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 2979 + "inputTokens": 9738, + "outputTokens": 263, + "latencyMs": 3199.3634999999776 }, { "questionId": "q72", @@ -7829,18 +7829,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1187 + "latencyMs": 1232.4811659999832 }, { "questionId": "q72", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1620 + "inputTokens": 6012, + "outputTokens": 263, + "latencyMs": 5616.989999999991 }, { "questionId": "q72", @@ -7851,18 +7851,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1532 + "latencyMs": 1697.3162920000032 }, { "questionId": "q72", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 1616 + "inputTokens": 6780, + "outputTokens": 199, + "latencyMs": 2781.3399999999965 }, { "questionId": "q72", @@ -7873,18 +7873,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1435 + "latencyMs": 1162.0402089999989 }, { "questionId": "q72", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1190 + "inputTokens": 9157, + "outputTokens": 199, + "latencyMs": 3651.1349579999805 }, { "questionId": "q72", @@ -7895,18 +7895,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1414 + "latencyMs": 1132.3132920000062 }, { "questionId": "q72", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "processing", "actual": "processing", "correct": true, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 2335 + "inputTokens": 7372, + "outputTokens": 135, + "latencyMs": 3017.5073749999865 }, { "questionId": "q72", @@ -7917,18 +7917,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1308 + "latencyMs": 1294.688374999998 }, { "questionId": "q73", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 3359 + "inputTokens": 9739, + "outputTokens": 201, + "latencyMs": 3591.221499999985 }, { "questionId": "q73", @@ -7939,18 +7939,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1227 + "latencyMs": 1329.419332999998 }, { "questionId": "q73", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1439 + "inputTokens": 6013, + "outputTokens": 137, + "latencyMs": 2655.557792000007 }, { "questionId": "q73", @@ -7961,18 +7961,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1179 + "latencyMs": 1446.9020000000019 }, { "questionId": "q73", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1064 + "inputTokens": 6781, + "outputTokens": 201, + "latencyMs": 3450.5822500000068 }, { "questionId": "q73", @@ -7983,18 +7983,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1144 + "latencyMs": 1291.2180410000146 }, { "questionId": "q73", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1873 + "inputTokens": 9158, + "outputTokens": 201, + "latencyMs": 2803.9767500000016 }, { "questionId": "q73", @@ -8005,18 +8005,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1302 + "latencyMs": 1098.5968749999884 }, { "questionId": "q73", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1956 + "inputTokens": 7373, + "outputTokens": 201, + "latencyMs": 3047.8699999999953 }, { "questionId": "q73", @@ -8027,18 +8027,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1281 + "latencyMs": 1800.6882080000069 }, { "questionId": "q74", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 1591 + "inputTokens": 9738, + "outputTokens": 199, + "latencyMs": 2957.2203330000048 }, { "questionId": "q74", @@ -8049,18 +8049,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1279 + "latencyMs": 1165.7748750000028 }, { "questionId": "q74", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 3152 + "inputTokens": 6012, + "outputTokens": 135, + "latencyMs": 2362.283208000008 }, { "questionId": "q74", @@ -8071,18 +8071,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1061 + "latencyMs": 1871.7275829999999 }, { "questionId": "q74", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 1557 + "inputTokens": 6780, + "outputTokens": 263, + "latencyMs": 4747.243208 }, { "questionId": "q74", @@ -8093,18 +8093,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1313 + "latencyMs": 1275.342082999996 }, { "questionId": "q74", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1433 + "inputTokens": 9157, + "outputTokens": 199, + "latencyMs": 3180.0179160000116 }, { "questionId": "q74", @@ -8115,18 +8115,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1812 + "latencyMs": 2343.5514580000017 }, { "questionId": "q74", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "pending", "actual": "pending", "correct": true, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 1024 + "inputTokens": 7372, + "outputTokens": 135, + "latencyMs": 2362.525915999984 }, { "questionId": "q74", @@ -8137,18 +8137,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1243 + "latencyMs": 1231.4291669999948 }, { "questionId": "q75", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1066", "actual": "1066", "correct": true, - "inputTokens": 9740, - "outputTokens": 3, - "latencyMs": 1500 + "inputTokens": 9739, + "outputTokens": 200, + "latencyMs": 3091.9045840000035 }, { "questionId": "q75", @@ -8159,18 +8159,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 6, - "latencyMs": 1275 + "latencyMs": 1111.9695000000065 }, { "questionId": "q75", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1066", "actual": "1066", "correct": true, - "inputTokens": 6014, - "outputTokens": 3, - "latencyMs": 1841 + "inputTokens": 6013, + "outputTokens": 264, + "latencyMs": 3977.5146669999813 }, { "questionId": "q75", @@ -8181,18 +8181,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 6, - "latencyMs": 1080 + "latencyMs": 1195.262208 }, { "questionId": "q75", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1066", "actual": "1066", "correct": true, - "inputTokens": 6782, - "outputTokens": 3, - "latencyMs": 1209 + "inputTokens": 6781, + "outputTokens": 328, + "latencyMs": 3839.0627499999828 }, { "questionId": "q75", @@ -8203,18 +8203,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 6, - "latencyMs": 1308 + "latencyMs": 2186.8021250000165 }, { "questionId": "q75", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1066", "actual": "1066", "correct": true, - "inputTokens": 9159, - "outputTokens": 3, - "latencyMs": 1556 + "inputTokens": 9158, + "outputTokens": 328, + "latencyMs": 6945.004667000001 }, { "questionId": "q75", @@ -8225,18 +8225,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 6, - "latencyMs": 1240 + "latencyMs": 1103.6762919999892 }, { "questionId": "q75", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1066", "actual": "1066", "correct": true, - "inputTokens": 7374, - "outputTokens": 3, - "latencyMs": 1254 + "inputTokens": 7373, + "outputTokens": 264, + "latencyMs": 3924.5181250000023 }, { "questionId": "q75", @@ -8247,18 +8247,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 6, - "latencyMs": 1305 + "latencyMs": 1023.334583000018 }, { "questionId": "q76", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 2606 + "inputTokens": 9738, + "outputTokens": 264, + "latencyMs": 4017.931666999997 }, { "questionId": "q76", @@ -8269,18 +8269,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1422 + "latencyMs": 1278.6839580000087 }, { "questionId": "q76", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 2688 + "inputTokens": 6012, + "outputTokens": 200, + "latencyMs": 2566.9374580000003 }, { "questionId": "q76", @@ -8291,18 +8291,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1041 + "latencyMs": 958.4104159999988 }, { "questionId": "q76", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 3070 + "inputTokens": 6780, + "outputTokens": 264, + "latencyMs": 3640.0960409999825 }, { "questionId": "q76", @@ -8313,18 +8313,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1167 + "latencyMs": 1534.7306249999965 }, { "questionId": "q76", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1702 + "inputTokens": 9157, + "outputTokens": 328, + "latencyMs": 3905.6711249999935 }, { "questionId": "q76", @@ -8335,18 +8335,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1182 + "latencyMs": 2067.435375000001 }, { "questionId": "q76", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 1740 + "inputTokens": 7372, + "outputTokens": 264, + "latencyMs": 3613.7146249999932 }, { "questionId": "q76", @@ -8357,18 +8357,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1404 + "latencyMs": 1154.955958000006 }, { "questionId": "q77", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1596 + "inputTokens": 9739, + "outputTokens": 330, + "latencyMs": 3904.2146250000224 }, { "questionId": "q77", @@ -8379,18 +8379,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 8, - "latencyMs": 2314 + "latencyMs": 1618.7487079999992 }, { "questionId": "q77", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1114 + "inputTokens": 6013, + "outputTokens": 202, + "latencyMs": 2906.194541999983 }, { "questionId": "q77", @@ -8401,18 +8401,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 8, - "latencyMs": 1289 + "latencyMs": 1481.559333000012 }, { "questionId": "q77", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 2428 + "inputTokens": 6781, + "outputTokens": 266, + "latencyMs": 3879.7539999999863 }, { "questionId": "q77", @@ -8423,18 +8423,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 8, - "latencyMs": 1325 + "latencyMs": 1809.5822499999776 }, { "questionId": "q77", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 1343 + "inputTokens": 9158, + "outputTokens": 202, + "latencyMs": 3147.330500000011 }, { "questionId": "q77", @@ -8445,18 +8445,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 8, - "latencyMs": 1783 + "latencyMs": 1297.2377080000006 }, { "questionId": "q77", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 918 + "inputTokens": 7373, + "outputTokens": 394, + "latencyMs": 3710.157500000001 }, { "questionId": "q77", @@ -8467,18 +8467,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 8, - "latencyMs": 1308 + "latencyMs": 1238.5442500000063 }, { "questionId": "q78", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 9739, - "outputTokens": 3, - "latencyMs": 1396 + "inputTokens": 9738, + "outputTokens": 392, + "latencyMs": 4101.743083999987 }, { "questionId": "q78", @@ -8489,18 +8489,18 @@ "correct": true, "inputTokens": 11906, "outputTokens": 4, - "latencyMs": 1225 + "latencyMs": 1170.750417000003 }, { "questionId": "q78", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 6013, - "outputTokens": 3, - "latencyMs": 2294 + "inputTokens": 6012, + "outputTokens": 264, + "latencyMs": 8324.009665999998 }, { "questionId": "q78", @@ -8511,18 +8511,18 @@ "correct": true, "inputTokens": 6992, "outputTokens": 4, - "latencyMs": 1418 + "latencyMs": 1173.343790999992 }, { "questionId": "q78", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 6781, - "outputTokens": 3, - "latencyMs": 1613 + "inputTokens": 6780, + "outputTokens": 264, + "latencyMs": 3005.4394999999786 }, { "questionId": "q78", @@ -8533,18 +8533,18 @@ "correct": true, "inputTokens": 8413, "outputTokens": 4, - "latencyMs": 1374 + "latencyMs": 1376.5506659999955 }, { "questionId": "q78", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 9158, - "outputTokens": 3, - "latencyMs": 1341 + "inputTokens": 9157, + "outputTokens": 136, + "latencyMs": 3209.5317499999946 }, { "questionId": "q78", @@ -8555,18 +8555,18 @@ "correct": true, "inputTokens": 9288, "outputTokens": 4, - "latencyMs": 1223 + "latencyMs": 1299.4064170000202 }, { "questionId": "q78", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", "correct": true, - "inputTokens": 7373, - "outputTokens": 3, - "latencyMs": 2230 + "inputTokens": 7372, + "outputTokens": 264, + "latencyMs": 3753.726042000024 }, { "questionId": "q78", @@ -8577,18 +8577,18 @@ "correct": true, "inputTokens": 8384, "outputTokens": 4, - "latencyMs": 1425 + "latencyMs": 1134.558416999993 }, { "questionId": "q79", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1377 + "inputTokens": 9739, + "outputTokens": 73, + "latencyMs": 2494.451874999999 }, { "questionId": "q79", @@ -8599,18 +8599,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 9, - "latencyMs": 1550 + "latencyMs": 1270.5290410000016 }, { "questionId": "q79", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1394 + "inputTokens": 6013, + "outputTokens": 137, + "latencyMs": 2403.4134579999954 }, { "questionId": "q79", @@ -8621,18 +8621,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 9, - "latencyMs": 1202 + "latencyMs": 1673.0169579999929 }, { "questionId": "q79", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1435 + "inputTokens": 6781, + "outputTokens": 73, + "latencyMs": 1704.8420409999962 }, { "questionId": "q79", @@ -8643,18 +8643,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 9, - "latencyMs": 1277 + "latencyMs": 1447.5210840000072 }, { "questionId": "q79", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1564 + "inputTokens": 9158, + "outputTokens": 73, + "latencyMs": 1638.756207999977 }, { "questionId": "q79", @@ -8665,18 +8665,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 9, - "latencyMs": 1200 + "latencyMs": 1504.7892920000013 }, { "questionId": "q79", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1596 + "inputTokens": 7373, + "outputTokens": 137, + "latencyMs": 2409.509625000006 }, { "questionId": "q79", @@ -8687,18 +8687,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 9, - "latencyMs": 1151 + "latencyMs": 1318.699833999999 }, { "questionId": "q80", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1458 + "inputTokens": 9739, + "outputTokens": 138, + "latencyMs": 2616.233749999985 }, { "questionId": "q80", @@ -8709,18 +8709,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 9, - "latencyMs": 1283 + "latencyMs": 1314.3836249999877 }, { "questionId": "q80", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 4702 + "inputTokens": 6013, + "outputTokens": 138, + "latencyMs": 2722.7087499999907 }, { "questionId": "q80", @@ -8731,18 +8731,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 9, - "latencyMs": 1360 + "latencyMs": 1190.632500000007 }, { "questionId": "q80", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 6167 + "inputTokens": 6781, + "outputTokens": 330, + "latencyMs": 4346.388291999989 }, { "questionId": "q80", @@ -8753,18 +8753,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 9, - "latencyMs": 1449 + "latencyMs": 1327.8158750000002 }, { "questionId": "q80", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 6096 + "inputTokens": 9158, + "outputTokens": 74, + "latencyMs": 2443.0598340000142 }, { "questionId": "q80", @@ -8775,18 +8775,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 9, - "latencyMs": 1194 + "latencyMs": 1396.4260829999985 }, { "questionId": "q80", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 7357 + "inputTokens": 7373, + "outputTokens": 266, + "latencyMs": 4886.8007919999945 }, { "questionId": "q80", @@ -8797,18 +8797,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 9, - "latencyMs": 1213 + "latencyMs": 1469.287249999994 }, { "questionId": "q81", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", "correct": true, - "inputTokens": 9740, - "outputTokens": 6, - "latencyMs": 2539 + "inputTokens": 9739, + "outputTokens": 139, + "latencyMs": 2891.1199170000036 }, { "questionId": "q81", @@ -8819,18 +8819,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 10, - "latencyMs": 1532 + "latencyMs": 1342.1902079999854 }, { "questionId": "q81", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", "correct": true, - "inputTokens": 6014, - "outputTokens": 6, - "latencyMs": 2960 + "inputTokens": 6013, + "outputTokens": 139, + "latencyMs": 2846.046624999988 }, { "questionId": "q81", @@ -8841,18 +8841,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 10, - "latencyMs": 1547 + "latencyMs": 1327.919499999989 }, { "questionId": "q81", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", "correct": true, - "inputTokens": 6782, - "outputTokens": 6, - "latencyMs": 1358 + "inputTokens": 6781, + "outputTokens": 139, + "latencyMs": 4302.444041999988 }, { "questionId": "q81", @@ -8863,18 +8863,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 10, - "latencyMs": 1424 + "latencyMs": 1207.6207500000019 }, { "questionId": "q81", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", "correct": true, - "inputTokens": 9159, - "outputTokens": 6, - "latencyMs": 958 + "inputTokens": 9158, + "outputTokens": 267, + "latencyMs": 3389.5046659999934 }, { "questionId": "q81", @@ -8885,18 +8885,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 10, - "latencyMs": 1381 + "latencyMs": 1236.2248340000224 }, { "questionId": "q81", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", "correct": true, - "inputTokens": 7374, - "outputTokens": 6, - "latencyMs": 1372 + "inputTokens": 7373, + "outputTokens": 139, + "latencyMs": 2138.4831669999985 }, { "questionId": "q81", @@ -8907,18 +8907,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 10, - "latencyMs": 1715 + "latencyMs": 1233.3828330000106 }, { "questionId": "q82", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1972 + "inputTokens": 9739, + "outputTokens": 138, + "latencyMs": 3346.8621669999848 }, { "questionId": "q82", @@ -8929,18 +8929,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 10, - "latencyMs": 1315 + "latencyMs": 1321.650082999986 }, { "questionId": "q82", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1634 + "inputTokens": 6013, + "outputTokens": 138, + "latencyMs": 2395.766499999998 }, { "questionId": "q82", @@ -8951,18 +8951,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 10, - "latencyMs": 1264 + "latencyMs": 1749.51670800001 }, { "questionId": "q82", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 1153 + "inputTokens": 6781, + "outputTokens": 330, + "latencyMs": 4207.4487500000105 }, { "questionId": "q82", @@ -8973,18 +8973,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 10, - "latencyMs": 1252 + "latencyMs": 1495.846125000011 }, { "questionId": "q82", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 1697 + "inputTokens": 9158, + "outputTokens": 266, + "latencyMs": 4258.881374999997 }, { "questionId": "q82", @@ -8995,18 +8995,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 10, - "latencyMs": 1198 + "latencyMs": 1113.9782499999856 }, { "questionId": "q82", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 1854 + "inputTokens": 7373, + "outputTokens": 74, + "latencyMs": 1841.1115829999908 }, { "questionId": "q82", @@ -9017,18 +9017,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 10, - "latencyMs": 1752 + "latencyMs": 1350.6631249999919 }, { "questionId": "q83", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 2076 + "inputTokens": 9739, + "outputTokens": 138, + "latencyMs": 2322.9531669999997 }, { "questionId": "q83", @@ -9039,18 +9039,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1398 + "latencyMs": 1556.4763749999984 }, { "questionId": "q83", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 2263 + "inputTokens": 6013, + "outputTokens": 74, + "latencyMs": 2354.004667000001 }, { "questionId": "q83", @@ -9061,18 +9061,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 3101 + "latencyMs": 1314.1952909999818 }, { "questionId": "q83", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 1453 + "inputTokens": 6781, + "outputTokens": 138, + "latencyMs": 3437.8392080000194 }, { "questionId": "q83", @@ -9083,18 +9083,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1265 + "latencyMs": 1131.0356249999895 }, { "questionId": "q83", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 8807 + "inputTokens": 9158, + "outputTokens": 138, + "latencyMs": 3209.646000000008 }, { "questionId": "q83", @@ -9105,18 +9105,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1097 + "latencyMs": 1175.6475829999836 }, { "questionId": "q83", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 1667 + "inputTokens": 7373, + "outputTokens": 266, + "latencyMs": 3785.0792920000094 }, { "questionId": "q83", @@ -9127,18 +9127,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1198 + "latencyMs": 1314.7905420000025 }, { "questionId": "q84", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", "correct": true, - "inputTokens": 9740, - "outputTokens": 3, - "latencyMs": 2292 + "inputTokens": 9739, + "outputTokens": 72, + "latencyMs": 2562.896166999999 }, { "questionId": "q84", @@ -9149,18 +9149,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1202 + "latencyMs": 3205.178583000001 }, { "questionId": "q84", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", "correct": true, - "inputTokens": 6014, - "outputTokens": 3, - "latencyMs": 1801 + "inputTokens": 6013, + "outputTokens": 136, + "latencyMs": 3746.9874170000257 }, { "questionId": "q84", @@ -9171,18 +9171,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1287 + "latencyMs": 1159.280584000022 }, { "questionId": "q84", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Andrew Kling", - "actual": "Andrew Kling", - "correct": true, - "inputTokens": 6782, - "outputTokens": 3, - "latencyMs": 1340 + "actual": "Marvin Thiel", + "correct": false, + "inputTokens": 6781, + "outputTokens": 202, + "latencyMs": 2584.499542000005 }, { "questionId": "q84", @@ -9193,18 +9193,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1163 + "latencyMs": 1249.9375 }, { "questionId": "q84", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", "correct": true, - "inputTokens": 9159, - "outputTokens": 3, - "latencyMs": 2685 + "inputTokens": 9158, + "outputTokens": 136, + "latencyMs": 2068.6956669999927 }, { "questionId": "q84", @@ -9215,18 +9215,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1397 + "latencyMs": 1733.235834000021 }, { "questionId": "q84", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", "correct": true, - "inputTokens": 7374, - "outputTokens": 3, - "latencyMs": 1289 + "inputTokens": 7373, + "outputTokens": 200, + "latencyMs": 3831.721124999982 }, { "questionId": "q84", @@ -9237,18 +9237,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1155 + "latencyMs": 1311.1745419999934 }, { "questionId": "q85", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", "correct": true, - "inputTokens": 9740, - "outputTokens": 6, - "latencyMs": 1601 + "inputTokens": 9739, + "outputTokens": 139, + "latencyMs": 5464.460791999998 }, { "questionId": "q85", @@ -9259,18 +9259,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 9, - "latencyMs": 1340 + "latencyMs": 1266.8881249999977 }, { "questionId": "q85", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", "correct": true, - "inputTokens": 6014, - "outputTokens": 6, - "latencyMs": 3525 + "inputTokens": 6013, + "outputTokens": 203, + "latencyMs": 2957.0821250000154 }, { "questionId": "q85", @@ -9281,18 +9281,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 9, - "latencyMs": 1710 + "latencyMs": 1264.50791700001 }, { "questionId": "q85", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", "correct": true, - "inputTokens": 6782, - "outputTokens": 6, - "latencyMs": 2333 + "inputTokens": 6781, + "outputTokens": 331, + "latencyMs": 3740.643666000018 }, { "questionId": "q85", @@ -9303,18 +9303,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 9, - "latencyMs": 1168 + "latencyMs": 1310.5358749999723 }, { "questionId": "q85", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", "correct": true, - "inputTokens": 9159, - "outputTokens": 6, - "latencyMs": 1781 + "inputTokens": 9158, + "outputTokens": 139, + "latencyMs": 2979.4539579999982 }, { "questionId": "q85", @@ -9325,18 +9325,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 9, - "latencyMs": 1552 + "latencyMs": 2026.8683329999913 }, { "questionId": "q85", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", "correct": true, - "inputTokens": 7374, - "outputTokens": 6, - "latencyMs": 1584 + "inputTokens": 7373, + "outputTokens": 139, + "latencyMs": 2932.0294159999758 }, { "questionId": "q85", @@ -9347,18 +9347,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 9, - "latencyMs": 1548 + "latencyMs": 1130.2447079999838 }, { "questionId": "q86", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", "correct": true, - "inputTokens": 9740, - "outputTokens": 6, - "latencyMs": 7230 + "inputTokens": 9739, + "outputTokens": 203, + "latencyMs": 2576.945458000002 }, { "questionId": "q86", @@ -9369,18 +9369,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 9, - "latencyMs": 1933 + "latencyMs": 1214.6620409999741 }, { "questionId": "q86", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", "correct": true, - "inputTokens": 6014, - "outputTokens": 6, - "latencyMs": 1067 + "inputTokens": 6013, + "outputTokens": 203, + "latencyMs": 3718.371167000005 }, { "questionId": "q86", @@ -9391,18 +9391,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 9, - "latencyMs": 1288 + "latencyMs": 1374.984832999995 }, { "questionId": "q86", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", "correct": true, - "inputTokens": 6782, - "outputTokens": 6, - "latencyMs": 3954 + "inputTokens": 6781, + "outputTokens": 139, + "latencyMs": 2313.5867499999877 }, { "questionId": "q86", @@ -9413,18 +9413,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 9, - "latencyMs": 1314 + "latencyMs": 1325.0793330000015 }, { "questionId": "q86", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", "correct": true, - "inputTokens": 9159, - "outputTokens": 6, - "latencyMs": 1334 + "inputTokens": 9158, + "outputTokens": 139, + "latencyMs": 2777.8669999999984 }, { "questionId": "q86", @@ -9435,18 +9435,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 9, - "latencyMs": 2441 + "latencyMs": 1246.2134589999914 }, { "questionId": "q86", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", "correct": true, - "inputTokens": 7374, - "outputTokens": 6, - "latencyMs": 1650 + "inputTokens": 7373, + "outputTokens": 75, + "latencyMs": 2246.8254580000066 }, { "questionId": "q86", @@ -9457,18 +9457,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 9, - "latencyMs": 1495 + "latencyMs": 1573.5733749999781 }, { "questionId": "q87", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1262 + "inputTokens": 9739, + "outputTokens": 74, + "latencyMs": 2494.7630000000063 }, { "questionId": "q87", @@ -9479,18 +9479,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 7, - "latencyMs": 1367 + "latencyMs": 1135.412083000003 }, { "questionId": "q87", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1385 + "inputTokens": 6013, + "outputTokens": 138, + "latencyMs": 2332.6303330000082 }, { "questionId": "q87", @@ -9501,18 +9501,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 7, - "latencyMs": 1313 + "latencyMs": 1175.6766249999928 }, { "questionId": "q87", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 1141 + "inputTokens": 6781, + "outputTokens": 458, + "latencyMs": 4252.623416000017 }, { "questionId": "q87", @@ -9523,18 +9523,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 7, - "latencyMs": 1300 + "latencyMs": 1297.546416999976 }, { "questionId": "q87", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 3347 + "inputTokens": 9158, + "outputTokens": 74, + "latencyMs": 2264.2770829999936 }, { "questionId": "q87", @@ -9545,18 +9545,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 7, - "latencyMs": 1457 + "latencyMs": 1055.0764170000039 }, { "questionId": "q87", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 1276 + "inputTokens": 7373, + "outputTokens": 138, + "latencyMs": 3193.2753749999974 }, { "questionId": "q87", @@ -9567,18 +9567,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 7, - "latencyMs": 1211 + "latencyMs": 1912.7229999999981 }, { "questionId": "q88", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1635 + "inputTokens": 9739, + "outputTokens": 138, + "latencyMs": 2147.5894160000025 }, { "questionId": "q88", @@ -9589,18 +9589,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 9, - "latencyMs": 1582 + "latencyMs": 1377.5190409999923 }, { "questionId": "q88", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1695 + "inputTokens": 6013, + "outputTokens": 202, + "latencyMs": 4472.317459000013 }, { "questionId": "q88", @@ -9611,18 +9611,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 9, - "latencyMs": 1318 + "latencyMs": 1376.0682919999817 }, { "questionId": "q88", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 936 + "inputTokens": 6781, + "outputTokens": 202, + "latencyMs": 6952.122459000006 }, { "questionId": "q88", @@ -9633,18 +9633,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 9, - "latencyMs": 1204 + "latencyMs": 1178.8732909999962 }, { "questionId": "q88", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 996 + "inputTokens": 9158, + "outputTokens": 266, + "latencyMs": 3619.214917000005 }, { "questionId": "q88", @@ -9655,18 +9655,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 9, - "latencyMs": 1261 + "latencyMs": 1212.3732920000039 }, { "questionId": "q88", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 2276 + "inputTokens": 7373, + "outputTokens": 202, + "latencyMs": 5169.327332999994 }, { "questionId": "q88", @@ -9677,18 +9677,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 9, - "latencyMs": 1380 + "latencyMs": 1452.6941670000087 }, { "questionId": "q89", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", "correct": true, - "inputTokens": 9740, - "outputTokens": 6, - "latencyMs": 1451 + "inputTokens": 9739, + "outputTokens": 395, + "latencyMs": 3384.798125000001 }, { "questionId": "q89", @@ -9699,18 +9699,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 10, - "latencyMs": 1977 + "latencyMs": 1241.960665999999 }, { "questionId": "q89", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", "correct": true, - "inputTokens": 6014, - "outputTokens": 6, - "latencyMs": 1376 + "inputTokens": 6013, + "outputTokens": 331, + "latencyMs": 4747.914124999981 }, { "questionId": "q89", @@ -9721,18 +9721,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 10, - "latencyMs": 1250 + "latencyMs": 1302.8907080000208 }, { "questionId": "q89", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", "correct": true, - "inputTokens": 6782, - "outputTokens": 6, - "latencyMs": 1273 + "inputTokens": 6781, + "outputTokens": 331, + "latencyMs": 3532.4660830000066 }, { "questionId": "q89", @@ -9743,18 +9743,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 10, - "latencyMs": 1359 + "latencyMs": 1203.086540999997 }, { "questionId": "q89", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", "correct": true, - "inputTokens": 9159, - "outputTokens": 6, - "latencyMs": 1791 + "inputTokens": 9158, + "outputTokens": 331, + "latencyMs": 4074.5077089999977 }, { "questionId": "q89", @@ -9765,18 +9765,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 10, - "latencyMs": 1273 + "latencyMs": 1345.891499999998 }, { "questionId": "q89", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", "correct": true, - "inputTokens": 7374, - "outputTokens": 6, - "latencyMs": 2832 + "inputTokens": 7373, + "outputTokens": 75, + "latencyMs": 1885.0838330000115 }, { "questionId": "q89", @@ -9787,18 +9787,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 10, - "latencyMs": 1172 + "latencyMs": 1182.5891669999983 }, { "questionId": "q90", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", "correct": true, - "inputTokens": 9740, - "outputTokens": 7, - "latencyMs": 1491 + "inputTokens": 9739, + "outputTokens": 140, + "latencyMs": 2772.3258339999884 }, { "questionId": "q90", @@ -9809,18 +9809,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 10, - "latencyMs": 1414 + "latencyMs": 1424.9674579999992 }, { "questionId": "q90", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", "correct": true, - "inputTokens": 6014, - "outputTokens": 7, - "latencyMs": 1396 + "inputTokens": 6013, + "outputTokens": 204, + "latencyMs": 2900.4731660000107 }, { "questionId": "q90", @@ -9831,18 +9831,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 10, - "latencyMs": 1514 + "latencyMs": 2815.817249999993 }, { "questionId": "q90", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", "correct": true, - "inputTokens": 6782, - "outputTokens": 7, - "latencyMs": 1573 + "inputTokens": 6781, + "outputTokens": 268, + "latencyMs": 3637.2442089999968 }, { "questionId": "q90", @@ -9853,18 +9853,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 10, - "latencyMs": 1284 + "latencyMs": 1104.2333339999896 }, { "questionId": "q90", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", "correct": true, - "inputTokens": 9159, - "outputTokens": 7, - "latencyMs": 5400 + "inputTokens": 9158, + "outputTokens": 396, + "latencyMs": 8213.703791999986 }, { "questionId": "q90", @@ -9875,18 +9875,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 10, - "latencyMs": 1486 + "latencyMs": 2875.9923749999725 }, { "questionId": "q90", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", "correct": true, - "inputTokens": 7374, - "outputTokens": 7, - "latencyMs": 1420 + "inputTokens": 7373, + "outputTokens": 140, + "latencyMs": 2809.8342080000148 }, { "questionId": "q90", @@ -9897,18 +9897,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 10, - "latencyMs": 1410 + "latencyMs": 1306.0824999999895 }, { "questionId": "q91", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1248 + "inputTokens": 9739, + "outputTokens": 265, + "latencyMs": 3632.680000000022 }, { "questionId": "q91", @@ -9919,18 +9919,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 5, - "latencyMs": 1177 + "latencyMs": 1446.0535420000087 }, { "questionId": "q91", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1601 + "inputTokens": 6013, + "outputTokens": 201, + "latencyMs": 2629.6447500000068 }, { "questionId": "q91", @@ -9941,18 +9941,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 5, - "latencyMs": 1822 + "latencyMs": 1387.298958999978 }, { "questionId": "q91", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1103 + "inputTokens": 6781, + "outputTokens": 457, + "latencyMs": 8303.644042 }, { "questionId": "q91", @@ -9963,18 +9963,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 5, - "latencyMs": 1247 + "latencyMs": 1178.2771250000224 }, { "questionId": "q91", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 1184 + "inputTokens": 9158, + "outputTokens": 329, + "latencyMs": 3967.7135410000046 }, { "questionId": "q91", @@ -9985,18 +9985,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 5, - "latencyMs": 1137 + "latencyMs": 1278.0479160000104 }, { "questionId": "q91", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 949 + "inputTokens": 7373, + "outputTokens": 73, + "latencyMs": 1974.7658750000119 }, { "questionId": "q91", @@ -10007,18 +10007,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 5, - "latencyMs": 1143 + "latencyMs": 1496.9746670000022 }, { "questionId": "q92", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", "correct": true, - "inputTokens": 9740, - "outputTokens": 4, - "latencyMs": 1021 + "inputTokens": 9739, + "outputTokens": 201, + "latencyMs": 4246.4962499999965 }, { "questionId": "q92", @@ -10029,18 +10029,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 8, - "latencyMs": 1301 + "latencyMs": 1322.2766660000198 }, { "questionId": "q92", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", "correct": true, - "inputTokens": 6014, - "outputTokens": 4, - "latencyMs": 1254 + "inputTokens": 6013, + "outputTokens": 137, + "latencyMs": 2135.097083999979 }, { "questionId": "q92", @@ -10051,18 +10051,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 8, - "latencyMs": 1375 + "latencyMs": 1213.9765000000189 }, { "questionId": "q92", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", "correct": true, - "inputTokens": 6782, - "outputTokens": 4, - "latencyMs": 1316 + "inputTokens": 6781, + "outputTokens": 265, + "latencyMs": 3583.0762920000125 }, { "questionId": "q92", @@ -10073,18 +10073,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 8, - "latencyMs": 2681 + "latencyMs": 1353.168249999988 }, { "questionId": "q92", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", "correct": true, - "inputTokens": 9159, - "outputTokens": 4, - "latencyMs": 2427 + "inputTokens": 9158, + "outputTokens": 201, + "latencyMs": 3724.366249999992 }, { "questionId": "q92", @@ -10095,18 +10095,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 8, - "latencyMs": 1526 + "latencyMs": 1239.5215000000026 }, { "questionId": "q92", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", "correct": true, - "inputTokens": 7374, - "outputTokens": 4, - "latencyMs": 1252 + "inputTokens": 7373, + "outputTokens": 137, + "latencyMs": 2863.772667000012 }, { "questionId": "q92", @@ -10117,18 +10117,18 @@ "correct": true, "inputTokens": 8385, "outputTokens": 8, - "latencyMs": 1324 + "latencyMs": 1297.5507919999945 }, { "questionId": "q93", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", "correct": true, - "inputTokens": 9740, - "outputTokens": 5, - "latencyMs": 1606 + "inputTokens": 9739, + "outputTokens": 202, + "latencyMs": 2533.5459160000028 }, { "questionId": "q93", @@ -10139,18 +10139,18 @@ "correct": true, "inputTokens": 11907, "outputTokens": 8, - "latencyMs": 1223 + "latencyMs": 1313.4649999999965 }, { "questionId": "q93", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", "correct": true, - "inputTokens": 6014, - "outputTokens": 5, - "latencyMs": 1965 + "inputTokens": 6013, + "outputTokens": 74, + "latencyMs": 1609.448166999995 }, { "questionId": "q93", @@ -10161,18 +10161,18 @@ "correct": true, "inputTokens": 6993, "outputTokens": 8, - "latencyMs": 1300 + "latencyMs": 1257.2229999999981 }, { "questionId": "q93", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", "correct": true, - "inputTokens": 6782, - "outputTokens": 5, - "latencyMs": 1110 + "inputTokens": 6781, + "outputTokens": 458, + "latencyMs": 5294.154332999984 }, { "questionId": "q93", @@ -10183,18 +10183,18 @@ "correct": true, "inputTokens": 8414, "outputTokens": 8, - "latencyMs": 1819 + "latencyMs": 1363.172208999982 }, { "questionId": "q93", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", "correct": true, - "inputTokens": 9159, - "outputTokens": 5, - "latencyMs": 1010 + "inputTokens": 9158, + "outputTokens": 74, + "latencyMs": 2154.742499999993 }, { "questionId": "q93", @@ -10205,18 +10205,18 @@ "correct": true, "inputTokens": 9289, "outputTokens": 8, - "latencyMs": 1224 + "latencyMs": 1509.8229580000043 }, { "questionId": "q93", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", "correct": true, - "inputTokens": 7374, - "outputTokens": 5, - "latencyMs": 1430 + "inputTokens": 7373, + "outputTokens": 74, + "latencyMs": 2010.5185419999762 }, { "questionId": "q93", @@ -10227,21 +10227,241 @@ "correct": true, "inputTokens": 8385, "outputTokens": 8, - "latencyMs": 1158 + "latencyMs": 1193.5151659999974 }, { "questionId": "q94", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9735, + "outputTokens": 1031, + "latencyMs": 9550.510582999996 + }, + { + "questionId": "q94", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1146.0822499999776 + }, + { + "questionId": "q94", + "format": "toon", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6009, + "outputTokens": 775, + "latencyMs": 6479.700542000006 + }, + { + "questionId": "q94", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1329.610708000022 + }, + { + "questionId": "q94", + "format": "csv", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6777, + "outputTokens": 967, + "latencyMs": 15240.216207999998 + }, + { + "questionId": "q94", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1203.151125000004 + }, + { + "questionId": "q94", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9154, + "outputTokens": 583, + "latencyMs": 6073.186583000002 + }, + { + "questionId": "q94", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1452.6655419999734 + }, + { + "questionId": "q94", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 7369, + "outputTokens": 647, + "latencyMs": 7084.941665999999 + }, + { + "questionId": "q94", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1120.7099159999925 + }, + { + "questionId": "q95", + "format": "json", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9735, + "outputTokens": 903, + "latencyMs": 8906.334791000001 + }, + { + "questionId": "q95", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 11902, + "outputTokens": 5, + "latencyMs": 1109.434333000012 + }, + { + "questionId": "q95", + "format": "toon", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6009, + "outputTokens": 391, + "latencyMs": 4955.000415999995 + }, + { + "questionId": "q95", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "7", + "correct": false, + "inputTokens": 6988, + "outputTokens": 5, + "latencyMs": 1040.817624999996 + }, + { + "questionId": "q95", + "format": "csv", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 6777, + "outputTokens": 775, + "latencyMs": 8308.952791000018 + }, + { + "questionId": "q95", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8409, + "outputTokens": 5, + "latencyMs": 1128.542833000014 + }, + { + "questionId": "q95", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 9154, + "outputTokens": 775, + "latencyMs": 7118.855291000014 + }, + { + "questionId": "q95", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 9284, + "outputTokens": 5, + "latencyMs": 1232.1081249999988 + }, + { + "questionId": "q95", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "10", + "actual": "10", + "correct": true, + "inputTokens": 7369, + "outputTokens": 647, + "latencyMs": 6776.706208000018 + }, + { + "questionId": "q95", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "10", + "actual": "8", + "correct": false, + "inputTokens": 8380, + "outputTokens": 5, + "latencyMs": 1677.1033330000064 + }, + { + "questionId": "q96", + "format": "json", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 9736, - "outputTokens": 2, - "latencyMs": 1352 + "outputTokens": 583, + "latencyMs": 5866.636624999985 }, { - "questionId": "q94", + "questionId": "q96", "format": "json", "model": "claude-haiku-4-5", "expected": "10", @@ -10249,43 +10469,43 @@ "correct": false, "inputTokens": 11902, "outputTokens": 5, - "latencyMs": 1498 + "latencyMs": 1574.224125000008 }, { - "questionId": "q94", + "questionId": "q96", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", - "actual": "12", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 6010, - "outputTokens": 2, - "latencyMs": 1249 + "outputTokens": 711, + "latencyMs": 7998.43637499999 }, { - "questionId": "q94", + "questionId": "q96", "format": "toon", "model": "claude-haiku-4-5", "expected": "10", - "actual": "8", + "actual": "7", "correct": false, "inputTokens": 6988, "outputTokens": 5, - "latencyMs": 1080 + "latencyMs": 1175.3050419999927 }, { - "questionId": "q94", + "questionId": "q96", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", - "actual": "12", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 6778, - "outputTokens": 2, - "latencyMs": 1760 + "outputTokens": 647, + "latencyMs": 6424.974583000003 }, { - "questionId": "q94", + "questionId": "q96", "format": "csv", "model": "claude-haiku-4-5", "expected": "10", @@ -10293,21 +10513,21 @@ "correct": false, "inputTokens": 8409, "outputTokens": 5, - "latencyMs": 1156 + "latencyMs": 1352.1832500000019 }, { - "questionId": "q94", + "questionId": "q96", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 9155, - "outputTokens": 2, - "latencyMs": 9923 + "outputTokens": 647, + "latencyMs": 6132.921792000008 }, { - "questionId": "q94", + "questionId": "q96", "format": "markdown-kv", "model": "claude-haiku-4-5", "expected": "10", @@ -10315,348 +10535,128 @@ "correct": false, "inputTokens": 9284, "outputTokens": 5, - "latencyMs": 1138 + "latencyMs": 1241.7496250000258 }, { - "questionId": "q94", + "questionId": "q96", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", - "actual": "12", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 7370, - "outputTokens": 2, - "latencyMs": 1070 + "outputTokens": 455, + "latencyMs": 8074.935457999993 }, { - "questionId": "q94", + "questionId": "q96", "format": "yaml", "model": "claude-haiku-4-5", "expected": "10", - "actual": "8", + "actual": "7", "correct": false, "inputTokens": 8380, "outputTokens": 5, - "latencyMs": 1114 + "latencyMs": 1294.4225830000069 }, { - "questionId": "q95", + "questionId": "q97", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 9736, - "outputTokens": 2, - "latencyMs": 830 + "outputTokens": 775, + "latencyMs": 7724.665375000011 }, { - "questionId": "q95", + "questionId": "q97", "format": "json", "model": "claude-haiku-4-5", "expected": "10", - "actual": "8", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 11902, "outputTokens": 5, - "latencyMs": 1085 + "latencyMs": 1450.864333000005 }, { - "questionId": "q95", + "questionId": "q97", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 6010, - "outputTokens": 2, - "latencyMs": 2362 + "outputTokens": 711, + "latencyMs": 5055.026333999995 }, { - "questionId": "q95", + "questionId": "q97", "format": "toon", "model": "claude-haiku-4-5", "expected": "10", - "actual": "7", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 6988, "outputTokens": 5, - "latencyMs": 1198 + "latencyMs": 1177.2059999999765 }, { - "questionId": "q95", + "questionId": "q97", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 6778, - "outputTokens": 2, - "latencyMs": 1630 + "outputTokens": 839, + "latencyMs": 7951.241416999983 }, { - "questionId": "q95", + "questionId": "q97", "format": "csv", "model": "claude-haiku-4-5", "expected": "10", - "actual": "8", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 8409, "outputTokens": 5, - "latencyMs": 1219 + "latencyMs": 1537.2077500000014 }, { - "questionId": "q95", + "questionId": "q97", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, "inputTokens": 9155, - "outputTokens": 2, - "latencyMs": 2666 + "outputTokens": 519, + "latencyMs": 9752.917709000001 }, { - "questionId": "q95", + "questionId": "q97", "format": "markdown-kv", "model": "claude-haiku-4-5", "expected": "10", - "actual": "8", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 9284, "outputTokens": 5, - "latencyMs": 1044 + "latencyMs": 1101.1202090000152 }, { - "questionId": "q95", + "questionId": "q97", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", - "actual": "12", - "correct": false, + "actual": "10", + "correct": true, "inputTokens": 7370, - "outputTokens": 2, - "latencyMs": 2187 - }, - { - "questionId": "q95", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "8", - "correct": false, - "inputTokens": 8380, - "outputTokens": 5, - "latencyMs": 1313 - }, - { - "questionId": "q96", - "format": "json", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "20", - "correct": false, - "inputTokens": 9737, - "outputTokens": 2, - "latencyMs": 1087 - }, - { - "questionId": "q96", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "8", - "correct": false, - "inputTokens": 11902, - "outputTokens": 5, - "latencyMs": 1292 - }, - { - "questionId": "q96", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 6011, - "outputTokens": 2, - "latencyMs": 1979 - }, - { - "questionId": "q96", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "7", - "correct": false, - "inputTokens": 6988, - "outputTokens": 5, - "latencyMs": 1095 - }, - { - "questionId": "q96", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 6779, - "outputTokens": 2, - "latencyMs": 1385 - }, - { - "questionId": "q96", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "8", - "correct": false, - "inputTokens": 8409, - "outputTokens": 5, - "latencyMs": 1507 - }, - { - "questionId": "q96", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 9156, - "outputTokens": 2, - "latencyMs": 1579 - }, - { - "questionId": "q96", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "8", - "correct": false, - "inputTokens": 9284, - "outputTokens": 5, - "latencyMs": 1365 - }, - { - "questionId": "q96", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "20", - "correct": false, - "inputTokens": 7371, - "outputTokens": 2, - "latencyMs": 1661 - }, - { - "questionId": "q96", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "7", - "correct": false, - "inputTokens": 8380, - "outputTokens": 5, - "latencyMs": 1423 - }, - { - "questionId": "q97", - "format": "json", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 9737, - "outputTokens": 2, - "latencyMs": 1815 - }, - { - "questionId": "q97", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 11902, - "outputTokens": 5, - "latencyMs": 1345 - }, - { - "questionId": "q97", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 6011, - "outputTokens": 2, - "latencyMs": 2193 - }, - { - "questionId": "q97", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 6988, - "outputTokens": 5, - "latencyMs": 1417 - }, - { - "questionId": "q97", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 6779, - "outputTokens": 2, - "latencyMs": 1721 - }, - { - "questionId": "q97", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 8409, - "outputTokens": 5, - "latencyMs": 1114 - }, - { - "questionId": "q97", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 9156, - "outputTokens": 2, - "latencyMs": 2208 - }, - { - "questionId": "q97", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "10", - "actual": "10", - "correct": true, - "inputTokens": 9284, - "outputTokens": 5, - "latencyMs": 1895 - }, - { - "questionId": "q97", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 7371, - "outputTokens": 2, - "latencyMs": 1287 + "outputTokens": 647, + "latencyMs": 5711.038375000004 }, { "questionId": "q97", @@ -10667,18 +10667,18 @@ "correct": true, "inputTokens": 8380, "outputTokens": 5, - "latencyMs": 1281 + "latencyMs": 1208.3837910000002 }, { "questionId": "q98", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, - "inputTokens": 9737, - "outputTokens": 2, - "latencyMs": 1387 + "inputTokens": 9736, + "outputTokens": 775, + "latencyMs": 6578.005040999997 }, { "questionId": "q98", @@ -10689,18 +10689,18 @@ "correct": false, "inputTokens": 11902, "outputTokens": 5, - "latencyMs": 1243 + "latencyMs": 1351.4712499999732 }, { "questionId": "q98", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, - "inputTokens": 6011, - "outputTokens": 2, - "latencyMs": 1284 + "inputTokens": 6010, + "outputTokens": 583, + "latencyMs": 6437.821874999994 }, { "questionId": "q98", @@ -10711,18 +10711,18 @@ "correct": false, "inputTokens": 6988, "outputTokens": 5, - "latencyMs": 1161 + "latencyMs": 1155.7898750000168 }, { "questionId": "q98", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", - "actual": "15", - "correct": false, - "inputTokens": 6779, - "outputTokens": 2, - "latencyMs": 10406 + "actual": "10", + "correct": true, + "inputTokens": 6778, + "outputTokens": 647, + "latencyMs": 6673.183250000002 }, { "questionId": "q98", @@ -10733,18 +10733,18 @@ "correct": true, "inputTokens": 8409, "outputTokens": 5, - "latencyMs": 1335 + "latencyMs": 1359.994417000009 }, { "questionId": "q98", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, - "inputTokens": 9156, - "outputTokens": 2, - "latencyMs": 1517 + "inputTokens": 9155, + "outputTokens": 647, + "latencyMs": 5806.33679099998 }, { "questionId": "q98", @@ -10755,18 +10755,18 @@ "correct": true, "inputTokens": 9284, "outputTokens": 5, - "latencyMs": 1702 + "latencyMs": 1339.4869999999937 }, { "questionId": "q98", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "10", "actual": "10", "correct": true, - "inputTokens": 7371, - "outputTokens": 2, - "latencyMs": 1676 + "inputTokens": 7370, + "outputTokens": 519, + "latencyMs": 6011.0411669999885 }, { "questionId": "q98", @@ -10777,40 +10777,40 @@ "correct": false, "inputTokens": 8380, "outputTokens": 5, - "latencyMs": 1218 + "latencyMs": 1305.6029999999737 }, { "questionId": "q99", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42342.25", - "actual": "$50,000.00", + "actual": "41001.14", "correct": false, - "inputTokens": 9737, - "outputTokens": 7, - "latencyMs": 1407 + "inputTokens": 9736, + "outputTokens": 1226, + "latencyMs": 11276.714458000002 }, { "questionId": "q99", "format": "json", "model": "claude-haiku-4-5", "expected": "42342.25", - "actual": "50,847.47", + "actual": "48,847.66", "correct": false, "inputTokens": 11902, "outputTokens": 9, - "latencyMs": 1443 + "latencyMs": 1400.5162910000072 }, { "questionId": "q99", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42342.25", - "actual": "Total revenue across all orders is 42,195.36.", - "correct": false, - "inputTokens": 6011, - "outputTokens": 14, - "latencyMs": 1150 + "actual": "42342.25", + "correct": true, + "inputTokens": 6010, + "outputTokens": 5962, + "latencyMs": 50971.727667 }, { "questionId": "q99", @@ -10821,18 +10821,18 @@ "correct": false, "inputTokens": 6988, "outputTokens": 9, - "latencyMs": 1774 + "latencyMs": 1118.9986250000075 }, { "questionId": "q99", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42342.25", - "actual": "$32,186.73", - "correct": false, - "inputTokens": 6779, - "outputTokens": 7, - "latencyMs": 2654 + "actual": "42342.25", + "correct": true, + "inputTokens": 6778, + "outputTokens": 3082, + "latencyMs": 22816.508165999985 }, { "questionId": "q99", @@ -10843,62 +10843,62 @@ "correct": false, "inputTokens": 8409, "outputTokens": 9, - "latencyMs": 1386 + "latencyMs": 1104.31912499998 }, { "questionId": "q99", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42342.25", - "actual": "$34,186.73", + "actual": "42425.97", "correct": false, - "inputTokens": 9156, - "outputTokens": 7, - "latencyMs": 1506 + "inputTokens": 9155, + "outputTokens": 2762, + "latencyMs": 17412.623583000008 }, { "questionId": "q99", "format": "markdown-kv", "model": "claude-haiku-4-5", "expected": "42342.25", - "actual": "48,847.47", - "correct": false, - "inputTokens": 9284, - "outputTokens": 9, - "latencyMs": 1509 - }, - { - "questionId": "q99", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "42342.25", - "actual": "Total revenue across all orders is 48780.73.", - "correct": false, - "inputTokens": 7371, - "outputTokens": 13, - "latencyMs": 1700 - }, - { - "questionId": "q99", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "42342.25", "actual": "47,847.47", "correct": false, + "inputTokens": 9284, + "outputTokens": 9, + "latencyMs": 1435.553082999977 + }, + { + "questionId": "q99", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "42342.25", + "actual": "42342.25", + "correct": true, + "inputTokens": 7370, + "outputTokens": 3402, + "latencyMs": 26299.00112500001 + }, + { + "questionId": "q99", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "42342.25", + "actual": "41,847.47", + "correct": false, "inputTokens": 8380, "outputTokens": 9, - "latencyMs": 1230 + "latencyMs": 1272.4541250000184 }, { "questionId": "q100", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "44", - "actual": "36", - "correct": false, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 1725 + "actual": "44", + "correct": true, + "inputTokens": 9738, + "outputTokens": 1351, + "latencyMs": 13461.932250000013 }, { "questionId": "q100", @@ -10909,18 +10909,18 @@ "correct": false, "inputTokens": 11904, "outputTokens": 5, - "latencyMs": 1377 + "latencyMs": 1772.9891250000219 }, { "questionId": "q100", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "44", - "actual": "34", - "correct": false, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1399 + "actual": "44", + "correct": true, + "inputTokens": 6012, + "outputTokens": 1735, + "latencyMs": 14196.807250000013 }, { "questionId": "q100", @@ -10931,18 +10931,18 @@ "correct": false, "inputTokens": 6990, "outputTokens": 5, - "latencyMs": 1094 + "latencyMs": 1749.7322920000006 }, { "questionId": "q100", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "44", - "actual": "34", - "correct": false, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 1617 + "actual": "44", + "correct": true, + "inputTokens": 6780, + "outputTokens": 1863, + "latencyMs": 14291.044916999992 }, { "questionId": "q100", @@ -10953,18 +10953,18 @@ "correct": false, "inputTokens": 8411, "outputTokens": 5, - "latencyMs": 1344 + "latencyMs": 1453.1822079999838 }, { "questionId": "q100", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "44", - "actual": "36", - "correct": false, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 2396 + "actual": "44", + "correct": true, + "inputTokens": 9157, + "outputTokens": 1799, + "latencyMs": 16012.806332999986 }, { "questionId": "q100", @@ -10975,18 +10975,18 @@ "correct": false, "inputTokens": 9286, "outputTokens": 5, - "latencyMs": 1145 + "latencyMs": 1761.131041000015 }, { "questionId": "q100", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "44", - "actual": "36", - "correct": false, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 951 + "actual": "44", + "correct": true, + "inputTokens": 7372, + "outputTokens": 1415, + "latencyMs": 12218.14491599999 }, { "questionId": "q100", @@ -10997,18 +10997,18 @@ "correct": false, "inputTokens": 8382, "outputTokens": 5, - "latencyMs": 1311 + "latencyMs": 1255.681917000009 }, { "questionId": "q101", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "39", - "actual": "34", - "correct": false, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 866 + "actual": "39", + "correct": true, + "inputTokens": 9738, + "outputTokens": 2311, + "latencyMs": 22316.87704199998 }, { "questionId": "q101", @@ -11019,18 +11019,18 @@ "correct": false, "inputTokens": 11904, "outputTokens": 5, - "latencyMs": 1964 + "latencyMs": 1090.176792000013 }, { "questionId": "q101", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "39", - "actual": "30", - "correct": false, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1994 + "actual": "39", + "correct": true, + "inputTokens": 6012, + "outputTokens": 1095, + "latencyMs": 7211.767082999984 }, { "questionId": "q101", @@ -11041,18 +11041,18 @@ "correct": false, "inputTokens": 6990, "outputTokens": 5, - "latencyMs": 1277 + "latencyMs": 1129.9290000000037 }, { "questionId": "q101", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "39", - "actual": "32", - "correct": false, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 1884 + "actual": "39", + "correct": true, + "inputTokens": 6780, + "outputTokens": 1415, + "latencyMs": 15701.471499999985 }, { "questionId": "q101", @@ -11063,40 +11063,40 @@ "correct": false, "inputTokens": 8411, "outputTokens": 5, - "latencyMs": 1282 + "latencyMs": 1251.5472500000033 }, { "questionId": "q101", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "39", - "actual": "32", - "correct": false, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1761 + "actual": "39", + "correct": true, + "inputTokens": 9157, + "outputTokens": 1799, + "latencyMs": 16689.30345800001 }, { "questionId": "q101", "format": "markdown-kv", "model": "claude-haiku-4-5", "expected": "39", - "actual": "38", + "actual": "41", "correct": false, "inputTokens": 9286, "outputTokens": 5, - "latencyMs": 1250 + "latencyMs": 1168.8190419999883 }, { "questionId": "q101", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "39", - "actual": "32", - "correct": false, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 1316 + "actual": "39", + "correct": true, + "inputTokens": 7372, + "outputTokens": 1863, + "latencyMs": 14505.393958999979 }, { "questionId": "q101", @@ -11107,18 +11107,18 @@ "correct": false, "inputTokens": 8382, "outputTokens": 5, - "latencyMs": 1373 + "latencyMs": 1149.8783330000006 }, { "questionId": "q102", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32", - "actual": "27", - "correct": false, - "inputTokens": 9739, - "outputTokens": 2, - "latencyMs": 1389 + "actual": "32", + "correct": true, + "inputTokens": 9738, + "outputTokens": 1607, + "latencyMs": 13945.93979200002 }, { "questionId": "q102", @@ -11129,18 +11129,18 @@ "correct": false, "inputTokens": 11904, "outputTokens": 5, - "latencyMs": 1215 + "latencyMs": 1175.8143749999872 }, { "questionId": "q102", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32", - "actual": "24", - "correct": false, - "inputTokens": 6013, - "outputTokens": 2, - "latencyMs": 1034 + "actual": "32", + "correct": true, + "inputTokens": 6012, + "outputTokens": 1351, + "latencyMs": 11991.764750000002 }, { "questionId": "q102", @@ -11151,18 +11151,18 @@ "correct": false, "inputTokens": 6990, "outputTokens": 5, - "latencyMs": 1063 + "latencyMs": 1643.4279169999936 }, { "questionId": "q102", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32", - "actual": "25", - "correct": false, - "inputTokens": 6781, - "outputTokens": 2, - "latencyMs": 7312 + "actual": "32", + "correct": true, + "inputTokens": 6780, + "outputTokens": 1799, + "latencyMs": 17324.695000000007 }, { "questionId": "q102", @@ -11173,18 +11173,18 @@ "correct": false, "inputTokens": 8411, "outputTokens": 5, - "latencyMs": 1387 + "latencyMs": 1197.7254160000011 }, { "questionId": "q102", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32", - "actual": "27", - "correct": false, - "inputTokens": 9158, - "outputTokens": 2, - "latencyMs": 1488 + "actual": "32", + "correct": true, + "inputTokens": 9157, + "outputTokens": 1607, + "latencyMs": 22426.01029199999 }, { "questionId": "q102", @@ -11195,18 +11195,18 @@ "correct": false, "inputTokens": 9286, "outputTokens": 5, - "latencyMs": 1268 + "latencyMs": 1065.6509170000209 }, { "questionId": "q102", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32", - "actual": "27", + "actual": "31", "correct": false, - "inputTokens": 7373, - "outputTokens": 2, - "latencyMs": 1274 + "inputTokens": 7372, + "outputTokens": 1543, + "latencyMs": 12786.843416999996 }, { "questionId": "q102", @@ -11217,18 +11217,18 @@ "correct": false, "inputTokens": 8382, "outputTokens": 5, - "latencyMs": 1354 + "latencyMs": 2054.993749999994 }, { "questionId": "q103", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6975", "actual": "6975", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1330 + "inputTokens": 3712, + "outputTokens": 72, + "latencyMs": 2244.986208999995 }, { "questionId": "q103", @@ -11239,18 +11239,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1437 + "latencyMs": 1162.9390420000127 }, { "questionId": "q103", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6975", "actual": "6975", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1341 + "inputTokens": 1563, + "outputTokens": 136, + "latencyMs": 2179.3558330000087 }, { "questionId": "q103", @@ -11261,18 +11261,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1231 + "latencyMs": 1013.4975409999897 }, { "questionId": "q103", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6975", "actual": "6975", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 2515 + "inputTokens": 1441, + "outputTokens": 72, + "latencyMs": 4859.720833999978 }, { "questionId": "q103", @@ -11283,18 +11283,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1162 + "latencyMs": 1437.758375000005 }, { "questionId": "q103", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6975", "actual": "6975", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 868 + "inputTokens": 3829, + "outputTokens": 72, + "latencyMs": 3120.702874999988 }, { "questionId": "q103", @@ -11305,18 +11305,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1149 + "latencyMs": 1051.775708000001 }, { "questionId": "q103", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6975", "actual": "6975", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1183 + "inputTokens": 2985, + "outputTokens": 72, + "latencyMs": 2182.880084000004 }, { "questionId": "q103", @@ -11327,18 +11327,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1119 + "latencyMs": 1045.2009580000013 }, { "questionId": "q104", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1273 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 5291.923750000016 }, { "questionId": "q104", @@ -11349,18 +11349,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1371 + "latencyMs": 1009.6958750000049 }, { "questionId": "q104", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 2052 + "inputTokens": 1562, + "outputTokens": 74, + "latencyMs": 2582.2320419999887 }, { "questionId": "q104", @@ -11371,18 +11371,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 997 + "latencyMs": 1203.816542000015 }, { "questionId": "q104", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1152 + "inputTokens": 1440, + "outputTokens": 138, + "latencyMs": 2774.835167000012 }, { "questionId": "q104", @@ -11393,18 +11393,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1188 + "latencyMs": 979.9191669999855 }, { "questionId": "q104", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 1259 + "inputTokens": 3828, + "outputTokens": 138, + "latencyMs": 2616.684333000012 }, { "questionId": "q104", @@ -11415,18 +11415,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1239 + "latencyMs": 1253.4844169999997 }, { "questionId": "q104", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1096 + "inputTokens": 2984, + "outputTokens": 74, + "latencyMs": 2267.1155000000144 }, { "questionId": "q104", @@ -11437,18 +11437,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1247 + "latencyMs": 1185.4212080000143 }, { "questionId": "q105", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "7500", "actual": "7500", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1354 + "inputTokens": 3712, + "outputTokens": 136, + "latencyMs": 2905.6011250000156 }, { "questionId": "q105", @@ -11459,18 +11459,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1083 + "latencyMs": 1571.1469999999972 }, { "questionId": "q105", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "7500", "actual": "7500", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 869 + "inputTokens": 1563, + "outputTokens": 328, + "latencyMs": 3884.65858399999 }, { "questionId": "q105", @@ -11481,18 +11481,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1051 + "latencyMs": 1207.1518330000108 }, { "questionId": "q105", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "7500", "actual": "7500", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1528 + "inputTokens": 1441, + "outputTokens": 72, + "latencyMs": 1995.0557919999992 }, { "questionId": "q105", @@ -11503,18 +11503,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1126 + "latencyMs": 1238.8113749999902 }, { "questionId": "q105", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "7500", "actual": "7500", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1136 + "inputTokens": 3829, + "outputTokens": 136, + "latencyMs": 5824.06574999998 }, { "questionId": "q105", @@ -11525,18 +11525,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1121 + "latencyMs": 1337.474749999994 }, { "questionId": "q105", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "7500", "actual": "7500", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1217 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 2286.1839580000087 }, { "questionId": "q105", @@ -11547,18 +11547,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1099 + "latencyMs": 1326.3640000000014 }, { "questionId": "q106", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1416 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 3801.309249999991 }, { "questionId": "q106", @@ -11569,18 +11569,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1526 + "latencyMs": 1054.8991249999963 }, { "questionId": "q106", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1350 + "inputTokens": 1562, + "outputTokens": 74, + "latencyMs": 3338.1347499999974 }, { "questionId": "q106", @@ -11591,18 +11591,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1330 + "latencyMs": 1393.589082999999 }, { "questionId": "q106", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 2337 + "inputTokens": 1440, + "outputTokens": 202, + "latencyMs": 3719.6092089999875 }, { "questionId": "q106", @@ -11613,18 +11613,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1171 + "latencyMs": 1030.9656669999822 }, { "questionId": "q106", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 3128 + "inputTokens": 3828, + "outputTokens": 74, + "latencyMs": 2226.628250000009 }, { "questionId": "q106", @@ -11635,18 +11635,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1151 + "latencyMs": 1154.132540999999 }, { "questionId": "q106", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1988 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 2922.2590830000117 }, { "questionId": "q106", @@ -11657,18 +11657,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1166 + "latencyMs": 2048.011916999996 }, { "questionId": "q107", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6692", "actual": "6692", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 2217 + "inputTokens": 3712, + "outputTokens": 200, + "latencyMs": 2520.5313329999917 }, { "questionId": "q107", @@ -11679,18 +11679,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1114 + "latencyMs": 943.3422089999949 }, { "questionId": "q107", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6692", "actual": "6692", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1360 + "inputTokens": 1563, + "outputTokens": 136, + "latencyMs": 2300.8406249999825 }, { "questionId": "q107", @@ -11701,18 +11701,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1079 + "latencyMs": 1128.4146670000046 }, { "questionId": "q107", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6692", "actual": "6692", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1951 + "inputTokens": 1441, + "outputTokens": 200, + "latencyMs": 2929.585208000004 }, { "questionId": "q107", @@ -11723,18 +11723,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1173 + "latencyMs": 1230.4635420000122 }, { "questionId": "q107", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6692", "actual": "6692", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1076 + "inputTokens": 3829, + "outputTokens": 136, + "latencyMs": 3650.3654169999936 }, { "questionId": "q107", @@ -11745,18 +11745,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1098 + "latencyMs": 985.8184590000019 }, { "questionId": "q107", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6692", "actual": "6692", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1101 + "inputTokens": 2985, + "outputTokens": 328, + "latencyMs": 3772.2553330000082 }, { "questionId": "q107", @@ -11767,18 +11767,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1254 + "latencyMs": 1311.8630419999827 }, { "questionId": "q108", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 2041 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 2935.785124999995 }, { "questionId": "q108", @@ -11789,18 +11789,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1405 + "latencyMs": 1391.9168749999953 }, { "questionId": "q108", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1170 + "inputTokens": 1562, + "outputTokens": 138, + "latencyMs": 5759.15529200001 }, { "questionId": "q108", @@ -11811,18 +11811,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1161 + "latencyMs": 1064.3980420000153 }, { "questionId": "q108", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1326 + "inputTokens": 1440, + "outputTokens": 74, + "latencyMs": 3640.193708000006 }, { "questionId": "q108", @@ -11833,18 +11833,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1259 + "latencyMs": 983.806166000024 }, { "questionId": "q108", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 3006 + "inputTokens": 3828, + "outputTokens": 266, + "latencyMs": 2604.2135000000126 }, { "questionId": "q108", @@ -11855,18 +11855,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1461 + "latencyMs": 1128.6182499999995 }, { "questionId": "q108", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 3824 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 2548.5608749999956 }, { "questionId": "q108", @@ -11877,18 +11877,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1391 + "latencyMs": 1029.5365000000165 }, { "questionId": "q109", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3285", "actual": "3285", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1091 + "inputTokens": 3712, + "outputTokens": 136, + "latencyMs": 3983.6009170000034 }, { "questionId": "q109", @@ -11899,18 +11899,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1188 + "latencyMs": 1095.2366250000196 }, { "questionId": "q109", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3285", "actual": "3285", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1450 + "inputTokens": 1563, + "outputTokens": 72, + "latencyMs": 2207.884417000023 }, { "questionId": "q109", @@ -11921,18 +11921,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1614 + "latencyMs": 2292.4111660000053 }, { "questionId": "q109", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3285", "actual": "3285", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1642 + "inputTokens": 1441, + "outputTokens": 136, + "latencyMs": 2749.430541000009 }, { "questionId": "q109", @@ -11943,18 +11943,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1311 + "latencyMs": 1215.8329999999842 }, { "questionId": "q109", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3285", "actual": "3285", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1201 + "inputTokens": 3829, + "outputTokens": 136, + "latencyMs": 2086.6161659999925 }, { "questionId": "q109", @@ -11965,18 +11965,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1261 + "latencyMs": 1299.715790999995 }, { "questionId": "q109", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3285", "actual": "3285", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 856 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 7107.394916999998 }, { "questionId": "q109", @@ -11987,18 +11987,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 980 + "latencyMs": 899.2319579999894 }, { "questionId": "q110", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 3090 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 2810.5213330000115 }, { "questionId": "q110", @@ -12009,18 +12009,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1123 + "latencyMs": 989.2326659999962 }, { "questionId": "q110", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 2911 + "inputTokens": 1562, + "outputTokens": 138, + "latencyMs": 2622.7841670000053 }, { "questionId": "q110", @@ -12031,18 +12031,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 979 + "latencyMs": 850.1227920000092 }, { "questionId": "q110", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1118 + "inputTokens": 1440, + "outputTokens": 138, + "latencyMs": 3057.1578750000044 }, { "questionId": "q110", @@ -12053,18 +12053,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 943 + "latencyMs": 1261.3340000000026 }, { "questionId": "q110", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 2639 + "inputTokens": 3828, + "outputTokens": 202, + "latencyMs": 3061.791499999992 }, { "questionId": "q110", @@ -12075,18 +12075,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1187 + "latencyMs": 1196.6509999999835 }, { "questionId": "q110", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 2402 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 3567.4540839999972 }, { "questionId": "q110", @@ -12097,18 +12097,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1723 + "latencyMs": 1033.8556249999965 }, { "questionId": "q111", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6191", "actual": "6191", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 2401 + "inputTokens": 3712, + "outputTokens": 136, + "latencyMs": 2842.961707999988 }, { "questionId": "q111", @@ -12119,18 +12119,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1117 + "latencyMs": 1258.130582999991 }, { "questionId": "q111", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6191", "actual": "6191", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1568 + "inputTokens": 1563, + "outputTokens": 456, + "latencyMs": 5828.652415999997 }, { "questionId": "q111", @@ -12141,18 +12141,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1132 + "latencyMs": 1004.821958000015 }, { "questionId": "q111", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6191", "actual": "6191", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1478 + "inputTokens": 1441, + "outputTokens": 72, + "latencyMs": 3102.38612499999 }, { "questionId": "q111", @@ -12163,18 +12163,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1831 + "latencyMs": 1454.8658750000177 }, { "questionId": "q111", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6191", "actual": "6191", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1631 + "inputTokens": 3829, + "outputTokens": 136, + "latencyMs": 2018.8434999999881 }, { "questionId": "q111", @@ -12185,18 +12185,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1371 + "latencyMs": 1237.4057080000057 }, { "questionId": "q111", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6191", "actual": "6191", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1209 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 3670.7451670000155 }, { "questionId": "q111", @@ -12207,18 +12207,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1411 + "latencyMs": 1070.646584000002 }, { "questionId": "q112", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1773 + "inputTokens": 3711, + "outputTokens": 202, + "latencyMs": 3731.3879579999775 }, { "questionId": "q112", @@ -12229,18 +12229,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1090 + "latencyMs": 1387.9798329999903 }, { "questionId": "q112", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1354 + "inputTokens": 1562, + "outputTokens": 394, + "latencyMs": 5560.397957999987 }, { "questionId": "q112", @@ -12251,18 +12251,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1095 + "latencyMs": 1552.963958999986 }, { "questionId": "q112", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1135 + "inputTokens": 1440, + "outputTokens": 138, + "latencyMs": 21759.84366700001 }, { "questionId": "q112", @@ -12273,18 +12273,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 976 + "latencyMs": 1132.519083000021 }, { "questionId": "q112", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 1311 + "inputTokens": 3828, + "outputTokens": 138, + "latencyMs": 2277.2652499999967 }, { "questionId": "q112", @@ -12295,18 +12295,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1287 + "latencyMs": 1098.0825420000183 }, { "questionId": "q112", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1288 + "inputTokens": 2984, + "outputTokens": 202, + "latencyMs": 2813.10504200001 }, { "questionId": "q112", @@ -12317,18 +12317,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1157 + "latencyMs": 1131.9674159999995 }, { "questionId": "q113", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4696", "actual": "4696", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1328 + "inputTokens": 3712, + "outputTokens": 136, + "latencyMs": 6657.446207999979 }, { "questionId": "q113", @@ -12339,18 +12339,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1068 + "latencyMs": 1265.4548749999958 }, { "questionId": "q113", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4696", "actual": "4696", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1020 + "inputTokens": 1563, + "outputTokens": 136, + "latencyMs": 3299.298792000016 }, { "questionId": "q113", @@ -12361,18 +12361,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1069 + "latencyMs": 1618.5091249999823 }, { "questionId": "q113", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4696", "actual": "4696", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 968 + "inputTokens": 1441, + "outputTokens": 136, + "latencyMs": 5353.29241699999 }, { "questionId": "q113", @@ -12383,18 +12383,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1436 + "latencyMs": 870.5113749999728 }, { "questionId": "q113", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4696", "actual": "4696", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1171 + "inputTokens": 3829, + "outputTokens": 200, + "latencyMs": 2780.5659159999923 }, { "questionId": "q113", @@ -12405,18 +12405,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1273 + "latencyMs": 1069.2415409999958 }, { "questionId": "q113", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4696", "actual": "4696", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1788 + "inputTokens": 2985, + "outputTokens": 200, + "latencyMs": 3036.145666999975 }, { "questionId": "q113", @@ -12427,18 +12427,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1050 + "latencyMs": 1252.9633329999924 }, { "questionId": "q114", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1414 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 2617.047249999974 }, { "questionId": "q114", @@ -12449,18 +12449,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1192 + "latencyMs": 1261.9117079999996 }, { "questionId": "q114", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 893 + "inputTokens": 1562, + "outputTokens": 202, + "latencyMs": 6192.06358300001 }, { "questionId": "q114", @@ -12471,18 +12471,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1065 + "latencyMs": 1158.3806249999907 }, { "questionId": "q114", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1155 + "inputTokens": 1440, + "outputTokens": 138, + "latencyMs": 2867.840083999996 }, { "questionId": "q114", @@ -12493,18 +12493,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1842 + "latencyMs": 856.2939580000238 }, { "questionId": "q114", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 2740 + "inputTokens": 3828, + "outputTokens": 138, + "latencyMs": 2329.6339579999913 }, { "questionId": "q114", @@ -12515,18 +12515,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1295 + "latencyMs": 1106.5591669999994 }, { "questionId": "q114", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1053 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 2590.7533330000006 }, { "questionId": "q114", @@ -12537,18 +12537,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1118 + "latencyMs": 1007.0892920000188 }, { "questionId": "q115", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6196", "actual": "6196", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1452 + "inputTokens": 3712, + "outputTokens": 200, + "latencyMs": 3839.2745000000286 }, { "questionId": "q115", @@ -12559,18 +12559,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1272 + "latencyMs": 1388.2399160000205 }, { "questionId": "q115", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6196", "actual": "6196", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1039 + "inputTokens": 1563, + "outputTokens": 200, + "latencyMs": 3955.22095800002 }, { "questionId": "q115", @@ -12581,18 +12581,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1155 + "latencyMs": 1036.567458000005 }, { "questionId": "q115", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6196", "actual": "6196", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 796 + "inputTokens": 1441, + "outputTokens": 200, + "latencyMs": 5566.705209000007 }, { "questionId": "q115", @@ -12603,18 +12603,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1048 + "latencyMs": 1078.5011670000094 }, { "questionId": "q115", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6196", "actual": "6196", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 2282 + "inputTokens": 3829, + "outputTokens": 200, + "latencyMs": 2956.9618330000376 }, { "questionId": "q115", @@ -12625,18 +12625,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1592 + "latencyMs": 1797.4496250000084 }, { "questionId": "q115", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6196", "actual": "6196", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 2691 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 2647.741832999978 }, { "questionId": "q115", @@ -12647,18 +12647,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1126 + "latencyMs": 1221.9055410000146 }, { "questionId": "q116", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1288 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 3783.334333000006 }, { "questionId": "q116", @@ -12669,18 +12669,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 991 + "latencyMs": 1135.7771670000511 }, { "questionId": "q116", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1257 + "inputTokens": 1562, + "outputTokens": 266, + "latencyMs": 3364.4232920000213 }, { "questionId": "q116", @@ -12691,18 +12691,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1004 + "latencyMs": 1161.263666999992 }, { "questionId": "q116", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1620 + "inputTokens": 1440, + "outputTokens": 74, + "latencyMs": 3646.0659589999705 }, { "questionId": "q116", @@ -12713,18 +12713,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 991 + "latencyMs": 955.7597500000265 }, { "questionId": "q116", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 1048 + "inputTokens": 3828, + "outputTokens": 74, + "latencyMs": 2345.2203750000335 }, { "questionId": "q116", @@ -12735,18 +12735,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1189 + "latencyMs": 1541.918249999988 }, { "questionId": "q116", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 3282 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 6126.976708000002 }, { "questionId": "q116", @@ -12757,18 +12757,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 985 + "latencyMs": 1097.440709000046 }, { "questionId": "q117", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6528", "actual": "6528", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 871 + "inputTokens": 3712, + "outputTokens": 264, + "latencyMs": 3404.643708999967 }, { "questionId": "q117", @@ -12779,18 +12779,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1042 + "latencyMs": 1227.7047499999753 }, { "questionId": "q117", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6528", "actual": "6528", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 999 + "inputTokens": 1563, + "outputTokens": 136, + "latencyMs": 2495.85037499998 }, { "questionId": "q117", @@ -12801,18 +12801,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1111 + "latencyMs": 1048.344832999981 }, { "questionId": "q117", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6528", "actual": "6528", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1132 + "inputTokens": 1441, + "outputTokens": 136, + "latencyMs": 3007.2462499999674 }, { "questionId": "q117", @@ -12823,18 +12823,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1004 + "latencyMs": 840.0351669999654 }, { "questionId": "q117", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6528", "actual": "6528", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1162 + "inputTokens": 3829, + "outputTokens": 328, + "latencyMs": 3149.872374999977 }, { "questionId": "q117", @@ -12845,18 +12845,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1271 + "latencyMs": 973.716167000006 }, { "questionId": "q117", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6528", "actual": "6528", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 961 + "inputTokens": 2985, + "outputTokens": 456, + "latencyMs": 5305.827791999967 }, { "questionId": "q117", @@ -12867,18 +12867,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1289 + "latencyMs": 953.3122500000172 }, { "questionId": "q118", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1634 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 3435.850167000026 }, { "questionId": "q118", @@ -12889,18 +12889,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1198 + "latencyMs": 1110.8856249999953 }, { "questionId": "q118", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 2678 + "inputTokens": 1562, + "outputTokens": 266, + "latencyMs": 3303.3427500000107 }, { "questionId": "q118", @@ -12911,18 +12911,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1155 + "latencyMs": 954.5857910000486 }, { "questionId": "q118", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1104 + "inputTokens": 1440, + "outputTokens": 138, + "latencyMs": 5035.666582999984 }, { "questionId": "q118", @@ -12933,18 +12933,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1109 + "latencyMs": 867.9529159999802 }, { "questionId": "q118", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 3756 + "inputTokens": 3828, + "outputTokens": 202, + "latencyMs": 2817.1118750000023 }, { "questionId": "q118", @@ -12955,18 +12955,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1082 + "latencyMs": 1029.4406660000095 }, { "questionId": "q118", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1451 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 2521.28145900002 }, { "questionId": "q118", @@ -12977,18 +12977,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1730 + "latencyMs": 1266.9695000000065 }, { "questionId": "q119", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4689", "actual": "4689", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1327 + "inputTokens": 3712, + "outputTokens": 72, + "latencyMs": 2383.6225830000476 }, { "questionId": "q119", @@ -12999,18 +12999,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 1282 + "latencyMs": 1100.3007499999949 }, { "questionId": "q119", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4689", "actual": "4689", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1368 + "inputTokens": 1563, + "outputTokens": 200, + "latencyMs": 2816.252374999982 }, { "questionId": "q119", @@ -13021,18 +13021,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1487 + "latencyMs": 1030.0248330000322 }, { "questionId": "q119", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4689", "actual": "4689", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 2752 + "inputTokens": 1441, + "outputTokens": 72, + "latencyMs": 1819.5161669999943 }, { "questionId": "q119", @@ -13043,18 +13043,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 909 + "latencyMs": 1012.0581670000101 }, { "questionId": "q119", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4689", "actual": "4689", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 3502 + "inputTokens": 3829, + "outputTokens": 136, + "latencyMs": 2960.8910000000033 }, { "questionId": "q119", @@ -13065,18 +13065,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1212 + "latencyMs": 1346.7110000000102 }, { "questionId": "q119", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "4689", "actual": "4689", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1218 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 3081.40625 }, { "questionId": "q119", @@ -13087,18 +13087,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1064 + "latencyMs": 1485.0133330000099 }, { "questionId": "q120", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 2777 + "inputTokens": 3711, + "outputTokens": 138, + "latencyMs": 3632.860875000013 }, { "questionId": "q120", @@ -13109,18 +13109,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1246 + "latencyMs": 1224.803750000021 }, { "questionId": "q120", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1424 + "inputTokens": 1562, + "outputTokens": 138, + "latencyMs": 2323.675958000007 }, { "questionId": "q120", @@ -13131,18 +13131,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1074 + "latencyMs": 1114.0831669999752 }, { "questionId": "q120", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 2803 + "inputTokens": 1440, + "outputTokens": 202, + "latencyMs": 3465.111333000008 }, { "questionId": "q120", @@ -13153,18 +13153,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1107 + "latencyMs": 1082.4990419999813 }, { "questionId": "q120", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 1066 + "inputTokens": 3828, + "outputTokens": 138, + "latencyMs": 5648.285415999999 }, { "questionId": "q120", @@ -13175,18 +13175,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1325 + "latencyMs": 1087.8757500000065 }, { "questionId": "q120", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1330 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 4587.399166000017 }, { "questionId": "q120", @@ -13197,18 +13197,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 1192 + "latencyMs": 1007.4333340000012 }, { "questionId": "q121", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "5685", "actual": "5685", "correct": true, - "inputTokens": 3713, - "outputTokens": 3, - "latencyMs": 1139 + "inputTokens": 3712, + "outputTokens": 72, + "latencyMs": 2307.9398339999607 }, { "questionId": "q121", @@ -13219,18 +13219,18 @@ "correct": true, "inputTokens": 4080, "outputTokens": 6, - "latencyMs": 994 + "latencyMs": 2368.3719580000034 }, { "questionId": "q121", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "5685", "actual": "5685", "correct": true, - "inputTokens": 1564, - "outputTokens": 3, - "latencyMs": 1309 + "inputTokens": 1563, + "outputTokens": 200, + "latencyMs": 3587.720166999963 }, { "questionId": "q121", @@ -13241,18 +13241,18 @@ "correct": true, "inputTokens": 1509, "outputTokens": 6, - "latencyMs": 1184 + "latencyMs": 1053.9867080000113 }, { "questionId": "q121", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "5685", "actual": "5685", "correct": true, - "inputTokens": 1442, - "outputTokens": 3, - "latencyMs": 1182 + "inputTokens": 1441, + "outputTokens": 136, + "latencyMs": 1593.4699169999803 }, { "questionId": "q121", @@ -13263,18 +13263,18 @@ "correct": true, "inputTokens": 1445, "outputTokens": 6, - "latencyMs": 1381 + "latencyMs": 2256.4729170000064 }, { "questionId": "q121", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "5685", "actual": "5685", "correct": true, - "inputTokens": 3830, - "outputTokens": 3, - "latencyMs": 1103 + "inputTokens": 3829, + "outputTokens": 200, + "latencyMs": 4466.158916999993 }, { "questionId": "q121", @@ -13285,18 +13285,18 @@ "correct": true, "inputTokens": 3415, "outputTokens": 6, - "latencyMs": 1220 + "latencyMs": 1305.1236670000362 }, { "questionId": "q121", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "5685", "actual": "5685", "correct": true, - "inputTokens": 2986, - "outputTokens": 3, - "latencyMs": 1169 + "inputTokens": 2985, + "outputTokens": 136, + "latencyMs": 3014.9748339999933 }, { "questionId": "q121", @@ -13307,18 +13307,18 @@ "correct": true, "inputTokens": 3110, "outputTokens": 6, - "latencyMs": 1208 + "latencyMs": 1421.9597920000087 }, { "questionId": "q122", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", "correct": true, - "inputTokens": 3712, - "outputTokens": 5, - "latencyMs": 1037 + "inputTokens": 3711, + "outputTokens": 202, + "latencyMs": 19503.25695900002 }, { "questionId": "q122", @@ -13329,18 +13329,18 @@ "correct": true, "inputTokens": 4079, "outputTokens": 8, - "latencyMs": 1278 + "latencyMs": 1164.002959000005 }, { "questionId": "q122", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", "correct": true, - "inputTokens": 1563, - "outputTokens": 5, - "latencyMs": 1441 + "inputTokens": 1562, + "outputTokens": 330, + "latencyMs": 4662.637042000017 }, { "questionId": "q122", @@ -13351,18 +13351,18 @@ "correct": true, "inputTokens": 1508, "outputTokens": 8, - "latencyMs": 1204 + "latencyMs": 1086.9569170000032 }, { "questionId": "q122", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", "correct": true, - "inputTokens": 1441, - "outputTokens": 5, - "latencyMs": 1782 + "inputTokens": 1440, + "outputTokens": 202, + "latencyMs": 2683.73904200003 }, { "questionId": "q122", @@ -13373,18 +13373,18 @@ "correct": true, "inputTokens": 1444, "outputTokens": 8, - "latencyMs": 1088 + "latencyMs": 2289.0300419999985 }, { "questionId": "q122", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", "correct": true, - "inputTokens": 3829, - "outputTokens": 5, - "latencyMs": 1447 + "inputTokens": 3828, + "outputTokens": 74, + "latencyMs": 1877.1760409999988 }, { "questionId": "q122", @@ -13395,18 +13395,18 @@ "correct": true, "inputTokens": 3414, "outputTokens": 8, - "latencyMs": 1356 + "latencyMs": 1460.1729160000104 }, { "questionId": "q122", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", "correct": true, - "inputTokens": 2985, - "outputTokens": 5, - "latencyMs": 1309 + "inputTokens": 2984, + "outputTokens": 138, + "latencyMs": 2582.983708999993 }, { "questionId": "q122", @@ -13417,18 +13417,18 @@ "correct": true, "inputTokens": 3109, "outputTokens": 8, - "latencyMs": 995 + "latencyMs": 1014.1320839999826 }, { "questionId": "q123", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "344498", - "actual": "188,000", - "correct": false, - "inputTokens": 3710, - "outputTokens": 4, - "latencyMs": 1405 + "actual": "344498", + "correct": true, + "inputTokens": 3709, + "outputTokens": 2376, + "latencyMs": 26290.846458000015 }, { "questionId": "q123", @@ -13439,18 +13439,18 @@ "correct": false, "inputTokens": 4077, "outputTokens": 7, - "latencyMs": 1110 + "latencyMs": 1288.6627500000177 }, { "questionId": "q123", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "344498", - "actual": "186,000", - "correct": false, - "inputTokens": 1561, - "outputTokens": 4, - "latencyMs": 1306 + "actual": "344498", + "correct": true, + "inputTokens": 1560, + "outputTokens": 1736, + "latencyMs": 13565.930124999955 }, { "questionId": "q123", @@ -13461,18 +13461,18 @@ "correct": false, "inputTokens": 1506, "outputTokens": 7, - "latencyMs": 1292 + "latencyMs": 1190.8501249999972 }, { "questionId": "q123", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "344498", - "actual": "188,000", - "correct": false, - "inputTokens": 1439, - "outputTokens": 4, - "latencyMs": 2659 + "actual": "344498", + "correct": true, + "inputTokens": 1438, + "outputTokens": 2888, + "latencyMs": 21377.612083000015 }, { "questionId": "q123", @@ -13483,18 +13483,18 @@ "correct": false, "inputTokens": 1442, "outputTokens": 7, - "latencyMs": 966 + "latencyMs": 931.349749999994 }, { "questionId": "q123", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "344498", - "actual": "174,000", - "correct": false, - "inputTokens": 3827, - "outputTokens": 4, - "latencyMs": 1177 + "actual": "344498", + "correct": true, + "inputTokens": 3826, + "outputTokens": 3208, + "latencyMs": 18997.804958999972 }, { "questionId": "q123", @@ -13505,18 +13505,18 @@ "correct": false, "inputTokens": 3412, "outputTokens": 7, - "latencyMs": 1018 + "latencyMs": 1185.3518330000225 }, { "questionId": "q123", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "344498", - "actual": "188,000", - "correct": false, - "inputTokens": 2983, - "outputTokens": 4, - "latencyMs": 1659 + "actual": "344498", + "correct": true, + "inputTokens": 2982, + "outputTokens": 2184, + "latencyMs": 23924.366792000015 }, { "questionId": "q123", @@ -13527,18 +13527,18 @@ "correct": false, "inputTokens": 3107, "outputTokens": 7, - "latencyMs": 1894 + "latencyMs": 2958.913666999957 }, { "questionId": "q124", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "312818.50", - "actual": "188,174.36", - "correct": false, - "inputTokens": 3708, - "outputTokens": 6, - "latencyMs": 2900 + "actual": "312818.50", + "correct": true, + "inputTokens": 3707, + "outputTokens": 4170, + "latencyMs": 29361.525874999992 }, { "questionId": "q124", @@ -13549,18 +13549,18 @@ "correct": false, "inputTokens": 4075, "outputTokens": 9, - "latencyMs": 1196 + "latencyMs": 1325.5311249999795 }, { "questionId": "q124", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "312818.50", - "actual": "Total revenue across all dates is 139,155.36.", - "correct": false, - "inputTokens": 1559, - "outputTokens": 14, - "latencyMs": 1401 + "actual": "312818.50", + "correct": true, + "inputTokens": 1558, + "outputTokens": 4106, + "latencyMs": 37997.09958400001 }, { "questionId": "q124", @@ -13571,18 +13571,18 @@ "correct": false, "inputTokens": 1504, "outputTokens": 9, - "latencyMs": 1118 + "latencyMs": 1184.0957090000156 }, { "questionId": "q124", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "312818.50", - "actual": "Total revenue across all dates is 155,000.00.", - "correct": false, - "inputTokens": 1437, - "outputTokens": 14, - "latencyMs": 1308 + "actual": "312818.50", + "correct": true, + "inputTokens": 1436, + "outputTokens": 3658, + "latencyMs": 26945.63508400001 }, { "questionId": "q124", @@ -13593,18 +13593,18 @@ "correct": false, "inputTokens": 1440, "outputTokens": 9, - "latencyMs": 1120 + "latencyMs": 1162.16949999996 }, { "questionId": "q124", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "312818.50", - "actual": "Total revenue across all dates is 155,155.36.", - "correct": false, - "inputTokens": 3825, - "outputTokens": 14, - "latencyMs": 1143 + "actual": "312818.50", + "correct": true, + "inputTokens": 3824, + "outputTokens": 3722, + "latencyMs": 27321.698167000024 }, { "questionId": "q124", @@ -13615,18 +13615,18 @@ "correct": false, "inputTokens": 3410, "outputTokens": 9, - "latencyMs": 1172 + "latencyMs": 2065.7583339999546 }, { "questionId": "q124", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "312818.50", - "actual": "Total revenue across all dates is 155,155.36.", - "correct": false, - "inputTokens": 2981, - "outputTokens": 14, - "latencyMs": 1179 + "actual": "312818.50", + "correct": true, + "inputTokens": 2980, + "outputTokens": 3658, + "latencyMs": 28778.99891600001 }, { "questionId": "q124", @@ -13637,18 +13637,18 @@ "correct": false, "inputTokens": 3105, "outputTokens": 9, - "latencyMs": 1073 + "latencyMs": 1233.4267090000212 }, { "questionId": "q125", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1811", - "actual": "1030", - "correct": false, - "inputTokens": 3710, - "outputTokens": 3, - "latencyMs": 3823 + "actual": "1811", + "correct": true, + "inputTokens": 3709, + "outputTokens": 2568, + "latencyMs": 28626.692666999996 }, { "questionId": "q125", @@ -13659,18 +13659,18 @@ "correct": false, "inputTokens": 4078, "outputTokens": 7, - "latencyMs": 1153 + "latencyMs": 1133.735584000009 }, { "questionId": "q125", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1811", - "actual": "1040", - "correct": false, - "inputTokens": 1561, - "outputTokens": 3, - "latencyMs": 1472 + "actual": "1811", + "correct": true, + "inputTokens": 1560, + "outputTokens": 1672, + "latencyMs": 14898.688125000044 }, { "questionId": "q125", @@ -13681,18 +13681,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 7, - "latencyMs": 940 + "latencyMs": 1178.2744999999995 }, { "questionId": "q125", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1811", - "actual": "1030", - "correct": false, - "inputTokens": 1439, - "outputTokens": 3, - "latencyMs": 1067 + "actual": "1811", + "correct": true, + "inputTokens": 1438, + "outputTokens": 1864, + "latencyMs": 15225.964540999965 }, { "questionId": "q125", @@ -13703,18 +13703,18 @@ "correct": false, "inputTokens": 1443, "outputTokens": 7, - "latencyMs": 1183 + "latencyMs": 1077.2695419999654 }, { "questionId": "q125", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1811", - "actual": "Total conversions: 1030", - "correct": false, - "inputTokens": 3827, - "outputTokens": 7, - "latencyMs": 1103 + "actual": "1811", + "correct": true, + "inputTokens": 3826, + "outputTokens": 1928, + "latencyMs": 14057.434583000024 }, { "questionId": "q125", @@ -13725,18 +13725,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 7, - "latencyMs": 1067 + "latencyMs": 1177.537500000035 }, { "questionId": "q125", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "1811", - "actual": "1040", - "correct": false, - "inputTokens": 2983, - "outputTokens": 3, - "latencyMs": 932 + "actual": "1811", + "correct": true, + "inputTokens": 2982, + "outputTokens": 2312, + "latencyMs": 19125.74099999998 }, { "questionId": "q125", @@ -13747,18 +13747,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 7, - "latencyMs": 1530 + "latencyMs": 1047.243833000015 }, { "questionId": "q126", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42", "actual": "42", "correct": true, - "inputTokens": 3710, - "outputTokens": 2, - "latencyMs": 1016 + "inputTokens": 3709, + "outputTokens": 1735, + "latencyMs": 14875.021707999986 }, { "questionId": "q126", @@ -13769,18 +13769,18 @@ "correct": true, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 1440 + "latencyMs": 1076.5694999999832 }, { "questionId": "q126", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42", - "actual": "24", - "correct": false, - "inputTokens": 1561, - "outputTokens": 2, - "latencyMs": 1206 + "actual": "42", + "correct": true, + "inputTokens": 1560, + "outputTokens": 2823, + "latencyMs": 22604.422416999994 }, { "questionId": "q126", @@ -13791,18 +13791,18 @@ "correct": true, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 1452 + "latencyMs": 1451.705666999973 }, { "questionId": "q126", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42", - "actual": "22", - "correct": false, - "inputTokens": 1439, - "outputTokens": 2, - "latencyMs": 1249 + "actual": "42", + "correct": true, + "inputTokens": 1438, + "outputTokens": 2183, + "latencyMs": 16916.007042000012 }, { "questionId": "q126", @@ -13813,18 +13813,18 @@ "correct": true, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 1248 + "latencyMs": 1103.1098750000237 }, { "questionId": "q126", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42", - "actual": "20", - "correct": false, - "inputTokens": 3827, - "outputTokens": 2, - "latencyMs": 1420 + "actual": "42", + "correct": true, + "inputTokens": 3826, + "outputTokens": 2055, + "latencyMs": 17162.629124999978 }, { "questionId": "q126", @@ -13835,18 +13835,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 900 + "latencyMs": 1150.0435000000289 }, { "questionId": "q126", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "42", "actual": "42", "correct": true, - "inputTokens": 2983, - "outputTokens": 2, - "latencyMs": 1309 + "inputTokens": 2982, + "outputTokens": 1607, + "latencyMs": 14835.323333000008 }, { "questionId": "q126", @@ -13857,18 +13857,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1216 + "latencyMs": 1206.8219590000226 }, { "questionId": "q127", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "28", - "actual": "38", - "correct": false, - "inputTokens": 3710, - "outputTokens": 2, - "latencyMs": 3911 + "actual": "28", + "correct": true, + "inputTokens": 3709, + "outputTokens": 1479, + "latencyMs": 11560.967958000023 }, { "questionId": "q127", @@ -13879,18 +13879,18 @@ "correct": false, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 1056 + "latencyMs": 1151.9984169999952 }, { "questionId": "q127", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "28", - "actual": "20", - "correct": false, - "inputTokens": 1561, - "outputTokens": 2, - "latencyMs": 839 + "actual": "28", + "correct": true, + "inputTokens": 1560, + "outputTokens": 1927, + "latencyMs": 15431.08262499998 }, { "questionId": "q127", @@ -13901,18 +13901,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 965 + "latencyMs": 1032.7485419999575 }, { "questionId": "q127", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "28", - "actual": "20", - "correct": false, - "inputTokens": 1439, - "outputTokens": 2, - "latencyMs": 2163 + "actual": "28", + "correct": true, + "inputTokens": 1438, + "outputTokens": 1607, + "latencyMs": 9425.883957999991 }, { "questionId": "q127", @@ -13923,18 +13923,18 @@ "correct": false, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 1006 + "latencyMs": 943.5942919999943 }, { "questionId": "q127", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "28", - "actual": "18", - "correct": false, - "inputTokens": 3827, - "outputTokens": 2, - "latencyMs": 2619 + "actual": "28", + "correct": true, + "inputTokens": 3826, + "outputTokens": 1927, + "latencyMs": 16529.66529199999 }, { "questionId": "q127", @@ -13945,18 +13945,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 989 + "latencyMs": 1107.5635419999599 }, { "questionId": "q127", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "28", - "actual": "22", - "correct": false, - "inputTokens": 2983, - "outputTokens": 2, - "latencyMs": 1830 + "actual": "28", + "correct": true, + "inputTokens": 2982, + "outputTokens": 1863, + "latencyMs": 21071.067082999973 }, { "questionId": "q127", @@ -13967,18 +13967,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1001 + "latencyMs": 1018.46212500002 }, { "questionId": "q128", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11", - "actual": "15", - "correct": false, - "inputTokens": 3710, - "outputTokens": 2, - "latencyMs": 1217 + "actual": "11", + "correct": true, + "inputTokens": 3709, + "outputTokens": 1223, + "latencyMs": 8242.37608300004 }, { "questionId": "q128", @@ -13989,18 +13989,18 @@ "correct": true, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 3180 + "latencyMs": 1052.7201249999925 }, { "questionId": "q128", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11", - "actual": "15", - "correct": false, - "inputTokens": 1561, - "outputTokens": 2, - "latencyMs": 1076 + "actual": "11", + "correct": true, + "inputTokens": 1560, + "outputTokens": 903, + "latencyMs": 5430.806291999994 }, { "questionId": "q128", @@ -14011,18 +14011,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 912 + "latencyMs": 2354.328999999969 }, { "questionId": "q128", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11", - "actual": "15", - "correct": false, - "inputTokens": 1439, - "outputTokens": 2, - "latencyMs": 2900 + "actual": "11", + "correct": true, + "inputTokens": 1438, + "outputTokens": 1607, + "latencyMs": 21944.211458000005 }, { "questionId": "q128", @@ -14033,18 +14033,18 @@ "correct": true, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 1389 + "latencyMs": 1249.9959590000217 }, { "questionId": "q128", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11", - "actual": "12", - "correct": false, - "inputTokens": 3827, - "outputTokens": 2, - "latencyMs": 1107 + "actual": "11", + "correct": true, + "inputTokens": 3826, + "outputTokens": 1415, + "latencyMs": 15465.409875000012 }, { "questionId": "q128", @@ -14055,18 +14055,18 @@ "correct": true, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 1150 + "latencyMs": 1131.9575830000103 }, { "questionId": "q128", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11", - "actual": "18", - "correct": false, - "inputTokens": 2983, - "outputTokens": 2, - "latencyMs": 1047 + "actual": "11", + "correct": true, + "inputTokens": 2982, + "outputTokens": 2503, + "latencyMs": 24744.971958999988 }, { "questionId": "q128", @@ -14077,18 +14077,18 @@ "correct": true, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1169 + "latencyMs": 1274.6952499999898 }, { "questionId": "q129", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58", - "actual": "36", - "correct": false, - "inputTokens": 3709, - "outputTokens": 2, - "latencyMs": 1007 + "actual": "58", + "correct": true, + "inputTokens": 3708, + "outputTokens": 1351, + "latencyMs": 12546.867542000022 }, { "questionId": "q129", @@ -14099,18 +14099,18 @@ "correct": false, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 1342 + "latencyMs": 1231.453749999986 }, { "questionId": "q129", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58", - "actual": "24", - "correct": false, - "inputTokens": 1560, - "outputTokens": 2, - "latencyMs": 828 + "actual": "58", + "correct": true, + "inputTokens": 1559, + "outputTokens": 1543, + "latencyMs": 16593.402166999993 }, { "questionId": "q129", @@ -14121,18 +14121,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 1305 + "latencyMs": 1079.0991659999709 }, { "questionId": "q129", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58", - "actual": "15", - "correct": false, - "inputTokens": 1438, - "outputTokens": 2, - "latencyMs": 1305 + "actual": "58", + "correct": true, + "inputTokens": 1437, + "outputTokens": 1543, + "latencyMs": 10956.456084000005 }, { "questionId": "q129", @@ -14143,18 +14143,18 @@ "correct": false, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 1406 + "latencyMs": 2018.3774170000106 }, { "questionId": "q129", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58", - "actual": "18", - "correct": false, - "inputTokens": 3826, - "outputTokens": 2, - "latencyMs": 1513 + "actual": "58", + "correct": true, + "inputTokens": 3825, + "outputTokens": 1351, + "latencyMs": 10537.598500000022 }, { "questionId": "q129", @@ -14165,18 +14165,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 1026 + "latencyMs": 1039.2452080000076 }, { "questionId": "q129", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58", - "actual": "42", - "correct": false, - "inputTokens": 2982, - "outputTokens": 2, - "latencyMs": 1373 + "actual": "58", + "correct": true, + "inputTokens": 2981, + "outputTokens": 839, + "latencyMs": 8039.237708000001 }, { "questionId": "q129", @@ -14187,18 +14187,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1112 + "latencyMs": 1264.6740829999908 }, { "questionId": "q130", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "34", - "correct": false, - "inputTokens": 3709, - "outputTokens": 2, - "latencyMs": 1248 + "actual": "41", + "correct": true, + "inputTokens": 3708, + "outputTokens": 1863, + "latencyMs": 14310.697374999989 }, { "questionId": "q130", @@ -14209,18 +14209,18 @@ "correct": false, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 1083 + "latencyMs": 1138.4443339999998 }, { "questionId": "q130", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "24", - "correct": false, - "inputTokens": 1560, - "outputTokens": 2, - "latencyMs": 895 + "actual": "41", + "correct": true, + "inputTokens": 1559, + "outputTokens": 1927, + "latencyMs": 16487.508375000034 }, { "questionId": "q130", @@ -14231,18 +14231,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 1087 + "latencyMs": 1104.2365410000202 }, { "questionId": "q130", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "18", - "correct": false, - "inputTokens": 1438, - "outputTokens": 2, - "latencyMs": 1157 + "actual": "41", + "correct": true, + "inputTokens": 1437, + "outputTokens": 3015, + "latencyMs": 23688.737208999984 }, { "questionId": "q130", @@ -14253,18 +14253,18 @@ "correct": false, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 1155 + "latencyMs": 1026.8166249999776 }, { "questionId": "q130", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "18", - "correct": false, - "inputTokens": 3826, - "outputTokens": 2, - "latencyMs": 1959 + "actual": "41", + "correct": true, + "inputTokens": 3825, + "outputTokens": 1671, + "latencyMs": 12415.87070899998 }, { "questionId": "q130", @@ -14275,18 +14275,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 1110 + "latencyMs": 1062.2278749999823 }, { "questionId": "q130", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "41", - "actual": "34", - "correct": false, - "inputTokens": 2982, - "outputTokens": 2, - "latencyMs": 4540 + "actual": "41", + "correct": true, + "inputTokens": 2981, + "outputTokens": 1799, + "latencyMs": 15901.829415999993 }, { "questionId": "q130", @@ -14297,18 +14297,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1286 + "latencyMs": 1051.6962910000002 }, { "questionId": "q131", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "23", - "actual": "18", - "correct": false, - "inputTokens": 3709, - "outputTokens": 2, - "latencyMs": 1059 + "actual": "23", + "correct": true, + "inputTokens": 3708, + "outputTokens": 1863, + "latencyMs": 15216.926500000001 }, { "questionId": "q131", @@ -14319,18 +14319,18 @@ "correct": false, "inputTokens": 4078, "outputTokens": 5, - "latencyMs": 1302 + "latencyMs": 1460.9212079999852 }, { "questionId": "q131", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "23", - "actual": "18", - "correct": false, - "inputTokens": 1560, - "outputTokens": 2, - "latencyMs": 1019 + "actual": "23", + "correct": true, + "inputTokens": 1559, + "outputTokens": 2567, + "latencyMs": 27103.083999999973 }, { "questionId": "q131", @@ -14341,18 +14341,18 @@ "correct": false, "inputTokens": 1507, "outputTokens": 5, - "latencyMs": 975 + "latencyMs": 1101.5416669999831 }, { "questionId": "q131", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "23", - "actual": "18", - "correct": false, - "inputTokens": 1438, - "outputTokens": 2, - "latencyMs": 1056 + "actual": "23", + "correct": true, + "inputTokens": 1437, + "outputTokens": 1543, + "latencyMs": 14598.558207999973 }, { "questionId": "q131", @@ -14363,18 +14363,18 @@ "correct": false, "inputTokens": 1443, "outputTokens": 5, - "latencyMs": 984 + "latencyMs": 1270.7722910000011 }, { "questionId": "q131", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "23", - "actual": "15", - "correct": false, - "inputTokens": 3826, - "outputTokens": 2, - "latencyMs": 1420 + "actual": "23", + "correct": true, + "inputTokens": 3825, + "outputTokens": 1415, + "latencyMs": 14102.604708999977 }, { "questionId": "q131", @@ -14385,18 +14385,18 @@ "correct": false, "inputTokens": 3413, "outputTokens": 5, - "latencyMs": 1139 + "latencyMs": 1251.4159170000348 }, { "questionId": "q131", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "23", - "actual": "18", - "correct": false, - "inputTokens": 2982, - "outputTokens": 2, - "latencyMs": 1097 + "actual": "23", + "correct": true, + "inputTokens": 2981, + "outputTokens": 1799, + "latencyMs": 18696.684999999998 }, { "questionId": "q131", @@ -14407,18 +14407,18 @@ "correct": false, "inputTokens": 3108, "outputTokens": 5, - "latencyMs": 1203 + "latencyMs": 1170.9401669999934 }, { "questionId": "q132", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "430828", "actual": "430828", "correct": true, - "inputTokens": 15188, - "outputTokens": 3, - "latencyMs": 2257 + "inputTokens": 15187, + "outputTokens": 136, + "latencyMs": 2872.1482499999693 }, { "questionId": "q132", @@ -14429,18 +14429,18 @@ "correct": true, "inputTokens": 17409, "outputTokens": 6, - "latencyMs": 1292 + "latencyMs": 1382.586333000043 }, { "questionId": "q132", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "430828", "actual": "430828", "correct": true, - "inputTokens": 8789, - "outputTokens": 3, - "latencyMs": 1877 + "inputTokens": 8788, + "outputTokens": 904, + "latencyMs": 9130.657125000027 }, { "questionId": "q132", @@ -14451,18 +14451,18 @@ "correct": true, "inputTokens": 9279, "outputTokens": 6, - "latencyMs": 1118 + "latencyMs": 1164.3372080000117 }, { "questionId": "q132", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "430828", "actual": "430828", "correct": true, - "inputTokens": 8557, - "outputTokens": 3, - "latencyMs": 4023 + "inputTokens": 8556, + "outputTokens": 648, + "latencyMs": 7763.659999999974 }, { "questionId": "q132", @@ -14473,18 +14473,18 @@ "correct": true, "inputTokens": 9125, "outputTokens": 6, - "latencyMs": 1134 + "latencyMs": 1331.3139999999548 }, { "questionId": "q132", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "430828", "actual": "430828", "correct": true, - "inputTokens": 15482, - "outputTokens": 3, - "latencyMs": 5304 + "inputTokens": 15481, + "outputTokens": 584, + "latencyMs": 9411.661499999987 }, { "questionId": "q132", @@ -14495,18 +14495,18 @@ "correct": true, "inputTokens": 15367, "outputTokens": 6, - "latencyMs": 1442 + "latencyMs": 1272.1991249999846 }, { "questionId": "q132", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "430828", "actual": "430828", "correct": true, - "inputTokens": 13172, - "outputTokens": 3, - "latencyMs": 2157 + "inputTokens": 13171, + "outputTokens": 200, + "latencyMs": 3587.8712090000045 }, { "questionId": "q132", @@ -14517,18 +14517,18 @@ "correct": true, "inputTokens": 14483, "outputTokens": 6, - "latencyMs": 1483 + "latencyMs": 1710.5899999999674 }, { "questionId": "q133", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11798", "actual": "11798", "correct": true, - "inputTokens": 15190, - "outputTokens": 3, - "latencyMs": 2084 + "inputTokens": 15189, + "outputTokens": 328, + "latencyMs": 3625.780167000019 }, { "questionId": "q133", @@ -14539,18 +14539,18 @@ "correct": true, "inputTokens": 17410, "outputTokens": 6, - "latencyMs": 2592 + "latencyMs": 1785.2782080000034 }, { "questionId": "q133", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11798", "actual": "11798", "correct": true, - "inputTokens": 8791, - "outputTokens": 3, - "latencyMs": 1208 + "inputTokens": 8790, + "outputTokens": 712, + "latencyMs": 6381.770374999964 }, { "questionId": "q133", @@ -14561,18 +14561,18 @@ "correct": true, "inputTokens": 9280, "outputTokens": 6, - "latencyMs": 1261 + "latencyMs": 1352.5436660000123 }, { "questionId": "q133", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11798", "actual": "11798", "correct": true, - "inputTokens": 8559, - "outputTokens": 3, - "latencyMs": 1697 + "inputTokens": 8558, + "outputTokens": 520, + "latencyMs": 27916.417874999985 }, { "questionId": "q133", @@ -14583,18 +14583,18 @@ "correct": true, "inputTokens": 9126, "outputTokens": 6, - "latencyMs": 1171 + "latencyMs": 2073.8068330000388 }, { "questionId": "q133", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11798", "actual": "11798", "correct": true, - "inputTokens": 15484, - "outputTokens": 3, - "latencyMs": 1704 + "inputTokens": 15483, + "outputTokens": 328, + "latencyMs": 5943.872542000026 }, { "questionId": "q133", @@ -14605,18 +14605,18 @@ "correct": true, "inputTokens": 15368, "outputTokens": 6, - "latencyMs": 1637 + "latencyMs": 1767.4393339999951 }, { "questionId": "q133", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11798", "actual": "11798", "correct": true, - "inputTokens": 13174, - "outputTokens": 3, - "latencyMs": 1599 + "inputTokens": 13173, + "outputTokens": 264, + "latencyMs": 3115.895124999981 }, { "questionId": "q133", @@ -14627,128 +14627,128 @@ "correct": true, "inputTokens": 14484, "outputTokens": 6, - "latencyMs": 1505 + "latencyMs": 1183.2249999999767 }, { "questionId": "q134", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "183631", "actual": "183631", "correct": true, - "inputTokens": 15193, - "outputTokens": 3, - "latencyMs": 2340 - }, - { - "questionId": "q134", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 17412, - "outputTokens": 6, - "latencyMs": 1380 - }, - { - "questionId": "q134", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 8794, - "outputTokens": 3, - "latencyMs": 1631 - }, - { - "questionId": "q134", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 9282, - "outputTokens": 6, - "latencyMs": 1271 - }, - { - "questionId": "q134", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 8562, - "outputTokens": 3, - "latencyMs": 1620 - }, - { - "questionId": "q134", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 9128, - "outputTokens": 6, - "latencyMs": 1279 - }, - { - "questionId": "q134", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 15487, - "outputTokens": 3, - "latencyMs": 14565 - }, - { - "questionId": "q134", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 15370, - "outputTokens": 6, - "latencyMs": 1559 - }, - { - "questionId": "q134", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 13177, - "outputTokens": 3, - "latencyMs": 1600 - }, - { - "questionId": "q134", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "183631", - "actual": "183631", - "correct": true, - "inputTokens": 14486, - "outputTokens": 6, - "latencyMs": 1179 - }, - { - "questionId": "q135", - "format": "json", - "model": "gpt-4o-mini", - "expected": "29246", - "actual": "29246", - "correct": true, "inputTokens": 15192, - "outputTokens": 3, - "latencyMs": 2508 + "outputTokens": 392, + "latencyMs": 4991.646125000028 + }, + { + "questionId": "q134", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 17412, + "outputTokens": 6, + "latencyMs": 1835.4077919999836 + }, + { + "questionId": "q134", + "format": "toon", + "model": "gpt-5-nano", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 8793, + "outputTokens": 712, + "latencyMs": 7788.013291999989 + }, + { + "questionId": "q134", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 9282, + "outputTokens": 6, + "latencyMs": 1082.4066669999738 + }, + { + "questionId": "q134", + "format": "csv", + "model": "gpt-5-nano", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 8561, + "outputTokens": 520, + "latencyMs": 5664.896500000032 + }, + { + "questionId": "q134", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 9128, + "outputTokens": 6, + "latencyMs": 1215.8875830000034 + }, + { + "questionId": "q134", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 15486, + "outputTokens": 456, + "latencyMs": 5141.449292000034 + }, + { + "questionId": "q134", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 15370, + "outputTokens": 6, + "latencyMs": 1483.2090420000022 + }, + { + "questionId": "q134", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 13176, + "outputTokens": 328, + "latencyMs": 7532.760624999995 + }, + { + "questionId": "q134", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "183631", + "actual": "183631", + "correct": true, + "inputTokens": 14486, + "outputTokens": 6, + "latencyMs": 1458.0657500000088 + }, + { + "questionId": "q135", + "format": "json", + "model": "gpt-5-nano", + "expected": "29246", + "actual": "29246", + "correct": true, + "inputTokens": 15191, + "outputTokens": 392, + "latencyMs": 7922.4705829999875 }, { "questionId": "q135", @@ -14759,18 +14759,18 @@ "correct": true, "inputTokens": 17412, "outputTokens": 6, - "latencyMs": 1359 + "latencyMs": 1510.0054579999996 }, { "questionId": "q135", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "29246", "actual": "29246", "correct": true, - "inputTokens": 8793, - "outputTokens": 3, - "latencyMs": 1188 + "inputTokens": 8792, + "outputTokens": 776, + "latencyMs": 8475.77466699999 }, { "questionId": "q135", @@ -14781,18 +14781,18 @@ "correct": true, "inputTokens": 9282, "outputTokens": 6, - "latencyMs": 1204 + "latencyMs": 1203.3620419999934 }, { "questionId": "q135", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "29246", "actual": "29246", "correct": true, - "inputTokens": 8561, - "outputTokens": 3, - "latencyMs": 2448 + "inputTokens": 8560, + "outputTokens": 776, + "latencyMs": 7283.84258300002 }, { "questionId": "q135", @@ -14803,18 +14803,18 @@ "correct": true, "inputTokens": 9128, "outputTokens": 6, - "latencyMs": 1311 + "latencyMs": 1365.2434169999906 }, { "questionId": "q135", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "29246", "actual": "29246", "correct": true, - "inputTokens": 15486, - "outputTokens": 3, - "latencyMs": 2442 + "inputTokens": 15485, + "outputTokens": 520, + "latencyMs": 5846.538916999998 }, { "questionId": "q135", @@ -14825,18 +14825,18 @@ "correct": true, "inputTokens": 15370, "outputTokens": 6, - "latencyMs": 1414 + "latencyMs": 1203.6220829999656 }, { "questionId": "q135", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "29246", "actual": "29246", "correct": true, - "inputTokens": 13176, - "outputTokens": 3, - "latencyMs": 2254 + "inputTokens": 13175, + "outputTokens": 456, + "latencyMs": 5973.848832999996 }, { "questionId": "q135", @@ -14847,18 +14847,18 @@ "correct": true, "inputTokens": 14486, "outputTokens": 6, - "latencyMs": 1512 + "latencyMs": 1189.811875000014 }, { "questionId": "q136", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "135306", "actual": "135306", "correct": true, - "inputTokens": 15188, - "outputTokens": 3, - "latencyMs": 1565 + "inputTokens": 15187, + "outputTokens": 328, + "latencyMs": 8872.252957999997 }, { "questionId": "q136", @@ -14869,18 +14869,18 @@ "correct": true, "inputTokens": 17407, "outputTokens": 6, - "latencyMs": 1871 + "latencyMs": 1775.476083000016 }, { "questionId": "q136", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "135306", "actual": "135306", "correct": true, - "inputTokens": 8789, - "outputTokens": 3, - "latencyMs": 1963 + "inputTokens": 8788, + "outputTokens": 648, + "latencyMs": 7149.649291000038 }, { "questionId": "q136", @@ -14891,18 +14891,18 @@ "correct": true, "inputTokens": 9277, "outputTokens": 6, - "latencyMs": 1533 + "latencyMs": 1577.2079999999842 }, { "questionId": "q136", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "135306", "actual": "135306", "correct": true, - "inputTokens": 8557, - "outputTokens": 3, - "latencyMs": 1561 + "inputTokens": 8556, + "outputTokens": 1288, + "latencyMs": 11344.462834000005 }, { "questionId": "q136", @@ -14913,18 +14913,18 @@ "correct": true, "inputTokens": 9123, "outputTokens": 6, - "latencyMs": 1200 + "latencyMs": 1340.27887499996 }, { "questionId": "q136", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "135306", "actual": "135306", "correct": true, - "inputTokens": 15482, - "outputTokens": 3, - "latencyMs": 1657 + "inputTokens": 15481, + "outputTokens": 392, + "latencyMs": 6256.696250000037 }, { "questionId": "q136", @@ -14935,18 +14935,18 @@ "correct": true, "inputTokens": 15365, "outputTokens": 6, - "latencyMs": 1582 + "latencyMs": 1604.6909999999916 }, { "questionId": "q136", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "135306", "actual": "135306", "correct": true, - "inputTokens": 13172, - "outputTokens": 3, - "latencyMs": 3402 + "inputTokens": 13171, + "outputTokens": 456, + "latencyMs": 5982.022666999954 }, { "questionId": "q136", @@ -14957,18 +14957,18 @@ "correct": true, "inputTokens": 14481, "outputTokens": 6, - "latencyMs": 1251 + "latencyMs": 1259.2409589999588 }, { "questionId": "q137", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "24914", "actual": "24914", "correct": true, - "inputTokens": 15187, - "outputTokens": 3, - "latencyMs": 2019 + "inputTokens": 15186, + "outputTokens": 200, + "latencyMs": 2858.1693749999977 }, { "questionId": "q137", @@ -14979,18 +14979,18 @@ "correct": true, "inputTokens": 17408, "outputTokens": 6, - "latencyMs": 1517 + "latencyMs": 1786.5725000000093 }, { "questionId": "q137", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "24914", - "actual": "The repository undefined/react-native does not exist in the provided data.", - "correct": false, - "inputTokens": 8788, - "outputTokens": 14, - "latencyMs": 1737 + "actual": "24914", + "correct": true, + "inputTokens": 8787, + "outputTokens": 2696, + "latencyMs": 23868.72975 }, { "questionId": "q137", @@ -15001,18 +15001,18 @@ "correct": true, "inputTokens": 9278, "outputTokens": 6, - "latencyMs": 1467 + "latencyMs": 1116.0275000000256 }, { "questionId": "q137", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "24914", - "actual": "24914", - "correct": true, - "inputTokens": 8556, - "outputTokens": 3, - "latencyMs": 3442 + "actual": "0", + "correct": false, + "inputTokens": 8555, + "outputTokens": 1543, + "latencyMs": 17006.341916999954 }, { "questionId": "q137", @@ -15023,18 +15023,18 @@ "correct": true, "inputTokens": 9124, "outputTokens": 6, - "latencyMs": 1300 + "latencyMs": 1425.7799160000286 }, { "questionId": "q137", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "24914", "actual": "24914", "correct": true, - "inputTokens": 15481, - "outputTokens": 3, - "latencyMs": 1825 + "inputTokens": 15480, + "outputTokens": 648, + "latencyMs": 8414.583791000012 }, { "questionId": "q137", @@ -15045,18 +15045,18 @@ "correct": true, "inputTokens": 15366, "outputTokens": 6, - "latencyMs": 1443 + "latencyMs": 1374.9217920000083 }, { "questionId": "q137", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "24914", - "actual": "124320", - "correct": false, - "inputTokens": 13171, - "outputTokens": 3, - "latencyMs": 1783 + "actual": "24914", + "correct": true, + "inputTokens": 13170, + "outputTokens": 456, + "latencyMs": 6113.31808300002 }, { "questionId": "q137", @@ -15067,18 +15067,18 @@ "correct": true, "inputTokens": 14482, "outputTokens": 6, - "latencyMs": 1362 + "latencyMs": 1374.9246660000063 }, { "questionId": "q138", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111683", "actual": "111683", "correct": true, - "inputTokens": 15187, - "outputTokens": 3, - "latencyMs": 1824 + "inputTokens": 15186, + "outputTokens": 392, + "latencyMs": 5410.596499999985 }, { "questionId": "q138", @@ -15089,18 +15089,18 @@ "correct": true, "inputTokens": 17407, "outputTokens": 6, - "latencyMs": 1479 + "latencyMs": 1607.6261659999727 }, { "questionId": "q138", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111683", - "actual": "108017", - "correct": false, - "inputTokens": 8788, - "outputTokens": 3, - "latencyMs": 3315 + "actual": "111683", + "correct": true, + "inputTokens": 8787, + "outputTokens": 520, + "latencyMs": 6469.81479199999 }, { "questionId": "q138", @@ -15111,18 +15111,18 @@ "correct": true, "inputTokens": 9277, "outputTokens": 6, - "latencyMs": 1270 + "latencyMs": 1103.9521250000107 }, { "questionId": "q138", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111683", "actual": "111683", "correct": true, - "inputTokens": 8556, - "outputTokens": 3, - "latencyMs": 1384 + "inputTokens": 8555, + "outputTokens": 904, + "latencyMs": 8993.236791000003 }, { "questionId": "q138", @@ -15133,18 +15133,18 @@ "correct": true, "inputTokens": 9123, "outputTokens": 6, - "latencyMs": 1252 + "latencyMs": 1118.0249590000021 }, { "questionId": "q138", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111683", "actual": "111683", "correct": true, - "inputTokens": 15481, - "outputTokens": 3, - "latencyMs": 3048 + "inputTokens": 15480, + "outputTokens": 392, + "latencyMs": 4705.902084000001 }, { "questionId": "q138", @@ -15155,18 +15155,18 @@ "correct": true, "inputTokens": 15365, "outputTokens": 6, - "latencyMs": 1381 + "latencyMs": 1454.1250839999993 }, { "questionId": "q138", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "111683", "actual": "111683", "correct": true, - "inputTokens": 13171, - "outputTokens": 3, - "latencyMs": 3804 + "inputTokens": 13170, + "outputTokens": 456, + "latencyMs": 5041.734750000003 }, { "questionId": "q138", @@ -15177,18 +15177,18 @@ "correct": true, "inputTokens": 14481, "outputTokens": 6, - "latencyMs": 1498 + "latencyMs": 1199.9473330000183 }, { "questionId": "q139", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "13364", "actual": "13364", "correct": true, - "inputTokens": 15194, - "outputTokens": 3, - "latencyMs": 1726 + "inputTokens": 15193, + "outputTokens": 328, + "latencyMs": 4364.900083000015 }, { "questionId": "q139", @@ -15199,18 +15199,18 @@ "correct": true, "inputTokens": 17412, "outputTokens": 6, - "latencyMs": 1526 + "latencyMs": 1320.7056250000023 }, { "questionId": "q139", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "13364", "actual": "13364", "correct": true, - "inputTokens": 8795, - "outputTokens": 3, - "latencyMs": 1685 + "inputTokens": 8794, + "outputTokens": 904, + "latencyMs": 8590.36599999998 }, { "questionId": "q139", @@ -15221,18 +15221,18 @@ "correct": true, "inputTokens": 9282, "outputTokens": 6, - "latencyMs": 1140 + "latencyMs": 1166.0237089999719 }, { "questionId": "q139", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "13364", - "actual": "0", - "correct": false, - "inputTokens": 8563, - "outputTokens": 2, - "latencyMs": 1933 + "actual": "13364", + "correct": true, + "inputTokens": 8562, + "outputTokens": 648, + "latencyMs": 6442.057417000004 }, { "questionId": "q139", @@ -15243,18 +15243,18 @@ "correct": true, "inputTokens": 9128, "outputTokens": 6, - "latencyMs": 1157 + "latencyMs": 1342.8652910000528 }, { "questionId": "q139", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "13364", "actual": "13364", "correct": true, - "inputTokens": 15488, - "outputTokens": 3, - "latencyMs": 1249 + "inputTokens": 15487, + "outputTokens": 264, + "latencyMs": 4450.340833000024 }, { "questionId": "q139", @@ -15265,18 +15265,18 @@ "correct": true, "inputTokens": 15370, "outputTokens": 6, - "latencyMs": 1347 + "latencyMs": 1551.4001249999856 }, { "questionId": "q139", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "13364", "actual": "13364", "correct": true, - "inputTokens": 13178, - "outputTokens": 3, - "latencyMs": 2174 + "inputTokens": 13177, + "outputTokens": 520, + "latencyMs": 5858.679374999949 }, { "questionId": "q139", @@ -15287,18 +15287,18 @@ "correct": true, "inputTokens": 14486, "outputTokens": 6, - "latencyMs": 1197 + "latencyMs": 1173.6422499999753 }, { "questionId": "q140", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "98464", - "actual": "0", - "correct": false, - "inputTokens": 15186, - "outputTokens": 2, - "latencyMs": 3252 + "actual": "98464", + "correct": true, + "inputTokens": 15185, + "outputTokens": 456, + "latencyMs": 6377.878708000004 }, { "questionId": "q140", @@ -15309,18 +15309,18 @@ "correct": true, "inputTokens": 17405, "outputTokens": 6, - "latencyMs": 1667 + "latencyMs": 1312.9188750000321 }, { "questionId": "q140", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "98464", - "actual": "0", - "correct": false, - "inputTokens": 8787, - "outputTokens": 2, - "latencyMs": 1192 + "actual": "98464", + "correct": true, + "inputTokens": 8786, + "outputTokens": 4680, + "latencyMs": 36395.80937499995 }, { "questionId": "q140", @@ -15331,18 +15331,18 @@ "correct": true, "inputTokens": 9275, "outputTokens": 6, - "latencyMs": 1113 + "latencyMs": 2024.6539580000099 }, { "questionId": "q140", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "98464", - "actual": "0", - "correct": false, - "inputTokens": 8555, - "outputTokens": 2, - "latencyMs": 2198 + "actual": "98464", + "correct": true, + "inputTokens": 8554, + "outputTokens": 3784, + "latencyMs": 30336.309707999986 }, { "questionId": "q140", @@ -15353,18 +15353,18 @@ "correct": true, "inputTokens": 9121, "outputTokens": 6, - "latencyMs": 1187 + "latencyMs": 1237.6976249999716 }, { "questionId": "q140", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "98464", - "actual": "0", - "correct": false, - "inputTokens": 15480, - "outputTokens": 2, - "latencyMs": 8573 + "actual": "98464", + "correct": true, + "inputTokens": 15479, + "outputTokens": 264, + "latencyMs": 5297.444375000021 }, { "questionId": "q140", @@ -15375,18 +15375,18 @@ "correct": true, "inputTokens": 15363, "outputTokens": 6, - "latencyMs": 1311 + "latencyMs": 1775.3334170000162 }, { "questionId": "q140", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "98464", - "actual": "0", - "correct": false, - "inputTokens": 13170, - "outputTokens": 2, - "latencyMs": 3471 + "actual": "98464", + "correct": true, + "inputTokens": 13169, + "outputTokens": 392, + "latencyMs": 8030.958958000003 }, { "questionId": "q140", @@ -15397,18 +15397,18 @@ "correct": true, "inputTokens": 14479, "outputTokens": 6, - "latencyMs": 1457 + "latencyMs": 1401.1453330000513 }, { "questionId": "q141", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6378", "actual": "6378", "correct": true, - "inputTokens": 15188, - "outputTokens": 3, - "latencyMs": 1363 + "inputTokens": 15187, + "outputTokens": 264, + "latencyMs": 6193.845583000046 }, { "questionId": "q141", @@ -15419,18 +15419,18 @@ "correct": true, "inputTokens": 17408, "outputTokens": 6, - "latencyMs": 1803 + "latencyMs": 2449.4082920000073 }, { "questionId": "q141", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6378", "actual": "6378", "correct": true, - "inputTokens": 8789, - "outputTokens": 3, - "latencyMs": 3696 + "inputTokens": 8788, + "outputTokens": 2568, + "latencyMs": 25386.850749999983 }, { "questionId": "q141", @@ -15441,18 +15441,18 @@ "correct": true, "inputTokens": 9278, "outputTokens": 6, - "latencyMs": 1391 + "latencyMs": 1351.401165999996 }, { "questionId": "q141", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6378", - "actual": "93731", - "correct": false, - "inputTokens": 8557, - "outputTokens": 3, - "latencyMs": 7861 + "actual": "6378", + "correct": true, + "inputTokens": 8556, + "outputTokens": 456, + "latencyMs": 5087.453167000029 }, { "questionId": "q141", @@ -15463,18 +15463,18 @@ "correct": true, "inputTokens": 9124, "outputTokens": 6, - "latencyMs": 1420 + "latencyMs": 1229.4187500000116 }, { "questionId": "q141", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6378", "actual": "6378", "correct": true, - "inputTokens": 15482, - "outputTokens": 3, - "latencyMs": 1769 + "inputTokens": 15481, + "outputTokens": 520, + "latencyMs": 6781.348249999981 }, { "questionId": "q141", @@ -15485,18 +15485,18 @@ "correct": true, "inputTokens": 15366, "outputTokens": 6, - "latencyMs": 1233 + "latencyMs": 1411.0081670000218 }, { "questionId": "q141", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "6378", - "actual": "93731", - "correct": false, - "inputTokens": 13172, - "outputTokens": 3, - "latencyMs": 1831 + "actual": "6378", + "correct": true, + "inputTokens": 13171, + "outputTokens": 328, + "latencyMs": 9405.325083000003 }, { "questionId": "q141", @@ -15507,18 +15507,18 @@ "correct": true, "inputTokens": 14482, "outputTokens": 6, - "latencyMs": 1507 + "latencyMs": 1575.9942499999888 }, { "questionId": "q142", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "254916", "actual": "254916", "correct": true, - "inputTokens": 15190, - "outputTokens": 3, - "latencyMs": 10752 + "inputTokens": 15189, + "outputTokens": 456, + "latencyMs": 7723.79820900003 }, { "questionId": "q142", @@ -15529,18 +15529,18 @@ "correct": true, "inputTokens": 17409, "outputTokens": 6, - "latencyMs": 1672 + "latencyMs": 1496.878625000012 }, { "questionId": "q142", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "254916", "actual": "254916", "correct": true, - "inputTokens": 8791, - "outputTokens": 3, - "latencyMs": 1788 + "inputTokens": 8790, + "outputTokens": 328, + "latencyMs": 5231.312959000003 }, { "questionId": "q142", @@ -15551,18 +15551,18 @@ "correct": true, "inputTokens": 9279, "outputTokens": 6, - "latencyMs": 1633 + "latencyMs": 1145.5107919999864 }, { "questionId": "q142", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "254916", "actual": "254916", "correct": true, - "inputTokens": 8559, - "outputTokens": 3, - "latencyMs": 1365 + "inputTokens": 8558, + "outputTokens": 392, + "latencyMs": 4585.943417000002 }, { "questionId": "q142", @@ -15573,18 +15573,18 @@ "correct": true, "inputTokens": 9125, "outputTokens": 6, - "latencyMs": 1242 + "latencyMs": 1386.1237079999992 }, { "questionId": "q142", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "254916", "actual": "254916", "correct": true, - "inputTokens": 15484, - "outputTokens": 3, - "latencyMs": 2237 + "inputTokens": 15483, + "outputTokens": 328, + "latencyMs": 9374.248917000019 }, { "questionId": "q142", @@ -15595,18 +15595,18 @@ "correct": true, "inputTokens": 15367, "outputTokens": 6, - "latencyMs": 1275 + "latencyMs": 1332.4388340000296 }, { "questionId": "q142", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "254916", "actual": "254916", "correct": true, - "inputTokens": 13174, - "outputTokens": 3, - "latencyMs": 3028 + "inputTokens": 13173, + "outputTokens": 200, + "latencyMs": 3953.8284580000327 }, { "questionId": "q142", @@ -15617,18 +15617,18 @@ "correct": true, "inputTokens": 14483, "outputTokens": 6, - "latencyMs": 1615 + "latencyMs": 1294.3535840000259 }, { "questionId": "q143", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32413", "actual": "32413", "correct": true, - "inputTokens": 15188, - "outputTokens": 3, - "latencyMs": 1972 + "inputTokens": 15187, + "outputTokens": 584, + "latencyMs": 8515.676582999993 }, { "questionId": "q143", @@ -15639,18 +15639,18 @@ "correct": true, "inputTokens": 17410, "outputTokens": 6, - "latencyMs": 2308 + "latencyMs": 2508.0940420000115 }, { "questionId": "q143", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32413", "actual": "32413", "correct": true, - "inputTokens": 8789, - "outputTokens": 3, - "latencyMs": 1361 + "inputTokens": 8788, + "outputTokens": 584, + "latencyMs": 6331.0320000000065 }, { "questionId": "q143", @@ -15661,18 +15661,18 @@ "correct": true, "inputTokens": 9280, "outputTokens": 6, - "latencyMs": 1162 + "latencyMs": 1249.4856250000303 }, { "questionId": "q143", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32413", "actual": "32413", "correct": true, - "inputTokens": 8557, - "outputTokens": 3, - "latencyMs": 2196 + "inputTokens": 8556, + "outputTokens": 648, + "latencyMs": 8463.519499999995 }, { "questionId": "q143", @@ -15683,18 +15683,18 @@ "correct": true, "inputTokens": 9126, "outputTokens": 6, - "latencyMs": 1199 + "latencyMs": 1035.4223750000237 }, { "questionId": "q143", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32413", "actual": "32413", "correct": true, - "inputTokens": 15482, - "outputTokens": 3, - "latencyMs": 1758 + "inputTokens": 15481, + "outputTokens": 520, + "latencyMs": 9625.975833999983 }, { "questionId": "q143", @@ -15705,18 +15705,18 @@ "correct": true, "inputTokens": 15368, "outputTokens": 6, - "latencyMs": 1340 + "latencyMs": 1460.7396250000456 }, { "questionId": "q143", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "32413", "actual": "32413", "correct": true, - "inputTokens": 13172, - "outputTokens": 3, - "latencyMs": 2122 + "inputTokens": 13171, + "outputTokens": 712, + "latencyMs": 7525.112709000008 }, { "questionId": "q143", @@ -15727,18 +15727,18 @@ "correct": true, "inputTokens": 14484, "outputTokens": 6, - "latencyMs": 1156 + "latencyMs": 1488.0029170000344 }, { "questionId": "q144", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "240059", - "actual": "0", + "actual": "not found", "correct": false, - "inputTokens": 15186, - "outputTokens": 2, - "latencyMs": 1208 + "inputTokens": 15185, + "outputTokens": 1352, + "latencyMs": 8303.157542 }, { "questionId": "q144", @@ -15749,18 +15749,18 @@ "correct": true, "inputTokens": 17405, "outputTokens": 6, - "latencyMs": 1826 + "latencyMs": 1515.7900000000373 }, { "questionId": "q144", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "240059", - "actual": "undefined", + "actual": "0", "correct": false, - "inputTokens": 8787, - "outputTokens": 2, - "latencyMs": 2224 + "inputTokens": 8786, + "outputTokens": 2503, + "latencyMs": 20915.808583000035 }, { "questionId": "q144", @@ -15771,18 +15771,18 @@ "correct": true, "inputTokens": 9275, "outputTokens": 6, - "latencyMs": 1220 + "latencyMs": 1193.4237079999875 }, { "questionId": "q144", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "240059", - "actual": "undefined", - "correct": false, - "inputTokens": 8555, - "outputTokens": 2, - "latencyMs": 1199 + "actual": "240059", + "correct": true, + "inputTokens": 8554, + "outputTokens": 4360, + "latencyMs": 34760.80329100002 }, { "questionId": "q144", @@ -15793,18 +15793,18 @@ "correct": true, "inputTokens": 9121, "outputTokens": 6, - "latencyMs": 1264 + "latencyMs": 3022.242749999976 }, { "questionId": "q144", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "240059", - "actual": "undefined/react does not exist in the provided data.", + "actual": "0", "correct": false, - "inputTokens": 15480, - "outputTokens": 11, - "latencyMs": 3072 + "inputTokens": 15479, + "outputTokens": 2567, + "latencyMs": 15901.546999999962 }, { "questionId": "q144", @@ -15815,18 +15815,18 @@ "correct": true, "inputTokens": 15363, "outputTokens": 6, - "latencyMs": 1609 + "latencyMs": 1358.283374999999 }, { "questionId": "q144", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "240059", - "actual": "undefined/react does not exist in the provided data.", - "correct": false, - "inputTokens": 13170, - "outputTokens": 11, - "latencyMs": 2608 + "actual": "240059", + "correct": true, + "inputTokens": 13169, + "outputTokens": 584, + "latencyMs": 10520.349042000016 }, { "questionId": "q144", @@ -15837,18 +15837,18 @@ "correct": true, "inputTokens": 14479, "outputTokens": 6, - "latencyMs": 1237 + "latencyMs": 1426.0678330000374 }, { "questionId": "q145", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "48986", - "actual": "0", - "correct": false, - "inputTokens": 15187, - "outputTokens": 2, - "latencyMs": 1906 + "actual": "48986", + "correct": true, + "inputTokens": 15186, + "outputTokens": 712, + "latencyMs": 7069.827042000019 }, { "questionId": "q145", @@ -15859,18 +15859,18 @@ "correct": true, "inputTokens": 17406, "outputTokens": 6, - "latencyMs": 1399 + "latencyMs": 1507.9525419999845 }, { "questionId": "q145", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "48986", - "actual": "0", + "actual": "undefined", "correct": false, - "inputTokens": 8788, - "outputTokens": 2, - "latencyMs": 2026 + "inputTokens": 8787, + "outputTokens": 2311, + "latencyMs": 18257.385332999984 }, { "questionId": "q145", @@ -15881,18 +15881,18 @@ "correct": true, "inputTokens": 9276, "outputTokens": 6, - "latencyMs": 1318 + "latencyMs": 1397.3040420000325 }, { "questionId": "q145", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "48986", - "actual": "0", - "correct": false, - "inputTokens": 8556, - "outputTokens": 2, - "latencyMs": 1605 + "actual": "48986", + "correct": true, + "inputTokens": 8555, + "outputTokens": 3976, + "latencyMs": 29865.140291999967 }, { "questionId": "q145", @@ -15903,18 +15903,18 @@ "correct": true, "inputTokens": 9122, "outputTokens": 6, - "latencyMs": 1270 + "latencyMs": 1218.4357079999754 }, { "questionId": "q145", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "48986", - "actual": "0", - "correct": false, - "inputTokens": 15481, - "outputTokens": 2, - "latencyMs": 5367 + "actual": "48986", + "correct": true, + "inputTokens": 15480, + "outputTokens": 904, + "latencyMs": 8906.708750000049 }, { "questionId": "q145", @@ -15925,18 +15925,18 @@ "correct": true, "inputTokens": 15364, "outputTokens": 6, - "latencyMs": 1204 + "latencyMs": 1917.3721249999944 }, { "questionId": "q145", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "48986", - "actual": "The repository \"undefined/Python\" does not exist in the provided data.", - "correct": false, - "inputTokens": 13171, - "outputTokens": 16, - "latencyMs": 6329 + "actual": "48986", + "correct": true, + "inputTokens": 13170, + "outputTokens": 1160, + "latencyMs": 9665.802708000003 }, { "questionId": "q145", @@ -15947,18 +15947,18 @@ "correct": true, "inputTokens": 14480, "outputTokens": 6, - "latencyMs": 1369 + "latencyMs": 1342.7929170000134 }, { "questionId": "q146", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "209624", "actual": "209624", "correct": true, - "inputTokens": 15186, - "outputTokens": 3, - "latencyMs": 2063 + "inputTokens": 15185, + "outputTokens": 648, + "latencyMs": 6259.387500000012 }, { "questionId": "q146", @@ -15969,18 +15969,18 @@ "correct": true, "inputTokens": 17405, "outputTokens": 6, - "latencyMs": 1470 + "latencyMs": 1860.1597499999916 }, { "questionId": "q146", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "209624", "actual": "209624", "correct": true, - "inputTokens": 8787, - "outputTokens": 3, - "latencyMs": 1386 + "inputTokens": 8786, + "outputTokens": 3336, + "latencyMs": 23288.63820799999 }, { "questionId": "q146", @@ -15991,18 +15991,18 @@ "correct": true, "inputTokens": 9275, "outputTokens": 6, - "latencyMs": 1104 + "latencyMs": 1180.5804169999901 }, { "questionId": "q146", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "209624", "actual": "209624", "correct": true, - "inputTokens": 8555, - "outputTokens": 3, - "latencyMs": 1747 + "inputTokens": 8554, + "outputTokens": 840, + "latencyMs": 6988.782166000048 }, { "questionId": "q146", @@ -16013,18 +16013,18 @@ "correct": true, "inputTokens": 9121, "outputTokens": 6, - "latencyMs": 1300 + "latencyMs": 1391.326041000022 }, { "questionId": "q146", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "209624", "actual": "209624", "correct": true, - "inputTokens": 15480, - "outputTokens": 3, - "latencyMs": 1443 + "inputTokens": 15479, + "outputTokens": 648, + "latencyMs": 6708.915624999965 }, { "questionId": "q146", @@ -16035,18 +16035,18 @@ "correct": true, "inputTokens": 15363, "outputTokens": 6, - "latencyMs": 1282 + "latencyMs": 1364.766833999951 }, { "questionId": "q146", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "209624", "actual": "209624", "correct": true, - "inputTokens": 13170, - "outputTokens": 3, - "latencyMs": 2185 + "inputTokens": 13169, + "outputTokens": 328, + "latencyMs": 3396.199416999996 }, { "questionId": "q146", @@ -16057,18 +16057,18 @@ "correct": true, "inputTokens": 14479, "outputTokens": 6, - "latencyMs": 1407 + "latencyMs": 1378.3461249999818 }, { "questionId": "q147", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58023", "actual": "58023", "correct": true, - "inputTokens": 15186, - "outputTokens": 3, - "latencyMs": 1743 + "inputTokens": 15185, + "outputTokens": 200, + "latencyMs": 2947.7053750000196 }, { "questionId": "q147", @@ -16079,18 +16079,18 @@ "correct": true, "inputTokens": 17406, "outputTokens": 6, - "latencyMs": 1564 + "latencyMs": 1512.1218329999829 }, { "questionId": "q147", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58023", "actual": "58023", "correct": true, - "inputTokens": 8787, - "outputTokens": 3, - "latencyMs": 1317 + "inputTokens": 8786, + "outputTokens": 840, + "latencyMs": 7657.443458000023 }, { "questionId": "q147", @@ -16101,18 +16101,18 @@ "correct": true, "inputTokens": 9276, "outputTokens": 6, - "latencyMs": 1258 + "latencyMs": 1119.6807499999995 }, { "questionId": "q147", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58023", "actual": "58023", "correct": true, - "inputTokens": 8555, - "outputTokens": 3, - "latencyMs": 2419 + "inputTokens": 8554, + "outputTokens": 392, + "latencyMs": 4410.906208000029 }, { "questionId": "q147", @@ -16123,18 +16123,18 @@ "correct": true, "inputTokens": 9122, "outputTokens": 6, - "latencyMs": 1171 + "latencyMs": 1227.467249999987 }, { "questionId": "q147", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58023", - "actual": "undefined/linux does not exist in the provided data.", - "correct": false, - "inputTokens": 15480, - "outputTokens": 11, - "latencyMs": 1680 + "actual": "58023", + "correct": true, + "inputTokens": 15479, + "outputTokens": 328, + "latencyMs": 4168.014292000036 }, { "questionId": "q147", @@ -16145,18 +16145,18 @@ "correct": true, "inputTokens": 15364, "outputTokens": 6, - "latencyMs": 1396 + "latencyMs": 1878.2624590000487 }, { "questionId": "q147", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "58023", - "actual": "The repository \"undefined/linux\" does not exist in the provided data.", - "correct": false, - "inputTokens": 13170, - "outputTokens": 15, - "latencyMs": 1418 + "actual": "58023", + "correct": true, + "inputTokens": 13169, + "outputTokens": 456, + "latencyMs": 4726.903416000016 }, { "questionId": "q147", @@ -16167,18 +16167,18 @@ "correct": true, "inputTokens": 14480, "outputTokens": 6, - "latencyMs": 1399 + "latencyMs": 1665.950124999974 }, { "questionId": "q148", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "196024", "actual": "196024", "correct": true, - "inputTokens": 15189, - "outputTokens": 3, - "latencyMs": 1673 + "inputTokens": 15188, + "outputTokens": 456, + "latencyMs": 5633.756834 }, { "questionId": "q148", @@ -16189,18 +16189,18 @@ "correct": true, "inputTokens": 17407, "outputTokens": 6, - "latencyMs": 1736 + "latencyMs": 1482.6277910000063 }, { "questionId": "q148", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "196024", "actual": "196024", "correct": true, - "inputTokens": 8790, - "outputTokens": 3, - "latencyMs": 1754 + "inputTokens": 8789, + "outputTokens": 1416, + "latencyMs": 11371.267457999988 }, { "questionId": "q148", @@ -16211,18 +16211,18 @@ "correct": true, "inputTokens": 9277, "outputTokens": 6, - "latencyMs": 1317 + "latencyMs": 1690.2400420000195 }, { "questionId": "q148", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "196024", - "actual": "0", + "actual": "Repo not found", "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 3219 + "inputTokens": 8557, + "outputTokens": 3273, + "latencyMs": 28731.530667000043 }, { "questionId": "q148", @@ -16233,18 +16233,18 @@ "correct": true, "inputTokens": 9123, "outputTokens": 6, - "latencyMs": 1311 + "latencyMs": 1070.5141670000157 }, { "questionId": "q148", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "196024", "actual": "196024", "correct": true, - "inputTokens": 15483, - "outputTokens": 3, - "latencyMs": 1346 + "inputTokens": 15482, + "outputTokens": 520, + "latencyMs": 7021.771125000028 }, { "questionId": "q148", @@ -16255,18 +16255,18 @@ "correct": true, "inputTokens": 15365, "outputTokens": 6, - "latencyMs": 1560 + "latencyMs": 1243.7466250000289 }, { "questionId": "q148", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "196024", "actual": "196024", "correct": true, - "inputTokens": 13173, - "outputTokens": 3, - "latencyMs": 1009 + "inputTokens": 13172, + "outputTokens": 456, + "latencyMs": 5286.169750000001 }, { "questionId": "q148", @@ -16277,18 +16277,18 @@ "correct": true, "inputTokens": 14481, "outputTokens": 6, - "latencyMs": 1446 + "latencyMs": 1450.456957999966 }, { "questionId": "q149", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "30919", "actual": "30919", "correct": true, - "inputTokens": 15189, - "outputTokens": 3, - "latencyMs": 3361 + "inputTokens": 15188, + "outputTokens": 456, + "latencyMs": 5440.864250000042 }, { "questionId": "q149", @@ -16299,18 +16299,18 @@ "correct": true, "inputTokens": 17408, "outputTokens": 6, - "latencyMs": 1788 + "latencyMs": 1369.6618330000201 }, { "questionId": "q149", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "30919", "actual": "30919", "correct": true, - "inputTokens": 8790, - "outputTokens": 3, - "latencyMs": 1123 + "inputTokens": 8789, + "outputTokens": 712, + "latencyMs": 6130.9379999999655 }, { "questionId": "q149", @@ -16321,18 +16321,18 @@ "correct": true, "inputTokens": 9278, "outputTokens": 6, - "latencyMs": 1235 + "latencyMs": 1635.81579100003 }, { "questionId": "q149", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "30919", - "actual": "30919", - "correct": true, - "inputTokens": 8558, - "outputTokens": 3, - "latencyMs": 1100 + "actual": "N/A", + "correct": false, + "inputTokens": 8557, + "outputTokens": 1288, + "latencyMs": 20319.653374999994 }, { "questionId": "q149", @@ -16343,18 +16343,18 @@ "correct": true, "inputTokens": 9124, "outputTokens": 6, - "latencyMs": 1188 + "latencyMs": 1381.8252079999656 }, { "questionId": "q149", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "30919", "actual": "30919", "correct": true, - "inputTokens": 15483, - "outputTokens": 3, - "latencyMs": 1557 + "inputTokens": 15482, + "outputTokens": 328, + "latencyMs": 5951.751374999993 }, { "questionId": "q149", @@ -16365,18 +16365,18 @@ "correct": true, "inputTokens": 15366, "outputTokens": 6, - "latencyMs": 1352 + "latencyMs": 1367.1241670000018 }, { "questionId": "q149", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "30919", "actual": "30919", "correct": true, - "inputTokens": 13173, - "outputTokens": 3, - "latencyMs": 1280 + "inputTokens": 13172, + "outputTokens": 328, + "latencyMs": 3499.136334000039 }, { "questionId": "q149", @@ -16387,18 +16387,18 @@ "correct": true, "inputTokens": 14482, "outputTokens": 6, - "latencyMs": 1247 + "latencyMs": 1573.7027499999967 }, { "questionId": "q150", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "192220", "actual": "192220", "correct": true, - "inputTokens": 15188, - "outputTokens": 3, - "latencyMs": 1394 + "inputTokens": 15187, + "outputTokens": 392, + "latencyMs": 7833.668625000049 }, { "questionId": "q150", @@ -16409,18 +16409,18 @@ "correct": true, "inputTokens": 17405, "outputTokens": 6, - "latencyMs": 1801 + "latencyMs": 1477.048582999967 }, { "questionId": "q150", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "192220", "actual": "192220", "correct": true, - "inputTokens": 8789, - "outputTokens": 3, - "latencyMs": 2052 + "inputTokens": 8788, + "outputTokens": 520, + "latencyMs": 4880.817959000007 }, { "questionId": "q150", @@ -16431,18 +16431,18 @@ "correct": true, "inputTokens": 9275, "outputTokens": 6, - "latencyMs": 1176 + "latencyMs": 1081.6979169999831 }, { "questionId": "q150", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "192220", "actual": "192220", "correct": true, - "inputTokens": 8557, - "outputTokens": 3, - "latencyMs": 2084 + "inputTokens": 8556, + "outputTokens": 1992, + "latencyMs": 14180.11841699999 }, { "questionId": "q150", @@ -16453,18 +16453,18 @@ "correct": true, "inputTokens": 9121, "outputTokens": 6, - "latencyMs": 1191 + "latencyMs": 1393.665417000011 }, { "questionId": "q150", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "192220", "actual": "192220", "correct": true, - "inputTokens": 15482, - "outputTokens": 3, - "latencyMs": 1261 + "inputTokens": 15481, + "outputTokens": 392, + "latencyMs": 4068.912416999985 }, { "questionId": "q150", @@ -16475,18 +16475,18 @@ "correct": true, "inputTokens": 15363, "outputTokens": 6, - "latencyMs": 1355 + "latencyMs": 1687.0724170000176 }, { "questionId": "q150", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "192220", "actual": "192220", "correct": true, - "inputTokens": 13172, - "outputTokens": 3, - "latencyMs": 3388 + "inputTokens": 13171, + "outputTokens": 392, + "latencyMs": 4048.8707089999807 }, { "questionId": "q150", @@ -16497,18 +16497,18 @@ "correct": true, "inputTokens": 14479, "outputTokens": 6, - "latencyMs": 1591 + "latencyMs": 1441.8594579999917 }, { "questionId": "q151", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11763", "actual": "11763", "correct": true, - "inputTokens": 15191, - "outputTokens": 3, - "latencyMs": 1942 + "inputTokens": 15190, + "outputTokens": 392, + "latencyMs": 4563.366041000001 }, { "questionId": "q151", @@ -16519,18 +16519,18 @@ "correct": true, "inputTokens": 17414, "outputTokens": 6, - "latencyMs": 1340 + "latencyMs": 1361.9952920000069 }, { "questionId": "q151", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11763", "actual": "11763", "correct": true, - "inputTokens": 8792, - "outputTokens": 3, - "latencyMs": 1443 + "inputTokens": 8791, + "outputTokens": 904, + "latencyMs": 9523.924416000023 }, { "questionId": "q151", @@ -16541,18 +16541,18 @@ "correct": true, "inputTokens": 9284, "outputTokens": 6, - "latencyMs": 1732 + "latencyMs": 1235.863416999986 }, { "questionId": "q151", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11763", "actual": "11763", "correct": true, - "inputTokens": 8560, - "outputTokens": 3, - "latencyMs": 1994 + "inputTokens": 8559, + "outputTokens": 584, + "latencyMs": 5264.637583000003 }, { "questionId": "q151", @@ -16563,18 +16563,18 @@ "correct": true, "inputTokens": 9130, "outputTokens": 6, - "latencyMs": 1198 + "latencyMs": 1307.1584169999696 }, { "questionId": "q151", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11763", "actual": "11763", "correct": true, - "inputTokens": 15485, - "outputTokens": 3, - "latencyMs": 5013 + "inputTokens": 15484, + "outputTokens": 328, + "latencyMs": 8621.355207999994 }, { "questionId": "q151", @@ -16585,18 +16585,18 @@ "correct": true, "inputTokens": 15372, "outputTokens": 6, - "latencyMs": 1463 + "latencyMs": 1464.8200829999987 }, { "questionId": "q151", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "11763", "actual": "11763", "correct": true, - "inputTokens": 13175, - "outputTokens": 3, - "latencyMs": 1296 + "inputTokens": 13174, + "outputTokens": 264, + "latencyMs": 3034.7359999999753 }, { "questionId": "q151", @@ -16607,18 +16607,18 @@ "correct": true, "inputTokens": 14488, "outputTokens": 6, - "latencyMs": 2877 + "latencyMs": 1959.3285000000033 }, { "questionId": "q152", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "0", - "correct": false, - "inputTokens": 15188, - "outputTokens": 2, - "latencyMs": 2160 + "actual": "100", + "correct": true, + "inputTokens": 15187, + "outputTokens": 2055, + "latencyMs": 16430.930082999985 }, { "questionId": "q152", @@ -16629,18 +16629,18 @@ "correct": false, "inputTokens": 17406, "outputTokens": 5, - "latencyMs": 1947 + "latencyMs": 1730.124458999955 }, { "questionId": "q152", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "0", - "correct": false, - "inputTokens": 8789, - "outputTokens": 2, - "latencyMs": 1222 + "actual": "100", + "correct": true, + "inputTokens": 8788, + "outputTokens": 839, + "latencyMs": 7275.640458000009 }, { "questionId": "q152", @@ -16651,18 +16651,18 @@ "correct": false, "inputTokens": 9276, "outputTokens": 5, - "latencyMs": 1487 + "latencyMs": 1286.8315839999705 }, { "questionId": "q152", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", "actual": "0", "correct": false, - "inputTokens": 8557, - "outputTokens": 2, - "latencyMs": 1450 + "inputTokens": 8556, + "outputTokens": 2695, + "latencyMs": 24177.570000000007 }, { "questionId": "q152", @@ -16673,18 +16673,18 @@ "correct": false, "inputTokens": 9122, "outputTokens": 5, - "latencyMs": 1358 + "latencyMs": 1102.5337500000023 }, { "questionId": "q152", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "0", - "correct": false, - "inputTokens": 15482, - "outputTokens": 2, - "latencyMs": 873 + "actual": "100", + "correct": true, + "inputTokens": 15481, + "outputTokens": 1671, + "latencyMs": 14929.856415999995 }, { "questionId": "q152", @@ -16695,18 +16695,18 @@ "correct": true, "inputTokens": 15364, "outputTokens": 5, - "latencyMs": 1500 + "latencyMs": 1227.103541999997 }, { "questionId": "q152", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", "actual": "0", "correct": false, - "inputTokens": 13172, - "outputTokens": 2, - "latencyMs": 7031 + "inputTokens": 13171, + "outputTokens": 583, + "latencyMs": 5785.248666999978 }, { "questionId": "q152", @@ -16717,18 +16717,18 @@ "correct": false, "inputTokens": 14480, "outputTokens": 5, - "latencyMs": 1916 + "latencyMs": 1959.456125000026 }, { "questionId": "q153", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "15404143", - "actual": "43115556", + "actual": "19196630", "correct": false, - "inputTokens": 15189, - "outputTokens": 4, - "latencyMs": 3324 + "inputTokens": 15188, + "outputTokens": 13385, + "latencyMs": 239619.323125 }, { "questionId": "q153", @@ -16739,18 +16739,18 @@ "correct": false, "inputTokens": 17407, "outputTokens": 9, - "latencyMs": 1607 + "latencyMs": 1838.8340420000022 }, { "questionId": "q153", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "15404143", - "actual": "10419582", - "correct": false, - "inputTokens": 8790, - "outputTokens": 4, - "latencyMs": 900 + "actual": "15404143", + "correct": true, + "inputTokens": 8789, + "outputTokens": 12169, + "latencyMs": 109453.991416 }, { "questionId": "q153", @@ -16761,18 +16761,18 @@ "correct": false, "inputTokens": 9277, "outputTokens": 9, - "latencyMs": 1385 + "latencyMs": 1443.470417000004 }, { "questionId": "q153", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "15404143", - "actual": "10419582", - "correct": false, - "inputTokens": 8558, - "outputTokens": 4, - "latencyMs": 1922 + "actual": "15404143", + "correct": true, + "inputTokens": 8557, + "outputTokens": 6281, + "latencyMs": 45474.442209 }, { "questionId": "q153", @@ -16783,18 +16783,18 @@ "correct": false, "inputTokens": 9123, "outputTokens": 9, - "latencyMs": 1230 + "latencyMs": 1361.6022089999751 }, { "questionId": "q153", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "15404143", - "actual": "10419580", - "correct": false, - "inputTokens": 15483, - "outputTokens": 4, - "latencyMs": 1716 + "actual": "15404143", + "correct": true, + "inputTokens": 15482, + "outputTokens": 4489, + "latencyMs": 29654.25554099999 }, { "questionId": "q153", @@ -16805,18 +16805,18 @@ "correct": false, "inputTokens": 15365, "outputTokens": 9, - "latencyMs": 1384 + "latencyMs": 1796.0902500000084 }, { "questionId": "q153", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "15404143", - "actual": "Total number of stars across all repositories is 4,978,155.", - "correct": false, - "inputTokens": 13173, - "outputTokens": 16, - "latencyMs": 3411 + "actual": "15404143", + "correct": true, + "inputTokens": 13172, + "outputTokens": 6409, + "latencyMs": 70234.84133299999 }, { "questionId": "q153", @@ -16827,18 +16827,18 @@ "correct": false, "inputTokens": 14481, "outputTokens": 9, - "latencyMs": 1539 + "latencyMs": 1965.7452919999487 }, { "questionId": "q154", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "77", + "actual": "60", "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 2523 + "inputTokens": 15188, + "outputTokens": 7495, + "latencyMs": 72992.43658400001 }, { "questionId": "q154", @@ -16849,18 +16849,18 @@ "correct": true, "inputTokens": 17408, "outputTokens": 5, - "latencyMs": 1885 + "latencyMs": 1772.3059999999823 }, { "questionId": "q154", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "42", - "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 1148 + "actual": "100", + "correct": true, + "inputTokens": 8789, + "outputTokens": 2759, + "latencyMs": 19214.133417000005 }, { "questionId": "q154", @@ -16871,18 +16871,18 @@ "correct": true, "inputTokens": 9278, "outputTokens": 5, - "latencyMs": 1378 + "latencyMs": 1115.5979170000064 }, { "questionId": "q154", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "42", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 1364 + "actual": "100", + "correct": true, + "inputTokens": 8557, + "outputTokens": 2439, + "latencyMs": 27365.987334000005 }, { "questionId": "q154", @@ -16893,18 +16893,18 @@ "correct": true, "inputTokens": 9124, "outputTokens": 5, - "latencyMs": 1125 + "latencyMs": 1322.4322910000337 }, { "questionId": "q154", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "104", - "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 1276 + "actual": "100", + "correct": true, + "inputTokens": 15482, + "outputTokens": 5767, + "latencyMs": 60524.90554200002 }, { "questionId": "q154", @@ -16915,18 +16915,18 @@ "correct": true, "inputTokens": 15366, "outputTokens": 5, - "latencyMs": 1331 + "latencyMs": 1597.7364170000073 }, { "questionId": "q154", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "77", - "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 1534 + "actual": "100", + "correct": true, + "inputTokens": 13172, + "outputTokens": 4039, + "latencyMs": 28819.869999999995 }, { "questionId": "q154", @@ -16937,18 +16937,18 @@ "correct": true, "inputTokens": 14482, "outputTokens": 5, - "latencyMs": 1282 + "latencyMs": 1798.9455409999937 }, { "questionId": "q155", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "19", + "actual": "86", "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 2206 + "inputTokens": 15188, + "outputTokens": 2375, + "latencyMs": 23963.549916999997 }, { "questionId": "q155", @@ -16959,18 +16959,18 @@ "correct": false, "inputTokens": 17408, "outputTokens": 5, - "latencyMs": 1568 + "latencyMs": 1836.1375000000116 }, { "questionId": "q155", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "15", - "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 1478 + "actual": "100", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3079, + "latencyMs": 26957.04420799995 }, { "questionId": "q155", @@ -16981,18 +16981,18 @@ "correct": false, "inputTokens": 9278, "outputTokens": 5, - "latencyMs": 1314 + "latencyMs": 1209.7997920000344 }, { "questionId": "q155", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "12", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 2149 + "actual": "100", + "correct": true, + "inputTokens": 8557, + "outputTokens": 2887, + "latencyMs": 27174.970375000034 }, { "questionId": "q155", @@ -17003,18 +17003,18 @@ "correct": false, "inputTokens": 9124, "outputTokens": 5, - "latencyMs": 1485 + "latencyMs": 1293.6252920000115 }, { "questionId": "q155", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "34", + "actual": "98", "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 1043 + "inputTokens": 15482, + "outputTokens": 2567, + "latencyMs": 29565.065250000043 }, { "questionId": "q155", @@ -17025,18 +17025,18 @@ "correct": false, "inputTokens": 15366, "outputTokens": 5, - "latencyMs": 1371 + "latencyMs": 1230.7459160000435 }, { "questionId": "q155", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "100", - "actual": "34", - "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 1693 + "actual": "100", + "correct": true, + "inputTokens": 13172, + "outputTokens": 2695, + "latencyMs": 20706.84841700003 }, { "questionId": "q155", @@ -17047,260 +17047,260 @@ "correct": false, "inputTokens": 14482, "outputTokens": 5, - "latencyMs": 1237 + "latencyMs": 1743.1536249999772 }, { "questionId": "q156", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "76", + "actual": "41", + "correct": false, + "inputTokens": 15188, + "outputTokens": 8263, + "latencyMs": 60899.858959000034 + }, + { + "questionId": "q156", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 17408, + "outputTokens": 5, + "latencyMs": 1350.1540420000092 + }, + { + "questionId": "q156", + "format": "toon", + "model": "gpt-5-nano", + "expected": "76", + "actual": "76", + "correct": true, + "inputTokens": 8789, + "outputTokens": 3847, + "latencyMs": 30491.779582999996 + }, + { + "questionId": "q156", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 9278, + "outputTokens": 5, + "latencyMs": 1513.2665410000482 + }, + { + "questionId": "q156", + "format": "csv", + "model": "gpt-5-nano", + "expected": "76", + "actual": "76", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3847, + "latencyMs": 25522.397125000018 + }, + { + "questionId": "q156", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 9124, + "outputTokens": 5, + "latencyMs": 1150.7281660000444 + }, + { + "questionId": "q156", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "76", + "actual": "76", + "correct": true, + "inputTokens": 15482, + "outputTokens": 2631, + "latencyMs": 22525.465083000017 + }, + { + "questionId": "q156", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 15366, + "outputTokens": 5, + "latencyMs": 1438.5829169999924 + }, + { + "questionId": "q156", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "76", + "actual": "62", + "correct": false, + "inputTokens": 13172, + "outputTokens": 1351, + "latencyMs": 11162.623291999975 + }, + { + "questionId": "q156", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "76", + "actual": "100", + "correct": false, + "inputTokens": 14482, + "outputTokens": 5, + "latencyMs": 1305.162249999994 + }, + { + "questionId": "q157", + "format": "json", + "model": "gpt-5-nano", + "expected": "100", + "actual": "129", + "correct": false, + "inputTokens": 15188, + "outputTokens": 6599, + "latencyMs": 49590.68900000001 + }, + { + "questionId": "q157", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "89", + "correct": false, + "inputTokens": 17409, + "outputTokens": 5, + "latencyMs": 1750.9506249999977 + }, + { + "questionId": "q157", + "format": "toon", + "model": "gpt-5-nano", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 8789, + "outputTokens": 8903, + "latencyMs": 68556.36550000001 + }, + { + "questionId": "q157", + "format": "toon", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "73", + "correct": false, + "inputTokens": 9279, + "outputTokens": 5, + "latencyMs": 1148.3701669999864 + }, + { + "questionId": "q157", + "format": "csv", + "model": "gpt-5-nano", + "expected": "100", + "actual": "100", + "correct": true, + "inputTokens": 8557, + "outputTokens": 3271, + "latencyMs": 36128.254709 + }, + { + "questionId": "q157", + "format": "csv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "89", + "correct": false, + "inputTokens": 9125, + "outputTokens": 5, + "latencyMs": 1137.2578750000102 + }, + { + "questionId": "q157", + "format": "markdown-kv", + "model": "gpt-5-nano", + "expected": "100", + "actual": "79", + "correct": false, + "inputTokens": 15482, + "outputTokens": 3527, + "latencyMs": 35526.23958300002 + }, + { + "questionId": "q157", + "format": "markdown-kv", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "95", + "correct": false, + "inputTokens": 15367, + "outputTokens": 5, + "latencyMs": 1501.6561670000083 + }, + { + "questionId": "q157", + "format": "yaml", + "model": "gpt-5-nano", + "expected": "100", + "actual": "99", + "correct": false, + "inputTokens": 13172, + "outputTokens": 3143, + "latencyMs": 26700.229333000025 + }, + { + "questionId": "q157", + "format": "yaml", + "model": "claude-haiku-4-5", + "expected": "100", + "actual": "95", + "correct": false, + "inputTokens": 14483, + "outputTokens": 5, + "latencyMs": 1159.0904580000206 + }, + { + "questionId": "q158", + "format": "json", + "model": "gpt-5-nano", + "expected": "95", + "actual": "94", + "correct": false, + "inputTokens": 15188, + "outputTokens": 4999, + "latencyMs": 32710.407750000013 + }, + { + "questionId": "q158", + "format": "json", + "model": "claude-haiku-4-5", + "expected": "95", + "actual": "42", + "correct": false, + "inputTokens": 17409, + "outputTokens": 5, + "latencyMs": 1451.6710420000018 + }, + { + "questionId": "q158", + "format": "toon", + "model": "gpt-5-nano", + "expected": "95", "actual": "82", "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 927 - }, - { - "questionId": "q156", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "76", - "actual": "100", - "correct": false, - "inputTokens": 17408, - "outputTokens": 5, - "latencyMs": 1274 - }, - { - "questionId": "q156", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "76", - "actual": "34", - "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 2541 - }, - { - "questionId": "q156", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "76", - "actual": "100", - "correct": false, - "inputTokens": 9278, - "outputTokens": 5, - "latencyMs": 1116 - }, - { - "questionId": "q156", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "76", - "actual": "34", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 997 - }, - { - "questionId": "q156", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "76", - "actual": "100", - "correct": false, - "inputTokens": 9124, - "outputTokens": 5, - "latencyMs": 1513 - }, - { - "questionId": "q156", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "76", - "actual": "104", - "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 3168 - }, - { - "questionId": "q156", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "76", - "actual": "100", - "correct": false, - "inputTokens": 15366, - "outputTokens": 5, - "latencyMs": 1498 - }, - { - "questionId": "q156", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "76", - "actual": "66", - "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 1600 - }, - { - "questionId": "q156", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "76", - "actual": "100", - "correct": false, - "inputTokens": 14482, - "outputTokens": 5, - "latencyMs": 1519 - }, - { - "questionId": "q157", - "format": "json", - "model": "gpt-4o-mini", - "expected": "100", - "actual": "77", - "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 1809 - }, - { - "questionId": "q157", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "100", - "actual": "89", - "correct": false, - "inputTokens": 17409, - "outputTokens": 5, - "latencyMs": 1409 - }, - { - "questionId": "q157", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "100", - "actual": "66", - "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 1367 - }, - { - "questionId": "q157", - "format": "toon", - "model": "claude-haiku-4-5", - "expected": "100", - "actual": "73", - "correct": false, - "inputTokens": 9279, - "outputTokens": 5, - "latencyMs": 1296 - }, - { - "questionId": "q157", - "format": "csv", - "model": "gpt-4o-mini", - "expected": "100", - "actual": "66", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 1162 - }, - { - "questionId": "q157", - "format": "csv", - "model": "claude-haiku-4-5", - "expected": "100", - "actual": "89", - "correct": false, - "inputTokens": 9125, - "outputTokens": 5, - "latencyMs": 1435 - }, - { - "questionId": "q157", - "format": "markdown-kv", - "model": "gpt-4o-mini", - "expected": "100", - "actual": "77", - "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 1774 - }, - { - "questionId": "q157", - "format": "markdown-kv", - "model": "claude-haiku-4-5", - "expected": "100", - "actual": "95", - "correct": false, - "inputTokens": 15367, - "outputTokens": 5, - "latencyMs": 1479 - }, - { - "questionId": "q157", - "format": "yaml", - "model": "gpt-4o-mini", - "expected": "100", - "actual": "66", - "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 2710 - }, - { - "questionId": "q157", - "format": "yaml", - "model": "claude-haiku-4-5", - "expected": "100", - "actual": "95", - "correct": false, - "inputTokens": 14483, - "outputTokens": 5, - "latencyMs": 1272 - }, - { - "questionId": "q158", - "format": "json", - "model": "gpt-4o-mini", - "expected": "95", - "actual": "42", - "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 3038 - }, - { - "questionId": "q158", - "format": "json", - "model": "claude-haiku-4-5", - "expected": "95", - "actual": "42", - "correct": false, - "inputTokens": 17409, - "outputTokens": 5, - "latencyMs": 1562 - }, - { - "questionId": "q158", - "format": "toon", - "model": "gpt-4o-mini", - "expected": "95", - "actual": "38", - "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 1536 + "inputTokens": 8789, + "outputTokens": 3143, + "latencyMs": 18360.73424999998 }, { "questionId": "q158", @@ -17311,18 +17311,18 @@ "correct": false, "inputTokens": 9279, "outputTokens": 5, - "latencyMs": 1216 + "latencyMs": 1035.2159160000156 }, { "questionId": "q158", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "95", - "actual": "34", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 1760 + "actual": "95", + "correct": true, + "inputTokens": 8557, + "outputTokens": 4487, + "latencyMs": 28020.044915999984 }, { "questionId": "q158", @@ -17333,18 +17333,18 @@ "correct": false, "inputTokens": 9125, "outputTokens": 5, - "latencyMs": 1255 + "latencyMs": 1175.8671249999898 }, { "questionId": "q158", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "95", - "actual": "66", + "actual": "77", "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 1683 + "inputTokens": 15482, + "outputTokens": 2887, + "latencyMs": 24031.185459 }, { "questionId": "q158", @@ -17355,18 +17355,18 @@ "correct": false, "inputTokens": 15367, "outputTokens": 5, - "latencyMs": 2256 + "latencyMs": 1724.9393750000163 }, { "questionId": "q158", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "95", - "actual": "38", + "actual": "81", "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 2831 + "inputTokens": 13172, + "outputTokens": 4359, + "latencyMs": 35723.19641699997 }, { "questionId": "q158", @@ -17377,18 +17377,18 @@ "correct": false, "inputTokens": 14483, "outputTokens": 5, - "latencyMs": 1980 + "latencyMs": 1663.259167000011 }, { "questionId": "q159", "format": "json", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "83", - "actual": "66", + "actual": "71", "correct": false, - "inputTokens": 15189, - "outputTokens": 2, - "latencyMs": 1327 + "inputTokens": 15188, + "outputTokens": 2439, + "latencyMs": 18168.518166999973 }, { "questionId": "q159", @@ -17399,18 +17399,18 @@ "correct": false, "inputTokens": 17409, "outputTokens": 5, - "latencyMs": 1894 + "latencyMs": 1390.1757499999949 }, { "questionId": "q159", "format": "toon", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "83", - "actual": "34", + "actual": "57", "correct": false, - "inputTokens": 8790, - "outputTokens": 2, - "latencyMs": 784 + "inputTokens": 8789, + "outputTokens": 4423, + "latencyMs": 41240.42016700003 }, { "questionId": "q159", @@ -17421,18 +17421,18 @@ "correct": false, "inputTokens": 9279, "outputTokens": 5, - "latencyMs": 1422 + "latencyMs": 1066.675458999991 }, { "questionId": "q159", "format": "csv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "83", - "actual": "34", - "correct": false, - "inputTokens": 8558, - "outputTokens": 2, - "latencyMs": 2644 + "actual": "83", + "correct": true, + "inputTokens": 8557, + "outputTokens": 5831, + "latencyMs": 40638.93858400005 }, { "questionId": "q159", @@ -17443,18 +17443,18 @@ "correct": false, "inputTokens": 9125, "outputTokens": 5, - "latencyMs": 1109 + "latencyMs": 1394.1952499999898 }, { "questionId": "q159", "format": "markdown-kv", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "83", - "actual": "66", - "correct": false, - "inputTokens": 15483, - "outputTokens": 2, - "latencyMs": 1826 + "actual": "83", + "correct": true, + "inputTokens": 15482, + "outputTokens": 3591, + "latencyMs": 25356.36183400004 }, { "questionId": "q159", @@ -17465,18 +17465,18 @@ "correct": false, "inputTokens": 15367, "outputTokens": 5, - "latencyMs": 1342 + "latencyMs": 1238.0827089999802 }, { "questionId": "q159", "format": "yaml", - "model": "gpt-4o-mini", + "model": "gpt-5-nano", "expected": "83", - "actual": "38", + "actual": "72", "correct": false, - "inputTokens": 13173, - "outputTokens": 2, - "latencyMs": 2055 + "inputTokens": 13172, + "outputTokens": 2567, + "latencyMs": 25124.520583999984 }, { "questionId": "q159", @@ -17487,6 +17487,6 @@ "correct": false, "inputTokens": 14483, "outputTokens": 5, - "latencyMs": 1537 + "latencyMs": 2058.834957999992 } ] diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/accuracy/report.md index b1050bb..a6f9a5c 100644 --- a/benchmarks/results/accuracy/report.md +++ b/benchmarks/results/accuracy/report.md @@ -3,19 +3,22 @@ Tested across **2 LLMs** with data retrieval tasks: ``` -gpt-4o-mini ██████████████░░░░░░ 72.3% accuracy -claude-haiku-4-5 ███████████████░░░░░ 76.7% accuracy +gpt-5-nano + toon ███████████████████░ 97.5% (155/159) + markdown-kv ███████████████████░ 95.6% (152/159) + yaml ███████████████████░ 94.3% (150/159) + json ███████████████████░ 93.7% (149/159) + csv ███████████████████░ 93.7% (149/159) + +claude-haiku-4-5 + markdown-kv ███████████████░░░░░ 76.7% (122/159) + toon ███████████████░░░░░ 75.5% (120/159) + json ███████████████░░░░░ 75.5% (120/159) + csv ███████████████░░░░░ 75.5% (120/159) + yaml ███████████████░░░░░ 74.8% (119/159) ``` -**TOON achieves 73.9% accuracy (vs JSON's 73.6%) while using 46.3% fewer tokens.** - -| Format | Accuracy | Average Tokens | -| ------ | -------- | -------------- | -| `toon` | 73.9% | 4.678 | -| `json` | 73.6% | 8.713 | -| `markdown-kv` | 73.6% | 8.649 | -| `csv` | 72.3% | 4.745 | -| `yaml` | 71.7% | 7.091 | +**Tradeoff:** TOON achieves 86.5% accuracy (vs JSON's 84.6%) while using 46.3% fewer tokens.
View detailed breakdown by dataset and model @@ -26,53 +29,53 @@ claude-haiku-4-5 ███████████████░░░░ | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 72.4% | 2.483 | 84/116 | -| `csv` | 69.0% | 2.337 | 80/116 | -| `yaml` | 68.1% | 4.969 | 79/116 | -| `markdown-kv` | 68.1% | 6.270 | 79/116 | -| `json` | 68.1% | 6.347 | 79/116 | +| `toon` | 86.2% | 2.483 | 100/116 | +| `csv` | 80.2% | 2.337 | 93/116 | +| `yaml` | 82.8% | 4.969 | 96/116 | +| `markdown-kv` | 84.5% | 6.270 | 98/116 | +| `json` | 84.5% | 6.347 | 98/116 | ##### E-commerce orders with nested structures | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 84.1% | 5.967 | 74/88 | -| `csv` | 83.0% | 6.735 | 73/88 | -| `yaml` | 81.8% | 7.328 | 72/88 | -| `markdown-kv` | 86.4% | 9.110 | 76/88 | -| `json` | 84.1% | 9.694 | 74/88 | +| `toon` | 90.9% | 5.967 | 80/88 | +| `csv` | 90.9% | 6.735 | 80/88 | +| `yaml` | 89.8% | 7.328 | 79/88 | +| `markdown-kv` | 90.9% | 9.110 | 80/88 | +| `json` | 89.8% | 9.694 | 79/88 | ##### Time-series analytics data | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `csv` | 72.4% | 1.393 | 42/58 | -| `toon` | 70.7% | 1.515 | 41/58 | -| `yaml` | 72.4% | 2.938 | 42/58 | -| `json` | 74.1% | 3.665 | 43/58 | -| `markdown-kv` | 70.7% | 3.779 | 41/58 | +| `csv` | 87.9% | 1.393 | 51/58 | +| `toon` | 86.2% | 1.515 | 50/58 | +| `yaml` | 86.2% | 2.938 | 50/58 | +| `json` | 87.9% | 3.665 | 51/58 | +| `markdown-kv` | 86.2% | 3.779 | 50/58 | ##### Popular GitHub repositories | Format | Accuracy | Tokens | Correct/Total | |--------|----------|--------|---------------| -| `toon` | 64.3% | 8.745 | 36/56 | -| `csv` | 62.5% | 8.513 | 35/56 | -| `json` | 67.9% | 15.145 | 38/56 | -| `markdown-kv` | 67.9% | 15.436 | 38/56 | -| `yaml` | 62.5% | 13.129 | 35/56 | +| `csv` | 80.4% | 8.513 | 45/56 | +| `toon` | 80.4% | 8.745 | 45/56 | +| `yaml` | 78.6% | 13.129 | 44/56 | +| `markdown-kv` | 82.1% | 15.436 | 46/56 | +| `json` | 73.2% | 15.145 | 41/56 | #### Performance by Model -##### gpt-4o-mini +##### gpt-5-nano | Format | Accuracy | Correct/Total | |--------|----------|---------------| -| `toon` | 72.3% | 115/159 | -| `json` | 71.7% | 114/159 | -| `markdown-kv` | 70.4% | 112/159 | -| `csv` | 69.2% | 110/159 | -| `yaml` | 68.6% | 109/159 | +| `toon` | 97.5% | 155/159 | +| `markdown-kv` | 95.6% | 152/159 | +| `yaml` | 94.3% | 150/159 | +| `json` | 93.7% | 149/159 | +| `csv` | 93.7% | 149/159 | ##### claude-haiku-4-5 diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json index 737107e..a49a81a 100644 --- a/benchmarks/results/accuracy/summary.json +++ b/benchmarks/results/accuracy/summary.json @@ -2,53 +2,48 @@ "formatResults": [ { "format": "toon", - "accuracy": 0.7389937106918238, + "accuracy": 0.8647798742138365, "totalTokens": 4678, - "avgInputTokens": 4675, - "avgLatency": 1424, - "correctCount": 235, - "totalCount": 318 - }, - { - "format": "json", - "accuracy": 0.7358490566037735, - "totalTokens": 8713, - "avgInputTokens": 9177, - "avgLatency": 1678, - "correctCount": 234, + "averageLatency": 5016, + "correctCount": 275, "totalCount": 318 }, { "format": "markdown-kv", - "accuracy": 0.7358490566037735, + "accuracy": 0.8616352201257862, "totalTokens": 8649, - "avgInputTokens": 8242, - "avgLatency": 1724, - "correctCount": 234, + "averageLatency": 4628, + "correctCount": 274, + "totalCount": 318 + }, + { + "format": "json", + "accuracy": 0.8459119496855346, + "totalTokens": 8713, + "averageLatency": 5369, + "correctCount": 269, "totalCount": 318 }, { "format": "csv", - "accuracy": 0.7232704402515723, + "accuracy": 0.8459119496855346, "totalTokens": 4745, - "avgInputTokens": 4878, - "avgLatency": 1573, - "correctCount": 230, + "averageLatency": 5168, + "correctCount": 269, "totalCount": 318 }, { "format": "yaml", - "accuracy": 0.7169811320754716, + "accuracy": 0.8459119496855346, "totalTokens": 7091, - "avgInputTokens": 7136, - "avgLatency": 1602, - "correctCount": 228, + "averageLatency": 4299, + "correctCount": 269, "totalCount": 318 } ], "questions": 159, "models": [ - "gpt-4o-mini", + "gpt-5-nano", "claude-haiku-4-5" ], "datasets": [ @@ -91,5 +86,5 @@ "yaml-analytics": 2938, "yaml-github": 13129 }, - "timestamp": "2025-10-27T10:46:35.127Z" + "timestamp": "2025-10-27T12:43:38.288Z" } diff --git a/benchmarks/results/token-efficiency.md b/benchmarks/results/token-efficiency.md index 090397a..619904c 100644 --- a/benchmarks/results/token-efficiency.md +++ b/benchmarks/results/token-efficiency.md @@ -7,7 +7,7 @@ 🛒 E-commerce Order ███████████████░░░░░░░░░░ 203 tokens (JSON: 338) 💰 39.9% saved ``` -**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → 47.9% savings +**Total:** 15,172 tokens (TOON) vs 29,096 tokens (JSON) → **47.9% savings**
View detailed examples diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts index 9867e5c..e2a4c1c 100644 --- a/benchmarks/scripts/accuracy-benchmark.ts +++ b/benchmarks/scripts/accuracy-benchmark.ts @@ -1,5 +1,5 @@ /** - * TOON LLM Accuracy Benchmark + * LLM Retrieval Accuracy Benchmark * * Main entry point that orchestrates the full benchmark: * 1. Generate questions from datasets @@ -20,7 +20,7 @@ import { formatters } from '../src/formatters' import { generateQuestions } from '../src/questions' import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report' -consola.start('LLM Accuracy Benchmark for TOON') +consola.start('Retrieval Accuracy Benchmark for TOON') // Check if results already exist const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy') @@ -82,10 +82,10 @@ else { // Format datasets once (reuse for all questions) const formattedDatasets: Record> = {} for (const [formatName, formatter] of Object.entries(formatters)) { - formattedDatasets[formatName] = {} + formattedDatasets[formatName] ??= {} + for (const dataset of datasets) { - const formatted = formatter(dataset.data) - formattedDatasets[formatName]![dataset.name] = formatted + formattedDatasets[formatName]![dataset.name] = formatter(dataset.data) } } @@ -108,7 +108,7 @@ else { tasks, async (task, index) => { const formattedData = formattedDatasets[task.formatName]![task.question.dataset]! - const model = activeModels[task.modelName as keyof typeof activeModels] + const model = activeModels[task.modelName as keyof typeof activeModels]! const result = await evaluateQuestion( task.question, @@ -121,7 +121,7 @@ else { // Progress update if ((index + 1) % 10 === 0) { const percent = (((index + 1) / total) * 100).toFixed(1) - console.log(`⏳ Progress: ${index + 1}/${total} (${percent}%)`) + consola.start(`Progress: ${index + 1}/${total} (${percent}%)`) } return result diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index 5957115..1b9f7d6 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -59,7 +59,7 @@ let totalToonTokens = 0 const results: BenchmarkResult[] = [] for (const example of BENCHMARK_EXAMPLES) { - const data = await example.getData() + const data = example.getData() const jsonString = JSON.stringify(data, undefined, 2) const toonString = encode(data) diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts index ec1c3ec..31642de 100644 --- a/benchmarks/src/evaluate.ts +++ b/benchmarks/src/evaluate.ts @@ -20,18 +20,69 @@ import { RATE_LIMIT_DELAY_MS } from './constants' * Models used for evaluation */ export const models: Record = { - 'gpt-4o-mini': openai('gpt-4o-mini'), + 'gpt-5-nano': openai('gpt-5-nano'), 'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'), } /** - * Validate an answer using LLM-as-judge approach - * More robust than string matching for LLM outputs + * Evaluate a single question with a specific format and model */ -export async function validateAnswer( - actual: string, - expected: string, - question: string, +export async function evaluateQuestion( + question: Question, + formatName: string, + formattedData: string, + model: LanguageModelV2, + modelName: string, +): Promise { + const prompt = `Given the following data in ${formatName} format: + +\`\`\` +${formattedData} +\`\`\` + +Question: ${question.prompt} + +Provide only the direct answer, without any additional explanation or formatting.` + + const startTime = performance.now() + const { text, usage } = await generateText({ + model, + prompt, + temperature: model.modelId.startsWith('gpt-') ? undefined : 0, + }) + + await setTimeout(RATE_LIMIT_DELAY_MS) + + const latencyMs = performance.now() - startTime + const correct = await validateAnswer({ + actual: text.trim(), + expected: question.groundTruth, + question: question.prompt, + }) + + return { + questionId: question.id, + format: formatName, + model: modelName, + expected: question.groundTruth, + actual: text.trim(), + correct, + inputTokens: usage.inputTokens, + outputTokens: usage.outputTokens, + latencyMs, + } +} + +/** + * Validate an answer using LLM-as-judge approach + */ +async function validateAnswer( + { + actual, + expected, + question, + }: + { actual: string, expected: string, question: string }, ): Promise { const prompt = `You are validating answers to questions about structured data. @@ -49,10 +100,9 @@ Respond with only "YES" or "NO".` try { const { text } = await generateText({ - model: models['gpt-4o-mini']!, + model: models['claude-haiku-4-5']!, prompt, temperature: 0, - maxOutputTokens: 16, }) await setTimeout(RATE_LIMIT_DELAY_MS) @@ -65,69 +115,3 @@ Respond with only "YES" or "NO".` return actual.toLowerCase().trim() === expected.toLowerCase().trim() } } - -/** - * Evaluate a single question with a specific format and model - */ -export async function evaluateQuestion( - question: Question, - formatName: string, - formattedData: string, - model: any, - modelName: string, -): Promise { - const prompt = `Given the following data in ${formatName} format: - -\`\`\` -${formattedData} -\`\`\` - -Question: ${question.prompt} - -Provide only the direct answer, without any additional explanation or formatting.` - - const startTime = Date.now() - - try { - const { text, usage } = await generateText({ - model, - prompt, - temperature: 0, - maxOutputTokens: 50, - }) - - await setTimeout(RATE_LIMIT_DELAY_MS) - - const latencyMs = Date.now() - startTime - const correct = await validateAnswer(text.trim(), question.groundTruth, question.prompt) - - return { - questionId: question.id, - format: formatName, - model: modelName, - expected: question.groundTruth, - actual: text.trim(), - correct, - inputTokens: usage.inputTokens ?? 0, - outputTokens: usage.outputTokens ?? 0, - latencyMs, - } - } - catch (error) { - consola.error(`Error evaluating ${question.id} with ${formatName}/${modelName}:`, error) - - await setTimeout(RATE_LIMIT_DELAY_MS) - - return { - questionId: question.id, - format: formatName, - model: modelName, - expected: question.groundTruth, - actual: '', - correct: false, - inputTokens: 0, - outputTokens: 0, - latencyMs: Date.now() - startTime, - } - } -} diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index 2638622..43d1c23 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -37,15 +37,13 @@ export function calculateFormatResults( .filter(([key]) => key.startsWith(`${formatName}-`)) .reduce((sum, [, tokens]) => sum + tokens, 0) / datasets.length - const avgInputTokens = formatResults.reduce((sum, r) => sum + r.inputTokens, 0) / totalCount - const avgLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount + const averageLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount return { format: formatName, accuracy, totalTokens: Math.round(avgTokens), - avgInputTokens: Math.round(avgInputTokens), - avgLatency: Math.round(avgLatency), + averageLatency: Math.round(averageLatency), correctCount, totalCount, } @@ -69,11 +67,13 @@ export function generateMarkdownReport( const toon = formatResults.find(r => r.format === 'toon') const json = formatResults.find(r => r.format === 'json') - // Model-by-model breakdown (most interesting result) + // Model-by-model breakdown with ASCII bars const modelCount = Object.keys(models).length lines.push(`Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:`, '', '```') - for (const modelName of Object.keys(models)) { + const modelNames = Object.keys(models) + for (let i = 0; i < modelNames.length; i++) { + const modelName = modelNames[i]! const modelResults = formatResults.map((fr) => { const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format) const correctCount = modelFormatResults.filter(r => r.correct).length @@ -88,10 +88,16 @@ export function generateMarkdownReport( } }).sort((a, b) => b.accuracy - a.accuracy) - const bestResult = modelResults[0]! - const bar = createTokenBar(bestResult.accuracy, 1, 20) - - lines.push(`${modelName.padEnd(20)} ${bar} ${(bestResult.accuracy * 100).toFixed(1)}% accuracy`) + // Add blank line before model name, except for first model + if (i > 0) + lines.push('') + lines.push(modelName) + for (const result of modelResults) { + const bar = createProgressBar(result.accuracy, 1, 20) + const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6) + const countStr = `(${result.correctCount}/${result.totalCount})` + lines.push(` ${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`) + } } lines.push('```', '') @@ -100,24 +106,12 @@ export function generateMarkdownReport( if (toon && json) { const tokenSavings = ((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1) lines.push( - `**TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.**`, + `**Tradeoff:** TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.`, '', ) } - // Simple format comparison table - lines.push( - '| Format | Accuracy | Average Tokens |', - '| ------ | -------- | -------------- |', - ) - - for (const result of formatResults) { - lines.push( - `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.totalTokens.toLocaleString()} |`, - ) - } - - lines.push('', '
', 'View detailed breakdown by dataset and model', '', '#### Performance by Dataset', '') + lines.push('
', 'View detailed breakdown by dataset and model', '', '#### Performance by Dataset', '') for (const dataset of datasets) { lines.push(`##### ${dataset.description}`, '') @@ -173,7 +167,7 @@ export function generateMarkdownReport( } // Model breakdown - lines.push('', '#### Performance by Model', '') + lines.push('#### Performance by Model', '') for (const modelName of Object.keys(models)) { lines.push(`##### ${modelName}`, '') @@ -203,7 +197,6 @@ export function generateMarkdownReport( // Methodology lines.push( - '', '#### Methodology', '', '- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).', @@ -252,20 +245,20 @@ export async function saveResults( // Save raw results await fsp.writeFile( path.join(resultsDir, 'raw-results.json'), - JSON.stringify(results, undefined, 2), + `${JSON.stringify(results, undefined, 2)}\n`, ) // Save summary await fsp.writeFile( path.join(resultsDir, 'summary.json'), - JSON.stringify({ + `${JSON.stringify({ formatResults, questions: questions.length, models: Object.keys(models), datasets: datasets.map(d => ({ name: d.name, description: d.description })), tokenCounts, timestamp: new Date().toISOString(), - }, undefined, 2), + }, undefined, 2)}\n`, ) // Generate markdown report @@ -279,9 +272,9 @@ export async function saveResults( } /** - * Generate visual bar chart for token counts + * Generate visual progress bar using ASCII characters (█ for filled, ░ for empty) */ -function createTokenBar(tokens: number, maxTokens: number, width = 30): string { +function createProgressBar(tokens: number, maxTokens: number, width = 30): string { const filled = Math.round((tokens / maxTokens) * width) const empty = width - filled return '█'.repeat(filled) + '░'.repeat(empty) diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts index bca48fa..11f8bcf 100644 --- a/benchmarks/src/types.ts +++ b/benchmarks/src/types.ts @@ -19,8 +19,8 @@ export interface EvaluationResult { expected: string actual: string correct: boolean - inputTokens: number - outputTokens: number + inputTokens?: number + outputTokens?: number latencyMs: number } @@ -28,8 +28,7 @@ export interface FormatResult { format: string accuracy: number totalTokens: number - avgInputTokens: number - avgLatency: number + averageLatency: number correctCount: number totalCount: number }