diff --git a/README.md b/README.md
index ca05be5..00393ef 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,13 @@ users[2]{id,name,role}:
2,Bob,user
```
+
+Another reason
+
+[](https://xkcd.com/927/)
+
+
+
> [!NOTE]
> I built TOON to save tokens when sending large datasets to LLMs at work, where I tend to have uniform arrays of objects that benefit from the tabular format.
@@ -225,7 +232,7 @@ claude-haiku-4-5
##### Uniform employee records (TOON optimal format)
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `toon` | 86.2% | 2.483 | 100/116 |
| `csv` | 80.2% | 2.337 | 93/116 |
| `yaml` | 82.8% | 4.969 | 96/116 |
@@ -235,7 +242,7 @@ claude-haiku-4-5
##### E-commerce orders with nested structures
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `toon` | 90.9% | 5.967 | 80/88 |
| `csv` | 90.9% | 6.735 | 80/88 |
| `yaml` | 89.8% | 7.328 | 79/88 |
@@ -245,17 +252,17 @@ claude-haiku-4-5
##### Time-series analytics data
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `csv` | 87.9% | 1.393 | 51/58 |
| `toon` | 86.2% | 1.515 | 50/58 |
| `yaml` | 86.2% | 2.938 | 50/58 |
| `json` | 87.9% | 3.665 | 51/58 |
| `markdown-kv` | 86.2% | 3.779 | 50/58 |
-##### Popular GitHub repositories
+##### Top 100 GitHub repositories
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `csv` | 80.4% | 8.513 | 45/56 |
| `toon` | 80.4% | 8.745 | 45/56 |
| `yaml` | 78.6% | 13.129 | 44/56 |
@@ -267,7 +274,7 @@ claude-haiku-4-5
##### gpt-5-nano
| Format | Accuracy | Correct/Total |
-|--------|----------|---------------|
+| ------ | -------- | ------------- |
| `toon` | 97.5% | 155/159 |
| `markdown-kv` | 95.6% | 152/159 |
| `yaml` | 94.3% | 150/159 |
@@ -277,7 +284,7 @@ claude-haiku-4-5
##### claude-haiku-4-5
| Format | Accuracy | Correct/Total |
-|--------|----------|---------------|
+| ------ | -------- | ------------- |
| `markdown-kv` | 76.7% | 122/159 |
| `toon` | 75.5% | 120/159 |
| `json` | 75.5% | 120/159 |
diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json
index adbe71c..f52e84f 100644
--- a/benchmarks/results/accuracy/raw-results.json
+++ b/benchmarks/results/accuracy/raw-results.json
@@ -5,7 +5,7 @@
"model": "gpt-5-nano",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 72,
"latencyMs": 2221.390167
@@ -16,7 +16,7 @@
"model": "claude-haiku-4-5",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7870,
"outputTokens": 6,
"latencyMs": 1276.715333
@@ -27,7 +27,7 @@
"model": "gpt-5-nano",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 72,
"latencyMs": 3718.250833
@@ -38,7 +38,7 @@
"model": "claude-haiku-4-5",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 6,
"latencyMs": 1215.944708
@@ -49,7 +49,7 @@
"model": "gpt-5-nano",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 72,
"latencyMs": 2417.306625
@@ -60,7 +60,7 @@
"model": "claude-haiku-4-5",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2856,
"outputTokens": 6,
"latencyMs": 1152.5258749999998
@@ -71,7 +71,7 @@
"model": "gpt-5-nano",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 72,
"latencyMs": 4603.444417
@@ -82,7 +82,7 @@
"model": "claude-haiku-4-5",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6365,
"outputTokens": 6,
"latencyMs": 1390.011125
@@ -93,7 +93,7 @@
"model": "gpt-5-nano",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 8,
"latencyMs": 4339.294459
@@ -104,7 +104,7 @@
"model": "claude-haiku-4-5",
"expected": "56176",
"actual": "56176",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5760,
"outputTokens": 6,
"latencyMs": 1374.47325
@@ -115,7 +115,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 135,
"latencyMs": 2550.589042
@@ -126,7 +126,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 4,
"latencyMs": 1139.559917
@@ -137,7 +137,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 135,
"latencyMs": 2422.8178749999997
@@ -148,7 +148,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 4,
"latencyMs": 1135.579459
@@ -159,7 +159,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 71,
"latencyMs": 4198.553583999999
@@ -170,7 +170,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 4,
"latencyMs": 1147.9685829999999
@@ -181,7 +181,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 71,
"latencyMs": 2594.702667
@@ -192,7 +192,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 4,
"latencyMs": 1568.4054999999998
@@ -203,7 +203,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 71,
"latencyMs": 2516.345875
@@ -214,7 +214,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 4,
"latencyMs": 1633.5375000000001
@@ -225,7 +225,7 @@
"model": "gpt-5-nano",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 76,
"latencyMs": 2079.8442499999996
@@ -236,7 +236,7 @@
"model": "claude-haiku-4-5",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7874,
"outputTokens": 12,
"latencyMs": 1201.556458
@@ -247,7 +247,7 @@
"model": "gpt-5-nano",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 140,
"latencyMs": 2356.408
@@ -258,7 +258,7 @@
"model": "claude-haiku-4-5",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2986,
"outputTokens": 12,
"latencyMs": 1113.255166
@@ -269,7 +269,7 @@
"model": "gpt-5-nano",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 140,
"latencyMs": 2188.5425419999997
@@ -280,7 +280,7 @@
"model": "claude-haiku-4-5",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2860,
"outputTokens": 12,
"latencyMs": 1029.9496669999999
@@ -291,7 +291,7 @@
"model": "gpt-5-nano",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 140,
"latencyMs": 2605.8857080000002
@@ -302,7 +302,7 @@
"model": "claude-haiku-4-5",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6369,
"outputTokens": 12,
"latencyMs": 1273.5997920000004
@@ -313,7 +313,7 @@
"model": "gpt-5-nano",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 140,
"latencyMs": 2530.4294580000005
@@ -324,7 +324,7 @@
"model": "claude-haiku-4-5",
"expected": "lorenza.kunze@yahoo.com",
"actual": "lorenza.kunze@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5764,
"outputTokens": 12,
"latencyMs": 1404.4837089999996
@@ -335,7 +335,7 @@
"model": "gpt-5-nano",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 72,
"latencyMs": 2302.062125
@@ -346,7 +346,7 @@
"model": "claude-haiku-4-5",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7870,
"outputTokens": 6,
"latencyMs": 1114.0778329999998
@@ -357,7 +357,7 @@
"model": "gpt-5-nano",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 72,
"latencyMs": 2006.7020830000001
@@ -368,7 +368,7 @@
"model": "claude-haiku-4-5",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 6,
"latencyMs": 1641.5518749999997
@@ -379,7 +379,7 @@
"model": "gpt-5-nano",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 136,
"latencyMs": 2850.351709
@@ -390,7 +390,7 @@
"model": "claude-haiku-4-5",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2856,
"outputTokens": 6,
"latencyMs": 1367.7319589999997
@@ -401,7 +401,7 @@
"model": "gpt-5-nano",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 72,
"latencyMs": 2477.8365839999997
@@ -412,7 +412,7 @@
"model": "claude-haiku-4-5",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6365,
"outputTokens": 6,
"latencyMs": 1309.567083
@@ -423,7 +423,7 @@
"model": "gpt-5-nano",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 72,
"latencyMs": 1794.2651250000008
@@ -434,7 +434,7 @@
"model": "claude-haiku-4-5",
"expected": "117381",
"actual": "117381",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5760,
"outputTokens": 6,
"latencyMs": 1177.5377079999998
@@ -445,7 +445,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 71,
"latencyMs": 1963.9477500000003
@@ -456,7 +456,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 4,
"latencyMs": 1024.5166669999999
@@ -467,7 +467,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 135,
"latencyMs": 2291.4288749999996
@@ -478,7 +478,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 4,
"latencyMs": 1312.7111250000007
@@ -489,7 +489,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 135,
"latencyMs": 1727.6371660000004
@@ -500,7 +500,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 4,
"latencyMs": 1097.0443749999995
@@ -511,7 +511,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 135,
"latencyMs": 2671.2276250000004
@@ -522,7 +522,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 4,
"latencyMs": 1174.8639999999996
@@ -533,7 +533,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 71,
"latencyMs": 2306.2642499999993
@@ -544,7 +544,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 4,
"latencyMs": 2822.8963750000003
@@ -555,7 +555,7 @@
"model": "gpt-5-nano",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 139,
"latencyMs": 2827.0400409999993
@@ -566,7 +566,7 @@
"model": "claude-haiku-4-5",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 11,
"latencyMs": 1151.7215829999996
@@ -577,7 +577,7 @@
"model": "gpt-5-nano",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 75,
"latencyMs": 1714.2902919999997
@@ -588,7 +588,7 @@
"model": "claude-haiku-4-5",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 11,
"latencyMs": 1810.6344170000011
@@ -599,7 +599,7 @@
"model": "gpt-5-nano",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 75,
"latencyMs": 2548.0390000000007
@@ -610,7 +610,7 @@
"model": "claude-haiku-4-5",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 11,
"latencyMs": 1046.7650829999993
@@ -621,7 +621,7 @@
"model": "gpt-5-nano",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 139,
"latencyMs": 2408.879916000001
@@ -632,7 +632,7 @@
"model": "claude-haiku-4-5",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 11,
"latencyMs": 1186.5773750000008
@@ -643,7 +643,7 @@
"model": "gpt-5-nano",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 139,
"latencyMs": 3157.9398329999995
@@ -654,7 +654,7 @@
"model": "claude-haiku-4-5",
"expected": "jayda60@hotmail.com",
"actual": "jayda60@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 11,
"latencyMs": 1129.6754170000004
@@ -665,7 +665,7 @@
"model": "gpt-5-nano",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 72,
"latencyMs": 2893.3476250000003
@@ -676,7 +676,7 @@
"model": "claude-haiku-4-5",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7870,
"outputTokens": 6,
"latencyMs": 1288.7682919999988
@@ -687,7 +687,7 @@
"model": "gpt-5-nano",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 72,
"latencyMs": 2324.6738330000007
@@ -698,7 +698,7 @@
"model": "claude-haiku-4-5",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 6,
"latencyMs": 1095.704291
@@ -709,7 +709,7 @@
"model": "gpt-5-nano",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 136,
"latencyMs": 3980.3727500000005
@@ -720,7 +720,7 @@
"model": "claude-haiku-4-5",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2856,
"outputTokens": 6,
"latencyMs": 1122.8730419999993
@@ -731,7 +731,7 @@
"model": "gpt-5-nano",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 72,
"latencyMs": 2030.0818330000002
@@ -742,7 +742,7 @@
"model": "claude-haiku-4-5",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6365,
"outputTokens": 6,
"latencyMs": 1705.6364999999987
@@ -753,7 +753,7 @@
"model": "gpt-5-nano",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 72,
"latencyMs": 1611.3567500000008
@@ -764,7 +764,7 @@
"model": "claude-haiku-4-5",
"expected": "92971",
"actual": "92971",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5760,
"outputTokens": 6,
"latencyMs": 1109.0094590000008
@@ -775,7 +775,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 199,
"latencyMs": 3099.078125
@@ -786,7 +786,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 4,
"latencyMs": 1115.9911250000005
@@ -797,7 +797,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 135,
"latencyMs": 2833.193875000001
@@ -808,7 +808,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 4,
"latencyMs": 933.1444169999995
@@ -819,7 +819,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 199,
"latencyMs": 2315.536
@@ -830,7 +830,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 4,
"latencyMs": 1300.336792
@@ -841,7 +841,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 135,
"latencyMs": 7016.997917000002
@@ -852,7 +852,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 4,
"latencyMs": 1288.107333
@@ -863,7 +863,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 135,
"latencyMs": 2474.8247499999998
@@ -874,7 +874,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 4,
"latencyMs": 1027.9775420000005
@@ -885,7 +885,7 @@
"model": "gpt-5-nano",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 652,
"latencyMs": 8322.172416
@@ -896,7 +896,7 @@
"model": "claude-haiku-4-5",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 11,
"latencyMs": 1066.3422090000004
@@ -907,7 +907,7 @@
"model": "gpt-5-nano",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 76,
"latencyMs": 2245.5604999999996
@@ -918,7 +918,7 @@
"model": "claude-haiku-4-5",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 11,
"latencyMs": 1179.7512079999997
@@ -929,7 +929,7 @@
"model": "gpt-5-nano",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 204,
"latencyMs": 2584.0723340000004
@@ -940,7 +940,7 @@
"model": "claude-haiku-4-5",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 11,
"latencyMs": 1204.6979589999992
@@ -951,7 +951,7 @@
"model": "gpt-5-nano",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 396,
"latencyMs": 3824.918375000001
@@ -962,7 +962,7 @@
"model": "claude-haiku-4-5",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 11,
"latencyMs": 1492.6765830000004
@@ -973,7 +973,7 @@
"model": "gpt-5-nano",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 76,
"latencyMs": 1834.562
@@ -984,7 +984,7 @@
"model": "claude-haiku-4-5",
"expected": "terrance.hansen@yahoo.com",
"actual": "terrance.hansen@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 11,
"latencyMs": 1245.0000419999997
@@ -995,7 +995,7 @@
"model": "gpt-5-nano",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 136,
"latencyMs": 2337.0652499999997
@@ -1006,7 +1006,7 @@
"model": "claude-haiku-4-5",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7870,
"outputTokens": 6,
"latencyMs": 1148.1971250000006
@@ -1017,7 +1017,7 @@
"model": "gpt-5-nano",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 72,
"latencyMs": 2736.2375420000008
@@ -1028,7 +1028,7 @@
"model": "claude-haiku-4-5",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 6,
"latencyMs": 1164.4291250000006
@@ -1039,7 +1039,7 @@
"model": "gpt-5-nano",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 72,
"latencyMs": 2479.8535840000004
@@ -1050,7 +1050,7 @@
"model": "claude-haiku-4-5",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2856,
"outputTokens": 6,
"latencyMs": 1032.3198329999996
@@ -1061,7 +1061,7 @@
"model": "gpt-5-nano",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6317,
"outputTokens": 136,
"latencyMs": 2237.465583000001
@@ -1072,7 +1072,7 @@
"model": "claude-haiku-4-5",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6365,
"outputTokens": 6,
"latencyMs": 1254.3189160000002
@@ -1083,7 +1083,7 @@
"model": "gpt-5-nano",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 72,
"latencyMs": 3753.917125
@@ -1094,7 +1094,7 @@
"model": "claude-haiku-4-5",
"expected": "107744",
"actual": "107744",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5760,
"outputTokens": 6,
"latencyMs": 1154.7003750000003
@@ -1105,7 +1105,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 135,
"latencyMs": 2621.2275420000005
@@ -1116,7 +1116,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 4,
"latencyMs": 1222.843499999999
@@ -1127,7 +1127,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 71,
"latencyMs": 1762.1339159999989
@@ -1138,7 +1138,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 4,
"latencyMs": 1630.7307079999991
@@ -1149,7 +1149,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 71,
"latencyMs": 1848.9775829999999
@@ -1160,7 +1160,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 4,
"latencyMs": 1080.8682500000014
@@ -1171,7 +1171,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 135,
"latencyMs": 26303.357959
@@ -1182,7 +1182,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 4,
"latencyMs": 1354.007999999998
@@ -1193,7 +1193,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 71,
"latencyMs": 1924.4625829999986
@@ -1204,7 +1204,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 4,
"latencyMs": 1279.5235830000001
@@ -1215,7 +1215,7 @@
"model": "gpt-5-nano",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 330,
"latencyMs": 3997.3972079999985
@@ -1226,7 +1226,7 @@
"model": "claude-haiku-4-5",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7867,
"outputTokens": 9,
"latencyMs": 1153.9412079999984
@@ -1237,7 +1237,7 @@
"model": "gpt-5-nano",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 138,
"latencyMs": 2494.580582999999
@@ -1248,7 +1248,7 @@
"model": "claude-haiku-4-5",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2979,
"outputTokens": 9,
"latencyMs": 1350.1353750000017
@@ -1259,7 +1259,7 @@
"model": "gpt-5-nano",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 138,
"latencyMs": 3024.4009160000023
@@ -1270,7 +1270,7 @@
"model": "claude-haiku-4-5",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2853,
"outputTokens": 9,
"latencyMs": 1199.3955830000014
@@ -1281,7 +1281,7 @@
"model": "gpt-5-nano",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 138,
"latencyMs": 5168.116582999999
@@ -1292,7 +1292,7 @@
"model": "claude-haiku-4-5",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6362,
"outputTokens": 9,
"latencyMs": 1198.3554160000022
@@ -1303,7 +1303,7 @@
"model": "gpt-5-nano",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 74,
"latencyMs": 2632.998958999997
@@ -1314,7 +1314,7 @@
"model": "claude-haiku-4-5",
"expected": "allan21@gmail.com",
"actual": "allan21@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5757,
"outputTokens": 9,
"latencyMs": 1124.5625419999997
@@ -1325,7 +1325,7 @@
"model": "gpt-5-nano",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6388,
"outputTokens": 72,
"latencyMs": 2357.2276249999995
@@ -1336,7 +1336,7 @@
"model": "claude-haiku-4-5",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 6,
"latencyMs": 1267.960791999998
@@ -1347,7 +1347,7 @@
"model": "gpt-5-nano",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2525,
"outputTokens": 136,
"latencyMs": 2397.798125000001
@@ -1358,7 +1358,7 @@
"model": "claude-haiku-4-5",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 6,
"latencyMs": 1170.6429580000004
@@ -1369,7 +1369,7 @@
"model": "gpt-5-nano",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2379,
"outputTokens": 136,
"latencyMs": 3227.198124999999
@@ -1380,7 +1380,7 @@
"model": "claude-haiku-4-5",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 6,
"latencyMs": 1112.6066250000003
@@ -1391,7 +1391,7 @@
"model": "gpt-5-nano",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6314,
"outputTokens": 72,
"latencyMs": 2036.251791999999
@@ -1402,7 +1402,7 @@
"model": "claude-haiku-4-5",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 6,
"latencyMs": 1290.7641250000015
@@ -1413,7 +1413,7 @@
"model": "gpt-5-nano",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5010,
"outputTokens": 72,
"latencyMs": 2262.8405840000014
@@ -1424,7 +1424,7 @@
"model": "claude-haiku-4-5",
"expected": "145843",
"actual": "145843",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 6,
"latencyMs": 1193.2695419999982
@@ -1435,7 +1435,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 71,
"latencyMs": 3198.2654159999984
@@ -1446,7 +1446,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 4,
"latencyMs": 1229.8644999999997
@@ -1457,7 +1457,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 71,
"latencyMs": 3293.710084000002
@@ -1468,7 +1468,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 4,
"latencyMs": 1121.200334000001
@@ -1479,7 +1479,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 71,
"latencyMs": 2497.4451249999984
@@ -1490,7 +1490,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 4,
"latencyMs": 1152.0107500000013
@@ -1501,7 +1501,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 71,
"latencyMs": 3547.6399999999994
@@ -1512,7 +1512,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 4,
"latencyMs": 2007.6731249999975
@@ -1523,7 +1523,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 71,
"latencyMs": 7054.295208
@@ -1534,7 +1534,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 4,
"latencyMs": 1230.5032920000012
@@ -1545,7 +1545,7 @@
"model": "gpt-5-nano",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 76,
"latencyMs": 2049.933416
@@ -1556,7 +1556,7 @@
"model": "claude-haiku-4-5",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 9,
"latencyMs": 1217.1906249999993
@@ -1567,7 +1567,7 @@
"model": "gpt-5-nano",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 204,
"latencyMs": 2844.136208
@@ -1578,7 +1578,7 @@
"model": "claude-haiku-4-5",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 9,
"latencyMs": 2166.8829589999987
@@ -1589,7 +1589,7 @@
"model": "gpt-5-nano",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 204,
"latencyMs": 2726.5934579999994
@@ -1600,7 +1600,7 @@
"model": "claude-haiku-4-5",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 9,
"latencyMs": 1107.4675410000018
@@ -1611,7 +1611,7 @@
"model": "gpt-5-nano",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 76,
"latencyMs": 2260.4548749999994
@@ -1622,7 +1622,7 @@
"model": "claude-haiku-4-5",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 9,
"latencyMs": 1257.2797080000018
@@ -1633,7 +1633,7 @@
"model": "gpt-5-nano",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 140,
"latencyMs": 2565.571791999999
@@ -1644,7 +1644,7 @@
"model": "claude-haiku-4-5",
"expected": "alexandria61@gmail.com",
"actual": "alexandria61@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 9,
"latencyMs": 1255.2880829999995
@@ -1655,7 +1655,7 @@
"model": "gpt-5-nano",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 136,
"latencyMs": 2595.422042000002
@@ -1666,7 +1666,7 @@
"model": "claude-haiku-4-5",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7870,
"outputTokens": 6,
"latencyMs": 1090.4299170000013
@@ -1677,7 +1677,7 @@
"model": "gpt-5-nano",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 72,
"latencyMs": 2985.3881250000013
@@ -1688,7 +1688,7 @@
"model": "claude-haiku-4-5",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 6,
"latencyMs": 1521.227415999998
@@ -1699,7 +1699,7 @@
"model": "gpt-5-nano",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 72,
"latencyMs": 2918.142082999999
@@ -1710,7 +1710,7 @@
"model": "claude-haiku-4-5",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2856,
"outputTokens": 6,
"latencyMs": 1049.085916
@@ -1721,7 +1721,7 @@
"model": "gpt-5-nano",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 136,
"latencyMs": 2414.9711669999997
@@ -1732,7 +1732,7 @@
"model": "claude-haiku-4-5",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6365,
"outputTokens": 6,
"latencyMs": 1178.0064170000005
@@ -1743,7 +1743,7 @@
"model": "gpt-5-nano",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 72,
"latencyMs": 1772.788625000001
@@ -1754,7 +1754,7 @@
"model": "claude-haiku-4-5",
"expected": "89436",
"actual": "89436",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5760,
"outputTokens": 6,
"latencyMs": 1134.7022499999985
@@ -1765,7 +1765,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 135,
"latencyMs": 2528.6098330000023
@@ -1776,7 +1776,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7872,
"outputTokens": 4,
"latencyMs": 1353.3026250000003
@@ -1787,7 +1787,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 71,
"latencyMs": 2286.120999999999
@@ -1798,7 +1798,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 4,
"latencyMs": 961.078292000002
@@ -1809,7 +1809,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 71,
"latencyMs": 3445.204249999999
@@ -1820,7 +1820,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2858,
"outputTokens": 4,
"latencyMs": 1003.445125000002
@@ -1831,7 +1831,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 135,
"latencyMs": 2696.166874999999
@@ -1842,7 +1842,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6367,
"outputTokens": 4,
"latencyMs": 1063.340791999999
@@ -1853,7 +1853,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 135,
"latencyMs": 3367.6109579999975
@@ -1864,7 +1864,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5762,
"outputTokens": 4,
"latencyMs": 1322.4013339999983
@@ -1875,7 +1875,7 @@
"model": "gpt-5-nano",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 139,
"latencyMs": 2745.6627499999995
@@ -1886,7 +1886,7 @@
"model": "claude-haiku-4-5",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 10,
"latencyMs": 1312.9286670000001
@@ -1897,7 +1897,7 @@
"model": "gpt-5-nano",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 1483,
"latencyMs": 13678.859999999997
@@ -1908,7 +1908,7 @@
"model": "claude-haiku-4-5",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 10,
"latencyMs": 1030.3843339999985
@@ -1919,7 +1919,7 @@
"model": "gpt-5-nano",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 139,
"latencyMs": 2223.2737909999996
@@ -1930,7 +1930,7 @@
"model": "claude-haiku-4-5",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 10,
"latencyMs": 1224.2647080000024
@@ -1941,7 +1941,7 @@
"model": "gpt-5-nano",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 139,
"latencyMs": 3198.8672499999993
@@ -1952,7 +1952,7 @@
"model": "claude-haiku-4-5",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 10,
"latencyMs": 1234.557084
@@ -1963,7 +1963,7 @@
"model": "gpt-5-nano",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 139,
"latencyMs": 2861.692708999999
@@ -1974,7 +1974,7 @@
"model": "claude-haiku-4-5",
"expected": "kelvin54@yahoo.com",
"actual": "kelvin54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 10,
"latencyMs": 1284.2591250000005
@@ -1985,7 +1985,7 @@
"model": "gpt-5-nano",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 136,
"latencyMs": 2741.803499999998
@@ -1996,7 +1996,7 @@
"model": "claude-haiku-4-5",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7872,
"outputTokens": 6,
"latencyMs": 1096.6906249999993
@@ -2007,7 +2007,7 @@
"model": "gpt-5-nano",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 136,
"latencyMs": 3692.904416999998
@@ -2018,7 +2018,7 @@
"model": "claude-haiku-4-5",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 6,
"latencyMs": 1516.7794159999976
@@ -2029,7 +2029,7 @@
"model": "gpt-5-nano",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 392,
"latencyMs": 5068.4152909999975
@@ -2040,7 +2040,7 @@
"model": "claude-haiku-4-5",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2858,
"outputTokens": 6,
"latencyMs": 1356.2728330000027
@@ -2051,7 +2051,7 @@
"model": "gpt-5-nano",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 136,
"latencyMs": 2866.8642500000024
@@ -2062,7 +2062,7 @@
"model": "claude-haiku-4-5",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6367,
"outputTokens": 6,
"latencyMs": 1462.041624999998
@@ -2073,7 +2073,7 @@
"model": "gpt-5-nano",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 72,
"latencyMs": 2320.320083999999
@@ -2084,7 +2084,7 @@
"model": "claude-haiku-4-5",
"expected": "143365",
"actual": "143365",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5762,
"outputTokens": 6,
"latencyMs": 1082.976666999999
@@ -2095,7 +2095,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 7,
"latencyMs": 2427.6330409999973
@@ -2106,7 +2106,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 4,
"latencyMs": 1108.7309170000008
@@ -2117,7 +2117,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 71,
"latencyMs": 4405.948458000003
@@ -2128,7 +2128,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 4,
"latencyMs": 1235.6647919999996
@@ -2139,7 +2139,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 71,
"latencyMs": 2528.553082999999
@@ -2150,7 +2150,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 4,
"latencyMs": 974.1328329999997
@@ -2161,7 +2161,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 135,
"latencyMs": 2243.1775420000013
@@ -2172,7 +2172,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 4,
"latencyMs": 2416.867124999997
@@ -2183,7 +2183,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 135,
"latencyMs": 2429.5548750000016
@@ -2194,7 +2194,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 4,
"latencyMs": 1257.326083
@@ -2205,7 +2205,7 @@
"model": "gpt-5-nano",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6393,
"outputTokens": 203,
"latencyMs": 4366.677041999996
@@ -2216,7 +2216,7 @@
"model": "claude-haiku-4-5",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7876,
"outputTokens": 9,
"latencyMs": 1410.3295419999995
@@ -2227,7 +2227,7 @@
"model": "gpt-5-nano",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2530,
"outputTokens": 75,
"latencyMs": 2834.2883330000004
@@ -2238,7 +2238,7 @@
"model": "claude-haiku-4-5",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2988,
"outputTokens": 9,
"latencyMs": 1023.437750000001
@@ -2249,7 +2249,7 @@
"model": "gpt-5-nano",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2384,
"outputTokens": 139,
"latencyMs": 3091.7722909999975
@@ -2260,7 +2260,7 @@
"model": "claude-haiku-4-5",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2862,
"outputTokens": 9,
"latencyMs": 1910.5562920000011
@@ -2271,7 +2271,7 @@
"model": "gpt-5-nano",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6319,
"outputTokens": 75,
"latencyMs": 2335.239207999999
@@ -2282,7 +2282,7 @@
"model": "claude-haiku-4-5",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6371,
"outputTokens": 9,
"latencyMs": 1145.7144169999992
@@ -2293,7 +2293,7 @@
"model": "gpt-5-nano",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5015,
"outputTokens": 75,
"latencyMs": 2204.0944169999966
@@ -2304,7 +2304,7 @@
"model": "claude-haiku-4-5",
"expected": "dean19@gmail.com",
"actual": "dean19@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5766,
"outputTokens": 9,
"latencyMs": 1102.2122499999969
@@ -2315,7 +2315,7 @@
"model": "gpt-5-nano",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 200,
"latencyMs": 3785.0480830000015
@@ -2326,7 +2326,7 @@
"model": "claude-haiku-4-5",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 6,
"latencyMs": 1147.6056669999962
@@ -2337,7 +2337,7 @@
"model": "gpt-5-nano",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 72,
"latencyMs": 3996.1190410000054
@@ -2348,7 +2348,7 @@
"model": "claude-haiku-4-5",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 6,
"latencyMs": 1101.5621670000037
@@ -2359,7 +2359,7 @@
"model": "gpt-5-nano",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 136,
"latencyMs": 2563.2732499999984
@@ -2370,7 +2370,7 @@
"model": "claude-haiku-4-5",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 6,
"latencyMs": 1224.5424589999966
@@ -2381,7 +2381,7 @@
"model": "gpt-5-nano",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6317,
"outputTokens": 136,
"latencyMs": 2436.8848329999964
@@ -2392,7 +2392,7 @@
"model": "claude-haiku-4-5",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 6,
"latencyMs": 1500.1066250000003
@@ -2403,7 +2403,7 @@
"model": "gpt-5-nano",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 72,
"latencyMs": 2529.925833000001
@@ -2414,7 +2414,7 @@
"model": "claude-haiku-4-5",
"expected": "111314",
"actual": "111314",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 6,
"latencyMs": 1701.0276660000018
@@ -2425,7 +2425,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6388,
"outputTokens": 135,
"latencyMs": 3078.5496249999997
@@ -2436,7 +2436,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 4,
"latencyMs": 1224.1848329999993
@@ -2447,7 +2447,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2525,
"outputTokens": 71,
"latencyMs": 2287.0156669999997
@@ -2458,7 +2458,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 4,
"latencyMs": 1209.1454999999987
@@ -2469,7 +2469,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2379,
"outputTokens": 71,
"latencyMs": 2059.012499999997
@@ -2480,7 +2480,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 4,
"latencyMs": 1393.596375000001
@@ -2491,7 +2491,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6314,
"outputTokens": 71,
"latencyMs": 1858.8989159999983
@@ -2502,7 +2502,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 4,
"latencyMs": 1193.9375419999997
@@ -2513,7 +2513,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5010,
"outputTokens": 135,
"latencyMs": 2755.0157499999987
@@ -2524,7 +2524,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 4,
"latencyMs": 1366.030666999999
@@ -2535,7 +2535,7 @@
"model": "gpt-5-nano",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 395,
"latencyMs": 4352.137999999999
@@ -2546,7 +2546,7 @@
"model": "claude-haiku-4-5",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 10,
"latencyMs": 1093.9707500000004
@@ -2557,7 +2557,7 @@
"model": "gpt-5-nano",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 139,
"latencyMs": 2481.934500000003
@@ -2568,7 +2568,7 @@
"model": "claude-haiku-4-5",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 10,
"latencyMs": 1262.3894579999978
@@ -2579,7 +2579,7 @@
"model": "gpt-5-nano",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 75,
"latencyMs": 2360.7159170000014
@@ -2590,7 +2590,7 @@
"model": "claude-haiku-4-5",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 10,
"latencyMs": 1462.5894999999946
@@ -2601,7 +2601,7 @@
"model": "gpt-5-nano",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 75,
"latencyMs": 3247.478041000002
@@ -2612,7 +2612,7 @@
"model": "claude-haiku-4-5",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 10,
"latencyMs": 1693.1597089999996
@@ -2623,7 +2623,7 @@
"model": "gpt-5-nano",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 75,
"latencyMs": 1726.2765839999993
@@ -2634,7 +2634,7 @@
"model": "claude-haiku-4-5",
"expected": "laurel54@yahoo.com",
"actual": "laurel54@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 10,
"latencyMs": 1605.044458000004
@@ -2645,7 +2645,7 @@
"model": "gpt-5-nano",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 136,
"latencyMs": 2263.1207090000025
@@ -2656,7 +2656,7 @@
"model": "claude-haiku-4-5",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7873,
"outputTokens": 6,
"latencyMs": 3789.016875000001
@@ -2667,7 +2667,7 @@
"model": "gpt-5-nano",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 72,
"latencyMs": 1829.9641669999983
@@ -2678,7 +2678,7 @@
"model": "claude-haiku-4-5",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 6,
"latencyMs": 989.6153750000012
@@ -2689,7 +2689,7 @@
"model": "gpt-5-nano",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 72,
"latencyMs": 2717.4773339999956
@@ -2700,7 +2700,7 @@
"model": "claude-haiku-4-5",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2859,
"outputTokens": 6,
"latencyMs": 1717.8889999999956
@@ -2711,7 +2711,7 @@
"model": "gpt-5-nano",
"expected": "89553",
"actual": "46730",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6317,
"outputTokens": 72,
"latencyMs": 5490.572667
@@ -2722,7 +2722,7 @@
"model": "claude-haiku-4-5",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6368,
"outputTokens": 6,
"latencyMs": 1427.4055000000008
@@ -2733,7 +2733,7 @@
"model": "gpt-5-nano",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 264,
"latencyMs": 4052.875957999997
@@ -2744,7 +2744,7 @@
"model": "claude-haiku-4-5",
"expected": "89553",
"actual": "89553",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5763,
"outputTokens": 6,
"latencyMs": 1586.255124999996
@@ -2755,7 +2755,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6388,
"outputTokens": 135,
"latencyMs": 3787.343541000002
@@ -2766,7 +2766,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7866,
"outputTokens": 4,
"latencyMs": 1196.934000000001
@@ -2777,7 +2777,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2525,
"outputTokens": 71,
"latencyMs": 2172.2377080000006
@@ -2788,7 +2788,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2978,
"outputTokens": 4,
"latencyMs": 1112.6987080000035
@@ -2799,7 +2799,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2379,
"outputTokens": 71,
"latencyMs": 2074.6067919999987
@@ -2810,7 +2810,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2852,
"outputTokens": 4,
"latencyMs": 1202.2165000000023
@@ -2821,7 +2821,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6314,
"outputTokens": 135,
"latencyMs": 3257.5967080000046
@@ -2832,7 +2832,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6361,
"outputTokens": 4,
"latencyMs": 1316.7435000000041
@@ -2843,7 +2843,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5010,
"outputTokens": 71,
"latencyMs": 2391.9063749999987
@@ -2854,7 +2854,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5756,
"outputTokens": 4,
"latencyMs": 1208.8820829999968
@@ -2865,7 +2865,7 @@
"model": "gpt-5-nano",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 142,
"latencyMs": 2735.679790999995
@@ -2876,7 +2876,7 @@
"model": "claude-haiku-4-5",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 14,
"latencyMs": 1253.706624999999
@@ -2887,7 +2887,7 @@
"model": "gpt-5-nano",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 142,
"latencyMs": 2471.819457999998
@@ -2898,7 +2898,7 @@
"model": "claude-haiku-4-5",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 14,
"latencyMs": 1063.2195409999986
@@ -2909,7 +2909,7 @@
"model": "gpt-5-nano",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 142,
"latencyMs": 2061.6382500000036
@@ -2920,7 +2920,7 @@
"model": "claude-haiku-4-5",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 14,
"latencyMs": 1877.579082999997
@@ -2931,7 +2931,7 @@
"model": "gpt-5-nano",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6317,
"outputTokens": 142,
"latencyMs": 3448.810375000001
@@ -2942,7 +2942,7 @@
"model": "claude-haiku-4-5",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 14,
"latencyMs": 1265.9410419999986
@@ -2953,7 +2953,7 @@
"model": "gpt-5-nano",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 78,
"latencyMs": 2152.5591669999994
@@ -2964,7 +2964,7 @@
"model": "claude-haiku-4-5",
"expected": "jayme.kertzmann77@gmail.com",
"actual": "jayme.kertzmann77@gmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 14,
"latencyMs": 1432.513583
@@ -2975,7 +2975,7 @@
"model": "gpt-5-nano",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 136,
"latencyMs": 2707.4454169999954
@@ -2986,7 +2986,7 @@
"model": "claude-haiku-4-5",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 6,
"latencyMs": 1568.5869169999933
@@ -2997,7 +2997,7 @@
"model": "gpt-5-nano",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 136,
"latencyMs": 2373.4566669999986
@@ -3008,7 +3008,7 @@
"model": "claude-haiku-4-5",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 6,
"latencyMs": 1525.172749999998
@@ -3019,7 +3019,7 @@
"model": "gpt-5-nano",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 136,
"latencyMs": 9347.989583000002
@@ -3030,7 +3030,7 @@
"model": "claude-haiku-4-5",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 6,
"latencyMs": 1748.783334000007
@@ -3041,7 +3041,7 @@
"model": "gpt-5-nano",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 72,
"latencyMs": 1929.517458000002
@@ -3052,7 +3052,7 @@
"model": "claude-haiku-4-5",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 6,
"latencyMs": 1022.1345000000001
@@ -3063,7 +3063,7 @@
"model": "gpt-5-nano",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 136,
"latencyMs": 2102.925624999996
@@ -3074,7 +3074,7 @@
"model": "claude-haiku-4-5",
"expected": "104053",
"actual": "104053",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 6,
"latencyMs": 1471.7255839999998
@@ -3085,7 +3085,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 71,
"latencyMs": 1983.693041999999
@@ -3096,7 +3096,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7872,
"outputTokens": 4,
"latencyMs": 1077.2119579999999
@@ -3107,7 +3107,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 71,
"latencyMs": 2549.1221250000017
@@ -3118,7 +3118,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 4,
"latencyMs": 921.1110840000038
@@ -3129,7 +3129,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 135,
"latencyMs": 4070.615666999998
@@ -3140,7 +3140,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2858,
"outputTokens": 4,
"latencyMs": 974.754832999999
@@ -3151,7 +3151,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6317,
"outputTokens": 135,
"latencyMs": 2665.842083000003
@@ -3162,7 +3162,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6367,
"outputTokens": 4,
"latencyMs": 1081.2904160000035
@@ -3173,7 +3173,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 135,
"latencyMs": 2897.919332999998
@@ -3184,7 +3184,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5762,
"outputTokens": 4,
"latencyMs": 1341.0955420000028
@@ -3195,7 +3195,7 @@
"model": "gpt-5-nano",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 204,
"latencyMs": 3231.9646249999932
@@ -3206,7 +3206,7 @@
"model": "claude-haiku-4-5",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 12,
"latencyMs": 1288.5363330000037
@@ -3217,7 +3217,7 @@
"model": "gpt-5-nano",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 76,
"latencyMs": 2581.508915999999
@@ -3228,7 +3228,7 @@
"model": "claude-haiku-4-5",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 12,
"latencyMs": 1183.8337079999983
@@ -3239,7 +3239,7 @@
"model": "gpt-5-nano",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 140,
"latencyMs": 2073.944792000002
@@ -3250,7 +3250,7 @@
"model": "claude-haiku-4-5",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 12,
"latencyMs": 1302.5857499999984
@@ -3261,7 +3261,7 @@
"model": "gpt-5-nano",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 204,
"latencyMs": 3076.5304590000014
@@ -3272,7 +3272,7 @@
"model": "claude-haiku-4-5",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 12,
"latencyMs": 1110.9787920000017
@@ -3283,7 +3283,7 @@
"model": "gpt-5-nano",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 76,
"latencyMs": 3381.732917000001
@@ -3294,7 +3294,7 @@
"model": "claude-haiku-4-5",
"expected": "carley.bauch@yahoo.com",
"actual": "carley.bauch@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 12,
"latencyMs": 1198.1488329999993
@@ -3305,7 +3305,7 @@
"model": "gpt-5-nano",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6393,
"outputTokens": 136,
"latencyMs": 2687.965959000001
@@ -3316,7 +3316,7 @@
"model": "claude-haiku-4-5",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7874,
"outputTokens": 6,
"latencyMs": 2615.956250000003
@@ -3327,7 +3327,7 @@
"model": "gpt-5-nano",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2530,
"outputTokens": 136,
"latencyMs": 2132.413249999998
@@ -3338,7 +3338,7 @@
"model": "claude-haiku-4-5",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2986,
"outputTokens": 6,
"latencyMs": 1091.060666999998
@@ -3349,7 +3349,7 @@
"model": "gpt-5-nano",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2384,
"outputTokens": 72,
"latencyMs": 2074.8201670000053
@@ -3360,7 +3360,7 @@
"model": "claude-haiku-4-5",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2860,
"outputTokens": 6,
"latencyMs": 1622.2757499999934
@@ -3371,7 +3371,7 @@
"model": "gpt-5-nano",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6319,
"outputTokens": 200,
"latencyMs": 3122.3756670000002
@@ -3382,7 +3382,7 @@
"model": "claude-haiku-4-5",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6369,
"outputTokens": 6,
"latencyMs": 1175.7301249999946
@@ -3393,7 +3393,7 @@
"model": "gpt-5-nano",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5015,
"outputTokens": 136,
"latencyMs": 2601.074916999998
@@ -3404,7 +3404,7 @@
"model": "claude-haiku-4-5",
"expected": "142029",
"actual": "142029",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5764,
"outputTokens": 6,
"latencyMs": 1089.4757079999981
@@ -3415,7 +3415,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 135,
"latencyMs": 6939.617750000005
@@ -3426,7 +3426,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 4,
"latencyMs": 1207.9619999999995
@@ -3437,7 +3437,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 135,
"latencyMs": 2784.063166
@@ -3448,7 +3448,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 4,
"latencyMs": 1011.0956670000014
@@ -3459,7 +3459,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 135,
"latencyMs": 3098.7147909999985
@@ -3470,7 +3470,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 4,
"latencyMs": 983.9449170000007
@@ -3481,7 +3481,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 135,
"latencyMs": 3889.572291999997
@@ -3492,7 +3492,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 4,
"latencyMs": 1096.1613339999967
@@ -3503,7 +3503,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 71,
"latencyMs": 2484.078917000006
@@ -3514,7 +3514,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 4,
"latencyMs": 1150.418792000004
@@ -3525,7 +3525,7 @@
"model": "gpt-5-nano",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6393,
"outputTokens": 140,
"latencyMs": 2221.4447079999954
@@ -3536,7 +3536,7 @@
"model": "claude-haiku-4-5",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7872,
"outputTokens": 14,
"latencyMs": 1193.9583749999947
@@ -3547,7 +3547,7 @@
"model": "gpt-5-nano",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2530,
"outputTokens": 76,
"latencyMs": 2170.8865829999995
@@ -3558,7 +3558,7 @@
"model": "claude-haiku-4-5",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 14,
"latencyMs": 1247.6116660000043
@@ -3569,7 +3569,7 @@
"model": "gpt-5-nano",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2384,
"outputTokens": 76,
"latencyMs": 3827.705667000002
@@ -3580,7 +3580,7 @@
"model": "claude-haiku-4-5",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2858,
"outputTokens": 14,
"latencyMs": 1084.8218339999949
@@ -3591,7 +3591,7 @@
"model": "gpt-5-nano",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6319,
"outputTokens": 140,
"latencyMs": 3311.8220839999994
@@ -3602,7 +3602,7 @@
"model": "claude-haiku-4-5",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6367,
"outputTokens": 14,
"latencyMs": 1269.2092920000068
@@ -3613,7 +3613,7 @@
"model": "gpt-5-nano",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5015,
"outputTokens": 140,
"latencyMs": 2648.3102500000023
@@ -3624,7 +3624,7 @@
"model": "claude-haiku-4-5",
"expected": "cheyenne_skiles@hotmail.com",
"actual": "cheyenne_skiles@hotmail.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5762,
"outputTokens": 14,
"latencyMs": 1278.0403750000041
@@ -3635,7 +3635,7 @@
"model": "gpt-5-nano",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6391,
"outputTokens": 136,
"latencyMs": 3555.1511670000036
@@ -3646,7 +3646,7 @@
"model": "claude-haiku-4-5",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 6,
"latencyMs": 1317.5797499999971
@@ -3657,7 +3657,7 @@
"model": "gpt-5-nano",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2528,
"outputTokens": 136,
"latencyMs": 2291.943041999999
@@ -3668,7 +3668,7 @@
"model": "claude-haiku-4-5",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 6,
"latencyMs": 2081.3947499999995
@@ -3679,7 +3679,7 @@
"model": "gpt-5-nano",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2382,
"outputTokens": 72,
"latencyMs": 2067.9348329999993
@@ -3690,7 +3690,7 @@
"model": "claude-haiku-4-5",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 6,
"latencyMs": 1192.6603340000001
@@ -3701,7 +3701,7 @@
"model": "gpt-5-nano",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6317,
"outputTokens": 200,
"latencyMs": 3044.592457999999
@@ -3712,7 +3712,7 @@
"model": "claude-haiku-4-5",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 6,
"latencyMs": 1106.2235409999994
@@ -3723,7 +3723,7 @@
"model": "gpt-5-nano",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5013,
"outputTokens": 136,
"latencyMs": 2627.8240000000005
@@ -3734,7 +3734,7 @@
"model": "claude-haiku-4-5",
"expected": "84650",
"actual": "84650",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 6,
"latencyMs": 1379.9015
@@ -3745,7 +3745,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 263,
"latencyMs": 3705.3900829999984
@@ -3756,7 +3756,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 4,
"latencyMs": 1909.4442500000005
@@ -3767,7 +3767,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 135,
"latencyMs": 2173.6019589999996
@@ -3778,7 +3778,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 4,
"latencyMs": 1063.8584580000024
@@ -3789,7 +3789,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 71,
"latencyMs": 1800.4930420000019
@@ -3800,7 +3800,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2857,
"outputTokens": 4,
"latencyMs": 1011.3969579999975
@@ -3811,7 +3811,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 135,
"latencyMs": 2562.2492500000008
@@ -3822,7 +3822,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 4,
"latencyMs": 1349.1809170000051
@@ -3833,7 +3833,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 71,
"latencyMs": 1883.7523750000037
@@ -3844,7 +3844,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 4,
"latencyMs": 1135.412292000001
@@ -3855,7 +3855,7 @@
"model": "gpt-5-nano",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 334,
"latencyMs": 4067.161957999997
@@ -3866,7 +3866,7 @@
"model": "claude-haiku-4-5",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 14,
"latencyMs": 1333.0713749999995
@@ -3877,7 +3877,7 @@
"model": "gpt-5-nano",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 142,
"latencyMs": 2081.8315000000002
@@ -3888,7 +3888,7 @@
"model": "claude-haiku-4-5",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 14,
"latencyMs": 1231.0224579999995
@@ -3899,7 +3899,7 @@
"model": "gpt-5-nano",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 78,
"latencyMs": 2333.0360409999994
@@ -3910,7 +3910,7 @@
"model": "claude-haiku-4-5",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 14,
"latencyMs": 1175.1937500000058
@@ -3921,7 +3921,7 @@
"model": "gpt-5-nano",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 206,
"latencyMs": 7391.094749999997
@@ -3932,7 +3932,7 @@
"model": "claude-haiku-4-5",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 14,
"latencyMs": 1843.981458000002
@@ -3943,7 +3943,7 @@
"model": "gpt-5-nano",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 142,
"latencyMs": 2386.8134589999972
@@ -3954,7 +3954,7 @@
"model": "claude-haiku-4-5",
"expected": "macey.gottlieb5@yahoo.com",
"actual": "macey.gottlieb5@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 14,
"latencyMs": 1449.751750000003
@@ -3965,7 +3965,7 @@
"model": "gpt-5-nano",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 136,
"latencyMs": 4075.600666999999
@@ -3976,7 +3976,7 @@
"model": "claude-haiku-4-5",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 6,
"latencyMs": 985.1729999999952
@@ -3987,7 +3987,7 @@
"model": "gpt-5-nano",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 136,
"latencyMs": 2891.2602079999997
@@ -3998,7 +3998,7 @@
"model": "claude-haiku-4-5",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 6,
"latencyMs": 2073.129000000001
@@ -4009,7 +4009,7 @@
"model": "gpt-5-nano",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 72,
"latencyMs": 1894.3316669999986
@@ -4020,7 +4020,7 @@
"model": "claude-haiku-4-5",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 6,
"latencyMs": 1172.3735000000015
@@ -4031,7 +4031,7 @@
"model": "gpt-5-nano",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 72,
"latencyMs": 2456.6511249999967
@@ -4042,7 +4042,7 @@
"model": "claude-haiku-4-5",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 6,
"latencyMs": 1298.1367079999982
@@ -4053,7 +4053,7 @@
"model": "gpt-5-nano",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 136,
"latencyMs": 6018.304375
@@ -4064,7 +4064,7 @@
"model": "claude-haiku-4-5",
"expected": "89773",
"actual": "89773",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 6,
"latencyMs": 1103.9152499999982
@@ -4075,7 +4075,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 71,
"latencyMs": 3867.303832999998
@@ -4086,7 +4086,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7868,
"outputTokens": 4,
"latencyMs": 1287.7528749999983
@@ -4097,7 +4097,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 135,
"latencyMs": 2355.0305829999998
@@ -4108,7 +4108,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 4,
"latencyMs": 1086.8424579999992
@@ -4119,7 +4119,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 71,
"latencyMs": 3472.6323339999944
@@ -4130,7 +4130,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2854,
"outputTokens": 4,
"latencyMs": 948.3086249999978
@@ -4141,7 +4141,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 71,
"latencyMs": 3343.3446659999972
@@ -4152,7 +4152,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6363,
"outputTokens": 4,
"latencyMs": 1048.567959
@@ -4163,7 +4163,7 @@
"model": "gpt-5-nano",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 71,
"latencyMs": 3761.141875000001
@@ -4174,7 +4174,7 @@
"model": "claude-haiku-4-5",
"expected": "Marketing",
"actual": "Marketing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5758,
"outputTokens": 4,
"latencyMs": 1130.9393339999951
@@ -4185,7 +4185,7 @@
"model": "gpt-5-nano",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6389,
"outputTokens": 79,
"latencyMs": 4200.215792000003
@@ -4196,7 +4196,7 @@
"model": "claude-haiku-4-5",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7869,
"outputTokens": 13,
"latencyMs": 1351.981166999998
@@ -4207,7 +4207,7 @@
"model": "gpt-5-nano",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2526,
"outputTokens": 143,
"latencyMs": 2465.4245840000003
@@ -4218,7 +4218,7 @@
"model": "claude-haiku-4-5",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 13,
"latencyMs": 885.4770840000056
@@ -4229,7 +4229,7 @@
"model": "gpt-5-nano",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2380,
"outputTokens": 143,
"latencyMs": 2903.201958000005
@@ -4240,7 +4240,7 @@
"model": "claude-haiku-4-5",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2855,
"outputTokens": 13,
"latencyMs": 1006.1219579999961
@@ -4251,7 +4251,7 @@
"model": "gpt-5-nano",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6315,
"outputTokens": 207,
"latencyMs": 3253.900333999998
@@ -4262,7 +4262,7 @@
"model": "claude-haiku-4-5",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6364,
"outputTokens": 13,
"latencyMs": 1219.713582999997
@@ -4273,7 +4273,7 @@
"model": "gpt-5-nano",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5011,
"outputTokens": 143,
"latencyMs": 2335.6635000000024
@@ -4284,7 +4284,7 @@
"model": "claude-haiku-4-5",
"expected": "georgianna_renner@yahoo.com",
"actual": "georgianna_renner@yahoo.com",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5759,
"outputTokens": 13,
"latencyMs": 1334.1358330000003
@@ -4295,7 +4295,7 @@
"model": "gpt-5-nano",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6390,
"outputTokens": 136,
"latencyMs": 1912.2536669999972
@@ -4306,7 +4306,7 @@
"model": "claude-haiku-4-5",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7871,
"outputTokens": 6,
"latencyMs": 1104.4684160000033
@@ -4317,7 +4317,7 @@
"model": "gpt-5-nano",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2527,
"outputTokens": 72,
"latencyMs": 2648.919750000001
@@ -4328,7 +4328,7 @@
"model": "claude-haiku-4-5",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2983,
"outputTokens": 6,
"latencyMs": 1525.6309170000022
@@ -4339,7 +4339,7 @@
"model": "gpt-5-nano",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2381,
"outputTokens": 136,
"latencyMs": 2736.3283749999973
@@ -4350,7 +4350,7 @@
"model": "claude-haiku-4-5",
"expected": "49741",
"actual": "144426",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2857,
"outputTokens": 6,
"latencyMs": 1077.766334
@@ -4361,7 +4361,7 @@
"model": "gpt-5-nano",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6316,
"outputTokens": 72,
"latencyMs": 2116.5284170000014
@@ -4372,7 +4372,7 @@
"model": "claude-haiku-4-5",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6366,
"outputTokens": 6,
"latencyMs": 1159.7744170000005
@@ -4383,7 +4383,7 @@
"model": "gpt-5-nano",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5012,
"outputTokens": 72,
"latencyMs": 2529.7074160000047
@@ -4394,7 +4394,7 @@
"model": "claude-haiku-4-5",
"expected": "49741",
"actual": "49741",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5761,
"outputTokens": 6,
"latencyMs": 1604.601791999994
@@ -4405,7 +4405,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 967,
"latencyMs": 8300.216583000001
@@ -4416,7 +4416,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1204.089749999992
@@ -4427,7 +4427,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 455,
"latencyMs": 5231.604541000001
@@ -4438,7 +4438,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 1168.508707999994
@@ -4449,7 +4449,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 967,
"latencyMs": 8396.912500000006
@@ -4460,7 +4460,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 1060.6276250000083
@@ -4471,7 +4471,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 775,
"latencyMs": 9340.763791999998
@@ -4482,7 +4482,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1020.8827080000046
@@ -4493,7 +4493,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 903,
"latencyMs": 8792.062000000005
@@ -4504,7 +4504,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 1459.8301659999997
@@ -4515,7 +4515,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 519,
"latencyMs": 6439.622583000004
@@ -4526,7 +4526,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1416.1659170000057
@@ -4537,7 +4537,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 903,
"latencyMs": 8064.398499999996
@@ -4548,7 +4548,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "14",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 998.3781250000029
@@ -4559,7 +4559,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 647,
"latencyMs": 5498.786500000002
@@ -4570,7 +4570,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 1343.9632910000073
@@ -4581,7 +4581,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 647,
"latencyMs": 7565.158291
@@ -4592,7 +4592,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "14",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1320.9714169999934
@@ -4603,7 +4603,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 839,
"latencyMs": 10626.395499999999
@@ -4614,7 +4614,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 3227.584917
@@ -4625,7 +4625,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 583,
"latencyMs": 6690.373416000002
@@ -4636,7 +4636,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1187.1296250000014
@@ -4647,7 +4647,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 519,
"latencyMs": 5081.884875000003
@@ -4658,7 +4658,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 1576.2339999999967
@@ -4669,7 +4669,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 1031,
"latencyMs": 9927.5775
@@ -4680,7 +4680,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 1169.6451669999951
@@ -4691,7 +4691,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 519,
"latencyMs": 6772.954291999995
@@ -4702,7 +4702,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1905.9189590000024
@@ -4713,7 +4713,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 455,
"latencyMs": 6827.424666999999
@@ -4724,7 +4724,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 2121.3979160000017
@@ -4735,7 +4735,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 519,
"latencyMs": 15235.099042000002
@@ -4746,7 +4746,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1182.0669170000037
@@ -4757,7 +4757,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 583,
"latencyMs": 6872.47600000001
@@ -4768,7 +4768,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 931.0203749999928
@@ -4779,7 +4779,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 2311,
"latencyMs": 17952.683875000002
@@ -4790,7 +4790,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 1167.8899999999994
@@ -4801,7 +4801,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 455,
"latencyMs": 6896.831916999989
@@ -4812,7 +4812,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1401.859083000003
@@ -4823,7 +4823,7 @@
"model": "gpt-5-nano",
"expected": "17",
"actual": "17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 647,
"latencyMs": 5266.956917000003
@@ -4834,7 +4834,7 @@
"model": "claude-haiku-4-5",
"expected": "17",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 1100.9057919999905
@@ -4845,7 +4845,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 1095,
"latencyMs": 15621.264291999993
@@ -4856,7 +4856,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "12",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1063.5868750000081
@@ -4867,7 +4867,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 455,
"latencyMs": 5703.061916000006
@@ -4878,7 +4878,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 1113.9432499999966
@@ -4889,7 +4889,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 3015,
"latencyMs": 22321.357124999995
@@ -4900,7 +4900,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 968.0936249999941
@@ -4911,7 +4911,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 1287,
"latencyMs": 14521.080749999994
@@ -4922,7 +4922,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "12",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1228.1847500000003
@@ -4933,7 +4933,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 455,
"latencyMs": 5216.268042000011
@@ -4944,7 +4944,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 1026.5127079999947
@@ -4955,7 +4955,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6387,
"outputTokens": 391,
"latencyMs": 4335.125541000001
@@ -4966,7 +4966,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7865,
"outputTokens": 5,
"latencyMs": 1116.4177909999999
@@ -4977,7 +4977,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2524,
"outputTokens": 583,
"latencyMs": 4128.823499999999
@@ -4988,7 +4988,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2977,
"outputTokens": 5,
"latencyMs": 1105.622457999998
@@ -4999,7 +4999,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2378,
"outputTokens": 839,
"latencyMs": 6542.58583299999
@@ -5010,7 +5010,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2851,
"outputTokens": 5,
"latencyMs": 1084.2237909999967
@@ -5021,7 +5021,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6313,
"outputTokens": 455,
"latencyMs": 5050.133375000005
@@ -5032,7 +5032,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6360,
"outputTokens": 5,
"latencyMs": 1075.023709000001
@@ -5043,7 +5043,7 @@
"model": "gpt-5-nano",
"expected": "16",
"actual": "16",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5009,
"outputTokens": 711,
"latencyMs": 9237.985791
@@ -5054,7 +5054,7 @@
"model": "claude-haiku-4-5",
"expected": "16",
"actual": "12",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5755,
"outputTokens": 5,
"latencyMs": 1346.3510000000097
@@ -5065,7 +5065,7 @@
"model": "gpt-5-nano",
"expected": "91",
"actual": "91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 2375,
"latencyMs": 27655.89520900001
@@ -5076,7 +5076,7 @@
"model": "claude-haiku-4-5",
"expected": "91",
"actual": "89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7870,
"outputTokens": 5,
"latencyMs": 1315.7111659999937
@@ -5087,7 +5087,7 @@
"model": "gpt-5-nano",
"expected": "91",
"actual": "91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 2695,
"latencyMs": 26482.504707999993
@@ -5098,7 +5098,7 @@
"model": "claude-haiku-4-5",
"expected": "91",
"actual": "85",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2982,
"outputTokens": 5,
"latencyMs": 1368.221916999988
@@ -5109,7 +5109,7 @@
"model": "gpt-5-nano",
"expected": "91",
"actual": "91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 1671,
"latencyMs": 18249.434333000012
@@ -5120,7 +5120,7 @@
"model": "claude-haiku-4-5",
"expected": "91",
"actual": "85",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2856,
"outputTokens": 5,
"latencyMs": 1051.9521660000028
@@ -5131,7 +5131,7 @@
"model": "gpt-5-nano",
"expected": "91",
"actual": "91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 1799,
"latencyMs": 15867.284083999999
@@ -5142,7 +5142,7 @@
"model": "claude-haiku-4-5",
"expected": "91",
"actual": "89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6365,
"outputTokens": 5,
"latencyMs": 1831.3835839999956
@@ -5153,7 +5153,7 @@
"model": "gpt-5-nano",
"expected": "91",
"actual": "91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 2247,
"latencyMs": 19254.821666999997
@@ -5164,7 +5164,7 @@
"model": "claude-haiku-4-5",
"expected": "91",
"actual": "89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5760,
"outputTokens": 5,
"latencyMs": 1762.2908329999918
@@ -5175,7 +5175,7 @@
"model": "gpt-5-nano",
"expected": "67",
"actual": "67",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 1479,
"latencyMs": 13444.104542000001
@@ -5186,7 +5186,7 @@
"model": "claude-haiku-4-5",
"expected": "67",
"actual": "57",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7870,
"outputTokens": 5,
"latencyMs": 1182.2523340000043
@@ -5197,7 +5197,7 @@
"model": "gpt-5-nano",
"expected": "67",
"actual": "67",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 2183,
"latencyMs": 19257.86050000001
@@ -5208,7 +5208,7 @@
"model": "claude-haiku-4-5",
"expected": "67",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2982,
"outputTokens": 5,
"latencyMs": 1081.3142080000107
@@ -5219,7 +5219,7 @@
"model": "gpt-5-nano",
"expected": "67",
"actual": "67",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 3463,
"latencyMs": 21384.707542000004
@@ -5230,7 +5230,7 @@
"model": "claude-haiku-4-5",
"expected": "67",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2856,
"outputTokens": 5,
"latencyMs": 1051.6647080000112
@@ -5241,7 +5241,7 @@
"model": "gpt-5-nano",
"expected": "67",
"actual": "67",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 2439,
"latencyMs": 19519.416207999995
@@ -5252,7 +5252,7 @@
"model": "claude-haiku-4-5",
"expected": "67",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6365,
"outputTokens": 5,
"latencyMs": 1060.1008749999892
@@ -5263,7 +5263,7 @@
"model": "gpt-5-nano",
"expected": "67",
"actual": "66",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5014,
"outputTokens": 1991,
"latencyMs": 15234.403459000008
@@ -5274,7 +5274,7 @@
"model": "claude-haiku-4-5",
"expected": "67",
"actual": "57",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5760,
"outputTokens": 5,
"latencyMs": 1208.8559589999932
@@ -5285,7 +5285,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 1415,
"latencyMs": 14119.885540999996
@@ -5296,7 +5296,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7870,
"outputTokens": 5,
"latencyMs": 1428.8373750000028
@@ -5307,7 +5307,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 1607,
"latencyMs": 13997.297709000006
@@ -5318,7 +5318,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "27",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2982,
"outputTokens": 5,
"latencyMs": 1270.4412920000032
@@ -5329,7 +5329,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 1415,
"latencyMs": 13861.177167000002
@@ -5340,7 +5340,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2856,
"outputTokens": 5,
"latencyMs": 916.5238340000069
@@ -5351,7 +5351,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6318,
"outputTokens": 1799,
"latencyMs": 16007.06925
@@ -5362,7 +5362,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "27",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6365,
"outputTokens": 5,
"latencyMs": 1426.0594579999888
@@ -5373,7 +5373,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 2055,
"latencyMs": 22966.680624999994
@@ -5384,7 +5384,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5760,
"outputTokens": 5,
"latencyMs": 1044.6609999999928
@@ -5395,7 +5395,7 @@
"model": "gpt-5-nano",
"expected": "26",
"actual": "26",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6392,
"outputTokens": 1159,
"latencyMs": 10799.117333000002
@@ -5406,7 +5406,7 @@
"model": "claude-haiku-4-5",
"expected": "26",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7870,
"outputTokens": 5,
"latencyMs": 1359.5568330000096
@@ -5417,7 +5417,7 @@
"model": "gpt-5-nano",
"expected": "26",
"actual": "26",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2529,
"outputTokens": 1543,
"latencyMs": 13702.052542000005
@@ -5428,7 +5428,7 @@
"model": "claude-haiku-4-5",
"expected": "26",
"actual": "16",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2982,
"outputTokens": 5,
"latencyMs": 967.0454159999936
@@ -5439,7 +5439,7 @@
"model": "gpt-5-nano",
"expected": "26",
"actual": "26",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2383,
"outputTokens": 1671,
"latencyMs": 13116.871958000003
@@ -5450,7 +5450,7 @@
"model": "claude-haiku-4-5",
"expected": "26",
"actual": "16",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2856,
"outputTokens": 5,
"latencyMs": 1088.8372910000035
@@ -5461,7 +5461,7 @@
"model": "gpt-5-nano",
"expected": "26",
"actual": "26",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6318,
"outputTokens": 1543,
"latencyMs": 14387.148624999987
@@ -5472,7 +5472,7 @@
"model": "claude-haiku-4-5",
"expected": "26",
"actual": "16",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6365,
"outputTokens": 5,
"latencyMs": 1273.9564170000085
@@ -5483,7 +5483,7 @@
"model": "gpt-5-nano",
"expected": "26",
"actual": "26",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5014,
"outputTokens": 1223,
"latencyMs": 12143.083792000005
@@ -5494,7 +5494,7 @@
"model": "claude-haiku-4-5",
"expected": "26",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5760,
"outputTokens": 5,
"latencyMs": 1032.9807079999882
@@ -5505,7 +5505,7 @@
"model": "gpt-5-nano",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6386,
"outputTokens": 2631,
"latencyMs": 23077.678417000003
@@ -5516,7 +5516,7 @@
"model": "claude-haiku-4-5",
"expected": "78",
"actual": "81",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7864,
"outputTokens": 5,
"latencyMs": 1281.171417000005
@@ -5527,7 +5527,7 @@
"model": "gpt-5-nano",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2523,
"outputTokens": 2759,
"latencyMs": 20331.962667
@@ -5538,7 +5538,7 @@
"model": "claude-haiku-4-5",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2976,
"outputTokens": 5,
"latencyMs": 1014.3847079999978
@@ -5549,7 +5549,7 @@
"model": "gpt-5-nano",
"expected": "78",
"actual": "81",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2377,
"outputTokens": 3335,
"latencyMs": 18037.630208000002
@@ -5560,7 +5560,7 @@
"model": "claude-haiku-4-5",
"expected": "78",
"actual": "73",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2850,
"outputTokens": 5,
"latencyMs": 918.3078749999986
@@ -5571,7 +5571,7 @@
"model": "gpt-5-nano",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6312,
"outputTokens": 1991,
"latencyMs": 15660.232958000008
@@ -5582,7 +5582,7 @@
"model": "claude-haiku-4-5",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6359,
"outputTokens": 5,
"latencyMs": 1033.7647080000024
@@ -5593,7 +5593,7 @@
"model": "gpt-5-nano",
"expected": "78",
"actual": "78",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5008,
"outputTokens": 4295,
"latencyMs": 26817.97
@@ -5604,7 +5604,7 @@
"model": "claude-haiku-4-5",
"expected": "78",
"actual": "77",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5754,
"outputTokens": 5,
"latencyMs": 1348.084750000009
@@ -5615,7 +5615,7 @@
"model": "gpt-5-nano",
"expected": "22",
"actual": "22",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6386,
"outputTokens": 1223,
"latencyMs": 10273.866540999996
@@ -5626,7 +5626,7 @@
"model": "claude-haiku-4-5",
"expected": "22",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7864,
"outputTokens": 5,
"latencyMs": 1081.604707999999
@@ -5637,7 +5637,7 @@
"model": "gpt-5-nano",
"expected": "22",
"actual": "22",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2523,
"outputTokens": 903,
"latencyMs": 13862.020499999999
@@ -5648,7 +5648,7 @@
"model": "claude-haiku-4-5",
"expected": "22",
"actual": "16",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2976,
"outputTokens": 5,
"latencyMs": 965.817916
@@ -5659,7 +5659,7 @@
"model": "gpt-5-nano",
"expected": "22",
"actual": "21",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2377,
"outputTokens": 2631,
"latencyMs": 24254.82570799999
@@ -5670,7 +5670,7 @@
"model": "claude-haiku-4-5",
"expected": "22",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2850,
"outputTokens": 5,
"latencyMs": 998.7978339999972
@@ -5681,7 +5681,7 @@
"model": "gpt-5-nano",
"expected": "22",
"actual": "22",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6312,
"outputTokens": 1095,
"latencyMs": 10401.351500000004
@@ -5692,7 +5692,7 @@
"model": "claude-haiku-4-5",
"expected": "22",
"actual": "15",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6359,
"outputTokens": 5,
"latencyMs": 1479.388791999998
@@ -5703,7 +5703,7 @@
"model": "gpt-5-nano",
"expected": "22",
"actual": "22",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5008,
"outputTokens": 839,
"latencyMs": 8160.454833999989
@@ -5714,7 +5714,7 @@
"model": "claude-haiku-4-5",
"expected": "22",
"actual": "16",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5754,
"outputTokens": 5,
"latencyMs": 1763.230291999993
@@ -5725,7 +5725,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6394,
"outputTokens": 1671,
"latencyMs": 14807.253333
@@ -5736,7 +5736,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "9",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1185.018333
@@ -5747,7 +5747,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2531,
"outputTokens": 1607,
"latencyMs": 13592.477832999997
@@ -5758,7 +5758,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "9",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 947.2789590000029
@@ -5769,7 +5769,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2385,
"outputTokens": 2759,
"latencyMs": 22718.536041999992
@@ -5780,7 +5780,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 973.4814580000093
@@ -5791,7 +5791,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6320,
"outputTokens": 1031,
"latencyMs": 10025.186000000002
@@ -5802,7 +5802,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 1038.4732499999955
@@ -5813,7 +5813,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5016,
"outputTokens": 903,
"latencyMs": 12459.619915999996
@@ -5824,7 +5824,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 1448.7940839999937
@@ -5835,7 +5835,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6394,
"outputTokens": 1415,
"latencyMs": 13094.547666999992
@@ -5846,7 +5846,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1241.7239169999957
@@ -5857,7 +5857,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2531,
"outputTokens": 1031,
"latencyMs": 10610.864084
@@ -5868,7 +5868,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "6",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 1100.7670829999988
@@ -5879,7 +5879,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2385,
"outputTokens": 1095,
"latencyMs": 11523.293417000008
@@ -5890,7 +5890,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 980.1522499999992
@@ -5901,7 +5901,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6320,
"outputTokens": 1095,
"latencyMs": 8184.143375
@@ -5912,7 +5912,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "6",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 1175.0723330000037
@@ -5923,7 +5923,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5016,
"outputTokens": 1159,
"latencyMs": 13082.53912500001
@@ -5934,7 +5934,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 1020.4026659999945
@@ -5945,7 +5945,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6394,
"outputTokens": 1223,
"latencyMs": 13166.679334
@@ -5956,7 +5956,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1090.0060839999933
@@ -5967,7 +5967,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2531,
"outputTokens": 1287,
"latencyMs": 11181.234958000001
@@ -5978,7 +5978,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 1365.1262080000015
@@ -5989,7 +5989,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2385,
"outputTokens": 967,
"latencyMs": 9549.427916999994
@@ -6000,7 +6000,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 981.8662500000064
@@ -6011,7 +6011,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6320,
"outputTokens": 1223,
"latencyMs": 11591.030333000002
@@ -6022,7 +6022,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 1430.038750000007
@@ -6033,7 +6033,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "10",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5016,
"outputTokens": 1735,
"latencyMs": 11458.303500000009
@@ -6044,7 +6044,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "9",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 1103.2402909999946
@@ -6055,7 +6055,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "11",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6394,
"outputTokens": 2631,
"latencyMs": 16900.63120799999
@@ -6066,7 +6066,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1043.442332999999
@@ -6077,7 +6077,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2531,
"outputTokens": 839,
"latencyMs": 7278.612083
@@ -6088,7 +6088,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "6",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 1705.2114999999903
@@ -6099,7 +6099,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "11",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2385,
"outputTokens": 1415,
"latencyMs": 10625.603375000006
@@ -6110,7 +6110,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 1081.0501670000085
@@ -6121,7 +6121,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "12",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6320,
"outputTokens": 2055,
"latencyMs": 17548.71483299999
@@ -6132,7 +6132,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 2302.2003750000003
@@ -6143,7 +6143,7 @@
"model": "gpt-5-nano",
"expected": "12",
"actual": "11",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5016,
"outputTokens": 1287,
"latencyMs": 13187.201000000015
@@ -6154,7 +6154,7 @@
"model": "claude-haiku-4-5",
"expected": "12",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 2621.4970829999947
@@ -6165,7 +6165,7 @@
"model": "gpt-5-nano",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6393,
"outputTokens": 3783,
"latencyMs": 29393.69395799999
@@ -6176,7 +6176,7 @@
"model": "claude-haiku-4-5",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1402.049291999996
@@ -6187,7 +6187,7 @@
"model": "gpt-5-nano",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2530,
"outputTokens": 2823,
"latencyMs": 23696.75
@@ -6198,7 +6198,7 @@
"model": "claude-haiku-4-5",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 1064.7778749999998
@@ -6209,7 +6209,7 @@
"model": "gpt-5-nano",
"expected": "62",
"actual": "64",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2384,
"outputTokens": 3143,
"latencyMs": 28384.533249999993
@@ -6220,7 +6220,7 @@
"model": "claude-haiku-4-5",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 889.2725839999912
@@ -6231,7 +6231,7 @@
"model": "gpt-5-nano",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6319,
"outputTokens": 6663,
"latencyMs": 50113.09675
@@ -6242,7 +6242,7 @@
"model": "claude-haiku-4-5",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 1074.8158330000006
@@ -6253,7 +6253,7 @@
"model": "gpt-5-nano",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5015,
"outputTokens": 2631,
"latencyMs": 23841.036083999992
@@ -6264,7 +6264,7 @@
"model": "claude-haiku-4-5",
"expected": "62",
"actual": "62",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 1010.4629169999971
@@ -6275,7 +6275,7 @@
"model": "gpt-5-nano",
"expected": "45",
"actual": "45",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6393,
"outputTokens": 2247,
"latencyMs": 18818.030874999997
@@ -6286,7 +6286,7 @@
"model": "claude-haiku-4-5",
"expected": "45",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7872,
"outputTokens": 5,
"latencyMs": 1203.152833
@@ -6297,7 +6297,7 @@
"model": "gpt-5-nano",
"expected": "45",
"actual": "45",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2530,
"outputTokens": 2631,
"latencyMs": 21987.539915999994
@@ -6308,7 +6308,7 @@
"model": "claude-haiku-4-5",
"expected": "45",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2984,
"outputTokens": 5,
"latencyMs": 1000.0181669999874
@@ -6319,7 +6319,7 @@
"model": "gpt-5-nano",
"expected": "45",
"actual": "46",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2384,
"outputTokens": 3079,
"latencyMs": 24534.847250000006
@@ -6330,7 +6330,7 @@
"model": "claude-haiku-4-5",
"expected": "45",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 2858,
"outputTokens": 5,
"latencyMs": 1125.7029999999795
@@ -6341,7 +6341,7 @@
"model": "gpt-5-nano",
"expected": "45",
"actual": "45",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6319,
"outputTokens": 2823,
"latencyMs": 27053.90824999998
@@ -6352,7 +6352,7 @@
"model": "claude-haiku-4-5",
"expected": "45",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6367,
"outputTokens": 5,
"latencyMs": 1474.1193330000096
@@ -6363,7 +6363,7 @@
"model": "gpt-5-nano",
"expected": "45",
"actual": "45",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 5015,
"outputTokens": 2567,
"latencyMs": 21642.824207999976
@@ -6374,7 +6374,7 @@
"model": "claude-haiku-4-5",
"expected": "45",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 5762,
"outputTokens": 5,
"latencyMs": 1170.1535830000066
@@ -6385,7 +6385,7 @@
"model": "gpt-5-nano",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 73,
"latencyMs": 2340.6126670000085
@@ -6396,7 +6396,7 @@
"model": "claude-haiku-4-5",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1337.4746670000022
@@ -6407,7 +6407,7 @@
"model": "gpt-5-nano",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 137,
"latencyMs": 2275.1715830000176
@@ -6418,7 +6418,7 @@
"model": "claude-haiku-4-5",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1086.9557499999937
@@ -6429,7 +6429,7 @@
"model": "gpt-5-nano",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 137,
"latencyMs": 2881.4037499999977
@@ -6440,7 +6440,7 @@
"model": "claude-haiku-4-5",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1172.774000000005
@@ -6451,7 +6451,7 @@
"model": "gpt-5-nano",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 201,
"latencyMs": 7706.478582999989
@@ -6462,7 +6462,7 @@
"model": "claude-haiku-4-5",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1106.0717920000025
@@ -6473,7 +6473,7 @@
"model": "gpt-5-nano",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 137,
"latencyMs": 6185.161250000005
@@ -6484,7 +6484,7 @@
"model": "claude-haiku-4-5",
"expected": "96.17",
"actual": "96.17",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1388.4410000000207
@@ -6495,7 +6495,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 136,
"latencyMs": 6699.9394589999865
@@ -6506,7 +6506,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1152.8117919999931
@@ -6517,7 +6517,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 136,
"latencyMs": 2446.019666999986
@@ -6528,7 +6528,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1046.3494580000115
@@ -6539,7 +6539,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 200,
"latencyMs": 6084.429165999987
@@ -6550,7 +6550,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1787.2428749999963
@@ -6561,7 +6561,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 264,
"latencyMs": 5364.3007919999945
@@ -6572,7 +6572,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1269.2162499999977
@@ -6583,7 +6583,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 72,
"latencyMs": 2381.514374999999
@@ -6594,7 +6594,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1222.1361669999897
@@ -6605,7 +6605,7 @@
"model": "gpt-5-nano",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 201,
"latencyMs": 3641.536167000013
@@ -6616,7 +6616,7 @@
"model": "claude-haiku-4-5",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 2457.5752079999947
@@ -6627,7 +6627,7 @@
"model": "gpt-5-nano",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 201,
"latencyMs": 3384.6115839999984
@@ -6638,7 +6638,7 @@
"model": "claude-haiku-4-5",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1372.8756669999857
@@ -6649,7 +6649,7 @@
"model": "gpt-5-nano",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 265,
"latencyMs": 5826.962750000006
@@ -6660,7 +6660,7 @@
"model": "claude-haiku-4-5",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1303.1691670000146
@@ -6671,7 +6671,7 @@
"model": "gpt-5-nano",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 265,
"latencyMs": 3602.1091250000172
@@ -6682,7 +6682,7 @@
"model": "claude-haiku-4-5",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1451.1585410000116
@@ -6693,7 +6693,7 @@
"model": "gpt-5-nano",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 137,
"latencyMs": 2453.183083000011
@@ -6704,7 +6704,7 @@
"model": "claude-haiku-4-5",
"expected": "599.39",
"actual": "599.39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1152.136541999993
@@ -6715,7 +6715,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 199,
"latencyMs": 5025.56916699998
@@ -6726,7 +6726,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1111.5014169999922
@@ -6737,7 +6737,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 199,
"latencyMs": 3548.9061660000007
@@ -6748,7 +6748,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1404.0692500000005
@@ -6759,7 +6759,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 135,
"latencyMs": 2879.9619169999787
@@ -6770,7 +6770,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1258.860249999998
@@ -6781,7 +6781,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 263,
"latencyMs": 7819.738958000002
@@ -6792,7 +6792,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1495.973915999988
@@ -6803,7 +6803,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 135,
"latencyMs": 3092.4329169999983
@@ -6814,7 +6814,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1268.1641250000102
@@ -6825,7 +6825,7 @@
"model": "gpt-5-nano",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 265,
"latencyMs": 4409.96212500002
@@ -6836,7 +6836,7 @@
"model": "claude-haiku-4-5",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1422.6079999999783
@@ -6847,7 +6847,7 @@
"model": "gpt-5-nano",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 329,
"latencyMs": 3593.100334000017
@@ -6858,7 +6858,7 @@
"model": "claude-haiku-4-5",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1474.3911249999946
@@ -6869,7 +6869,7 @@
"model": "gpt-5-nano",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 265,
"latencyMs": 5419.795374999987
@@ -6880,7 +6880,7 @@
"model": "claude-haiku-4-5",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1059.3489999999874
@@ -6891,7 +6891,7 @@
"model": "gpt-5-nano",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 265,
"latencyMs": 4783.504167000006
@@ -6902,7 +6902,7 @@
"model": "claude-haiku-4-5",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1340.6675410000025
@@ -6913,7 +6913,7 @@
"model": "gpt-5-nano",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 329,
"latencyMs": 4222.140958000004
@@ -6924,7 +6924,7 @@
"model": "claude-haiku-4-5",
"expected": "528.71",
"actual": "528.71",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1169.892125000013
@@ -6935,7 +6935,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 135,
"latencyMs": 2854.8382500000007
@@ -6946,7 +6946,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1077.335374999995
@@ -6957,7 +6957,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 135,
"latencyMs": 2525.2092499999853
@@ -6968,7 +6968,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 2100.2050000000163
@@ -6979,7 +6979,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 263,
"latencyMs": 5882.592499999999
@@ -6990,7 +6990,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1168.5295410000253
@@ -7001,7 +7001,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 263,
"latencyMs": 3944.433083000011
@@ -7012,7 +7012,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1882.1263749999925
@@ -7023,7 +7023,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 135,
"latencyMs": 1657.7255829999922
@@ -7034,7 +7034,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1056.5719169999938
@@ -7045,7 +7045,7 @@
"model": "gpt-5-nano",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 266,
"latencyMs": 5764.2531250000175
@@ -7056,7 +7056,7 @@
"model": "claude-haiku-4-5",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 8,
"latencyMs": 1241.8239590000012
@@ -7067,7 +7067,7 @@
"model": "gpt-5-nano",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 266,
"latencyMs": 3203.148416000011
@@ -7078,7 +7078,7 @@
"model": "claude-haiku-4-5",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 8,
"latencyMs": 1395.2265419999894
@@ -7089,7 +7089,7 @@
"model": "gpt-5-nano",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 330,
"latencyMs": 3854.1738750000077
@@ -7100,7 +7100,7 @@
"model": "claude-haiku-4-5",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 8,
"latencyMs": 1868.680457999988
@@ -7111,7 +7111,7 @@
"model": "gpt-5-nano",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 330,
"latencyMs": 4486.571708000003
@@ -7122,7 +7122,7 @@
"model": "claude-haiku-4-5",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 8,
"latencyMs": 1336.9320829999924
@@ -7133,7 +7133,7 @@
"model": "gpt-5-nano",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 266,
"latencyMs": 3571.6664579999924
@@ -7144,7 +7144,7 @@
"model": "claude-haiku-4-5",
"expected": "1687.82",
"actual": "1687.82",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 8,
"latencyMs": 1179.5032920000085
@@ -7155,7 +7155,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 200,
"latencyMs": 3395.709499999997
@@ -7166,7 +7166,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1374.4573329999985
@@ -7177,7 +7177,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 200,
"latencyMs": 3162.779542000004
@@ -7188,7 +7188,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1010.6076670000039
@@ -7199,7 +7199,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 328,
"latencyMs": 3606.7964999999967
@@ -7210,7 +7210,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1432.5227920000034
@@ -7221,7 +7221,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 328,
"latencyMs": 2916.351958000014
@@ -7232,7 +7232,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1207.7237920000043
@@ -7243,7 +7243,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 136,
"latencyMs": 2741.256458000018
@@ -7254,7 +7254,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1385.7817920000234
@@ -7265,7 +7265,7 @@
"model": "gpt-5-nano",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 201,
"latencyMs": 4731.81024999998
@@ -7276,7 +7276,7 @@
"model": "claude-haiku-4-5",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1572.4971659999865
@@ -7287,7 +7287,7 @@
"model": "gpt-5-nano",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 137,
"latencyMs": 2684.556333000015
@@ -7298,7 +7298,7 @@
"model": "claude-haiku-4-5",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1314.9989999999816
@@ -7309,7 +7309,7 @@
"model": "gpt-5-nano",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 137,
"latencyMs": 2746.457541999989
@@ -7320,7 +7320,7 @@
"model": "claude-haiku-4-5",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1254.8903329999885
@@ -7331,7 +7331,7 @@
"model": "gpt-5-nano",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 137,
"latencyMs": 4298.293416
@@ -7342,7 +7342,7 @@
"model": "claude-haiku-4-5",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1346.4980839999916
@@ -7353,7 +7353,7 @@
"model": "gpt-5-nano",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 265,
"latencyMs": 3634.2565419999883
@@ -7364,7 +7364,7 @@
"model": "claude-haiku-4-5",
"expected": "423.6",
"actual": "423.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1363.8280410000007
@@ -7375,7 +7375,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 392,
"latencyMs": 3933.217000000004
@@ -7386,7 +7386,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1229.9339579999796
@@ -7397,7 +7397,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 136,
"latencyMs": 2728.4598340000084
@@ -7408,7 +7408,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1427.2494170000136
@@ -7419,7 +7419,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 200,
"latencyMs": 3187.385666999995
@@ -7430,7 +7430,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1482.2487079999992
@@ -7441,7 +7441,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 264,
"latencyMs": 3429.744458000001
@@ -7452,7 +7452,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1100.8814589999965
@@ -7463,7 +7463,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 72,
"latencyMs": 1993.443707999977
@@ -7474,7 +7474,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1105.5260419999831
@@ -7485,7 +7485,7 @@
"model": "gpt-5-nano",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 137,
"latencyMs": 3255.3775840000017
@@ -7496,7 +7496,7 @@
"model": "claude-haiku-4-5",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1274.000417000003
@@ -7507,7 +7507,7 @@
"model": "gpt-5-nano",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 265,
"latencyMs": 3098.326624999987
@@ -7518,7 +7518,7 @@
"model": "claude-haiku-4-5",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1057.8637079999899
@@ -7529,7 +7529,7 @@
"model": "gpt-5-nano",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 201,
"latencyMs": 3651.3826249999984
@@ -7540,7 +7540,7 @@
"model": "claude-haiku-4-5",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1404.9795829999784
@@ -7551,7 +7551,7 @@
"model": "gpt-5-nano",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 201,
"latencyMs": 4157.148833000014
@@ -7562,7 +7562,7 @@
"model": "claude-haiku-4-5",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1607.9431249999907
@@ -7573,7 +7573,7 @@
"model": "gpt-5-nano",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 329,
"latencyMs": 4582.246665999992
@@ -7584,7 +7584,7 @@
"model": "claude-haiku-4-5",
"expected": "784.03",
"actual": "784.03",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1458.8513329999987
@@ -7595,7 +7595,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 200,
"latencyMs": 3341.994207999989
@@ -7606,7 +7606,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1144.3136670000094
@@ -7617,7 +7617,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 392,
"latencyMs": 6067.672458999994
@@ -7628,7 +7628,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1325.0467500000086
@@ -7639,7 +7639,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 200,
"latencyMs": 2847.485000000015
@@ -7650,7 +7650,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1212.1944169999915
@@ -7661,7 +7661,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 456,
"latencyMs": 5099.853499999997
@@ -7672,7 +7672,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1284.708416999987
@@ -7683,7 +7683,7 @@
"model": "gpt-5-nano",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 200,
"latencyMs": 2745.7869170000195
@@ -7694,7 +7694,7 @@
"model": "claude-haiku-4-5",
"expected": "shipped",
"actual": "shipped",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1114.6338329999999
@@ -7705,7 +7705,7 @@
"model": "gpt-5-nano",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 265,
"latencyMs": 3482.8154170000053
@@ -7716,7 +7716,7 @@
"model": "claude-haiku-4-5",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1156.5491669999901
@@ -7727,7 +7727,7 @@
"model": "gpt-5-nano",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 201,
"latencyMs": 2970.104541000008
@@ -7738,7 +7738,7 @@
"model": "claude-haiku-4-5",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1297.768374999985
@@ -7749,7 +7749,7 @@
"model": "gpt-5-nano",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 201,
"latencyMs": 3475.6895419999782
@@ -7760,7 +7760,7 @@
"model": "claude-haiku-4-5",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1469.7436250000028
@@ -7771,7 +7771,7 @@
"model": "gpt-5-nano",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 265,
"latencyMs": 4107.424582999985
@@ -7782,7 +7782,7 @@
"model": "claude-haiku-4-5",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1070.4507500000182
@@ -7793,7 +7793,7 @@
"model": "gpt-5-nano",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 265,
"latencyMs": 3768.3023749999993
@@ -7804,7 +7804,7 @@
"model": "claude-haiku-4-5",
"expected": "645.88",
"actual": "645.88",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1111.744915999996
@@ -7815,7 +7815,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 263,
"latencyMs": 3199.3634999999776
@@ -7826,7 +7826,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1232.4811659999832
@@ -7837,7 +7837,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 263,
"latencyMs": 5616.989999999991
@@ -7848,7 +7848,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1697.3162920000032
@@ -7859,7 +7859,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 199,
"latencyMs": 2781.3399999999965
@@ -7870,7 +7870,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1162.0402089999989
@@ -7881,7 +7881,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 199,
"latencyMs": 3651.1349579999805
@@ -7892,7 +7892,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1132.3132920000062
@@ -7903,7 +7903,7 @@
"model": "gpt-5-nano",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 135,
"latencyMs": 3017.5073749999865
@@ -7914,7 +7914,7 @@
"model": "claude-haiku-4-5",
"expected": "processing",
"actual": "processing",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1294.688374999998
@@ -7925,7 +7925,7 @@
"model": "gpt-5-nano",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 201,
"latencyMs": 3591.221499999985
@@ -7936,7 +7936,7 @@
"model": "claude-haiku-4-5",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1329.419332999998
@@ -7947,7 +7947,7 @@
"model": "gpt-5-nano",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 137,
"latencyMs": 2655.557792000007
@@ -7958,7 +7958,7 @@
"model": "claude-haiku-4-5",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1446.9020000000019
@@ -7969,7 +7969,7 @@
"model": "gpt-5-nano",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 201,
"latencyMs": 3450.5822500000068
@@ -7980,7 +7980,7 @@
"model": "claude-haiku-4-5",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1291.2180410000146
@@ -7991,7 +7991,7 @@
"model": "gpt-5-nano",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 201,
"latencyMs": 2803.9767500000016
@@ -8002,7 +8002,7 @@
"model": "claude-haiku-4-5",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1098.5968749999884
@@ -8013,7 +8013,7 @@
"model": "gpt-5-nano",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 201,
"latencyMs": 3047.8699999999953
@@ -8024,7 +8024,7 @@
"model": "claude-haiku-4-5",
"expected": "371.91",
"actual": "371.91",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1800.6882080000069
@@ -8035,7 +8035,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 199,
"latencyMs": 2957.2203330000048
@@ -8046,7 +8046,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1165.7748750000028
@@ -8057,7 +8057,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 135,
"latencyMs": 2362.283208000008
@@ -8068,7 +8068,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1871.7275829999999
@@ -8079,7 +8079,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 263,
"latencyMs": 4747.243208
@@ -8090,7 +8090,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1275.342082999996
@@ -8101,7 +8101,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 199,
"latencyMs": 3180.0179160000116
@@ -8112,7 +8112,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 2343.5514580000017
@@ -8123,7 +8123,7 @@
"model": "gpt-5-nano",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 135,
"latencyMs": 2362.525915999984
@@ -8134,7 +8134,7 @@
"model": "claude-haiku-4-5",
"expected": "pending",
"actual": "pending",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1231.4291669999948
@@ -8145,7 +8145,7 @@
"model": "gpt-5-nano",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 200,
"latencyMs": 3091.9045840000035
@@ -8156,7 +8156,7 @@
"model": "claude-haiku-4-5",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 6,
"latencyMs": 1111.9695000000065
@@ -8167,7 +8167,7 @@
"model": "gpt-5-nano",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 264,
"latencyMs": 3977.5146669999813
@@ -8178,7 +8178,7 @@
"model": "claude-haiku-4-5",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 6,
"latencyMs": 1195.262208
@@ -8189,7 +8189,7 @@
"model": "gpt-5-nano",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 328,
"latencyMs": 3839.0627499999828
@@ -8200,7 +8200,7 @@
"model": "claude-haiku-4-5",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 6,
"latencyMs": 2186.8021250000165
@@ -8211,7 +8211,7 @@
"model": "gpt-5-nano",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 328,
"latencyMs": 6945.004667000001
@@ -8222,7 +8222,7 @@
"model": "claude-haiku-4-5",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 6,
"latencyMs": 1103.6762919999892
@@ -8233,7 +8233,7 @@
"model": "gpt-5-nano",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 264,
"latencyMs": 3924.5181250000023
@@ -8244,7 +8244,7 @@
"model": "claude-haiku-4-5",
"expected": "1066",
"actual": "1066",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 6,
"latencyMs": 1023.334583000018
@@ -8255,7 +8255,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 264,
"latencyMs": 4017.931666999997
@@ -8266,7 +8266,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1278.6839580000087
@@ -8277,7 +8277,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 200,
"latencyMs": 2566.9374580000003
@@ -8288,7 +8288,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 958.4104159999988
@@ -8299,7 +8299,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 264,
"latencyMs": 3640.0960409999825
@@ -8310,7 +8310,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1534.7306249999965
@@ -8321,7 +8321,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 328,
"latencyMs": 3905.6711249999935
@@ -8332,7 +8332,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 2067.435375000001
@@ -8343,7 +8343,7 @@
"model": "gpt-5-nano",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 264,
"latencyMs": 3613.7146249999932
@@ -8354,7 +8354,7 @@
"model": "claude-haiku-4-5",
"expected": "cancelled",
"actual": "cancelled",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1154.955958000006
@@ -8365,7 +8365,7 @@
"model": "gpt-5-nano",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 330,
"latencyMs": 3904.2146250000224
@@ -8376,7 +8376,7 @@
"model": "claude-haiku-4-5",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 8,
"latencyMs": 1618.7487079999992
@@ -8387,7 +8387,7 @@
"model": "gpt-5-nano",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 202,
"latencyMs": 2906.194541999983
@@ -8398,7 +8398,7 @@
"model": "claude-haiku-4-5",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 8,
"latencyMs": 1481.559333000012
@@ -8409,7 +8409,7 @@
"model": "gpt-5-nano",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 266,
"latencyMs": 3879.7539999999863
@@ -8420,7 +8420,7 @@
"model": "claude-haiku-4-5",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 8,
"latencyMs": 1809.5822499999776
@@ -8431,7 +8431,7 @@
"model": "gpt-5-nano",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 202,
"latencyMs": 3147.330500000011
@@ -8442,7 +8442,7 @@
"model": "claude-haiku-4-5",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 8,
"latencyMs": 1297.2377080000006
@@ -8453,7 +8453,7 @@
"model": "gpt-5-nano",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 394,
"latencyMs": 3710.157500000001
@@ -8464,7 +8464,7 @@
"model": "claude-haiku-4-5",
"expected": "1697.4",
"actual": "1697.4",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 8,
"latencyMs": 1238.5442500000063
@@ -8475,7 +8475,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 392,
"latencyMs": 4101.743083999987
@@ -8486,7 +8486,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11906,
"outputTokens": 4,
"latencyMs": 1170.750417000003
@@ -8497,7 +8497,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 264,
"latencyMs": 8324.009665999998
@@ -8508,7 +8508,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6992,
"outputTokens": 4,
"latencyMs": 1173.343790999992
@@ -8519,7 +8519,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 264,
"latencyMs": 3005.4394999999786
@@ -8530,7 +8530,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8413,
"outputTokens": 4,
"latencyMs": 1376.5506659999955
@@ -8541,7 +8541,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 136,
"latencyMs": 3209.5317499999946
@@ -8552,7 +8552,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9288,
"outputTokens": 4,
"latencyMs": 1299.4064170000202
@@ -8563,7 +8563,7 @@
"model": "gpt-5-nano",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 264,
"latencyMs": 3753.726042000024
@@ -8574,7 +8574,7 @@
"model": "claude-haiku-4-5",
"expected": "delivered",
"actual": "delivered",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8384,
"outputTokens": 4,
"latencyMs": 1134.558416999993
@@ -8585,7 +8585,7 @@
"model": "gpt-5-nano",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 73,
"latencyMs": 2494.451874999999
@@ -8596,7 +8596,7 @@
"model": "claude-haiku-4-5",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 9,
"latencyMs": 1270.5290410000016
@@ -8607,7 +8607,7 @@
"model": "gpt-5-nano",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 137,
"latencyMs": 2403.4134579999954
@@ -8618,7 +8618,7 @@
"model": "claude-haiku-4-5",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 9,
"latencyMs": 1673.0169579999929
@@ -8629,7 +8629,7 @@
"model": "gpt-5-nano",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 73,
"latencyMs": 1704.8420409999962
@@ -8640,7 +8640,7 @@
"model": "claude-haiku-4-5",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 9,
"latencyMs": 1447.5210840000072
@@ -8651,7 +8651,7 @@
"model": "gpt-5-nano",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 73,
"latencyMs": 1638.756207999977
@@ -8662,7 +8662,7 @@
"model": "claude-haiku-4-5",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 9,
"latencyMs": 1504.7892920000013
@@ -8673,7 +8673,7 @@
"model": "gpt-5-nano",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 137,
"latencyMs": 2409.509625000006
@@ -8684,7 +8684,7 @@
"model": "claude-haiku-4-5",
"expected": "Valerie Braun",
"actual": "Valerie Braun",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 9,
"latencyMs": 1318.699833999999
@@ -8695,7 +8695,7 @@
"model": "gpt-5-nano",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 138,
"latencyMs": 2616.233749999985
@@ -8706,7 +8706,7 @@
"model": "claude-haiku-4-5",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 9,
"latencyMs": 1314.3836249999877
@@ -8717,7 +8717,7 @@
"model": "gpt-5-nano",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 138,
"latencyMs": 2722.7087499999907
@@ -8728,7 +8728,7 @@
"model": "claude-haiku-4-5",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 9,
"latencyMs": 1190.632500000007
@@ -8739,7 +8739,7 @@
"model": "gpt-5-nano",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 330,
"latencyMs": 4346.388291999989
@@ -8750,7 +8750,7 @@
"model": "claude-haiku-4-5",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 9,
"latencyMs": 1327.8158750000002
@@ -8761,7 +8761,7 @@
"model": "gpt-5-nano",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 74,
"latencyMs": 2443.0598340000142
@@ -8772,7 +8772,7 @@
"model": "claude-haiku-4-5",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 9,
"latencyMs": 1396.4260829999985
@@ -8783,7 +8783,7 @@
"model": "gpt-5-nano",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 266,
"latencyMs": 4886.8007919999945
@@ -8794,7 +8794,7 @@
"model": "claude-haiku-4-5",
"expected": "Anita Kozey",
"actual": "Anita Kozey",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 9,
"latencyMs": 1469.287249999994
@@ -8805,7 +8805,7 @@
"model": "gpt-5-nano",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 139,
"latencyMs": 2891.1199170000036
@@ -8816,7 +8816,7 @@
"model": "claude-haiku-4-5",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 10,
"latencyMs": 1342.1902079999854
@@ -8827,7 +8827,7 @@
"model": "gpt-5-nano",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 139,
"latencyMs": 2846.046624999988
@@ -8838,7 +8838,7 @@
"model": "claude-haiku-4-5",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 10,
"latencyMs": 1327.919499999989
@@ -8849,7 +8849,7 @@
"model": "gpt-5-nano",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 139,
"latencyMs": 4302.444041999988
@@ -8860,7 +8860,7 @@
"model": "claude-haiku-4-5",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 10,
"latencyMs": 1207.6207500000019
@@ -8871,7 +8871,7 @@
"model": "gpt-5-nano",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 267,
"latencyMs": 3389.5046659999934
@@ -8882,7 +8882,7 @@
"model": "claude-haiku-4-5",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 10,
"latencyMs": 1236.2248340000224
@@ -8893,7 +8893,7 @@
"model": "gpt-5-nano",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 139,
"latencyMs": 2138.4831669999985
@@ -8904,7 +8904,7 @@
"model": "claude-haiku-4-5",
"expected": "Elmer Kub PhD",
"actual": "Elmer Kub PhD",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 10,
"latencyMs": 1233.3828330000106
@@ -8915,7 +8915,7 @@
"model": "gpt-5-nano",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 138,
"latencyMs": 3346.8621669999848
@@ -8926,7 +8926,7 @@
"model": "claude-haiku-4-5",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 10,
"latencyMs": 1321.650082999986
@@ -8937,7 +8937,7 @@
"model": "gpt-5-nano",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 138,
"latencyMs": 2395.766499999998
@@ -8948,7 +8948,7 @@
"model": "claude-haiku-4-5",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 10,
"latencyMs": 1749.51670800001
@@ -8959,7 +8959,7 @@
"model": "gpt-5-nano",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 330,
"latencyMs": 4207.4487500000105
@@ -8970,7 +8970,7 @@
"model": "claude-haiku-4-5",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 10,
"latencyMs": 1495.846125000011
@@ -8981,7 +8981,7 @@
"model": "gpt-5-nano",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 266,
"latencyMs": 4258.881374999997
@@ -8992,7 +8992,7 @@
"model": "claude-haiku-4-5",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 10,
"latencyMs": 1113.9782499999856
@@ -9003,7 +9003,7 @@
"model": "gpt-5-nano",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 74,
"latencyMs": 1841.1115829999908
@@ -9014,7 +9014,7 @@
"model": "claude-haiku-4-5",
"expected": "Maxine Zemlak",
"actual": "Maxine Zemlak",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 10,
"latencyMs": 1350.6631249999919
@@ -9025,7 +9025,7 @@
"model": "gpt-5-nano",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 138,
"latencyMs": 2322.9531669999997
@@ -9036,7 +9036,7 @@
"model": "claude-haiku-4-5",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1556.4763749999984
@@ -9047,7 +9047,7 @@
"model": "gpt-5-nano",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 74,
"latencyMs": 2354.004667000001
@@ -9058,7 +9058,7 @@
"model": "claude-haiku-4-5",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1314.1952909999818
@@ -9069,7 +9069,7 @@
"model": "gpt-5-nano",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 138,
"latencyMs": 3437.8392080000194
@@ -9080,7 +9080,7 @@
"model": "claude-haiku-4-5",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1131.0356249999895
@@ -9091,7 +9091,7 @@
"model": "gpt-5-nano",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 138,
"latencyMs": 3209.646000000008
@@ -9102,7 +9102,7 @@
"model": "claude-haiku-4-5",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1175.6475829999836
@@ -9113,7 +9113,7 @@
"model": "gpt-5-nano",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 266,
"latencyMs": 3785.0792920000094
@@ -9124,7 +9124,7 @@
"model": "claude-haiku-4-5",
"expected": "Emanuel Littel",
"actual": "Emanuel Littel",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1314.7905420000025
@@ -9135,7 +9135,7 @@
"model": "gpt-5-nano",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 72,
"latencyMs": 2562.896166999999
@@ -9146,7 +9146,7 @@
"model": "claude-haiku-4-5",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 3205.178583000001
@@ -9157,7 +9157,7 @@
"model": "gpt-5-nano",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 136,
"latencyMs": 3746.9874170000257
@@ -9168,7 +9168,7 @@
"model": "claude-haiku-4-5",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1159.280584000022
@@ -9179,7 +9179,7 @@
"model": "gpt-5-nano",
"expected": "Andrew Kling",
"actual": "Marvin Thiel",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6781,
"outputTokens": 202,
"latencyMs": 2584.499542000005
@@ -9190,7 +9190,7 @@
"model": "claude-haiku-4-5",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1249.9375
@@ -9201,7 +9201,7 @@
"model": "gpt-5-nano",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 136,
"latencyMs": 2068.6956669999927
@@ -9212,7 +9212,7 @@
"model": "claude-haiku-4-5",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1733.235834000021
@@ -9223,7 +9223,7 @@
"model": "gpt-5-nano",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 200,
"latencyMs": 3831.721124999982
@@ -9234,7 +9234,7 @@
"model": "claude-haiku-4-5",
"expected": "Andrew Kling",
"actual": "Andrew Kling",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1311.1745419999934
@@ -9245,7 +9245,7 @@
"model": "gpt-5-nano",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 139,
"latencyMs": 5464.460791999998
@@ -9256,7 +9256,7 @@
"model": "claude-haiku-4-5",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 9,
"latencyMs": 1266.8881249999977
@@ -9267,7 +9267,7 @@
"model": "gpt-5-nano",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 203,
"latencyMs": 2957.0821250000154
@@ -9278,7 +9278,7 @@
"model": "claude-haiku-4-5",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 9,
"latencyMs": 1264.50791700001
@@ -9289,7 +9289,7 @@
"model": "gpt-5-nano",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 331,
"latencyMs": 3740.643666000018
@@ -9300,7 +9300,7 @@
"model": "claude-haiku-4-5",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 9,
"latencyMs": 1310.5358749999723
@@ -9311,7 +9311,7 @@
"model": "gpt-5-nano",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 139,
"latencyMs": 2979.4539579999982
@@ -9322,7 +9322,7 @@
"model": "claude-haiku-4-5",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 9,
"latencyMs": 2026.8683329999913
@@ -9333,7 +9333,7 @@
"model": "gpt-5-nano",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 139,
"latencyMs": 2932.0294159999758
@@ -9344,7 +9344,7 @@
"model": "claude-haiku-4-5",
"expected": "Morris O'Hara",
"actual": "Morris O'Hara",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 9,
"latencyMs": 1130.2447079999838
@@ -9355,7 +9355,7 @@
"model": "gpt-5-nano",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 203,
"latencyMs": 2576.945458000002
@@ -9366,7 +9366,7 @@
"model": "claude-haiku-4-5",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 9,
"latencyMs": 1214.6620409999741
@@ -9377,7 +9377,7 @@
"model": "gpt-5-nano",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 203,
"latencyMs": 3718.371167000005
@@ -9388,7 +9388,7 @@
"model": "claude-haiku-4-5",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 9,
"latencyMs": 1374.984832999995
@@ -9399,7 +9399,7 @@
"model": "gpt-5-nano",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 139,
"latencyMs": 2313.5867499999877
@@ -9410,7 +9410,7 @@
"model": "claude-haiku-4-5",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 9,
"latencyMs": 1325.0793330000015
@@ -9421,7 +9421,7 @@
"model": "gpt-5-nano",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 139,
"latencyMs": 2777.8669999999984
@@ -9432,7 +9432,7 @@
"model": "claude-haiku-4-5",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 9,
"latencyMs": 1246.2134589999914
@@ -9443,7 +9443,7 @@
"model": "gpt-5-nano",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 75,
"latencyMs": 2246.8254580000066
@@ -9454,7 +9454,7 @@
"model": "claude-haiku-4-5",
"expected": "Elijah Franecki",
"actual": "Elijah Franecki",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 9,
"latencyMs": 1573.5733749999781
@@ -9465,7 +9465,7 @@
"model": "gpt-5-nano",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 74,
"latencyMs": 2494.7630000000063
@@ -9476,7 +9476,7 @@
"model": "claude-haiku-4-5",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 7,
"latencyMs": 1135.412083000003
@@ -9487,7 +9487,7 @@
"model": "gpt-5-nano",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 138,
"latencyMs": 2332.6303330000082
@@ -9498,7 +9498,7 @@
"model": "claude-haiku-4-5",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 7,
"latencyMs": 1175.6766249999928
@@ -9509,7 +9509,7 @@
"model": "gpt-5-nano",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 458,
"latencyMs": 4252.623416000017
@@ -9520,7 +9520,7 @@
"model": "claude-haiku-4-5",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 7,
"latencyMs": 1297.546416999976
@@ -9531,7 +9531,7 @@
"model": "gpt-5-nano",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 74,
"latencyMs": 2264.2770829999936
@@ -9542,7 +9542,7 @@
"model": "claude-haiku-4-5",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 7,
"latencyMs": 1055.0764170000039
@@ -9553,7 +9553,7 @@
"model": "gpt-5-nano",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 138,
"latencyMs": 3193.2753749999974
@@ -9564,7 +9564,7 @@
"model": "claude-haiku-4-5",
"expected": "Malcolm Erdman",
"actual": "Malcolm Erdman",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 7,
"latencyMs": 1912.7229999999981
@@ -9575,7 +9575,7 @@
"model": "gpt-5-nano",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 138,
"latencyMs": 2147.5894160000025
@@ -9586,7 +9586,7 @@
"model": "claude-haiku-4-5",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 9,
"latencyMs": 1377.5190409999923
@@ -9597,7 +9597,7 @@
"model": "gpt-5-nano",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 202,
"latencyMs": 4472.317459000013
@@ -9608,7 +9608,7 @@
"model": "claude-haiku-4-5",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 9,
"latencyMs": 1376.0682919999817
@@ -9619,7 +9619,7 @@
"model": "gpt-5-nano",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 202,
"latencyMs": 6952.122459000006
@@ -9630,7 +9630,7 @@
"model": "claude-haiku-4-5",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 9,
"latencyMs": 1178.8732909999962
@@ -9641,7 +9641,7 @@
"model": "gpt-5-nano",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 266,
"latencyMs": 3619.214917000005
@@ -9652,7 +9652,7 @@
"model": "claude-haiku-4-5",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 9,
"latencyMs": 1212.3732920000039
@@ -9663,7 +9663,7 @@
"model": "gpt-5-nano",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 202,
"latencyMs": 5169.327332999994
@@ -9674,7 +9674,7 @@
"model": "claude-haiku-4-5",
"expected": "Fannie Skiles",
"actual": "Fannie Skiles",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 9,
"latencyMs": 1452.6941670000087
@@ -9685,7 +9685,7 @@
"model": "gpt-5-nano",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 395,
"latencyMs": 3384.798125000001
@@ -9696,7 +9696,7 @@
"model": "claude-haiku-4-5",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 10,
"latencyMs": 1241.960665999999
@@ -9707,7 +9707,7 @@
"model": "gpt-5-nano",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 331,
"latencyMs": 4747.914124999981
@@ -9718,7 +9718,7 @@
"model": "claude-haiku-4-5",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 10,
"latencyMs": 1302.8907080000208
@@ -9729,7 +9729,7 @@
"model": "gpt-5-nano",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 331,
"latencyMs": 3532.4660830000066
@@ -9740,7 +9740,7 @@
"model": "claude-haiku-4-5",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 10,
"latencyMs": 1203.086540999997
@@ -9751,7 +9751,7 @@
"model": "gpt-5-nano",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 331,
"latencyMs": 4074.5077089999977
@@ -9762,7 +9762,7 @@
"model": "claude-haiku-4-5",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 10,
"latencyMs": 1345.891499999998
@@ -9773,7 +9773,7 @@
"model": "gpt-5-nano",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 75,
"latencyMs": 1885.0838330000115
@@ -9784,7 +9784,7 @@
"model": "claude-haiku-4-5",
"expected": "Sonja Emmerich",
"actual": "Sonja Emmerich",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 10,
"latencyMs": 1182.5891669999983
@@ -9795,7 +9795,7 @@
"model": "gpt-5-nano",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 140,
"latencyMs": 2772.3258339999884
@@ -9806,7 +9806,7 @@
"model": "claude-haiku-4-5",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 10,
"latencyMs": 1424.9674579999992
@@ -9817,7 +9817,7 @@
"model": "gpt-5-nano",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 204,
"latencyMs": 2900.4731660000107
@@ -9828,7 +9828,7 @@
"model": "claude-haiku-4-5",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 10,
"latencyMs": 2815.817249999993
@@ -9839,7 +9839,7 @@
"model": "gpt-5-nano",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 268,
"latencyMs": 3637.2442089999968
@@ -9850,7 +9850,7 @@
"model": "claude-haiku-4-5",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 10,
"latencyMs": 1104.2333339999896
@@ -9861,7 +9861,7 @@
"model": "gpt-5-nano",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 396,
"latencyMs": 8213.703791999986
@@ -9872,7 +9872,7 @@
"model": "claude-haiku-4-5",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 10,
"latencyMs": 2875.9923749999725
@@ -9883,7 +9883,7 @@
"model": "gpt-5-nano",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 140,
"latencyMs": 2809.8342080000148
@@ -9894,7 +9894,7 @@
"model": "claude-haiku-4-5",
"expected": "Frank Emmerich DVM",
"actual": "Frank Emmerich DVM",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 10,
"latencyMs": 1306.0824999999895
@@ -9905,7 +9905,7 @@
"model": "gpt-5-nano",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 265,
"latencyMs": 3632.680000000022
@@ -9916,7 +9916,7 @@
"model": "claude-haiku-4-5",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 5,
"latencyMs": 1446.0535420000087
@@ -9927,7 +9927,7 @@
"model": "gpt-5-nano",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 201,
"latencyMs": 2629.6447500000068
@@ -9938,7 +9938,7 @@
"model": "claude-haiku-4-5",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 5,
"latencyMs": 1387.298958999978
@@ -9949,7 +9949,7 @@
"model": "gpt-5-nano",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 457,
"latencyMs": 8303.644042
@@ -9960,7 +9960,7 @@
"model": "claude-haiku-4-5",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 5,
"latencyMs": 1178.2771250000224
@@ -9971,7 +9971,7 @@
"model": "gpt-5-nano",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 329,
"latencyMs": 3967.7135410000046
@@ -9982,7 +9982,7 @@
"model": "claude-haiku-4-5",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 5,
"latencyMs": 1278.0479160000104
@@ -9993,7 +9993,7 @@
"model": "gpt-5-nano",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 73,
"latencyMs": 1974.7658750000119
@@ -10004,7 +10004,7 @@
"model": "claude-haiku-4-5",
"expected": "Ronald Collins",
"actual": "Ronald Collins",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 5,
"latencyMs": 1496.9746670000022
@@ -10015,7 +10015,7 @@
"model": "gpt-5-nano",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 201,
"latencyMs": 4246.4962499999965
@@ -10026,7 +10026,7 @@
"model": "claude-haiku-4-5",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 8,
"latencyMs": 1322.2766660000198
@@ -10037,7 +10037,7 @@
"model": "gpt-5-nano",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 137,
"latencyMs": 2135.097083999979
@@ -10048,7 +10048,7 @@
"model": "claude-haiku-4-5",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 8,
"latencyMs": 1213.9765000000189
@@ -10059,7 +10059,7 @@
"model": "gpt-5-nano",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 265,
"latencyMs": 3583.0762920000125
@@ -10070,7 +10070,7 @@
"model": "claude-haiku-4-5",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 8,
"latencyMs": 1353.168249999988
@@ -10081,7 +10081,7 @@
"model": "gpt-5-nano",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 201,
"latencyMs": 3724.366249999992
@@ -10092,7 +10092,7 @@
"model": "claude-haiku-4-5",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 8,
"latencyMs": 1239.5215000000026
@@ -10103,7 +10103,7 @@
"model": "gpt-5-nano",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 137,
"latencyMs": 2863.772667000012
@@ -10114,7 +10114,7 @@
"model": "claude-haiku-4-5",
"expected": "Jeannie Klein",
"actual": "Jeannie Klein",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 8,
"latencyMs": 1297.5507919999945
@@ -10125,7 +10125,7 @@
"model": "gpt-5-nano",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9739,
"outputTokens": 202,
"latencyMs": 2533.5459160000028
@@ -10136,7 +10136,7 @@
"model": "claude-haiku-4-5",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11907,
"outputTokens": 8,
"latencyMs": 1313.4649999999965
@@ -10147,7 +10147,7 @@
"model": "gpt-5-nano",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6013,
"outputTokens": 74,
"latencyMs": 1609.448166999995
@@ -10158,7 +10158,7 @@
"model": "claude-haiku-4-5",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6993,
"outputTokens": 8,
"latencyMs": 1257.2229999999981
@@ -10169,7 +10169,7 @@
"model": "gpt-5-nano",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6781,
"outputTokens": 458,
"latencyMs": 5294.154332999984
@@ -10180,7 +10180,7 @@
"model": "claude-haiku-4-5",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8414,
"outputTokens": 8,
"latencyMs": 1363.172208999982
@@ -10191,7 +10191,7 @@
"model": "gpt-5-nano",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9158,
"outputTokens": 74,
"latencyMs": 2154.742499999993
@@ -10202,7 +10202,7 @@
"model": "claude-haiku-4-5",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9289,
"outputTokens": 8,
"latencyMs": 1509.8229580000043
@@ -10213,7 +10213,7 @@
"model": "gpt-5-nano",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7373,
"outputTokens": 74,
"latencyMs": 2010.5185419999762
@@ -10224,7 +10224,7 @@
"model": "claude-haiku-4-5",
"expected": "Joshua Watsica",
"actual": "Joshua Watsica",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8385,
"outputTokens": 8,
"latencyMs": 1193.5151659999974
@@ -10235,7 +10235,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9735,
"outputTokens": 1031,
"latencyMs": 9550.510582999996
@@ -10246,7 +10246,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11902,
"outputTokens": 5,
"latencyMs": 1146.0822499999776
@@ -10257,7 +10257,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6009,
"outputTokens": 775,
"latencyMs": 6479.700542000006
@@ -10268,7 +10268,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6988,
"outputTokens": 5,
"latencyMs": 1329.610708000022
@@ -10279,7 +10279,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6777,
"outputTokens": 967,
"latencyMs": 15240.216207999998
@@ -10290,7 +10290,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8409,
"outputTokens": 5,
"latencyMs": 1203.151125000004
@@ -10301,7 +10301,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9154,
"outputTokens": 583,
"latencyMs": 6073.186583000002
@@ -10312,7 +10312,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9284,
"outputTokens": 5,
"latencyMs": 1452.6655419999734
@@ -10323,7 +10323,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7369,
"outputTokens": 647,
"latencyMs": 7084.941665999999
@@ -10334,7 +10334,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8380,
"outputTokens": 5,
"latencyMs": 1120.7099159999925
@@ -10345,7 +10345,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9735,
"outputTokens": 903,
"latencyMs": 8906.334791000001
@@ -10356,7 +10356,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11902,
"outputTokens": 5,
"latencyMs": 1109.434333000012
@@ -10367,7 +10367,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6009,
"outputTokens": 391,
"latencyMs": 4955.000415999995
@@ -10378,7 +10378,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6988,
"outputTokens": 5,
"latencyMs": 1040.817624999996
@@ -10389,7 +10389,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6777,
"outputTokens": 775,
"latencyMs": 8308.952791000018
@@ -10400,7 +10400,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8409,
"outputTokens": 5,
"latencyMs": 1128.542833000014
@@ -10411,7 +10411,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9154,
"outputTokens": 775,
"latencyMs": 7118.855291000014
@@ -10422,7 +10422,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9284,
"outputTokens": 5,
"latencyMs": 1232.1081249999988
@@ -10433,7 +10433,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7369,
"outputTokens": 647,
"latencyMs": 6776.706208000018
@@ -10444,7 +10444,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8380,
"outputTokens": 5,
"latencyMs": 1677.1033330000064
@@ -10455,7 +10455,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9736,
"outputTokens": 583,
"latencyMs": 5866.636624999985
@@ -10466,7 +10466,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11902,
"outputTokens": 5,
"latencyMs": 1574.224125000008
@@ -10477,7 +10477,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6010,
"outputTokens": 711,
"latencyMs": 7998.43637499999
@@ -10488,7 +10488,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6988,
"outputTokens": 5,
"latencyMs": 1175.3050419999927
@@ -10499,7 +10499,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6778,
"outputTokens": 647,
"latencyMs": 6424.974583000003
@@ -10510,7 +10510,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8409,
"outputTokens": 5,
"latencyMs": 1352.1832500000019
@@ -10521,7 +10521,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9155,
"outputTokens": 647,
"latencyMs": 6132.921792000008
@@ -10532,7 +10532,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9284,
"outputTokens": 5,
"latencyMs": 1241.7496250000258
@@ -10543,7 +10543,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7370,
"outputTokens": 455,
"latencyMs": 8074.935457999993
@@ -10554,7 +10554,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "7",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8380,
"outputTokens": 5,
"latencyMs": 1294.4225830000069
@@ -10565,7 +10565,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9736,
"outputTokens": 775,
"latencyMs": 7724.665375000011
@@ -10576,7 +10576,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 11902,
"outputTokens": 5,
"latencyMs": 1450.864333000005
@@ -10587,7 +10587,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6010,
"outputTokens": 711,
"latencyMs": 5055.026333999995
@@ -10598,7 +10598,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6988,
"outputTokens": 5,
"latencyMs": 1177.2059999999765
@@ -10609,7 +10609,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6778,
"outputTokens": 839,
"latencyMs": 7951.241416999983
@@ -10620,7 +10620,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8409,
"outputTokens": 5,
"latencyMs": 1537.2077500000014
@@ -10631,7 +10631,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9155,
"outputTokens": 519,
"latencyMs": 9752.917709000001
@@ -10642,7 +10642,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9284,
"outputTokens": 5,
"latencyMs": 1101.1202090000152
@@ -10653,7 +10653,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7370,
"outputTokens": 647,
"latencyMs": 5711.038375000004
@@ -10664,7 +10664,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8380,
"outputTokens": 5,
"latencyMs": 1208.3837910000002
@@ -10675,7 +10675,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9736,
"outputTokens": 775,
"latencyMs": 6578.005040999997
@@ -10686,7 +10686,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11902,
"outputTokens": 5,
"latencyMs": 1351.4712499999732
@@ -10697,7 +10697,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6010,
"outputTokens": 583,
"latencyMs": 6437.821874999994
@@ -10708,7 +10708,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6988,
"outputTokens": 5,
"latencyMs": 1155.7898750000168
@@ -10719,7 +10719,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6778,
"outputTokens": 647,
"latencyMs": 6673.183250000002
@@ -10730,7 +10730,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8409,
"outputTokens": 5,
"latencyMs": 1359.994417000009
@@ -10741,7 +10741,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9155,
"outputTokens": 647,
"latencyMs": 5806.33679099998
@@ -10752,7 +10752,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9284,
"outputTokens": 5,
"latencyMs": 1339.4869999999937
@@ -10763,7 +10763,7 @@
"model": "gpt-5-nano",
"expected": "10",
"actual": "10",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7370,
"outputTokens": 519,
"latencyMs": 6011.0411669999885
@@ -10774,7 +10774,7 @@
"model": "claude-haiku-4-5",
"expected": "10",
"actual": "8",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8380,
"outputTokens": 5,
"latencyMs": 1305.6029999999737
@@ -10785,7 +10785,7 @@
"model": "gpt-5-nano",
"expected": "42342.25",
"actual": "41001.14",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9736,
"outputTokens": 1226,
"latencyMs": 11276.714458000002
@@ -10796,7 +10796,7 @@
"model": "claude-haiku-4-5",
"expected": "42342.25",
"actual": "48,847.66",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11902,
"outputTokens": 9,
"latencyMs": 1400.5162910000072
@@ -10807,7 +10807,7 @@
"model": "gpt-5-nano",
"expected": "42342.25",
"actual": "42342.25",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6010,
"outputTokens": 5962,
"latencyMs": 50971.727667
@@ -10818,7 +10818,7 @@
"model": "claude-haiku-4-5",
"expected": "42342.25",
"actual": "41,847.47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6988,
"outputTokens": 9,
"latencyMs": 1118.9986250000075
@@ -10829,7 +10829,7 @@
"model": "gpt-5-nano",
"expected": "42342.25",
"actual": "42342.25",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6778,
"outputTokens": 3082,
"latencyMs": 22816.508165999985
@@ -10840,7 +10840,7 @@
"model": "claude-haiku-4-5",
"expected": "42342.25",
"actual": "48,847.47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8409,
"outputTokens": 9,
"latencyMs": 1104.31912499998
@@ -10851,7 +10851,7 @@
"model": "gpt-5-nano",
"expected": "42342.25",
"actual": "42425.97",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9155,
"outputTokens": 2762,
"latencyMs": 17412.623583000008
@@ -10862,7 +10862,7 @@
"model": "claude-haiku-4-5",
"expected": "42342.25",
"actual": "47,847.47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9284,
"outputTokens": 9,
"latencyMs": 1435.553082999977
@@ -10873,7 +10873,7 @@
"model": "gpt-5-nano",
"expected": "42342.25",
"actual": "42342.25",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7370,
"outputTokens": 3402,
"latencyMs": 26299.00112500001
@@ -10884,7 +10884,7 @@
"model": "claude-haiku-4-5",
"expected": "42342.25",
"actual": "41,847.47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8380,
"outputTokens": 9,
"latencyMs": 1272.4541250000184
@@ -10895,7 +10895,7 @@
"model": "gpt-5-nano",
"expected": "44",
"actual": "44",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 1351,
"latencyMs": 13461.932250000013
@@ -10906,7 +10906,7 @@
"model": "claude-haiku-4-5",
"expected": "44",
"actual": "48",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11904,
"outputTokens": 5,
"latencyMs": 1772.9891250000219
@@ -10917,7 +10917,7 @@
"model": "gpt-5-nano",
"expected": "44",
"actual": "44",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 1735,
"latencyMs": 14196.807250000013
@@ -10928,7 +10928,7 @@
"model": "claude-haiku-4-5",
"expected": "44",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6990,
"outputTokens": 5,
"latencyMs": 1749.7322920000006
@@ -10939,7 +10939,7 @@
"model": "gpt-5-nano",
"expected": "44",
"actual": "44",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 1863,
"latencyMs": 14291.044916999992
@@ -10950,7 +10950,7 @@
"model": "claude-haiku-4-5",
"expected": "44",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8411,
"outputTokens": 5,
"latencyMs": 1453.1822079999838
@@ -10961,7 +10961,7 @@
"model": "gpt-5-nano",
"expected": "44",
"actual": "44",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 1799,
"latencyMs": 16012.806332999986
@@ -10972,7 +10972,7 @@
"model": "claude-haiku-4-5",
"expected": "44",
"actual": "48",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9286,
"outputTokens": 5,
"latencyMs": 1761.131041000015
@@ -10983,7 +10983,7 @@
"model": "gpt-5-nano",
"expected": "44",
"actual": "44",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 1415,
"latencyMs": 12218.14491599999
@@ -10994,7 +10994,7 @@
"model": "claude-haiku-4-5",
"expected": "44",
"actual": "45",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8382,
"outputTokens": 5,
"latencyMs": 1255.681917000009
@@ -11005,7 +11005,7 @@
"model": "gpt-5-nano",
"expected": "39",
"actual": "39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 2311,
"latencyMs": 22316.87704199998
@@ -11016,7 +11016,7 @@
"model": "claude-haiku-4-5",
"expected": "39",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11904,
"outputTokens": 5,
"latencyMs": 1090.176792000013
@@ -11027,7 +11027,7 @@
"model": "gpt-5-nano",
"expected": "39",
"actual": "39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 1095,
"latencyMs": 7211.767082999984
@@ -11038,7 +11038,7 @@
"model": "claude-haiku-4-5",
"expected": "39",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6990,
"outputTokens": 5,
"latencyMs": 1129.9290000000037
@@ -11049,7 +11049,7 @@
"model": "gpt-5-nano",
"expected": "39",
"actual": "39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 1415,
"latencyMs": 15701.471499999985
@@ -11060,7 +11060,7 @@
"model": "claude-haiku-4-5",
"expected": "39",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8411,
"outputTokens": 5,
"latencyMs": 1251.5472500000033
@@ -11071,7 +11071,7 @@
"model": "gpt-5-nano",
"expected": "39",
"actual": "39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 1799,
"latencyMs": 16689.30345800001
@@ -11082,7 +11082,7 @@
"model": "claude-haiku-4-5",
"expected": "39",
"actual": "41",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9286,
"outputTokens": 5,
"latencyMs": 1168.8190419999883
@@ -11093,7 +11093,7 @@
"model": "gpt-5-nano",
"expected": "39",
"actual": "39",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 7372,
"outputTokens": 1863,
"latencyMs": 14505.393958999979
@@ -11104,7 +11104,7 @@
"model": "claude-haiku-4-5",
"expected": "39",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8382,
"outputTokens": 5,
"latencyMs": 1149.8783330000006
@@ -11115,7 +11115,7 @@
"model": "gpt-5-nano",
"expected": "32",
"actual": "32",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9738,
"outputTokens": 1607,
"latencyMs": 13945.93979200002
@@ -11126,7 +11126,7 @@
"model": "claude-haiku-4-5",
"expected": "32",
"actual": "28",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 11904,
"outputTokens": 5,
"latencyMs": 1175.8143749999872
@@ -11137,7 +11137,7 @@
"model": "gpt-5-nano",
"expected": "32",
"actual": "32",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6012,
"outputTokens": 1351,
"latencyMs": 11991.764750000002
@@ -11148,7 +11148,7 @@
"model": "claude-haiku-4-5",
"expected": "32",
"actual": "26",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 6990,
"outputTokens": 5,
"latencyMs": 1643.4279169999936
@@ -11159,7 +11159,7 @@
"model": "gpt-5-nano",
"expected": "32",
"actual": "32",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 6780,
"outputTokens": 1799,
"latencyMs": 17324.695000000007
@@ -11170,7 +11170,7 @@
"model": "claude-haiku-4-5",
"expected": "32",
"actual": "28",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8411,
"outputTokens": 5,
"latencyMs": 1197.7254160000011
@@ -11181,7 +11181,7 @@
"model": "gpt-5-nano",
"expected": "32",
"actual": "32",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9157,
"outputTokens": 1607,
"latencyMs": 22426.01029199999
@@ -11192,7 +11192,7 @@
"model": "claude-haiku-4-5",
"expected": "32",
"actual": "28",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9286,
"outputTokens": 5,
"latencyMs": 1065.6509170000209
@@ -11203,7 +11203,7 @@
"model": "gpt-5-nano",
"expected": "32",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 7372,
"outputTokens": 1543,
"latencyMs": 12786.843416999996
@@ -11214,7 +11214,7 @@
"model": "claude-haiku-4-5",
"expected": "32",
"actual": "26",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8382,
"outputTokens": 5,
"latencyMs": 2054.993749999994
@@ -11225,7 +11225,7 @@
"model": "gpt-5-nano",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 72,
"latencyMs": 2244.986208999995
@@ -11236,7 +11236,7 @@
"model": "claude-haiku-4-5",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1162.9390420000127
@@ -11247,7 +11247,7 @@
"model": "gpt-5-nano",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 136,
"latencyMs": 2179.3558330000087
@@ -11258,7 +11258,7 @@
"model": "claude-haiku-4-5",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1013.4975409999897
@@ -11269,7 +11269,7 @@
"model": "gpt-5-nano",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 72,
"latencyMs": 4859.720833999978
@@ -11280,7 +11280,7 @@
"model": "claude-haiku-4-5",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1437.758375000005
@@ -11291,7 +11291,7 @@
"model": "gpt-5-nano",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 72,
"latencyMs": 3120.702874999988
@@ -11302,7 +11302,7 @@
"model": "claude-haiku-4-5",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1051.775708000001
@@ -11313,7 +11313,7 @@
"model": "gpt-5-nano",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 72,
"latencyMs": 2182.880084000004
@@ -11324,7 +11324,7 @@
"model": "claude-haiku-4-5",
"expected": "6975",
"actual": "6975",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1045.2009580000013
@@ -11335,7 +11335,7 @@
"model": "gpt-5-nano",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 5291.923750000016
@@ -11346,7 +11346,7 @@
"model": "claude-haiku-4-5",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1009.6958750000049
@@ -11357,7 +11357,7 @@
"model": "gpt-5-nano",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 74,
"latencyMs": 2582.2320419999887
@@ -11368,7 +11368,7 @@
"model": "claude-haiku-4-5",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1203.816542000015
@@ -11379,7 +11379,7 @@
"model": "gpt-5-nano",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 138,
"latencyMs": 2774.835167000012
@@ -11390,7 +11390,7 @@
"model": "claude-haiku-4-5",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 979.9191669999855
@@ -11401,7 +11401,7 @@
"model": "gpt-5-nano",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 138,
"latencyMs": 2616.684333000012
@@ -11412,7 +11412,7 @@
"model": "claude-haiku-4-5",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1253.4844169999997
@@ -11423,7 +11423,7 @@
"model": "gpt-5-nano",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 74,
"latencyMs": 2267.1155000000144
@@ -11434,7 +11434,7 @@
"model": "claude-haiku-4-5",
"expected": "6686.23",
"actual": "6686.23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1185.4212080000143
@@ -11445,7 +11445,7 @@
"model": "gpt-5-nano",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 136,
"latencyMs": 2905.6011250000156
@@ -11456,7 +11456,7 @@
"model": "claude-haiku-4-5",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1571.1469999999972
@@ -11467,7 +11467,7 @@
"model": "gpt-5-nano",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 328,
"latencyMs": 3884.65858399999
@@ -11478,7 +11478,7 @@
"model": "claude-haiku-4-5",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1207.1518330000108
@@ -11489,7 +11489,7 @@
"model": "gpt-5-nano",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 72,
"latencyMs": 1995.0557919999992
@@ -11500,7 +11500,7 @@
"model": "claude-haiku-4-5",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1238.8113749999902
@@ -11511,7 +11511,7 @@
"model": "gpt-5-nano",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 136,
"latencyMs": 5824.06574999998
@@ -11522,7 +11522,7 @@
"model": "claude-haiku-4-5",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1337.474749999994
@@ -11533,7 +11533,7 @@
"model": "gpt-5-nano",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 2286.1839580000087
@@ -11544,7 +11544,7 @@
"model": "claude-haiku-4-5",
"expected": "7500",
"actual": "7500",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1326.3640000000014
@@ -11555,7 +11555,7 @@
"model": "gpt-5-nano",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 3801.309249999991
@@ -11566,7 +11566,7 @@
"model": "claude-haiku-4-5",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1054.8991249999963
@@ -11577,7 +11577,7 @@
"model": "gpt-5-nano",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 74,
"latencyMs": 3338.1347499999974
@@ -11588,7 +11588,7 @@
"model": "claude-haiku-4-5",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1393.589082999999
@@ -11599,7 +11599,7 @@
"model": "gpt-5-nano",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 202,
"latencyMs": 3719.6092089999875
@@ -11610,7 +11610,7 @@
"model": "claude-haiku-4-5",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 1030.9656669999822
@@ -11621,7 +11621,7 @@
"model": "gpt-5-nano",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 74,
"latencyMs": 2226.628250000009
@@ -11632,7 +11632,7 @@
"model": "claude-haiku-4-5",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1154.132540999999
@@ -11643,7 +11643,7 @@
"model": "gpt-5-nano",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 2922.2590830000117
@@ -11654,7 +11654,7 @@
"model": "claude-haiku-4-5",
"expected": "14297.05",
"actual": "14297.05",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 2048.011916999996
@@ -11665,7 +11665,7 @@
"model": "gpt-5-nano",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 200,
"latencyMs": 2520.5313329999917
@@ -11676,7 +11676,7 @@
"model": "claude-haiku-4-5",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 943.3422089999949
@@ -11687,7 +11687,7 @@
"model": "gpt-5-nano",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 136,
"latencyMs": 2300.8406249999825
@@ -11698,7 +11698,7 @@
"model": "claude-haiku-4-5",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1128.4146670000046
@@ -11709,7 +11709,7 @@
"model": "gpt-5-nano",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 200,
"latencyMs": 2929.585208000004
@@ -11720,7 +11720,7 @@
"model": "claude-haiku-4-5",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1230.4635420000122
@@ -11731,7 +11731,7 @@
"model": "gpt-5-nano",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 136,
"latencyMs": 3650.3654169999936
@@ -11742,7 +11742,7 @@
"model": "claude-haiku-4-5",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 985.8184590000019
@@ -11753,7 +11753,7 @@
"model": "gpt-5-nano",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 328,
"latencyMs": 3772.2553330000082
@@ -11764,7 +11764,7 @@
"model": "claude-haiku-4-5",
"expected": "6692",
"actual": "6692",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1311.8630419999827
@@ -11775,7 +11775,7 @@
"model": "gpt-5-nano",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 2935.785124999995
@@ -11786,7 +11786,7 @@
"model": "claude-haiku-4-5",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1391.9168749999953
@@ -11797,7 +11797,7 @@
"model": "gpt-5-nano",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 138,
"latencyMs": 5759.15529200001
@@ -11808,7 +11808,7 @@
"model": "claude-haiku-4-5",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1064.3980420000153
@@ -11819,7 +11819,7 @@
"model": "gpt-5-nano",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 74,
"latencyMs": 3640.193708000006
@@ -11830,7 +11830,7 @@
"model": "claude-haiku-4-5",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 983.806166000024
@@ -11841,7 +11841,7 @@
"model": "gpt-5-nano",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 266,
"latencyMs": 2604.2135000000126
@@ -11852,7 +11852,7 @@
"model": "claude-haiku-4-5",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1128.6182499999995
@@ -11863,7 +11863,7 @@
"model": "gpt-5-nano",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 2548.5608749999956
@@ -11874,7 +11874,7 @@
"model": "claude-haiku-4-5",
"expected": "9302.76",
"actual": "9302.76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1029.5365000000165
@@ -11885,7 +11885,7 @@
"model": "gpt-5-nano",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 136,
"latencyMs": 3983.6009170000034
@@ -11896,7 +11896,7 @@
"model": "claude-haiku-4-5",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1095.2366250000196
@@ -11907,7 +11907,7 @@
"model": "gpt-5-nano",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 72,
"latencyMs": 2207.884417000023
@@ -11918,7 +11918,7 @@
"model": "claude-haiku-4-5",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 2292.4111660000053
@@ -11929,7 +11929,7 @@
"model": "gpt-5-nano",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 136,
"latencyMs": 2749.430541000009
@@ -11940,7 +11940,7 @@
"model": "claude-haiku-4-5",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1215.8329999999842
@@ -11951,7 +11951,7 @@
"model": "gpt-5-nano",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 136,
"latencyMs": 2086.6161659999925
@@ -11962,7 +11962,7 @@
"model": "claude-haiku-4-5",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1299.715790999995
@@ -11973,7 +11973,7 @@
"model": "gpt-5-nano",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 7107.394916999998
@@ -11984,7 +11984,7 @@
"model": "claude-haiku-4-5",
"expected": "3285",
"actual": "3285",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 899.2319579999894
@@ -11995,7 +11995,7 @@
"model": "gpt-5-nano",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 2810.5213330000115
@@ -12006,7 +12006,7 @@
"model": "claude-haiku-4-5",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 989.2326659999962
@@ -12017,7 +12017,7 @@
"model": "gpt-5-nano",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 138,
"latencyMs": 2622.7841670000053
@@ -12028,7 +12028,7 @@
"model": "claude-haiku-4-5",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 850.1227920000092
@@ -12039,7 +12039,7 @@
"model": "gpt-5-nano",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 138,
"latencyMs": 3057.1578750000044
@@ -12050,7 +12050,7 @@
"model": "claude-haiku-4-5",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 1261.3340000000026
@@ -12061,7 +12061,7 @@
"model": "gpt-5-nano",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 202,
"latencyMs": 3061.791499999992
@@ -12072,7 +12072,7 @@
"model": "claude-haiku-4-5",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1196.6509999999835
@@ -12083,7 +12083,7 @@
"model": "gpt-5-nano",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 3567.4540839999972
@@ -12094,7 +12094,7 @@
"model": "claude-haiku-4-5",
"expected": "3826.93",
"actual": "3826.93",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1033.8556249999965
@@ -12105,7 +12105,7 @@
"model": "gpt-5-nano",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 136,
"latencyMs": 2842.961707999988
@@ -12116,7 +12116,7 @@
"model": "claude-haiku-4-5",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1258.130582999991
@@ -12127,7 +12127,7 @@
"model": "gpt-5-nano",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 456,
"latencyMs": 5828.652415999997
@@ -12138,7 +12138,7 @@
"model": "claude-haiku-4-5",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1004.821958000015
@@ -12149,7 +12149,7 @@
"model": "gpt-5-nano",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 72,
"latencyMs": 3102.38612499999
@@ -12160,7 +12160,7 @@
"model": "claude-haiku-4-5",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1454.8658750000177
@@ -12171,7 +12171,7 @@
"model": "gpt-5-nano",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 136,
"latencyMs": 2018.8434999999881
@@ -12182,7 +12182,7 @@
"model": "claude-haiku-4-5",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1237.4057080000057
@@ -12193,7 +12193,7 @@
"model": "gpt-5-nano",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 3670.7451670000155
@@ -12204,7 +12204,7 @@
"model": "claude-haiku-4-5",
"expected": "6191",
"actual": "6191",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1070.646584000002
@@ -12215,7 +12215,7 @@
"model": "gpt-5-nano",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 202,
"latencyMs": 3731.3879579999775
@@ -12226,7 +12226,7 @@
"model": "claude-haiku-4-5",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1387.9798329999903
@@ -12237,7 +12237,7 @@
"model": "gpt-5-nano",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 394,
"latencyMs": 5560.397957999987
@@ -12248,7 +12248,7 @@
"model": "claude-haiku-4-5",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1552.963958999986
@@ -12259,7 +12259,7 @@
"model": "gpt-5-nano",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 138,
"latencyMs": 21759.84366700001
@@ -12270,7 +12270,7 @@
"model": "claude-haiku-4-5",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 1132.519083000021
@@ -12281,7 +12281,7 @@
"model": "gpt-5-nano",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 138,
"latencyMs": 2277.2652499999967
@@ -12292,7 +12292,7 @@
"model": "claude-haiku-4-5",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1098.0825420000183
@@ -12303,7 +12303,7 @@
"model": "gpt-5-nano",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 202,
"latencyMs": 2813.10504200001
@@ -12314,7 +12314,7 @@
"model": "claude-haiku-4-5",
"expected": "1854.66",
"actual": "1854.66",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1131.9674159999995
@@ -12325,7 +12325,7 @@
"model": "gpt-5-nano",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 136,
"latencyMs": 6657.446207999979
@@ -12336,7 +12336,7 @@
"model": "claude-haiku-4-5",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1265.4548749999958
@@ -12347,7 +12347,7 @@
"model": "gpt-5-nano",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 136,
"latencyMs": 3299.298792000016
@@ -12358,7 +12358,7 @@
"model": "claude-haiku-4-5",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1618.5091249999823
@@ -12369,7 +12369,7 @@
"model": "gpt-5-nano",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 136,
"latencyMs": 5353.29241699999
@@ -12380,7 +12380,7 @@
"model": "claude-haiku-4-5",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 870.5113749999728
@@ -12391,7 +12391,7 @@
"model": "gpt-5-nano",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 200,
"latencyMs": 2780.5659159999923
@@ -12402,7 +12402,7 @@
"model": "claude-haiku-4-5",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1069.2415409999958
@@ -12413,7 +12413,7 @@
"model": "gpt-5-nano",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 200,
"latencyMs": 3036.145666999975
@@ -12424,7 +12424,7 @@
"model": "claude-haiku-4-5",
"expected": "4696",
"actual": "4696",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1252.9633329999924
@@ -12435,7 +12435,7 @@
"model": "gpt-5-nano",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 2617.047249999974
@@ -12446,7 +12446,7 @@
"model": "claude-haiku-4-5",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1261.9117079999996
@@ -12457,7 +12457,7 @@
"model": "gpt-5-nano",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 202,
"latencyMs": 6192.06358300001
@@ -12468,7 +12468,7 @@
"model": "claude-haiku-4-5",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1158.3806249999907
@@ -12479,7 +12479,7 @@
"model": "gpt-5-nano",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 138,
"latencyMs": 2867.840083999996
@@ -12490,7 +12490,7 @@
"model": "claude-haiku-4-5",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 856.2939580000238
@@ -12501,7 +12501,7 @@
"model": "gpt-5-nano",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 138,
"latencyMs": 2329.6339579999913
@@ -12512,7 +12512,7 @@
"model": "claude-haiku-4-5",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1106.5591669999994
@@ -12523,7 +12523,7 @@
"model": "gpt-5-nano",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 2590.7533330000006
@@ -12534,7 +12534,7 @@
"model": "claude-haiku-4-5",
"expected": "4211.6",
"actual": "4211.6",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1007.0892920000188
@@ -12545,7 +12545,7 @@
"model": "gpt-5-nano",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 200,
"latencyMs": 3839.2745000000286
@@ -12556,7 +12556,7 @@
"model": "claude-haiku-4-5",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1388.2399160000205
@@ -12567,7 +12567,7 @@
"model": "gpt-5-nano",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 200,
"latencyMs": 3955.22095800002
@@ -12578,7 +12578,7 @@
"model": "claude-haiku-4-5",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1036.567458000005
@@ -12589,7 +12589,7 @@
"model": "gpt-5-nano",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 200,
"latencyMs": 5566.705209000007
@@ -12600,7 +12600,7 @@
"model": "claude-haiku-4-5",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1078.5011670000094
@@ -12611,7 +12611,7 @@
"model": "gpt-5-nano",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 200,
"latencyMs": 2956.9618330000376
@@ -12622,7 +12622,7 @@
"model": "claude-haiku-4-5",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1797.4496250000084
@@ -12633,7 +12633,7 @@
"model": "gpt-5-nano",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 2647.741832999978
@@ -12644,7 +12644,7 @@
"model": "claude-haiku-4-5",
"expected": "6196",
"actual": "6196",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1221.9055410000146
@@ -12655,7 +12655,7 @@
"model": "gpt-5-nano",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 3783.334333000006
@@ -12666,7 +12666,7 @@
"model": "claude-haiku-4-5",
"expected": "6105.3",
"actual": "6105.30",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1135.7771670000511
@@ -12677,7 +12677,7 @@
"model": "gpt-5-nano",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 266,
"latencyMs": 3364.4232920000213
@@ -12688,7 +12688,7 @@
"model": "claude-haiku-4-5",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1161.263666999992
@@ -12699,7 +12699,7 @@
"model": "gpt-5-nano",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 74,
"latencyMs": 3646.0659589999705
@@ -12710,7 +12710,7 @@
"model": "claude-haiku-4-5",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 955.7597500000265
@@ -12721,7 +12721,7 @@
"model": "gpt-5-nano",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 74,
"latencyMs": 2345.2203750000335
@@ -12732,7 +12732,7 @@
"model": "claude-haiku-4-5",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1541.918249999988
@@ -12743,7 +12743,7 @@
"model": "gpt-5-nano",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 6126.976708000002
@@ -12754,7 +12754,7 @@
"model": "claude-haiku-4-5",
"expected": "6105.3",
"actual": "6105.3",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1097.440709000046
@@ -12765,7 +12765,7 @@
"model": "gpt-5-nano",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 264,
"latencyMs": 3404.643708999967
@@ -12776,7 +12776,7 @@
"model": "claude-haiku-4-5",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1227.7047499999753
@@ -12787,7 +12787,7 @@
"model": "gpt-5-nano",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 136,
"latencyMs": 2495.85037499998
@@ -12798,7 +12798,7 @@
"model": "claude-haiku-4-5",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1048.344832999981
@@ -12809,7 +12809,7 @@
"model": "gpt-5-nano",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 136,
"latencyMs": 3007.2462499999674
@@ -12820,7 +12820,7 @@
"model": "claude-haiku-4-5",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 840.0351669999654
@@ -12831,7 +12831,7 @@
"model": "gpt-5-nano",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 328,
"latencyMs": 3149.872374999977
@@ -12842,7 +12842,7 @@
"model": "claude-haiku-4-5",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 973.716167000006
@@ -12853,7 +12853,7 @@
"model": "gpt-5-nano",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 456,
"latencyMs": 5305.827791999967
@@ -12864,7 +12864,7 @@
"model": "claude-haiku-4-5",
"expected": "6528",
"actual": "6528",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 953.3122500000172
@@ -12875,7 +12875,7 @@
"model": "gpt-5-nano",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 3435.850167000026
@@ -12886,7 +12886,7 @@
"model": "claude-haiku-4-5",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1110.8856249999953
@@ -12897,7 +12897,7 @@
"model": "gpt-5-nano",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 266,
"latencyMs": 3303.3427500000107
@@ -12908,7 +12908,7 @@
"model": "claude-haiku-4-5",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 954.5857910000486
@@ -12919,7 +12919,7 @@
"model": "gpt-5-nano",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 138,
"latencyMs": 5035.666582999984
@@ -12930,7 +12930,7 @@
"model": "claude-haiku-4-5",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 867.9529159999802
@@ -12941,7 +12941,7 @@
"model": "gpt-5-nano",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 202,
"latencyMs": 2817.1118750000023
@@ -12952,7 +12952,7 @@
"model": "claude-haiku-4-5",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1029.4406660000095
@@ -12963,7 +12963,7 @@
"model": "gpt-5-nano",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 2521.28145900002
@@ -12974,7 +12974,7 @@
"model": "claude-haiku-4-5",
"expected": "1136.09",
"actual": "1136.09",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1266.9695000000065
@@ -12985,7 +12985,7 @@
"model": "gpt-5-nano",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 72,
"latencyMs": 2383.6225830000476
@@ -12996,7 +12996,7 @@
"model": "claude-haiku-4-5",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 1100.3007499999949
@@ -13007,7 +13007,7 @@
"model": "gpt-5-nano",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 200,
"latencyMs": 2816.252374999982
@@ -13018,7 +13018,7 @@
"model": "claude-haiku-4-5",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1030.0248330000322
@@ -13029,7 +13029,7 @@
"model": "gpt-5-nano",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 72,
"latencyMs": 1819.5161669999943
@@ -13040,7 +13040,7 @@
"model": "claude-haiku-4-5",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 1012.0581670000101
@@ -13051,7 +13051,7 @@
"model": "gpt-5-nano",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 136,
"latencyMs": 2960.8910000000033
@@ -13062,7 +13062,7 @@
"model": "claude-haiku-4-5",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1346.7110000000102
@@ -13073,7 +13073,7 @@
"model": "gpt-5-nano",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 3081.40625
@@ -13084,7 +13084,7 @@
"model": "claude-haiku-4-5",
"expected": "4689",
"actual": "4689",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1485.0133330000099
@@ -13095,7 +13095,7 @@
"model": "gpt-5-nano",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 138,
"latencyMs": 3632.860875000013
@@ -13106,7 +13106,7 @@
"model": "claude-haiku-4-5",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1224.803750000021
@@ -13117,7 +13117,7 @@
"model": "gpt-5-nano",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 138,
"latencyMs": 2323.675958000007
@@ -13128,7 +13128,7 @@
"model": "claude-haiku-4-5",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1114.0831669999752
@@ -13139,7 +13139,7 @@
"model": "gpt-5-nano",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 202,
"latencyMs": 3465.111333000008
@@ -13150,7 +13150,7 @@
"model": "claude-haiku-4-5",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 1082.4990419999813
@@ -13161,7 +13161,7 @@
"model": "gpt-5-nano",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 138,
"latencyMs": 5648.285415999999
@@ -13172,7 +13172,7 @@
"model": "claude-haiku-4-5",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1087.8757500000065
@@ -13183,7 +13183,7 @@
"model": "gpt-5-nano",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 4587.399166000017
@@ -13194,7 +13194,7 @@
"model": "claude-haiku-4-5",
"expected": "2637.73",
"actual": "2637.73",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1007.4333340000012
@@ -13205,7 +13205,7 @@
"model": "gpt-5-nano",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3712,
"outputTokens": 72,
"latencyMs": 2307.9398339999607
@@ -13216,7 +13216,7 @@
"model": "claude-haiku-4-5",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4080,
"outputTokens": 6,
"latencyMs": 2368.3719580000034
@@ -13227,7 +13227,7 @@
"model": "gpt-5-nano",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1563,
"outputTokens": 200,
"latencyMs": 3587.720166999963
@@ -13238,7 +13238,7 @@
"model": "claude-haiku-4-5",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1509,
"outputTokens": 6,
"latencyMs": 1053.9867080000113
@@ -13249,7 +13249,7 @@
"model": "gpt-5-nano",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1441,
"outputTokens": 136,
"latencyMs": 1593.4699169999803
@@ -13260,7 +13260,7 @@
"model": "claude-haiku-4-5",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1445,
"outputTokens": 6,
"latencyMs": 2256.4729170000064
@@ -13271,7 +13271,7 @@
"model": "gpt-5-nano",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3829,
"outputTokens": 200,
"latencyMs": 4466.158916999993
@@ -13282,7 +13282,7 @@
"model": "claude-haiku-4-5",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3415,
"outputTokens": 6,
"latencyMs": 1305.1236670000362
@@ -13293,7 +13293,7 @@
"model": "gpt-5-nano",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2985,
"outputTokens": 136,
"latencyMs": 3014.9748339999933
@@ -13304,7 +13304,7 @@
"model": "claude-haiku-4-5",
"expected": "5685",
"actual": "5685",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3110,
"outputTokens": 6,
"latencyMs": 1421.9597920000087
@@ -13315,7 +13315,7 @@
"model": "gpt-5-nano",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3711,
"outputTokens": 202,
"latencyMs": 19503.25695900002
@@ -13326,7 +13326,7 @@
"model": "claude-haiku-4-5",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4079,
"outputTokens": 8,
"latencyMs": 1164.002959000005
@@ -13337,7 +13337,7 @@
"model": "gpt-5-nano",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1562,
"outputTokens": 330,
"latencyMs": 4662.637042000017
@@ -13348,7 +13348,7 @@
"model": "claude-haiku-4-5",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1508,
"outputTokens": 8,
"latencyMs": 1086.9569170000032
@@ -13359,7 +13359,7 @@
"model": "gpt-5-nano",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1440,
"outputTokens": 202,
"latencyMs": 2683.73904200003
@@ -13370,7 +13370,7 @@
"model": "claude-haiku-4-5",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1444,
"outputTokens": 8,
"latencyMs": 2289.0300419999985
@@ -13381,7 +13381,7 @@
"model": "gpt-5-nano",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3828,
"outputTokens": 74,
"latencyMs": 1877.1760409999988
@@ -13392,7 +13392,7 @@
"model": "claude-haiku-4-5",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3414,
"outputTokens": 8,
"latencyMs": 1460.1729160000104
@@ -13403,7 +13403,7 @@
"model": "gpt-5-nano",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2984,
"outputTokens": 138,
"latencyMs": 2582.983708999993
@@ -13414,7 +13414,7 @@
"model": "claude-haiku-4-5",
"expected": "3421.06",
"actual": "3421.06",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3109,
"outputTokens": 8,
"latencyMs": 1014.1320839999826
@@ -13425,7 +13425,7 @@
"model": "gpt-5-nano",
"expected": "344498",
"actual": "344498",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3709,
"outputTokens": 2376,
"latencyMs": 26290.846458000015
@@ -13436,7 +13436,7 @@
"model": "claude-haiku-4-5",
"expected": "344498",
"actual": "188,945",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4077,
"outputTokens": 7,
"latencyMs": 1288.6627500000177
@@ -13447,7 +13447,7 @@
"model": "gpt-5-nano",
"expected": "344498",
"actual": "344498",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1560,
"outputTokens": 1736,
"latencyMs": 13565.930124999955
@@ -13458,7 +13458,7 @@
"model": "claude-haiku-4-5",
"expected": "344498",
"actual": "337,045",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1506,
"outputTokens": 7,
"latencyMs": 1190.8501249999972
@@ -13469,7 +13469,7 @@
"model": "gpt-5-nano",
"expected": "344498",
"actual": "344498",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1438,
"outputTokens": 2888,
"latencyMs": 21377.612083000015
@@ -13480,7 +13480,7 @@
"model": "claude-haiku-4-5",
"expected": "344498",
"actual": "372,915",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1442,
"outputTokens": 7,
"latencyMs": 931.349749999994
@@ -13491,7 +13491,7 @@
"model": "gpt-5-nano",
"expected": "344498",
"actual": "344498",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3826,
"outputTokens": 3208,
"latencyMs": 18997.804958999972
@@ -13502,7 +13502,7 @@
"model": "claude-haiku-4-5",
"expected": "344498",
"actual": "188,647",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3412,
"outputTokens": 7,
"latencyMs": 1185.3518330000225
@@ -13513,7 +13513,7 @@
"model": "gpt-5-nano",
"expected": "344498",
"actual": "344498",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 2184,
"latencyMs": 23924.366792000015
@@ -13524,7 +13524,7 @@
"model": "claude-haiku-4-5",
"expected": "344498",
"actual": "181,854",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3107,
"outputTokens": 7,
"latencyMs": 2958.913666999957
@@ -13535,7 +13535,7 @@
"model": "gpt-5-nano",
"expected": "312818.50",
"actual": "312818.50",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3707,
"outputTokens": 4170,
"latencyMs": 29361.525874999992
@@ -13546,7 +13546,7 @@
"model": "claude-haiku-4-5",
"expected": "312818.50",
"actual": "287,745.89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4075,
"outputTokens": 9,
"latencyMs": 1325.5311249999795
@@ -13557,7 +13557,7 @@
"model": "gpt-5-nano",
"expected": "312818.50",
"actual": "312818.50",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1558,
"outputTokens": 4106,
"latencyMs": 37997.09958400001
@@ -13568,7 +13568,7 @@
"model": "claude-haiku-4-5",
"expected": "312818.50",
"actual": "487,891.45",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1504,
"outputTokens": 9,
"latencyMs": 1184.0957090000156
@@ -13579,7 +13579,7 @@
"model": "gpt-5-nano",
"expected": "312818.50",
"actual": "312818.50",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1436,
"outputTokens": 3658,
"latencyMs": 26945.63508400001
@@ -13590,7 +13590,7 @@
"model": "claude-haiku-4-5",
"expected": "312818.50",
"actual": "487,891.89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1440,
"outputTokens": 9,
"latencyMs": 1162.16949999996
@@ -13601,7 +13601,7 @@
"model": "gpt-5-nano",
"expected": "312818.50",
"actual": "312818.50",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3824,
"outputTokens": 3722,
"latencyMs": 27321.698167000024
@@ -13612,7 +13612,7 @@
"model": "claude-haiku-4-5",
"expected": "312818.50",
"actual": "381,968.89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3410,
"outputTokens": 9,
"latencyMs": 2065.7583339999546
@@ -13623,7 +13623,7 @@
"model": "gpt-5-nano",
"expected": "312818.50",
"actual": "312818.50",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2980,
"outputTokens": 3658,
"latencyMs": 28778.99891600001
@@ -13634,7 +13634,7 @@
"model": "claude-haiku-4-5",
"expected": "312818.50",
"actual": "381,847.89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3105,
"outputTokens": 9,
"latencyMs": 1233.4267090000212
@@ -13645,7 +13645,7 @@
"model": "gpt-5-nano",
"expected": "1811",
"actual": "1811",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3709,
"outputTokens": 2568,
"latencyMs": 28626.692666999996
@@ -13656,7 +13656,7 @@
"model": "claude-haiku-4-5",
"expected": "1811",
"actual": "1,234",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4078,
"outputTokens": 7,
"latencyMs": 1133.735584000009
@@ -13667,7 +13667,7 @@
"model": "gpt-5-nano",
"expected": "1811",
"actual": "1811",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1560,
"outputTokens": 1672,
"latencyMs": 14898.688125000044
@@ -13678,7 +13678,7 @@
"model": "claude-haiku-4-5",
"expected": "1811",
"actual": "1,945",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 7,
"latencyMs": 1178.2744999999995
@@ -13689,7 +13689,7 @@
"model": "gpt-5-nano",
"expected": "1811",
"actual": "1811",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1438,
"outputTokens": 1864,
"latencyMs": 15225.964540999965
@@ -13700,7 +13700,7 @@
"model": "claude-haiku-4-5",
"expected": "1811",
"actual": "1,945",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1443,
"outputTokens": 7,
"latencyMs": 1077.2695419999654
@@ -13711,7 +13711,7 @@
"model": "gpt-5-nano",
"expected": "1811",
"actual": "1811",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3826,
"outputTokens": 1928,
"latencyMs": 14057.434583000024
@@ -13722,7 +13722,7 @@
"model": "claude-haiku-4-5",
"expected": "1811",
"actual": "1,454",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 7,
"latencyMs": 1177.537500000035
@@ -13733,7 +13733,7 @@
"model": "gpt-5-nano",
"expected": "1811",
"actual": "1811",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 2312,
"latencyMs": 19125.74099999998
@@ -13744,7 +13744,7 @@
"model": "claude-haiku-4-5",
"expected": "1811",
"actual": "1,454",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 7,
"latencyMs": 1047.243833000015
@@ -13755,7 +13755,7 @@
"model": "gpt-5-nano",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3709,
"outputTokens": 1735,
"latencyMs": 14875.021707999986
@@ -13766,7 +13766,7 @@
"model": "claude-haiku-4-5",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1076.5694999999832
@@ -13777,7 +13777,7 @@
"model": "gpt-5-nano",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1560,
"outputTokens": 2823,
"latencyMs": 22604.422416999994
@@ -13788,7 +13788,7 @@
"model": "claude-haiku-4-5",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 1451.705666999973
@@ -13799,7 +13799,7 @@
"model": "gpt-5-nano",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1438,
"outputTokens": 2183,
"latencyMs": 16916.007042000012
@@ -13810,7 +13810,7 @@
"model": "claude-haiku-4-5",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 1103.1098750000237
@@ -13821,7 +13821,7 @@
"model": "gpt-5-nano",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3826,
"outputTokens": 2055,
"latencyMs": 17162.629124999978
@@ -13832,7 +13832,7 @@
"model": "claude-haiku-4-5",
"expected": "42",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1150.0435000000289
@@ -13843,7 +13843,7 @@
"model": "gpt-5-nano",
"expected": "42",
"actual": "42",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 1607,
"latencyMs": 14835.323333000008
@@ -13854,7 +13854,7 @@
"model": "claude-haiku-4-5",
"expected": "42",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1206.8219590000226
@@ -13865,7 +13865,7 @@
"model": "gpt-5-nano",
"expected": "28",
"actual": "28",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3709,
"outputTokens": 1479,
"latencyMs": 11560.967958000023
@@ -13876,7 +13876,7 @@
"model": "claude-haiku-4-5",
"expected": "28",
"actual": "24",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1151.9984169999952
@@ -13887,7 +13887,7 @@
"model": "gpt-5-nano",
"expected": "28",
"actual": "28",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1560,
"outputTokens": 1927,
"latencyMs": 15431.08262499998
@@ -13898,7 +13898,7 @@
"model": "claude-haiku-4-5",
"expected": "28",
"actual": "26",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 1032.7485419999575
@@ -13909,7 +13909,7 @@
"model": "gpt-5-nano",
"expected": "28",
"actual": "28",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1438,
"outputTokens": 1607,
"latencyMs": 9425.883957999991
@@ -13920,7 +13920,7 @@
"model": "claude-haiku-4-5",
"expected": "28",
"actual": "23",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 943.5942919999943
@@ -13931,7 +13931,7 @@
"model": "gpt-5-nano",
"expected": "28",
"actual": "28",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3826,
"outputTokens": 1927,
"latencyMs": 16529.66529199999
@@ -13942,7 +13942,7 @@
"model": "claude-haiku-4-5",
"expected": "28",
"actual": "24",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1107.5635419999599
@@ -13953,7 +13953,7 @@
"model": "gpt-5-nano",
"expected": "28",
"actual": "28",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 1863,
"latencyMs": 21071.067082999973
@@ -13964,7 +13964,7 @@
"model": "claude-haiku-4-5",
"expected": "28",
"actual": "23",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1018.46212500002
@@ -13975,7 +13975,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3709,
"outputTokens": 1223,
"latencyMs": 8242.37608300004
@@ -13986,7 +13986,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1052.7201249999925
@@ -13997,7 +13997,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1560,
"outputTokens": 903,
"latencyMs": 5430.806291999994
@@ -14008,7 +14008,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "12",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 2354.328999999969
@@ -14019,7 +14019,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1438,
"outputTokens": 1607,
"latencyMs": 21944.211458000005
@@ -14030,7 +14030,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 1249.9959590000217
@@ -14041,7 +14041,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3826,
"outputTokens": 1415,
"latencyMs": 15465.409875000012
@@ -14052,7 +14052,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1131.9575830000103
@@ -14063,7 +14063,7 @@
"model": "gpt-5-nano",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2982,
"outputTokens": 2503,
"latencyMs": 24744.971958999988
@@ -14074,7 +14074,7 @@
"model": "claude-haiku-4-5",
"expected": "11",
"actual": "11",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1274.6952499999898
@@ -14085,7 +14085,7 @@
"model": "gpt-5-nano",
"expected": "58",
"actual": "58",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3708,
"outputTokens": 1351,
"latencyMs": 12546.867542000022
@@ -14096,7 +14096,7 @@
"model": "claude-haiku-4-5",
"expected": "58",
"actual": "50",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1231.453749999986
@@ -14107,7 +14107,7 @@
"model": "gpt-5-nano",
"expected": "58",
"actual": "58",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1559,
"outputTokens": 1543,
"latencyMs": 16593.402166999993
@@ -14118,7 +14118,7 @@
"model": "claude-haiku-4-5",
"expected": "58",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 1079.0991659999709
@@ -14129,7 +14129,7 @@
"model": "gpt-5-nano",
"expected": "58",
"actual": "58",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1437,
"outputTokens": 1543,
"latencyMs": 10956.456084000005
@@ -14140,7 +14140,7 @@
"model": "claude-haiku-4-5",
"expected": "58",
"actual": "54",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 2018.3774170000106
@@ -14151,7 +14151,7 @@
"model": "gpt-5-nano",
"expected": "58",
"actual": "58",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3825,
"outputTokens": 1351,
"latencyMs": 10537.598500000022
@@ -14162,7 +14162,7 @@
"model": "claude-haiku-4-5",
"expected": "58",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1039.2452080000076
@@ -14173,7 +14173,7 @@
"model": "gpt-5-nano",
"expected": "58",
"actual": "58",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 839,
"latencyMs": 8039.237708000001
@@ -14184,7 +14184,7 @@
"model": "claude-haiku-4-5",
"expected": "58",
"actual": "54",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1264.6740829999908
@@ -14195,7 +14195,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3708,
"outputTokens": 1863,
"latencyMs": 14310.697374999989
@@ -14206,7 +14206,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1138.4443339999998
@@ -14217,7 +14217,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1559,
"outputTokens": 1927,
"latencyMs": 16487.508375000034
@@ -14228,7 +14228,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 1104.2365410000202
@@ -14239,7 +14239,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1437,
"outputTokens": 3015,
"latencyMs": 23688.737208999984
@@ -14250,7 +14250,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "38",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 1026.8166249999776
@@ -14261,7 +14261,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3825,
"outputTokens": 1671,
"latencyMs": 12415.87070899998
@@ -14272,7 +14272,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1062.2278749999823
@@ -14283,7 +14283,7 @@
"model": "gpt-5-nano",
"expected": "41",
"actual": "41",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 1799,
"latencyMs": 15901.829415999993
@@ -14294,7 +14294,7 @@
"model": "claude-haiku-4-5",
"expected": "41",
"actual": "31",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1051.6962910000002
@@ -14305,7 +14305,7 @@
"model": "gpt-5-nano",
"expected": "23",
"actual": "23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3708,
"outputTokens": 1863,
"latencyMs": 15216.926500000001
@@ -14316,7 +14316,7 @@
"model": "claude-haiku-4-5",
"expected": "23",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 4078,
"outputTokens": 5,
"latencyMs": 1460.9212079999852
@@ -14327,7 +14327,7 @@
"model": "gpt-5-nano",
"expected": "23",
"actual": "23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1559,
"outputTokens": 2567,
"latencyMs": 27103.083999999973
@@ -14338,7 +14338,7 @@
"model": "claude-haiku-4-5",
"expected": "23",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1507,
"outputTokens": 5,
"latencyMs": 1101.5416669999831
@@ -14349,7 +14349,7 @@
"model": "gpt-5-nano",
"expected": "23",
"actual": "23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 1437,
"outputTokens": 1543,
"latencyMs": 14598.558207999973
@@ -14360,7 +14360,7 @@
"model": "claude-haiku-4-5",
"expected": "23",
"actual": "20",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 1443,
"outputTokens": 5,
"latencyMs": 1270.7722910000011
@@ -14371,7 +14371,7 @@
"model": "gpt-5-nano",
"expected": "23",
"actual": "23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 3825,
"outputTokens": 1415,
"latencyMs": 14102.604708999977
@@ -14382,7 +14382,7 @@
"model": "claude-haiku-4-5",
"expected": "23",
"actual": "21",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3413,
"outputTokens": 5,
"latencyMs": 1251.4159170000348
@@ -14393,7 +14393,7 @@
"model": "gpt-5-nano",
"expected": "23",
"actual": "23",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 2981,
"outputTokens": 1799,
"latencyMs": 18696.684999999998
@@ -14404,7 +14404,7 @@
"model": "claude-haiku-4-5",
"expected": "23",
"actual": "21",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 3108,
"outputTokens": 5,
"latencyMs": 1170.9401669999934
@@ -14415,7 +14415,7 @@
"model": "gpt-5-nano",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 136,
"latencyMs": 2872.1482499999693
@@ -14426,7 +14426,7 @@
"model": "claude-haiku-4-5",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17409,
"outputTokens": 6,
"latencyMs": 1382.586333000043
@@ -14437,7 +14437,7 @@
"model": "gpt-5-nano",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 904,
"latencyMs": 9130.657125000027
@@ -14448,7 +14448,7 @@
"model": "claude-haiku-4-5",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9279,
"outputTokens": 6,
"latencyMs": 1164.3372080000117
@@ -14459,7 +14459,7 @@
"model": "gpt-5-nano",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8556,
"outputTokens": 648,
"latencyMs": 7763.659999999974
@@ -14470,7 +14470,7 @@
"model": "claude-haiku-4-5",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9125,
"outputTokens": 6,
"latencyMs": 1331.3139999999548
@@ -14481,7 +14481,7 @@
"model": "gpt-5-nano",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 584,
"latencyMs": 9411.661499999987
@@ -14492,7 +14492,7 @@
"model": "claude-haiku-4-5",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15367,
"outputTokens": 6,
"latencyMs": 1272.1991249999846
@@ -14503,7 +14503,7 @@
"model": "gpt-5-nano",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13171,
"outputTokens": 200,
"latencyMs": 3587.8712090000045
@@ -14514,7 +14514,7 @@
"model": "claude-haiku-4-5",
"expected": "430828",
"actual": "430828",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14483,
"outputTokens": 6,
"latencyMs": 1710.5899999999674
@@ -14525,7 +14525,7 @@
"model": "gpt-5-nano",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15189,
"outputTokens": 328,
"latencyMs": 3625.780167000019
@@ -14536,7 +14536,7 @@
"model": "claude-haiku-4-5",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17410,
"outputTokens": 6,
"latencyMs": 1785.2782080000034
@@ -14547,7 +14547,7 @@
"model": "gpt-5-nano",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8790,
"outputTokens": 712,
"latencyMs": 6381.770374999964
@@ -14558,7 +14558,7 @@
"model": "claude-haiku-4-5",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9280,
"outputTokens": 6,
"latencyMs": 1352.5436660000123
@@ -14569,7 +14569,7 @@
"model": "gpt-5-nano",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8558,
"outputTokens": 520,
"latencyMs": 27916.417874999985
@@ -14580,7 +14580,7 @@
"model": "claude-haiku-4-5",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9126,
"outputTokens": 6,
"latencyMs": 2073.8068330000388
@@ -14591,7 +14591,7 @@
"model": "gpt-5-nano",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15483,
"outputTokens": 328,
"latencyMs": 5943.872542000026
@@ -14602,7 +14602,7 @@
"model": "claude-haiku-4-5",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15368,
"outputTokens": 6,
"latencyMs": 1767.4393339999951
@@ -14613,7 +14613,7 @@
"model": "gpt-5-nano",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13173,
"outputTokens": 264,
"latencyMs": 3115.895124999981
@@ -14624,7 +14624,7 @@
"model": "claude-haiku-4-5",
"expected": "11798",
"actual": "11798",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14484,
"outputTokens": 6,
"latencyMs": 1183.2249999999767
@@ -14635,7 +14635,7 @@
"model": "gpt-5-nano",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15192,
"outputTokens": 392,
"latencyMs": 4991.646125000028
@@ -14646,7 +14646,7 @@
"model": "claude-haiku-4-5",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17412,
"outputTokens": 6,
"latencyMs": 1835.4077919999836
@@ -14657,7 +14657,7 @@
"model": "gpt-5-nano",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8793,
"outputTokens": 712,
"latencyMs": 7788.013291999989
@@ -14668,7 +14668,7 @@
"model": "claude-haiku-4-5",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9282,
"outputTokens": 6,
"latencyMs": 1082.4066669999738
@@ -14679,7 +14679,7 @@
"model": "gpt-5-nano",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8561,
"outputTokens": 520,
"latencyMs": 5664.896500000032
@@ -14690,7 +14690,7 @@
"model": "claude-haiku-4-5",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9128,
"outputTokens": 6,
"latencyMs": 1215.8875830000034
@@ -14701,7 +14701,7 @@
"model": "gpt-5-nano",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15486,
"outputTokens": 456,
"latencyMs": 5141.449292000034
@@ -14712,7 +14712,7 @@
"model": "claude-haiku-4-5",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15370,
"outputTokens": 6,
"latencyMs": 1483.2090420000022
@@ -14723,7 +14723,7 @@
"model": "gpt-5-nano",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13176,
"outputTokens": 328,
"latencyMs": 7532.760624999995
@@ -14734,7 +14734,7 @@
"model": "claude-haiku-4-5",
"expected": "183631",
"actual": "183631",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14486,
"outputTokens": 6,
"latencyMs": 1458.0657500000088
@@ -14745,7 +14745,7 @@
"model": "gpt-5-nano",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15191,
"outputTokens": 392,
"latencyMs": 7922.4705829999875
@@ -14756,7 +14756,7 @@
"model": "claude-haiku-4-5",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17412,
"outputTokens": 6,
"latencyMs": 1510.0054579999996
@@ -14767,7 +14767,7 @@
"model": "gpt-5-nano",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8792,
"outputTokens": 776,
"latencyMs": 8475.77466699999
@@ -14778,7 +14778,7 @@
"model": "claude-haiku-4-5",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9282,
"outputTokens": 6,
"latencyMs": 1203.3620419999934
@@ -14789,7 +14789,7 @@
"model": "gpt-5-nano",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8560,
"outputTokens": 776,
"latencyMs": 7283.84258300002
@@ -14800,7 +14800,7 @@
"model": "claude-haiku-4-5",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9128,
"outputTokens": 6,
"latencyMs": 1365.2434169999906
@@ -14811,7 +14811,7 @@
"model": "gpt-5-nano",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15485,
"outputTokens": 520,
"latencyMs": 5846.538916999998
@@ -14822,7 +14822,7 @@
"model": "claude-haiku-4-5",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15370,
"outputTokens": 6,
"latencyMs": 1203.6220829999656
@@ -14833,7 +14833,7 @@
"model": "gpt-5-nano",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13175,
"outputTokens": 456,
"latencyMs": 5973.848832999996
@@ -14844,7 +14844,7 @@
"model": "claude-haiku-4-5",
"expected": "29246",
"actual": "29246",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14486,
"outputTokens": 6,
"latencyMs": 1189.811875000014
@@ -14855,7 +14855,7 @@
"model": "gpt-5-nano",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 328,
"latencyMs": 8872.252957999997
@@ -14866,7 +14866,7 @@
"model": "claude-haiku-4-5",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17407,
"outputTokens": 6,
"latencyMs": 1775.476083000016
@@ -14877,7 +14877,7 @@
"model": "gpt-5-nano",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 648,
"latencyMs": 7149.649291000038
@@ -14888,7 +14888,7 @@
"model": "claude-haiku-4-5",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9277,
"outputTokens": 6,
"latencyMs": 1577.2079999999842
@@ -14899,7 +14899,7 @@
"model": "gpt-5-nano",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8556,
"outputTokens": 1288,
"latencyMs": 11344.462834000005
@@ -14910,7 +14910,7 @@
"model": "claude-haiku-4-5",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9123,
"outputTokens": 6,
"latencyMs": 1340.27887499996
@@ -14921,7 +14921,7 @@
"model": "gpt-5-nano",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 392,
"latencyMs": 6256.696250000037
@@ -14932,7 +14932,7 @@
"model": "claude-haiku-4-5",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15365,
"outputTokens": 6,
"latencyMs": 1604.6909999999916
@@ -14943,7 +14943,7 @@
"model": "gpt-5-nano",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13171,
"outputTokens": 456,
"latencyMs": 5982.022666999954
@@ -14954,7 +14954,7 @@
"model": "claude-haiku-4-5",
"expected": "135306",
"actual": "135306",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14481,
"outputTokens": 6,
"latencyMs": 1259.2409589999588
@@ -14965,7 +14965,7 @@
"model": "gpt-5-nano",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15186,
"outputTokens": 200,
"latencyMs": 2858.1693749999977
@@ -14976,7 +14976,7 @@
"model": "claude-haiku-4-5",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17408,
"outputTokens": 6,
"latencyMs": 1786.5725000000093
@@ -14987,7 +14987,7 @@
"model": "gpt-5-nano",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8787,
"outputTokens": 2696,
"latencyMs": 23868.72975
@@ -14998,7 +14998,7 @@
"model": "claude-haiku-4-5",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9278,
"outputTokens": 6,
"latencyMs": 1116.0275000000256
@@ -15009,7 +15009,7 @@
"model": "gpt-5-nano",
"expected": "24914",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8555,
"outputTokens": 1543,
"latencyMs": 17006.341916999954
@@ -15020,7 +15020,7 @@
"model": "claude-haiku-4-5",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9124,
"outputTokens": 6,
"latencyMs": 1425.7799160000286
@@ -15031,7 +15031,7 @@
"model": "gpt-5-nano",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15480,
"outputTokens": 648,
"latencyMs": 8414.583791000012
@@ -15042,7 +15042,7 @@
"model": "claude-haiku-4-5",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15366,
"outputTokens": 6,
"latencyMs": 1374.9217920000083
@@ -15053,7 +15053,7 @@
"model": "gpt-5-nano",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13170,
"outputTokens": 456,
"latencyMs": 6113.31808300002
@@ -15064,7 +15064,7 @@
"model": "claude-haiku-4-5",
"expected": "24914",
"actual": "24914",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14482,
"outputTokens": 6,
"latencyMs": 1374.9246660000063
@@ -15075,7 +15075,7 @@
"model": "gpt-5-nano",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15186,
"outputTokens": 392,
"latencyMs": 5410.596499999985
@@ -15086,7 +15086,7 @@
"model": "claude-haiku-4-5",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17407,
"outputTokens": 6,
"latencyMs": 1607.6261659999727
@@ -15097,7 +15097,7 @@
"model": "gpt-5-nano",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8787,
"outputTokens": 520,
"latencyMs": 6469.81479199999
@@ -15108,7 +15108,7 @@
"model": "claude-haiku-4-5",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9277,
"outputTokens": 6,
"latencyMs": 1103.9521250000107
@@ -15119,7 +15119,7 @@
"model": "gpt-5-nano",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8555,
"outputTokens": 904,
"latencyMs": 8993.236791000003
@@ -15130,7 +15130,7 @@
"model": "claude-haiku-4-5",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9123,
"outputTokens": 6,
"latencyMs": 1118.0249590000021
@@ -15141,7 +15141,7 @@
"model": "gpt-5-nano",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15480,
"outputTokens": 392,
"latencyMs": 4705.902084000001
@@ -15152,7 +15152,7 @@
"model": "claude-haiku-4-5",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15365,
"outputTokens": 6,
"latencyMs": 1454.1250839999993
@@ -15163,7 +15163,7 @@
"model": "gpt-5-nano",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13170,
"outputTokens": 456,
"latencyMs": 5041.734750000003
@@ -15174,7 +15174,7 @@
"model": "claude-haiku-4-5",
"expected": "111683",
"actual": "111683",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14481,
"outputTokens": 6,
"latencyMs": 1199.9473330000183
@@ -15185,7 +15185,7 @@
"model": "gpt-5-nano",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15193,
"outputTokens": 328,
"latencyMs": 4364.900083000015
@@ -15196,7 +15196,7 @@
"model": "claude-haiku-4-5",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17412,
"outputTokens": 6,
"latencyMs": 1320.7056250000023
@@ -15207,7 +15207,7 @@
"model": "gpt-5-nano",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8794,
"outputTokens": 904,
"latencyMs": 8590.36599999998
@@ -15218,7 +15218,7 @@
"model": "claude-haiku-4-5",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9282,
"outputTokens": 6,
"latencyMs": 1166.0237089999719
@@ -15229,7 +15229,7 @@
"model": "gpt-5-nano",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8562,
"outputTokens": 648,
"latencyMs": 6442.057417000004
@@ -15240,7 +15240,7 @@
"model": "claude-haiku-4-5",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9128,
"outputTokens": 6,
"latencyMs": 1342.8652910000528
@@ -15251,7 +15251,7 @@
"model": "gpt-5-nano",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15487,
"outputTokens": 264,
"latencyMs": 4450.340833000024
@@ -15262,7 +15262,7 @@
"model": "claude-haiku-4-5",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15370,
"outputTokens": 6,
"latencyMs": 1551.4001249999856
@@ -15273,7 +15273,7 @@
"model": "gpt-5-nano",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13177,
"outputTokens": 520,
"latencyMs": 5858.679374999949
@@ -15284,7 +15284,7 @@
"model": "claude-haiku-4-5",
"expected": "13364",
"actual": "13364",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14486,
"outputTokens": 6,
"latencyMs": 1173.6422499999753
@@ -15295,7 +15295,7 @@
"model": "gpt-5-nano",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15185,
"outputTokens": 456,
"latencyMs": 6377.878708000004
@@ -15306,7 +15306,7 @@
"model": "claude-haiku-4-5",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17405,
"outputTokens": 6,
"latencyMs": 1312.9188750000321
@@ -15317,7 +15317,7 @@
"model": "gpt-5-nano",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8786,
"outputTokens": 4680,
"latencyMs": 36395.80937499995
@@ -15328,7 +15328,7 @@
"model": "claude-haiku-4-5",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9275,
"outputTokens": 6,
"latencyMs": 2024.6539580000099
@@ -15339,7 +15339,7 @@
"model": "gpt-5-nano",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8554,
"outputTokens": 3784,
"latencyMs": 30336.309707999986
@@ -15350,7 +15350,7 @@
"model": "claude-haiku-4-5",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9121,
"outputTokens": 6,
"latencyMs": 1237.6976249999716
@@ -15361,7 +15361,7 @@
"model": "gpt-5-nano",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15479,
"outputTokens": 264,
"latencyMs": 5297.444375000021
@@ -15372,7 +15372,7 @@
"model": "claude-haiku-4-5",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15363,
"outputTokens": 6,
"latencyMs": 1775.3334170000162
@@ -15383,7 +15383,7 @@
"model": "gpt-5-nano",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13169,
"outputTokens": 392,
"latencyMs": 8030.958958000003
@@ -15394,7 +15394,7 @@
"model": "claude-haiku-4-5",
"expected": "98464",
"actual": "98464",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14479,
"outputTokens": 6,
"latencyMs": 1401.1453330000513
@@ -15405,7 +15405,7 @@
"model": "gpt-5-nano",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 264,
"latencyMs": 6193.845583000046
@@ -15416,7 +15416,7 @@
"model": "claude-haiku-4-5",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17408,
"outputTokens": 6,
"latencyMs": 2449.4082920000073
@@ -15427,7 +15427,7 @@
"model": "gpt-5-nano",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 2568,
"latencyMs": 25386.850749999983
@@ -15438,7 +15438,7 @@
"model": "claude-haiku-4-5",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9278,
"outputTokens": 6,
"latencyMs": 1351.401165999996
@@ -15449,7 +15449,7 @@
"model": "gpt-5-nano",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8556,
"outputTokens": 456,
"latencyMs": 5087.453167000029
@@ -15460,7 +15460,7 @@
"model": "claude-haiku-4-5",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9124,
"outputTokens": 6,
"latencyMs": 1229.4187500000116
@@ -15471,7 +15471,7 @@
"model": "gpt-5-nano",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 520,
"latencyMs": 6781.348249999981
@@ -15482,7 +15482,7 @@
"model": "claude-haiku-4-5",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15366,
"outputTokens": 6,
"latencyMs": 1411.0081670000218
@@ -15493,7 +15493,7 @@
"model": "gpt-5-nano",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13171,
"outputTokens": 328,
"latencyMs": 9405.325083000003
@@ -15504,7 +15504,7 @@
"model": "claude-haiku-4-5",
"expected": "6378",
"actual": "6378",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14482,
"outputTokens": 6,
"latencyMs": 1575.9942499999888
@@ -15515,7 +15515,7 @@
"model": "gpt-5-nano",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15189,
"outputTokens": 456,
"latencyMs": 7723.79820900003
@@ -15526,7 +15526,7 @@
"model": "claude-haiku-4-5",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17409,
"outputTokens": 6,
"latencyMs": 1496.878625000012
@@ -15537,7 +15537,7 @@
"model": "gpt-5-nano",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8790,
"outputTokens": 328,
"latencyMs": 5231.312959000003
@@ -15548,7 +15548,7 @@
"model": "claude-haiku-4-5",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9279,
"outputTokens": 6,
"latencyMs": 1145.5107919999864
@@ -15559,7 +15559,7 @@
"model": "gpt-5-nano",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8558,
"outputTokens": 392,
"latencyMs": 4585.943417000002
@@ -15570,7 +15570,7 @@
"model": "claude-haiku-4-5",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9125,
"outputTokens": 6,
"latencyMs": 1386.1237079999992
@@ -15581,7 +15581,7 @@
"model": "gpt-5-nano",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15483,
"outputTokens": 328,
"latencyMs": 9374.248917000019
@@ -15592,7 +15592,7 @@
"model": "claude-haiku-4-5",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15367,
"outputTokens": 6,
"latencyMs": 1332.4388340000296
@@ -15603,7 +15603,7 @@
"model": "gpt-5-nano",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13173,
"outputTokens": 200,
"latencyMs": 3953.8284580000327
@@ -15614,7 +15614,7 @@
"model": "claude-haiku-4-5",
"expected": "254916",
"actual": "254916",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14483,
"outputTokens": 6,
"latencyMs": 1294.3535840000259
@@ -15625,7 +15625,7 @@
"model": "gpt-5-nano",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 584,
"latencyMs": 8515.676582999993
@@ -15636,7 +15636,7 @@
"model": "claude-haiku-4-5",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17410,
"outputTokens": 6,
"latencyMs": 2508.0940420000115
@@ -15647,7 +15647,7 @@
"model": "gpt-5-nano",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 584,
"latencyMs": 6331.0320000000065
@@ -15658,7 +15658,7 @@
"model": "claude-haiku-4-5",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9280,
"outputTokens": 6,
"latencyMs": 1249.4856250000303
@@ -15669,7 +15669,7 @@
"model": "gpt-5-nano",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8556,
"outputTokens": 648,
"latencyMs": 8463.519499999995
@@ -15680,7 +15680,7 @@
"model": "claude-haiku-4-5",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9126,
"outputTokens": 6,
"latencyMs": 1035.4223750000237
@@ -15691,7 +15691,7 @@
"model": "gpt-5-nano",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 520,
"latencyMs": 9625.975833999983
@@ -15702,7 +15702,7 @@
"model": "claude-haiku-4-5",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15368,
"outputTokens": 6,
"latencyMs": 1460.7396250000456
@@ -15713,7 +15713,7 @@
"model": "gpt-5-nano",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13171,
"outputTokens": 712,
"latencyMs": 7525.112709000008
@@ -15724,7 +15724,7 @@
"model": "claude-haiku-4-5",
"expected": "32413",
"actual": "32413",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14484,
"outputTokens": 6,
"latencyMs": 1488.0029170000344
@@ -15735,7 +15735,7 @@
"model": "gpt-5-nano",
"expected": "240059",
"actual": "not found",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15185,
"outputTokens": 1352,
"latencyMs": 8303.157542
@@ -15746,7 +15746,7 @@
"model": "claude-haiku-4-5",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17405,
"outputTokens": 6,
"latencyMs": 1515.7900000000373
@@ -15757,7 +15757,7 @@
"model": "gpt-5-nano",
"expected": "240059",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8786,
"outputTokens": 2503,
"latencyMs": 20915.808583000035
@@ -15768,7 +15768,7 @@
"model": "claude-haiku-4-5",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9275,
"outputTokens": 6,
"latencyMs": 1193.4237079999875
@@ -15779,7 +15779,7 @@
"model": "gpt-5-nano",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8554,
"outputTokens": 4360,
"latencyMs": 34760.80329100002
@@ -15790,7 +15790,7 @@
"model": "claude-haiku-4-5",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9121,
"outputTokens": 6,
"latencyMs": 3022.242749999976
@@ -15801,7 +15801,7 @@
"model": "gpt-5-nano",
"expected": "240059",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15479,
"outputTokens": 2567,
"latencyMs": 15901.546999999962
@@ -15812,7 +15812,7 @@
"model": "claude-haiku-4-5",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15363,
"outputTokens": 6,
"latencyMs": 1358.283374999999
@@ -15823,7 +15823,7 @@
"model": "gpt-5-nano",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13169,
"outputTokens": 584,
"latencyMs": 10520.349042000016
@@ -15834,7 +15834,7 @@
"model": "claude-haiku-4-5",
"expected": "240059",
"actual": "240059",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14479,
"outputTokens": 6,
"latencyMs": 1426.0678330000374
@@ -15845,7 +15845,7 @@
"model": "gpt-5-nano",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15186,
"outputTokens": 712,
"latencyMs": 7069.827042000019
@@ -15856,7 +15856,7 @@
"model": "claude-haiku-4-5",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17406,
"outputTokens": 6,
"latencyMs": 1507.9525419999845
@@ -15867,7 +15867,7 @@
"model": "gpt-5-nano",
"expected": "48986",
"actual": "undefined",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8787,
"outputTokens": 2311,
"latencyMs": 18257.385332999984
@@ -15878,7 +15878,7 @@
"model": "claude-haiku-4-5",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9276,
"outputTokens": 6,
"latencyMs": 1397.3040420000325
@@ -15889,7 +15889,7 @@
"model": "gpt-5-nano",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8555,
"outputTokens": 3976,
"latencyMs": 29865.140291999967
@@ -15900,7 +15900,7 @@
"model": "claude-haiku-4-5",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9122,
"outputTokens": 6,
"latencyMs": 1218.4357079999754
@@ -15911,7 +15911,7 @@
"model": "gpt-5-nano",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15480,
"outputTokens": 904,
"latencyMs": 8906.708750000049
@@ -15922,7 +15922,7 @@
"model": "claude-haiku-4-5",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15364,
"outputTokens": 6,
"latencyMs": 1917.3721249999944
@@ -15933,7 +15933,7 @@
"model": "gpt-5-nano",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13170,
"outputTokens": 1160,
"latencyMs": 9665.802708000003
@@ -15944,7 +15944,7 @@
"model": "claude-haiku-4-5",
"expected": "48986",
"actual": "48986",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14480,
"outputTokens": 6,
"latencyMs": 1342.7929170000134
@@ -15955,7 +15955,7 @@
"model": "gpt-5-nano",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15185,
"outputTokens": 648,
"latencyMs": 6259.387500000012
@@ -15966,7 +15966,7 @@
"model": "claude-haiku-4-5",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17405,
"outputTokens": 6,
"latencyMs": 1860.1597499999916
@@ -15977,7 +15977,7 @@
"model": "gpt-5-nano",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8786,
"outputTokens": 3336,
"latencyMs": 23288.63820799999
@@ -15988,7 +15988,7 @@
"model": "claude-haiku-4-5",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9275,
"outputTokens": 6,
"latencyMs": 1180.5804169999901
@@ -15999,7 +15999,7 @@
"model": "gpt-5-nano",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8554,
"outputTokens": 840,
"latencyMs": 6988.782166000048
@@ -16010,7 +16010,7 @@
"model": "claude-haiku-4-5",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9121,
"outputTokens": 6,
"latencyMs": 1391.326041000022
@@ -16021,7 +16021,7 @@
"model": "gpt-5-nano",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15479,
"outputTokens": 648,
"latencyMs": 6708.915624999965
@@ -16032,7 +16032,7 @@
"model": "claude-haiku-4-5",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15363,
"outputTokens": 6,
"latencyMs": 1364.766833999951
@@ -16043,7 +16043,7 @@
"model": "gpt-5-nano",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13169,
"outputTokens": 328,
"latencyMs": 3396.199416999996
@@ -16054,7 +16054,7 @@
"model": "claude-haiku-4-5",
"expected": "209624",
"actual": "209624",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14479,
"outputTokens": 6,
"latencyMs": 1378.3461249999818
@@ -16065,7 +16065,7 @@
"model": "gpt-5-nano",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15185,
"outputTokens": 200,
"latencyMs": 2947.7053750000196
@@ -16076,7 +16076,7 @@
"model": "claude-haiku-4-5",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17406,
"outputTokens": 6,
"latencyMs": 1512.1218329999829
@@ -16087,7 +16087,7 @@
"model": "gpt-5-nano",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8786,
"outputTokens": 840,
"latencyMs": 7657.443458000023
@@ -16098,7 +16098,7 @@
"model": "claude-haiku-4-5",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9276,
"outputTokens": 6,
"latencyMs": 1119.6807499999995
@@ -16109,7 +16109,7 @@
"model": "gpt-5-nano",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8554,
"outputTokens": 392,
"latencyMs": 4410.906208000029
@@ -16120,7 +16120,7 @@
"model": "claude-haiku-4-5",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9122,
"outputTokens": 6,
"latencyMs": 1227.467249999987
@@ -16131,7 +16131,7 @@
"model": "gpt-5-nano",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15479,
"outputTokens": 328,
"latencyMs": 4168.014292000036
@@ -16142,7 +16142,7 @@
"model": "claude-haiku-4-5",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15364,
"outputTokens": 6,
"latencyMs": 1878.2624590000487
@@ -16153,7 +16153,7 @@
"model": "gpt-5-nano",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13169,
"outputTokens": 456,
"latencyMs": 4726.903416000016
@@ -16164,7 +16164,7 @@
"model": "claude-haiku-4-5",
"expected": "58023",
"actual": "58023",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14480,
"outputTokens": 6,
"latencyMs": 1665.950124999974
@@ -16175,7 +16175,7 @@
"model": "gpt-5-nano",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15188,
"outputTokens": 456,
"latencyMs": 5633.756834
@@ -16186,7 +16186,7 @@
"model": "claude-haiku-4-5",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17407,
"outputTokens": 6,
"latencyMs": 1482.6277910000063
@@ -16197,7 +16197,7 @@
"model": "gpt-5-nano",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 1416,
"latencyMs": 11371.267457999988
@@ -16208,7 +16208,7 @@
"model": "claude-haiku-4-5",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9277,
"outputTokens": 6,
"latencyMs": 1690.2400420000195
@@ -16219,7 +16219,7 @@
"model": "gpt-5-nano",
"expected": "196024",
"actual": "Repo not found",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8557,
"outputTokens": 3273,
"latencyMs": 28731.530667000043
@@ -16230,7 +16230,7 @@
"model": "claude-haiku-4-5",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9123,
"outputTokens": 6,
"latencyMs": 1070.5141670000157
@@ -16241,7 +16241,7 @@
"model": "gpt-5-nano",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 520,
"latencyMs": 7021.771125000028
@@ -16252,7 +16252,7 @@
"model": "claude-haiku-4-5",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15365,
"outputTokens": 6,
"latencyMs": 1243.7466250000289
@@ -16263,7 +16263,7 @@
"model": "gpt-5-nano",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13172,
"outputTokens": 456,
"latencyMs": 5286.169750000001
@@ -16274,7 +16274,7 @@
"model": "claude-haiku-4-5",
"expected": "196024",
"actual": "196024",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14481,
"outputTokens": 6,
"latencyMs": 1450.456957999966
@@ -16285,7 +16285,7 @@
"model": "gpt-5-nano",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15188,
"outputTokens": 456,
"latencyMs": 5440.864250000042
@@ -16296,7 +16296,7 @@
"model": "claude-haiku-4-5",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17408,
"outputTokens": 6,
"latencyMs": 1369.6618330000201
@@ -16307,7 +16307,7 @@
"model": "gpt-5-nano",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 712,
"latencyMs": 6130.9379999999655
@@ -16318,7 +16318,7 @@
"model": "claude-haiku-4-5",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9278,
"outputTokens": 6,
"latencyMs": 1635.81579100003
@@ -16329,7 +16329,7 @@
"model": "gpt-5-nano",
"expected": "30919",
"actual": "N/A",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8557,
"outputTokens": 1288,
"latencyMs": 20319.653374999994
@@ -16340,7 +16340,7 @@
"model": "claude-haiku-4-5",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9124,
"outputTokens": 6,
"latencyMs": 1381.8252079999656
@@ -16351,7 +16351,7 @@
"model": "gpt-5-nano",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 328,
"latencyMs": 5951.751374999993
@@ -16362,7 +16362,7 @@
"model": "claude-haiku-4-5",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15366,
"outputTokens": 6,
"latencyMs": 1367.1241670000018
@@ -16373,7 +16373,7 @@
"model": "gpt-5-nano",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13172,
"outputTokens": 328,
"latencyMs": 3499.136334000039
@@ -16384,7 +16384,7 @@
"model": "claude-haiku-4-5",
"expected": "30919",
"actual": "30919",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14482,
"outputTokens": 6,
"latencyMs": 1573.7027499999967
@@ -16395,7 +16395,7 @@
"model": "gpt-5-nano",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 392,
"latencyMs": 7833.668625000049
@@ -16406,7 +16406,7 @@
"model": "claude-haiku-4-5",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17405,
"outputTokens": 6,
"latencyMs": 1477.048582999967
@@ -16417,7 +16417,7 @@
"model": "gpt-5-nano",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 520,
"latencyMs": 4880.817959000007
@@ -16428,7 +16428,7 @@
"model": "claude-haiku-4-5",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9275,
"outputTokens": 6,
"latencyMs": 1081.6979169999831
@@ -16439,7 +16439,7 @@
"model": "gpt-5-nano",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8556,
"outputTokens": 1992,
"latencyMs": 14180.11841699999
@@ -16450,7 +16450,7 @@
"model": "claude-haiku-4-5",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9121,
"outputTokens": 6,
"latencyMs": 1393.665417000011
@@ -16461,7 +16461,7 @@
"model": "gpt-5-nano",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 392,
"latencyMs": 4068.912416999985
@@ -16472,7 +16472,7 @@
"model": "claude-haiku-4-5",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15363,
"outputTokens": 6,
"latencyMs": 1687.0724170000176
@@ -16483,7 +16483,7 @@
"model": "gpt-5-nano",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13171,
"outputTokens": 392,
"latencyMs": 4048.8707089999807
@@ -16494,7 +16494,7 @@
"model": "claude-haiku-4-5",
"expected": "192220",
"actual": "192220",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14479,
"outputTokens": 6,
"latencyMs": 1441.8594579999917
@@ -16505,7 +16505,7 @@
"model": "gpt-5-nano",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15190,
"outputTokens": 392,
"latencyMs": 4563.366041000001
@@ -16516,7 +16516,7 @@
"model": "claude-haiku-4-5",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17414,
"outputTokens": 6,
"latencyMs": 1361.9952920000069
@@ -16527,7 +16527,7 @@
"model": "gpt-5-nano",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8791,
"outputTokens": 904,
"latencyMs": 9523.924416000023
@@ -16538,7 +16538,7 @@
"model": "claude-haiku-4-5",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9284,
"outputTokens": 6,
"latencyMs": 1235.863416999986
@@ -16549,7 +16549,7 @@
"model": "gpt-5-nano",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8559,
"outputTokens": 584,
"latencyMs": 5264.637583000003
@@ -16560,7 +16560,7 @@
"model": "claude-haiku-4-5",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9130,
"outputTokens": 6,
"latencyMs": 1307.1584169999696
@@ -16571,7 +16571,7 @@
"model": "gpt-5-nano",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15484,
"outputTokens": 328,
"latencyMs": 8621.355207999994
@@ -16582,7 +16582,7 @@
"model": "claude-haiku-4-5",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15372,
"outputTokens": 6,
"latencyMs": 1464.8200829999987
@@ -16593,7 +16593,7 @@
"model": "gpt-5-nano",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13174,
"outputTokens": 264,
"latencyMs": 3034.7359999999753
@@ -16604,7 +16604,7 @@
"model": "claude-haiku-4-5",
"expected": "11763",
"actual": "11763",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14488,
"outputTokens": 6,
"latencyMs": 1959.3285000000033
@@ -16615,7 +16615,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15187,
"outputTokens": 2055,
"latencyMs": 16430.930082999985
@@ -16626,7 +16626,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17406,
"outputTokens": 5,
"latencyMs": 1730.124458999955
@@ -16637,7 +16637,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8788,
"outputTokens": 839,
"latencyMs": 7275.640458000009
@@ -16648,7 +16648,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9276,
"outputTokens": 5,
"latencyMs": 1286.8315839999705
@@ -16659,7 +16659,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8556,
"outputTokens": 2695,
"latencyMs": 24177.570000000007
@@ -16670,7 +16670,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9122,
"outputTokens": 5,
"latencyMs": 1102.5337500000023
@@ -16681,7 +16681,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15481,
"outputTokens": 1671,
"latencyMs": 14929.856415999995
@@ -16692,7 +16692,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15364,
"outputTokens": 5,
"latencyMs": 1227.103541999997
@@ -16703,7 +16703,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 13171,
"outputTokens": 583,
"latencyMs": 5785.248666999978
@@ -16714,7 +16714,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "0",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14480,
"outputTokens": 5,
"latencyMs": 1959.456125000026
@@ -16725,7 +16725,7 @@
"model": "gpt-5-nano",
"expected": "15404143",
"actual": "19196630",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 13385,
"latencyMs": 239619.323125
@@ -16736,7 +16736,7 @@
"model": "claude-haiku-4-5",
"expected": "15404143",
"actual": "13,847,892",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17407,
"outputTokens": 9,
"latencyMs": 1838.8340420000022
@@ -16747,7 +16747,7 @@
"model": "gpt-5-nano",
"expected": "15404143",
"actual": "15404143",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 12169,
"latencyMs": 109453.991416
@@ -16758,7 +16758,7 @@
"model": "claude-haiku-4-5",
"expected": "15404143",
"actual": "13,847,892",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9277,
"outputTokens": 9,
"latencyMs": 1443.470417000004
@@ -16769,7 +16769,7 @@
"model": "gpt-5-nano",
"expected": "15404143",
"actual": "15404143",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 6281,
"latencyMs": 45474.442209
@@ -16780,7 +16780,7 @@
"model": "claude-haiku-4-5",
"expected": "15404143",
"actual": "15,847,892",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9123,
"outputTokens": 9,
"latencyMs": 1361.6022089999751
@@ -16791,7 +16791,7 @@
"model": "gpt-5-nano",
"expected": "15404143",
"actual": "15404143",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 4489,
"latencyMs": 29654.25554099999
@@ -16802,7 +16802,7 @@
"model": "claude-haiku-4-5",
"expected": "15404143",
"actual": "13,847,892",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15365,
"outputTokens": 9,
"latencyMs": 1796.0902500000084
@@ -16813,7 +16813,7 @@
"model": "gpt-5-nano",
"expected": "15404143",
"actual": "15404143",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13172,
"outputTokens": 6409,
"latencyMs": 70234.84133299999
@@ -16824,7 +16824,7 @@
"model": "claude-haiku-4-5",
"expected": "15404143",
"actual": "13,847,892",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14481,
"outputTokens": 9,
"latencyMs": 1965.7452919999487
@@ -16835,7 +16835,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "60",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 7495,
"latencyMs": 72992.43658400001
@@ -16846,7 +16846,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 17408,
"outputTokens": 5,
"latencyMs": 1772.3059999999823
@@ -16857,7 +16857,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 2759,
"latencyMs": 19214.133417000005
@@ -16868,7 +16868,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9278,
"outputTokens": 5,
"latencyMs": 1115.5979170000064
@@ -16879,7 +16879,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 2439,
"latencyMs": 27365.987334000005
@@ -16890,7 +16890,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 9124,
"outputTokens": 5,
"latencyMs": 1322.4322910000337
@@ -16901,7 +16901,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 5767,
"latencyMs": 60524.90554200002
@@ -16912,7 +16912,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15366,
"outputTokens": 5,
"latencyMs": 1597.7364170000073
@@ -16923,7 +16923,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13172,
"outputTokens": 4039,
"latencyMs": 28819.869999999995
@@ -16934,7 +16934,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 14482,
"outputTokens": 5,
"latencyMs": 1798.9455409999937
@@ -16945,7 +16945,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "86",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 2375,
"latencyMs": 23963.549916999997
@@ -16956,7 +16956,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17408,
"outputTokens": 5,
"latencyMs": 1836.1375000000116
@@ -16967,7 +16967,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 3079,
"latencyMs": 26957.04420799995
@@ -16978,7 +16978,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9278,
"outputTokens": 5,
"latencyMs": 1209.7997920000344
@@ -16989,7 +16989,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 2887,
"latencyMs": 27174.970375000034
@@ -17000,7 +17000,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9124,
"outputTokens": 5,
"latencyMs": 1293.6252920000115
@@ -17011,7 +17011,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "98",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15482,
"outputTokens": 2567,
"latencyMs": 29565.065250000043
@@ -17022,7 +17022,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15366,
"outputTokens": 5,
"latencyMs": 1230.7459160000435
@@ -17033,7 +17033,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 13172,
"outputTokens": 2695,
"latencyMs": 20706.84841700003
@@ -17044,7 +17044,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14482,
"outputTokens": 5,
"latencyMs": 1743.1536249999772
@@ -17055,7 +17055,7 @@
"model": "gpt-5-nano",
"expected": "76",
"actual": "41",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 8263,
"latencyMs": 60899.858959000034
@@ -17066,7 +17066,7 @@
"model": "claude-haiku-4-5",
"expected": "76",
"actual": "100",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17408,
"outputTokens": 5,
"latencyMs": 1350.1540420000092
@@ -17077,7 +17077,7 @@
"model": "gpt-5-nano",
"expected": "76",
"actual": "76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 3847,
"latencyMs": 30491.779582999996
@@ -17088,7 +17088,7 @@
"model": "claude-haiku-4-5",
"expected": "76",
"actual": "100",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9278,
"outputTokens": 5,
"latencyMs": 1513.2665410000482
@@ -17099,7 +17099,7 @@
"model": "gpt-5-nano",
"expected": "76",
"actual": "76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 3847,
"latencyMs": 25522.397125000018
@@ -17110,7 +17110,7 @@
"model": "claude-haiku-4-5",
"expected": "76",
"actual": "100",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9124,
"outputTokens": 5,
"latencyMs": 1150.7281660000444
@@ -17121,7 +17121,7 @@
"model": "gpt-5-nano",
"expected": "76",
"actual": "76",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 2631,
"latencyMs": 22525.465083000017
@@ -17132,7 +17132,7 @@
"model": "claude-haiku-4-5",
"expected": "76",
"actual": "100",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15366,
"outputTokens": 5,
"latencyMs": 1438.5829169999924
@@ -17143,7 +17143,7 @@
"model": "gpt-5-nano",
"expected": "76",
"actual": "62",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 13172,
"outputTokens": 1351,
"latencyMs": 11162.623291999975
@@ -17154,7 +17154,7 @@
"model": "claude-haiku-4-5",
"expected": "76",
"actual": "100",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14482,
"outputTokens": 5,
"latencyMs": 1305.162249999994
@@ -17165,7 +17165,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "129",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 6599,
"latencyMs": 49590.68900000001
@@ -17176,7 +17176,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17409,
"outputTokens": 5,
"latencyMs": 1750.9506249999977
@@ -17187,7 +17187,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8789,
"outputTokens": 8903,
"latencyMs": 68556.36550000001
@@ -17198,7 +17198,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "73",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9279,
"outputTokens": 5,
"latencyMs": 1148.3701669999864
@@ -17209,7 +17209,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "100",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 3271,
"latencyMs": 36128.254709
@@ -17220,7 +17220,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "89",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9125,
"outputTokens": 5,
"latencyMs": 1137.2578750000102
@@ -17231,7 +17231,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "79",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15482,
"outputTokens": 3527,
"latencyMs": 35526.23958300002
@@ -17242,7 +17242,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "95",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15367,
"outputTokens": 5,
"latencyMs": 1501.6561670000083
@@ -17253,7 +17253,7 @@
"model": "gpt-5-nano",
"expected": "100",
"actual": "99",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 13172,
"outputTokens": 3143,
"latencyMs": 26700.229333000025
@@ -17264,7 +17264,7 @@
"model": "claude-haiku-4-5",
"expected": "100",
"actual": "95",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14483,
"outputTokens": 5,
"latencyMs": 1159.0904580000206
@@ -17275,7 +17275,7 @@
"model": "gpt-5-nano",
"expected": "95",
"actual": "94",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 4999,
"latencyMs": 32710.407750000013
@@ -17286,7 +17286,7 @@
"model": "claude-haiku-4-5",
"expected": "95",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17409,
"outputTokens": 5,
"latencyMs": 1451.6710420000018
@@ -17297,7 +17297,7 @@
"model": "gpt-5-nano",
"expected": "95",
"actual": "82",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8789,
"outputTokens": 3143,
"latencyMs": 18360.73424999998
@@ -17308,7 +17308,7 @@
"model": "claude-haiku-4-5",
"expected": "95",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9279,
"outputTokens": 5,
"latencyMs": 1035.2159160000156
@@ -17319,7 +17319,7 @@
"model": "gpt-5-nano",
"expected": "95",
"actual": "95",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 4487,
"latencyMs": 28020.044915999984
@@ -17330,7 +17330,7 @@
"model": "claude-haiku-4-5",
"expected": "95",
"actual": "42",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9125,
"outputTokens": 5,
"latencyMs": 1175.8671249999898
@@ -17341,7 +17341,7 @@
"model": "gpt-5-nano",
"expected": "95",
"actual": "77",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15482,
"outputTokens": 2887,
"latencyMs": 24031.185459
@@ -17352,7 +17352,7 @@
"model": "claude-haiku-4-5",
"expected": "95",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15367,
"outputTokens": 5,
"latencyMs": 1724.9393750000163
@@ -17363,7 +17363,7 @@
"model": "gpt-5-nano",
"expected": "95",
"actual": "81",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 13172,
"outputTokens": 4359,
"latencyMs": 35723.19641699997
@@ -17374,7 +17374,7 @@
"model": "claude-haiku-4-5",
"expected": "95",
"actual": "47",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14483,
"outputTokens": 5,
"latencyMs": 1663.259167000011
@@ -17385,7 +17385,7 @@
"model": "gpt-5-nano",
"expected": "83",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15188,
"outputTokens": 2439,
"latencyMs": 18168.518166999973
@@ -17396,7 +17396,7 @@
"model": "claude-haiku-4-5",
"expected": "83",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 17409,
"outputTokens": 5,
"latencyMs": 1390.1757499999949
@@ -17407,7 +17407,7 @@
"model": "gpt-5-nano",
"expected": "83",
"actual": "57",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 8789,
"outputTokens": 4423,
"latencyMs": 41240.42016700003
@@ -17418,7 +17418,7 @@
"model": "claude-haiku-4-5",
"expected": "83",
"actual": "73",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9279,
"outputTokens": 5,
"latencyMs": 1066.675458999991
@@ -17429,7 +17429,7 @@
"model": "gpt-5-nano",
"expected": "83",
"actual": "83",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 8557,
"outputTokens": 5831,
"latencyMs": 40638.93858400005
@@ -17440,7 +17440,7 @@
"model": "claude-haiku-4-5",
"expected": "83",
"actual": "73",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 9125,
"outputTokens": 5,
"latencyMs": 1394.1952499999898
@@ -17451,7 +17451,7 @@
"model": "gpt-5-nano",
"expected": "83",
"actual": "83",
- "correct": true,
+ "isCorrect": true,
"inputTokens": 15482,
"outputTokens": 3591,
"latencyMs": 25356.36183400004
@@ -17462,7 +17462,7 @@
"model": "claude-haiku-4-5",
"expected": "83",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 15367,
"outputTokens": 5,
"latencyMs": 1238.0827089999802
@@ -17473,7 +17473,7 @@
"model": "gpt-5-nano",
"expected": "83",
"actual": "72",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 13172,
"outputTokens": 2567,
"latencyMs": 25124.520583999984
@@ -17484,7 +17484,7 @@
"model": "claude-haiku-4-5",
"expected": "83",
"actual": "71",
- "correct": false,
+ "isCorrect": false,
"inputTokens": 14483,
"outputTokens": 5,
"latencyMs": 2058.834957999992
diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/accuracy/report.md
index a6f9a5c..b44276b 100644
--- a/benchmarks/results/accuracy/report.md
+++ b/benchmarks/results/accuracy/report.md
@@ -28,7 +28,7 @@ claude-haiku-4-5
##### Uniform employee records (TOON optimal format)
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `toon` | 86.2% | 2.483 | 100/116 |
| `csv` | 80.2% | 2.337 | 93/116 |
| `yaml` | 82.8% | 4.969 | 96/116 |
@@ -38,7 +38,7 @@ claude-haiku-4-5
##### E-commerce orders with nested structures
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `toon` | 90.9% | 5.967 | 80/88 |
| `csv` | 90.9% | 6.735 | 80/88 |
| `yaml` | 89.8% | 7.328 | 79/88 |
@@ -48,17 +48,17 @@ claude-haiku-4-5
##### Time-series analytics data
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `csv` | 87.9% | 1.393 | 51/58 |
| `toon` | 86.2% | 1.515 | 50/58 |
| `yaml` | 86.2% | 2.938 | 50/58 |
| `json` | 87.9% | 3.665 | 51/58 |
| `markdown-kv` | 86.2% | 3.779 | 50/58 |
-##### Popular GitHub repositories
+##### Top 100 GitHub repositories
| Format | Accuracy | Tokens | Correct/Total |
-|--------|----------|--------|---------------|
+| ------ | -------- | ------ | ------------- |
| `csv` | 80.4% | 8.513 | 45/56 |
| `toon` | 80.4% | 8.745 | 45/56 |
| `yaml` | 78.6% | 13.129 | 44/56 |
@@ -70,7 +70,7 @@ claude-haiku-4-5
##### gpt-5-nano
| Format | Accuracy | Correct/Total |
-|--------|----------|---------------|
+| ------ | -------- | ------------- |
| `toon` | 97.5% | 155/159 |
| `markdown-kv` | 95.6% | 152/159 |
| `yaml` | 94.3% | 150/159 |
@@ -80,7 +80,7 @@ claude-haiku-4-5
##### claude-haiku-4-5
| Format | Accuracy | Correct/Total |
-|--------|----------|---------------|
+| ------ | -------- | ------------- |
| `markdown-kv` | 76.7% | 122/159 |
| `toon` | 75.5% | 120/159 |
| `json` | 75.5% | 120/159 |
diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json
index a49a81a..dbbd353 100644
--- a/benchmarks/results/accuracy/summary.json
+++ b/benchmarks/results/accuracy/summary.json
@@ -61,7 +61,7 @@
},
{
"name": "github",
- "description": "Popular GitHub repositories"
+ "description": "Top 100 GitHub repositories"
}
],
"tokenCounts": {
@@ -86,5 +86,5 @@
"yaml-analytics": 2938,
"yaml-github": 13129
},
- "timestamp": "2025-10-27T12:43:38.288Z"
+ "timestamp": "2025-10-27T13:04:50.634Z"
}
diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts
index e2a4c1c..b467c63 100644
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -81,6 +81,7 @@ else {
// Format datasets once (reuse for all questions)
const formattedDatasets: Record> = {}
+
for (const [formatName, formatter] of Object.entries(formatters)) {
formattedDatasets[formatName] ??= {}
@@ -91,6 +92,7 @@ else {
// Generate evaluation tasks
const tasks: { question: Question, formatName: string, modelName: string }[] = []
+
for (const question of questions) {
for (const [formatName] of Object.entries(formatters)) {
for (const [modelName] of Object.entries(activeModels)) {
@@ -100,7 +102,6 @@ else {
}
const total = tasks.length
-
consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
// Evaluate all tasks in parallel
@@ -110,16 +111,15 @@ else {
const formattedData = formattedDatasets[task.formatName]![task.question.dataset]!
const model = activeModels[task.modelName as keyof typeof activeModels]!
- const result = await evaluateQuestion(
- task.question,
- task.formatName,
+ const result = await evaluateQuestion({
+ question: task.question,
+ formatName: task.formatName,
formattedData,
model,
- task.modelName,
- )
+ })
- // Progress update
- if ((index + 1) % 10 === 0) {
+ // Progress update after task completes
+ if ((index + 1) % 10 === 0 || (index + 1) === total) {
const percent = (((index + 1) / total) * 100).toFixed(1)
consola.start(`Progress: ${index + 1}/${total} (${percent}%)`)
}
@@ -133,6 +133,7 @@ else {
}
// Generate/regenerate markdown report
+consola.start('Generating report and saving results…')
const formatResults = calculateFormatResults(results, tokenCounts)
await saveResults(results, formatResults, questions, tokenCounts)
diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
index 1b9f7d6..f498f3e 100644
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -46,7 +46,7 @@ const BENCHMARK_EXAMPLES = [
{
name: 'E-commerce Order',
emoji: '🛒',
- description: 'Nested order with customer and items',
+ description: 'Single nested order with customer and items',
getData: generateOrder,
showDetailed: false,
},
diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts
index e146db0..6434dde 100644
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -5,8 +5,9 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
/**
- * Benchmark execution configuration
+ * Default concurrency for parallel evaluations
*/
+export const DEFAULT_CONCURRENCY = 20
/**
* Enable dry run mode for quick testing with limited AI requests
@@ -27,13 +28,3 @@ export const DRY_RUN_LIMITS = {
/** Models to use in dry run */
allowedModels: [] as string[],
}
-
-/**
- * Default concurrency for parallel evaluations
- */
-export const DEFAULT_CONCURRENCY = 20
-
-/**
- * Delay between API requests to avoid rate limiting (in milliseconds)
- */
-export const RATE_LIMIT_DELAY_MS = 100
diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts
index 87643f2..0fbb65c 100644
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -122,16 +122,16 @@ const analyticsDataset: Dataset = {
}
/**
- * GitHub dataset: Popular repositories
+ * Real-world dataset: Top 100 starred GitHub repositories
*
* @remarks
- * Tests TOON's tabular format with real-world data
+ * Tests TOON's tabular format
*/
const githubDataset: Dataset = {
name: 'github',
- description: 'Popular GitHub repositories',
+ description: 'Top 100 GitHub repositories',
data: {
- repositories: githubRepos.slice(0, 200),
+ repositories: githubRepos,
},
}
diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts
index 31642de..e6e490b 100644
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -9,12 +9,10 @@
import type { LanguageModelV2 } from '@ai-sdk/provider'
import type { EvaluationResult, Question } from './types'
-import { setTimeout } from 'node:timers/promises'
import { anthropic } from '@ai-sdk/anthropic'
import { openai } from '@ai-sdk/openai'
import { generateText } from 'ai'
import { consola } from 'consola'
-import { RATE_LIMIT_DELAY_MS } from './constants'
/**
* Models used for evaluation
@@ -28,11 +26,8 @@ export const models: Record = {
* Evaluate a single question with a specific format and model
*/
export async function evaluateQuestion(
- question: Question,
- formatName: string,
- formattedData: string,
- model: LanguageModelV2,
- modelName: string,
+ { question, formatName, formattedData, model}:
+ { question: Question, formatName: string, formattedData: string, model: LanguageModelV2 },
): Promise {
const prompt = `Given the following data in ${formatName} format:
@@ -51,10 +46,8 @@ Provide only the direct answer, without any additional explanation or formatting
temperature: model.modelId.startsWith('gpt-') ? undefined : 0,
})
- await setTimeout(RATE_LIMIT_DELAY_MS)
-
const latencyMs = performance.now() - startTime
- const correct = await validateAnswer({
+ const isCorrect = await validateAnswer({
actual: text.trim(),
expected: question.groundTruth,
question: question.prompt,
@@ -63,10 +56,10 @@ Provide only the direct answer, without any additional explanation or formatting
return {
questionId: question.id,
format: formatName,
- model: modelName,
+ model: model.modelId,
expected: question.groundTruth,
actual: text.trim(),
- correct,
+ isCorrect,
inputTokens: usage.inputTokens,
outputTokens: usage.outputTokens,
latencyMs,
@@ -105,8 +98,6 @@ Respond with only "YES" or "NO".`
temperature: 0,
})
- await setTimeout(RATE_LIMIT_DELAY_MS)
-
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
index 43d1c23..35891af 100644
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -3,7 +3,7 @@
*
* Handles:
* - Statistical analysis
- * - Twitter-ready markdown report generation with visual elements
+ * - Markdown report generation with visual elements
* - Per-dataset breakdowns
* - Cost analysis
* - Result file saving
@@ -28,7 +28,7 @@ export function calculateFormatResults(
return formatNames.map((formatName) => {
const formatResults = results.filter(r => r.format === formatName)
- const correctCount = formatResults.filter(r => r.correct).length
+ const correctCount = formatResults.filter(r => r.isCorrect).length
const totalCount = formatResults.length
const accuracy = correctCount / totalCount
@@ -59,24 +59,17 @@ export function generateMarkdownReport(
questions: Question[],
tokenCounts: Record,
): string {
- const lines: string[] = [
- '### Retrieval Accuracy',
- '',
- ]
-
const toon = formatResults.find(r => r.format === 'toon')
const json = formatResults.find(r => r.format === 'json')
- // Model-by-model breakdown with ASCII bars
+ // Build model-by-model breakdown with ASCII bars
const modelCount = Object.keys(models).length
- lines.push(`Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:`, '', '```')
-
const modelNames = Object.keys(models)
- for (let i = 0; i < modelNames.length; i++) {
- const modelName = modelNames[i]!
+
+ const modelBreakdown = modelNames.map((modelName, i) => {
const modelResults = formatResults.map((fr) => {
const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format)
- const correctCount = modelFormatResults.filter(r => r.correct).length
+ const correctCount = modelFormatResults.filter(r => r.isCorrect).length
const totalCount = modelFormatResults.length
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
@@ -88,34 +81,24 @@ export function generateMarkdownReport(
}
}).sort((a, b) => b.accuracy - a.accuracy)
- // Add blank line before model name, except for first model
- if (i > 0)
- lines.push('')
- lines.push(modelName)
- for (const result of modelResults) {
+ const formatLines = modelResults.map((result) => {
const bar = createProgressBar(result.accuracy, 1, 20)
const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
const countStr = `(${result.correctCount}/${result.totalCount})`
- lines.push(` ${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`)
- }
- }
+ return ` ${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`
+ }).join('\n')
- lines.push('```', '')
+ // Add blank line before model name, except for first model
+ return `${i > 0 ? '\n' : ''}${modelName}\n${formatLines}`
+ }).join('\n')
- // Summary comparison
- if (toon && json) {
- const tokenSavings = ((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1)
- lines.push(
- `**Tradeoff:** TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.`,
- '',
- )
- }
-
- lines.push('', 'View detailed breakdown by dataset and model
', '', '#### Performance by Dataset', '')
-
- for (const dataset of datasets) {
- lines.push(`##### ${dataset.description}`, '')
+ // Build summary comparison
+ const summaryComparison = toon && json
+ ? `**Tradeoff:** TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1)}% fewer tokens.`
+ : ''
+ // Build performance by dataset
+ const datasetBreakdown = datasets.map((dataset) => {
const datasetResults = formatResults.map((fr) => {
const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
if (datasetFormatResults.length === 0)
@@ -125,7 +108,7 @@ export function generateMarkdownReport(
if (formatDatasetResults.length === 0)
return undefined
- const correctCount = formatDatasetResults.filter(r => r.correct).length
+ const correctCount = formatDatasetResults.filter(r => r.isCorrect).length
const totalCount = formatDatasetResults.length
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
@@ -143,7 +126,7 @@ export function generateMarkdownReport(
}).filter(Boolean) as { format: string, accuracy: number, tokens: number, correctCount: number, totalCount: number }[]
if (datasetResults.length === 0)
- continue
+ return ''
// Sort by efficiency
datasetResults.sort((a, b) => {
@@ -152,29 +135,24 @@ export function generateMarkdownReport(
return effB - effA
})
- lines.push(
- '| Format | Accuracy | Tokens | Correct/Total |',
- '|--------|----------|--------|---------------|',
- )
+ const tableRows = datasetResults.slice(0, 6).map(result =>
+ `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
+ ).join('\n')
- for (const result of datasetResults.slice(0, 6)) {
- lines.push(
- `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
- )
- }
+ return `
+##### ${dataset.description}
- lines.push('')
- }
-
- // Model breakdown
- lines.push('#### Performance by Model', '')
-
- for (const modelName of Object.keys(models)) {
- lines.push(`##### ${modelName}`, '')
+| Format | Accuracy | Tokens | Correct/Total |
+| ------ | -------- | ------ | ------------- |
+${tableRows}
+`.trimStart()
+ }).filter(Boolean).join('\n')
+ // Build performance by model
+ const modelPerformance = modelNames.map((modelName) => {
const modelResults = formatResults.map((fr) => {
const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format)
- const correctCount = modelFormatResults.filter(r => r.correct).length
+ const correctCount = modelFormatResults.filter(r => r.isCorrect).length
const totalCount = modelFormatResults.length
const accuracy = correctCount / totalCount
@@ -186,36 +164,55 @@ export function generateMarkdownReport(
}
}).sort((a, b) => b.accuracy - a.accuracy)
- lines.push('| Format | Accuracy | Correct/Total |', '|--------|----------|---------------|')
+ const tableRows = modelResults.map(result =>
+ `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`,
+ ).join('\n')
- for (const result of modelResults) {
- lines.push(`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`)
- }
+ return `
+##### ${modelName}
- lines.push('')
- }
+| Format | Accuracy | Correct/Total |
+| ------ | -------- | ------------- |
+${tableRows}
+`.trimStart()
+ }).join('\n')
- // Methodology
- lines.push(
- '#### Methodology',
- '',
- '- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).',
- '- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.',
- '- **Question types**: Field retrieval, aggregation, and filtering tasks.',
- '- **Real data**: Faker.js-generated datasets + GitHub repositories.',
- '',
- ' ',
- '',
- )
+ return `
+### Retrieval Accuracy
- return lines.join('\n')
+Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:
+
+\`\`\`
+${modelBreakdown}
+\`\`\`
+
+${summaryComparison}
+
+
+View detailed breakdown by dataset and model
+
+#### Performance by Dataset
+
+${datasetBreakdown}
+#### Performance by Model
+
+${modelPerformance}
+#### Methodology
+
+- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).
+- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding.
+- **Question types**: Field retrieval, aggregation, and filtering tasks.
+- **Real data**: Faker.js-generated datasets + GitHub repositories.
+
+
+`.trimStart()
}
/**
* Calculate token counts for all format+dataset combinations
*/
export function calculateTokenCounts(
- formatters: Record string>,
+ formatters: Record string>,
): Record {
const tokenCounts: Record = {}
@@ -272,7 +269,7 @@ export async function saveResults(
}
/**
- * Generate visual progress bar using ASCII characters (█ for filled, ░ for empty)
+ * Generate visual progress bar using ASCII characters (`█` for filled, `░` for empty)
*/
function createProgressBar(tokens: number, maxTokens: number, width = 30): string {
const filled = Math.round((tokens / maxTokens) * width)
diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts
index 11f8bcf..399a167 100644
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -18,7 +18,7 @@ export interface EvaluationResult {
model: string
expected: string
actual: string
- correct: boolean
+ isCorrect: boolean
inputTokens?: number
outputTokens?: number
latencyMs: number