diff --git a/README.md b/README.md index ca05be5..00393ef 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,13 @@ users[2]{id,name,role}: 2,Bob,user ``` +
+Another reason + +[![xkcd: Standards](https://imgs.xkcd.com/comics/standards_2x.png)](https://xkcd.com/927/) + +
+ > [!NOTE] > I built TOON to save tokens when sending large datasets to LLMs at work, where I tend to have uniform arrays of objects that benefit from the tabular format. @@ -225,7 +232,7 @@ claude-haiku-4-5 ##### Uniform employee records (TOON optimal format) | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `toon` | 86.2% | 2.483 | 100/116 | | `csv` | 80.2% | 2.337 | 93/116 | | `yaml` | 82.8% | 4.969 | 96/116 | @@ -235,7 +242,7 @@ claude-haiku-4-5 ##### E-commerce orders with nested structures | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `toon` | 90.9% | 5.967 | 80/88 | | `csv` | 90.9% | 6.735 | 80/88 | | `yaml` | 89.8% | 7.328 | 79/88 | @@ -245,17 +252,17 @@ claude-haiku-4-5 ##### Time-series analytics data | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `csv` | 87.9% | 1.393 | 51/58 | | `toon` | 86.2% | 1.515 | 50/58 | | `yaml` | 86.2% | 2.938 | 50/58 | | `json` | 87.9% | 3.665 | 51/58 | | `markdown-kv` | 86.2% | 3.779 | 50/58 | -##### Popular GitHub repositories +##### Top 100 GitHub repositories | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `csv` | 80.4% | 8.513 | 45/56 | | `toon` | 80.4% | 8.745 | 45/56 | | `yaml` | 78.6% | 13.129 | 44/56 | @@ -267,7 +274,7 @@ claude-haiku-4-5 ##### gpt-5-nano | Format | Accuracy | Correct/Total | -|--------|----------|---------------| +| ------ | -------- | ------------- | | `toon` | 97.5% | 155/159 | | `markdown-kv` | 95.6% | 152/159 | | `yaml` | 94.3% | 150/159 | @@ -277,7 +284,7 @@ claude-haiku-4-5 ##### claude-haiku-4-5 | Format | Accuracy | Correct/Total | -|--------|----------|---------------| +| ------ | -------- | ------------- | | `markdown-kv` | 76.7% | 122/159 | | `toon` | 75.5% | 120/159 | | `json` | 75.5% | 120/159 | diff --git a/benchmarks/results/accuracy/raw-results.json b/benchmarks/results/accuracy/raw-results.json index adbe71c..f52e84f 100644 --- a/benchmarks/results/accuracy/raw-results.json +++ b/benchmarks/results/accuracy/raw-results.json @@ -5,7 +5,7 @@ "model": "gpt-5-nano", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 72, "latencyMs": 2221.390167 @@ -16,7 +16,7 @@ "model": "claude-haiku-4-5", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 7870, "outputTokens": 6, "latencyMs": 1276.715333 @@ -27,7 +27,7 @@ "model": "gpt-5-nano", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 72, "latencyMs": 3718.250833 @@ -38,7 +38,7 @@ "model": "claude-haiku-4-5", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 6, "latencyMs": 1215.944708 @@ -49,7 +49,7 @@ "model": "gpt-5-nano", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 72, "latencyMs": 2417.306625 @@ -60,7 +60,7 @@ "model": "claude-haiku-4-5", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 2856, "outputTokens": 6, "latencyMs": 1152.5258749999998 @@ -71,7 +71,7 @@ "model": "gpt-5-nano", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 72, "latencyMs": 4603.444417 @@ -82,7 +82,7 @@ "model": "claude-haiku-4-5", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 6365, "outputTokens": 6, "latencyMs": 1390.011125 @@ -93,7 +93,7 @@ "model": "gpt-5-nano", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 8, "latencyMs": 4339.294459 @@ -104,7 +104,7 @@ "model": "claude-haiku-4-5", "expected": "56176", "actual": "56176", - "correct": true, + "isCorrect": true, "inputTokens": 5760, "outputTokens": 6, "latencyMs": 1374.47325 @@ -115,7 +115,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 135, "latencyMs": 2550.589042 @@ -126,7 +126,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 4, "latencyMs": 1139.559917 @@ -137,7 +137,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 135, "latencyMs": 2422.8178749999997 @@ -148,7 +148,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 4, "latencyMs": 1135.579459 @@ -159,7 +159,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 71, "latencyMs": 4198.553583999999 @@ -170,7 +170,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 4, "latencyMs": 1147.9685829999999 @@ -181,7 +181,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 71, "latencyMs": 2594.702667 @@ -192,7 +192,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 4, "latencyMs": 1568.4054999999998 @@ -203,7 +203,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 71, "latencyMs": 2516.345875 @@ -214,7 +214,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 4, "latencyMs": 1633.5375000000001 @@ -225,7 +225,7 @@ "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 76, "latencyMs": 2079.8442499999996 @@ -236,7 +236,7 @@ "model": "claude-haiku-4-5", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7874, "outputTokens": 12, "latencyMs": 1201.556458 @@ -247,7 +247,7 @@ "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 140, "latencyMs": 2356.408 @@ -258,7 +258,7 @@ "model": "claude-haiku-4-5", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2986, "outputTokens": 12, "latencyMs": 1113.255166 @@ -269,7 +269,7 @@ "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 140, "latencyMs": 2188.5425419999997 @@ -280,7 +280,7 @@ "model": "claude-haiku-4-5", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2860, "outputTokens": 12, "latencyMs": 1029.9496669999999 @@ -291,7 +291,7 @@ "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 140, "latencyMs": 2605.8857080000002 @@ -302,7 +302,7 @@ "model": "claude-haiku-4-5", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6369, "outputTokens": 12, "latencyMs": 1273.5997920000004 @@ -313,7 +313,7 @@ "model": "gpt-5-nano", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 140, "latencyMs": 2530.4294580000005 @@ -324,7 +324,7 @@ "model": "claude-haiku-4-5", "expected": "lorenza.kunze@yahoo.com", "actual": "lorenza.kunze@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5764, "outputTokens": 12, "latencyMs": 1404.4837089999996 @@ -335,7 +335,7 @@ "model": "gpt-5-nano", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 72, "latencyMs": 2302.062125 @@ -346,7 +346,7 @@ "model": "claude-haiku-4-5", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 7870, "outputTokens": 6, "latencyMs": 1114.0778329999998 @@ -357,7 +357,7 @@ "model": "gpt-5-nano", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 72, "latencyMs": 2006.7020830000001 @@ -368,7 +368,7 @@ "model": "claude-haiku-4-5", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 6, "latencyMs": 1641.5518749999997 @@ -379,7 +379,7 @@ "model": "gpt-5-nano", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 136, "latencyMs": 2850.351709 @@ -390,7 +390,7 @@ "model": "claude-haiku-4-5", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 2856, "outputTokens": 6, "latencyMs": 1367.7319589999997 @@ -401,7 +401,7 @@ "model": "gpt-5-nano", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 72, "latencyMs": 2477.8365839999997 @@ -412,7 +412,7 @@ "model": "claude-haiku-4-5", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 6365, "outputTokens": 6, "latencyMs": 1309.567083 @@ -423,7 +423,7 @@ "model": "gpt-5-nano", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 72, "latencyMs": 1794.2651250000008 @@ -434,7 +434,7 @@ "model": "claude-haiku-4-5", "expected": "117381", "actual": "117381", - "correct": true, + "isCorrect": true, "inputTokens": 5760, "outputTokens": 6, "latencyMs": 1177.5377079999998 @@ -445,7 +445,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 71, "latencyMs": 1963.9477500000003 @@ -456,7 +456,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 4, "latencyMs": 1024.5166669999999 @@ -467,7 +467,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 135, "latencyMs": 2291.4288749999996 @@ -478,7 +478,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 4, "latencyMs": 1312.7111250000007 @@ -489,7 +489,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 135, "latencyMs": 1727.6371660000004 @@ -500,7 +500,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 4, "latencyMs": 1097.0443749999995 @@ -511,7 +511,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 135, "latencyMs": 2671.2276250000004 @@ -522,7 +522,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 4, "latencyMs": 1174.8639999999996 @@ -533,7 +533,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 71, "latencyMs": 2306.2642499999993 @@ -544,7 +544,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 4, "latencyMs": 2822.8963750000003 @@ -555,7 +555,7 @@ "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 139, "latencyMs": 2827.0400409999993 @@ -566,7 +566,7 @@ "model": "claude-haiku-4-5", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 11, "latencyMs": 1151.7215829999996 @@ -577,7 +577,7 @@ "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 75, "latencyMs": 1714.2902919999997 @@ -588,7 +588,7 @@ "model": "claude-haiku-4-5", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 11, "latencyMs": 1810.6344170000011 @@ -599,7 +599,7 @@ "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 75, "latencyMs": 2548.0390000000007 @@ -610,7 +610,7 @@ "model": "claude-haiku-4-5", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 11, "latencyMs": 1046.7650829999993 @@ -621,7 +621,7 @@ "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 139, "latencyMs": 2408.879916000001 @@ -632,7 +632,7 @@ "model": "claude-haiku-4-5", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 11, "latencyMs": 1186.5773750000008 @@ -643,7 +643,7 @@ "model": "gpt-5-nano", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 139, "latencyMs": 3157.9398329999995 @@ -654,7 +654,7 @@ "model": "claude-haiku-4-5", "expected": "jayda60@hotmail.com", "actual": "jayda60@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 11, "latencyMs": 1129.6754170000004 @@ -665,7 +665,7 @@ "model": "gpt-5-nano", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 72, "latencyMs": 2893.3476250000003 @@ -676,7 +676,7 @@ "model": "claude-haiku-4-5", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 7870, "outputTokens": 6, "latencyMs": 1288.7682919999988 @@ -687,7 +687,7 @@ "model": "gpt-5-nano", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 72, "latencyMs": 2324.6738330000007 @@ -698,7 +698,7 @@ "model": "claude-haiku-4-5", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 6, "latencyMs": 1095.704291 @@ -709,7 +709,7 @@ "model": "gpt-5-nano", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 136, "latencyMs": 3980.3727500000005 @@ -720,7 +720,7 @@ "model": "claude-haiku-4-5", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 2856, "outputTokens": 6, "latencyMs": 1122.8730419999993 @@ -731,7 +731,7 @@ "model": "gpt-5-nano", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 72, "latencyMs": 2030.0818330000002 @@ -742,7 +742,7 @@ "model": "claude-haiku-4-5", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 6365, "outputTokens": 6, "latencyMs": 1705.6364999999987 @@ -753,7 +753,7 @@ "model": "gpt-5-nano", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 72, "latencyMs": 1611.3567500000008 @@ -764,7 +764,7 @@ "model": "claude-haiku-4-5", "expected": "92971", "actual": "92971", - "correct": true, + "isCorrect": true, "inputTokens": 5760, "outputTokens": 6, "latencyMs": 1109.0094590000008 @@ -775,7 +775,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 199, "latencyMs": 3099.078125 @@ -786,7 +786,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 4, "latencyMs": 1115.9911250000005 @@ -797,7 +797,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 135, "latencyMs": 2833.193875000001 @@ -808,7 +808,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 4, "latencyMs": 933.1444169999995 @@ -819,7 +819,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 199, "latencyMs": 2315.536 @@ -830,7 +830,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 4, "latencyMs": 1300.336792 @@ -841,7 +841,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 135, "latencyMs": 7016.997917000002 @@ -852,7 +852,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 4, "latencyMs": 1288.107333 @@ -863,7 +863,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 135, "latencyMs": 2474.8247499999998 @@ -874,7 +874,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 4, "latencyMs": 1027.9775420000005 @@ -885,7 +885,7 @@ "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 652, "latencyMs": 8322.172416 @@ -896,7 +896,7 @@ "model": "claude-haiku-4-5", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 11, "latencyMs": 1066.3422090000004 @@ -907,7 +907,7 @@ "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 76, "latencyMs": 2245.5604999999996 @@ -918,7 +918,7 @@ "model": "claude-haiku-4-5", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 11, "latencyMs": 1179.7512079999997 @@ -929,7 +929,7 @@ "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 204, "latencyMs": 2584.0723340000004 @@ -940,7 +940,7 @@ "model": "claude-haiku-4-5", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 11, "latencyMs": 1204.6979589999992 @@ -951,7 +951,7 @@ "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 396, "latencyMs": 3824.918375000001 @@ -962,7 +962,7 @@ "model": "claude-haiku-4-5", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 11, "latencyMs": 1492.6765830000004 @@ -973,7 +973,7 @@ "model": "gpt-5-nano", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 76, "latencyMs": 1834.562 @@ -984,7 +984,7 @@ "model": "claude-haiku-4-5", "expected": "terrance.hansen@yahoo.com", "actual": "terrance.hansen@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 11, "latencyMs": 1245.0000419999997 @@ -995,7 +995,7 @@ "model": "gpt-5-nano", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 136, "latencyMs": 2337.0652499999997 @@ -1006,7 +1006,7 @@ "model": "claude-haiku-4-5", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 7870, "outputTokens": 6, "latencyMs": 1148.1971250000006 @@ -1017,7 +1017,7 @@ "model": "gpt-5-nano", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 72, "latencyMs": 2736.2375420000008 @@ -1028,7 +1028,7 @@ "model": "claude-haiku-4-5", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 6, "latencyMs": 1164.4291250000006 @@ -1039,7 +1039,7 @@ "model": "gpt-5-nano", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 72, "latencyMs": 2479.8535840000004 @@ -1050,7 +1050,7 @@ "model": "claude-haiku-4-5", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 2856, "outputTokens": 6, "latencyMs": 1032.3198329999996 @@ -1061,7 +1061,7 @@ "model": "gpt-5-nano", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 6317, "outputTokens": 136, "latencyMs": 2237.465583000001 @@ -1072,7 +1072,7 @@ "model": "claude-haiku-4-5", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 6365, "outputTokens": 6, "latencyMs": 1254.3189160000002 @@ -1083,7 +1083,7 @@ "model": "gpt-5-nano", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 72, "latencyMs": 3753.917125 @@ -1094,7 +1094,7 @@ "model": "claude-haiku-4-5", "expected": "107744", "actual": "107744", - "correct": true, + "isCorrect": true, "inputTokens": 5760, "outputTokens": 6, "latencyMs": 1154.7003750000003 @@ -1105,7 +1105,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 135, "latencyMs": 2621.2275420000005 @@ -1116,7 +1116,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 4, "latencyMs": 1222.843499999999 @@ -1127,7 +1127,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 71, "latencyMs": 1762.1339159999989 @@ -1138,7 +1138,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 4, "latencyMs": 1630.7307079999991 @@ -1149,7 +1149,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 71, "latencyMs": 1848.9775829999999 @@ -1160,7 +1160,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 4, "latencyMs": 1080.8682500000014 @@ -1171,7 +1171,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 135, "latencyMs": 26303.357959 @@ -1182,7 +1182,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 4, "latencyMs": 1354.007999999998 @@ -1193,7 +1193,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 71, "latencyMs": 1924.4625829999986 @@ -1204,7 +1204,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 4, "latencyMs": 1279.5235830000001 @@ -1215,7 +1215,7 @@ "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 330, "latencyMs": 3997.3972079999985 @@ -1226,7 +1226,7 @@ "model": "claude-haiku-4-5", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7867, "outputTokens": 9, "latencyMs": 1153.9412079999984 @@ -1237,7 +1237,7 @@ "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 138, "latencyMs": 2494.580582999999 @@ -1248,7 +1248,7 @@ "model": "claude-haiku-4-5", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2979, "outputTokens": 9, "latencyMs": 1350.1353750000017 @@ -1259,7 +1259,7 @@ "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 138, "latencyMs": 3024.4009160000023 @@ -1270,7 +1270,7 @@ "model": "claude-haiku-4-5", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2853, "outputTokens": 9, "latencyMs": 1199.3955830000014 @@ -1281,7 +1281,7 @@ "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 138, "latencyMs": 5168.116582999999 @@ -1292,7 +1292,7 @@ "model": "claude-haiku-4-5", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6362, "outputTokens": 9, "latencyMs": 1198.3554160000022 @@ -1303,7 +1303,7 @@ "model": "gpt-5-nano", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 74, "latencyMs": 2632.998958999997 @@ -1314,7 +1314,7 @@ "model": "claude-haiku-4-5", "expected": "allan21@gmail.com", "actual": "allan21@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5757, "outputTokens": 9, "latencyMs": 1124.5625419999997 @@ -1325,7 +1325,7 @@ "model": "gpt-5-nano", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 6388, "outputTokens": 72, "latencyMs": 2357.2276249999995 @@ -1336,7 +1336,7 @@ "model": "claude-haiku-4-5", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 6, "latencyMs": 1267.960791999998 @@ -1347,7 +1347,7 @@ "model": "gpt-5-nano", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 2525, "outputTokens": 136, "latencyMs": 2397.798125000001 @@ -1358,7 +1358,7 @@ "model": "claude-haiku-4-5", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 6, "latencyMs": 1170.6429580000004 @@ -1369,7 +1369,7 @@ "model": "gpt-5-nano", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 2379, "outputTokens": 136, "latencyMs": 3227.198124999999 @@ -1380,7 +1380,7 @@ "model": "claude-haiku-4-5", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 6, "latencyMs": 1112.6066250000003 @@ -1391,7 +1391,7 @@ "model": "gpt-5-nano", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 6314, "outputTokens": 72, "latencyMs": 2036.251791999999 @@ -1402,7 +1402,7 @@ "model": "claude-haiku-4-5", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 6, "latencyMs": 1290.7641250000015 @@ -1413,7 +1413,7 @@ "model": "gpt-5-nano", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 5010, "outputTokens": 72, "latencyMs": 2262.8405840000014 @@ -1424,7 +1424,7 @@ "model": "claude-haiku-4-5", "expected": "145843", "actual": "145843", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 6, "latencyMs": 1193.2695419999982 @@ -1435,7 +1435,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 71, "latencyMs": 3198.2654159999984 @@ -1446,7 +1446,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 4, "latencyMs": 1229.8644999999997 @@ -1457,7 +1457,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 71, "latencyMs": 3293.710084000002 @@ -1468,7 +1468,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 4, "latencyMs": 1121.200334000001 @@ -1479,7 +1479,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 71, "latencyMs": 2497.4451249999984 @@ -1490,7 +1490,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 4, "latencyMs": 1152.0107500000013 @@ -1501,7 +1501,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 71, "latencyMs": 3547.6399999999994 @@ -1512,7 +1512,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 4, "latencyMs": 2007.6731249999975 @@ -1523,7 +1523,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 71, "latencyMs": 7054.295208 @@ -1534,7 +1534,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 4, "latencyMs": 1230.5032920000012 @@ -1545,7 +1545,7 @@ "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 76, "latencyMs": 2049.933416 @@ -1556,7 +1556,7 @@ "model": "claude-haiku-4-5", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 9, "latencyMs": 1217.1906249999993 @@ -1567,7 +1567,7 @@ "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 204, "latencyMs": 2844.136208 @@ -1578,7 +1578,7 @@ "model": "claude-haiku-4-5", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 9, "latencyMs": 2166.8829589999987 @@ -1589,7 +1589,7 @@ "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 204, "latencyMs": 2726.5934579999994 @@ -1600,7 +1600,7 @@ "model": "claude-haiku-4-5", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 9, "latencyMs": 1107.4675410000018 @@ -1611,7 +1611,7 @@ "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 76, "latencyMs": 2260.4548749999994 @@ -1622,7 +1622,7 @@ "model": "claude-haiku-4-5", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 9, "latencyMs": 1257.2797080000018 @@ -1633,7 +1633,7 @@ "model": "gpt-5-nano", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 140, "latencyMs": 2565.571791999999 @@ -1644,7 +1644,7 @@ "model": "claude-haiku-4-5", "expected": "alexandria61@gmail.com", "actual": "alexandria61@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 9, "latencyMs": 1255.2880829999995 @@ -1655,7 +1655,7 @@ "model": "gpt-5-nano", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 136, "latencyMs": 2595.422042000002 @@ -1666,7 +1666,7 @@ "model": "claude-haiku-4-5", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 7870, "outputTokens": 6, "latencyMs": 1090.4299170000013 @@ -1677,7 +1677,7 @@ "model": "gpt-5-nano", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 72, "latencyMs": 2985.3881250000013 @@ -1688,7 +1688,7 @@ "model": "claude-haiku-4-5", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 6, "latencyMs": 1521.227415999998 @@ -1699,7 +1699,7 @@ "model": "gpt-5-nano", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 72, "latencyMs": 2918.142082999999 @@ -1710,7 +1710,7 @@ "model": "claude-haiku-4-5", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 2856, "outputTokens": 6, "latencyMs": 1049.085916 @@ -1721,7 +1721,7 @@ "model": "gpt-5-nano", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 136, "latencyMs": 2414.9711669999997 @@ -1732,7 +1732,7 @@ "model": "claude-haiku-4-5", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 6365, "outputTokens": 6, "latencyMs": 1178.0064170000005 @@ -1743,7 +1743,7 @@ "model": "gpt-5-nano", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 72, "latencyMs": 1772.788625000001 @@ -1754,7 +1754,7 @@ "model": "claude-haiku-4-5", "expected": "89436", "actual": "89436", - "correct": true, + "isCorrect": true, "inputTokens": 5760, "outputTokens": 6, "latencyMs": 1134.7022499999985 @@ -1765,7 +1765,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 135, "latencyMs": 2528.6098330000023 @@ -1776,7 +1776,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7872, "outputTokens": 4, "latencyMs": 1353.3026250000003 @@ -1787,7 +1787,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 71, "latencyMs": 2286.120999999999 @@ -1798,7 +1798,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 4, "latencyMs": 961.078292000002 @@ -1809,7 +1809,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 71, "latencyMs": 3445.204249999999 @@ -1820,7 +1820,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2858, "outputTokens": 4, "latencyMs": 1003.445125000002 @@ -1831,7 +1831,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 135, "latencyMs": 2696.166874999999 @@ -1842,7 +1842,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6367, "outputTokens": 4, "latencyMs": 1063.340791999999 @@ -1853,7 +1853,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 135, "latencyMs": 3367.6109579999975 @@ -1864,7 +1864,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5762, "outputTokens": 4, "latencyMs": 1322.4013339999983 @@ -1875,7 +1875,7 @@ "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 139, "latencyMs": 2745.6627499999995 @@ -1886,7 +1886,7 @@ "model": "claude-haiku-4-5", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 10, "latencyMs": 1312.9286670000001 @@ -1897,7 +1897,7 @@ "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 1483, "latencyMs": 13678.859999999997 @@ -1908,7 +1908,7 @@ "model": "claude-haiku-4-5", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 10, "latencyMs": 1030.3843339999985 @@ -1919,7 +1919,7 @@ "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 139, "latencyMs": 2223.2737909999996 @@ -1930,7 +1930,7 @@ "model": "claude-haiku-4-5", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 10, "latencyMs": 1224.2647080000024 @@ -1941,7 +1941,7 @@ "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 139, "latencyMs": 3198.8672499999993 @@ -1952,7 +1952,7 @@ "model": "claude-haiku-4-5", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 10, "latencyMs": 1234.557084 @@ -1963,7 +1963,7 @@ "model": "gpt-5-nano", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 139, "latencyMs": 2861.692708999999 @@ -1974,7 +1974,7 @@ "model": "claude-haiku-4-5", "expected": "kelvin54@yahoo.com", "actual": "kelvin54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 10, "latencyMs": 1284.2591250000005 @@ -1985,7 +1985,7 @@ "model": "gpt-5-nano", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 136, "latencyMs": 2741.803499999998 @@ -1996,7 +1996,7 @@ "model": "claude-haiku-4-5", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 7872, "outputTokens": 6, "latencyMs": 1096.6906249999993 @@ -2007,7 +2007,7 @@ "model": "gpt-5-nano", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 136, "latencyMs": 3692.904416999998 @@ -2018,7 +2018,7 @@ "model": "claude-haiku-4-5", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 6, "latencyMs": 1516.7794159999976 @@ -2029,7 +2029,7 @@ "model": "gpt-5-nano", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 392, "latencyMs": 5068.4152909999975 @@ -2040,7 +2040,7 @@ "model": "claude-haiku-4-5", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 2858, "outputTokens": 6, "latencyMs": 1356.2728330000027 @@ -2051,7 +2051,7 @@ "model": "gpt-5-nano", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 136, "latencyMs": 2866.8642500000024 @@ -2062,7 +2062,7 @@ "model": "claude-haiku-4-5", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 6367, "outputTokens": 6, "latencyMs": 1462.041624999998 @@ -2073,7 +2073,7 @@ "model": "gpt-5-nano", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 72, "latencyMs": 2320.320083999999 @@ -2084,7 +2084,7 @@ "model": "claude-haiku-4-5", "expected": "143365", "actual": "143365", - "correct": true, + "isCorrect": true, "inputTokens": 5762, "outputTokens": 6, "latencyMs": 1082.976666999999 @@ -2095,7 +2095,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 7, "latencyMs": 2427.6330409999973 @@ -2106,7 +2106,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 4, "latencyMs": 1108.7309170000008 @@ -2117,7 +2117,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 71, "latencyMs": 4405.948458000003 @@ -2128,7 +2128,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 4, "latencyMs": 1235.6647919999996 @@ -2139,7 +2139,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 71, "latencyMs": 2528.553082999999 @@ -2150,7 +2150,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 4, "latencyMs": 974.1328329999997 @@ -2161,7 +2161,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 135, "latencyMs": 2243.1775420000013 @@ -2172,7 +2172,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 4, "latencyMs": 2416.867124999997 @@ -2183,7 +2183,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 135, "latencyMs": 2429.5548750000016 @@ -2194,7 +2194,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 4, "latencyMs": 1257.326083 @@ -2205,7 +2205,7 @@ "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6393, "outputTokens": 203, "latencyMs": 4366.677041999996 @@ -2216,7 +2216,7 @@ "model": "claude-haiku-4-5", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7876, "outputTokens": 9, "latencyMs": 1410.3295419999995 @@ -2227,7 +2227,7 @@ "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2530, "outputTokens": 75, "latencyMs": 2834.2883330000004 @@ -2238,7 +2238,7 @@ "model": "claude-haiku-4-5", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2988, "outputTokens": 9, "latencyMs": 1023.437750000001 @@ -2249,7 +2249,7 @@ "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2384, "outputTokens": 139, "latencyMs": 3091.7722909999975 @@ -2260,7 +2260,7 @@ "model": "claude-haiku-4-5", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2862, "outputTokens": 9, "latencyMs": 1910.5562920000011 @@ -2271,7 +2271,7 @@ "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6319, "outputTokens": 75, "latencyMs": 2335.239207999999 @@ -2282,7 +2282,7 @@ "model": "claude-haiku-4-5", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6371, "outputTokens": 9, "latencyMs": 1145.7144169999992 @@ -2293,7 +2293,7 @@ "model": "gpt-5-nano", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5015, "outputTokens": 75, "latencyMs": 2204.0944169999966 @@ -2304,7 +2304,7 @@ "model": "claude-haiku-4-5", "expected": "dean19@gmail.com", "actual": "dean19@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5766, "outputTokens": 9, "latencyMs": 1102.2122499999969 @@ -2315,7 +2315,7 @@ "model": "gpt-5-nano", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 200, "latencyMs": 3785.0480830000015 @@ -2326,7 +2326,7 @@ "model": "claude-haiku-4-5", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 6, "latencyMs": 1147.6056669999962 @@ -2337,7 +2337,7 @@ "model": "gpt-5-nano", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 72, "latencyMs": 3996.1190410000054 @@ -2348,7 +2348,7 @@ "model": "claude-haiku-4-5", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 6, "latencyMs": 1101.5621670000037 @@ -2359,7 +2359,7 @@ "model": "gpt-5-nano", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 136, "latencyMs": 2563.2732499999984 @@ -2370,7 +2370,7 @@ "model": "claude-haiku-4-5", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 6, "latencyMs": 1224.5424589999966 @@ -2381,7 +2381,7 @@ "model": "gpt-5-nano", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 6317, "outputTokens": 136, "latencyMs": 2436.8848329999964 @@ -2392,7 +2392,7 @@ "model": "claude-haiku-4-5", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 6, "latencyMs": 1500.1066250000003 @@ -2403,7 +2403,7 @@ "model": "gpt-5-nano", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 72, "latencyMs": 2529.925833000001 @@ -2414,7 +2414,7 @@ "model": "claude-haiku-4-5", "expected": "111314", "actual": "111314", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 6, "latencyMs": 1701.0276660000018 @@ -2425,7 +2425,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6388, "outputTokens": 135, "latencyMs": 3078.5496249999997 @@ -2436,7 +2436,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 4, "latencyMs": 1224.1848329999993 @@ -2447,7 +2447,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2525, "outputTokens": 71, "latencyMs": 2287.0156669999997 @@ -2458,7 +2458,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 4, "latencyMs": 1209.1454999999987 @@ -2469,7 +2469,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2379, "outputTokens": 71, "latencyMs": 2059.012499999997 @@ -2480,7 +2480,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 4, "latencyMs": 1393.596375000001 @@ -2491,7 +2491,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6314, "outputTokens": 71, "latencyMs": 1858.8989159999983 @@ -2502,7 +2502,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 4, "latencyMs": 1193.9375419999997 @@ -2513,7 +2513,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5010, "outputTokens": 135, "latencyMs": 2755.0157499999987 @@ -2524,7 +2524,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 4, "latencyMs": 1366.030666999999 @@ -2535,7 +2535,7 @@ "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 395, "latencyMs": 4352.137999999999 @@ -2546,7 +2546,7 @@ "model": "claude-haiku-4-5", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 10, "latencyMs": 1093.9707500000004 @@ -2557,7 +2557,7 @@ "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 139, "latencyMs": 2481.934500000003 @@ -2568,7 +2568,7 @@ "model": "claude-haiku-4-5", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 10, "latencyMs": 1262.3894579999978 @@ -2579,7 +2579,7 @@ "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 75, "latencyMs": 2360.7159170000014 @@ -2590,7 +2590,7 @@ "model": "claude-haiku-4-5", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 10, "latencyMs": 1462.5894999999946 @@ -2601,7 +2601,7 @@ "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 75, "latencyMs": 3247.478041000002 @@ -2612,7 +2612,7 @@ "model": "claude-haiku-4-5", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 10, "latencyMs": 1693.1597089999996 @@ -2623,7 +2623,7 @@ "model": "gpt-5-nano", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 75, "latencyMs": 1726.2765839999993 @@ -2634,7 +2634,7 @@ "model": "claude-haiku-4-5", "expected": "laurel54@yahoo.com", "actual": "laurel54@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 10, "latencyMs": 1605.044458000004 @@ -2645,7 +2645,7 @@ "model": "gpt-5-nano", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 136, "latencyMs": 2263.1207090000025 @@ -2656,7 +2656,7 @@ "model": "claude-haiku-4-5", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 7873, "outputTokens": 6, "latencyMs": 3789.016875000001 @@ -2667,7 +2667,7 @@ "model": "gpt-5-nano", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 72, "latencyMs": 1829.9641669999983 @@ -2678,7 +2678,7 @@ "model": "claude-haiku-4-5", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 6, "latencyMs": 989.6153750000012 @@ -2689,7 +2689,7 @@ "model": "gpt-5-nano", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 72, "latencyMs": 2717.4773339999956 @@ -2700,7 +2700,7 @@ "model": "claude-haiku-4-5", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 2859, "outputTokens": 6, "latencyMs": 1717.8889999999956 @@ -2711,7 +2711,7 @@ "model": "gpt-5-nano", "expected": "89553", "actual": "46730", - "correct": false, + "isCorrect": false, "inputTokens": 6317, "outputTokens": 72, "latencyMs": 5490.572667 @@ -2722,7 +2722,7 @@ "model": "claude-haiku-4-5", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 6368, "outputTokens": 6, "latencyMs": 1427.4055000000008 @@ -2733,7 +2733,7 @@ "model": "gpt-5-nano", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 264, "latencyMs": 4052.875957999997 @@ -2744,7 +2744,7 @@ "model": "claude-haiku-4-5", "expected": "89553", "actual": "89553", - "correct": true, + "isCorrect": true, "inputTokens": 5763, "outputTokens": 6, "latencyMs": 1586.255124999996 @@ -2755,7 +2755,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6388, "outputTokens": 135, "latencyMs": 3787.343541000002 @@ -2766,7 +2766,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7866, "outputTokens": 4, "latencyMs": 1196.934000000001 @@ -2777,7 +2777,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2525, "outputTokens": 71, "latencyMs": 2172.2377080000006 @@ -2788,7 +2788,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2978, "outputTokens": 4, "latencyMs": 1112.6987080000035 @@ -2799,7 +2799,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2379, "outputTokens": 71, "latencyMs": 2074.6067919999987 @@ -2810,7 +2810,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2852, "outputTokens": 4, "latencyMs": 1202.2165000000023 @@ -2821,7 +2821,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6314, "outputTokens": 135, "latencyMs": 3257.5967080000046 @@ -2832,7 +2832,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6361, "outputTokens": 4, "latencyMs": 1316.7435000000041 @@ -2843,7 +2843,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5010, "outputTokens": 71, "latencyMs": 2391.9063749999987 @@ -2854,7 +2854,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5756, "outputTokens": 4, "latencyMs": 1208.8820829999968 @@ -2865,7 +2865,7 @@ "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 142, "latencyMs": 2735.679790999995 @@ -2876,7 +2876,7 @@ "model": "claude-haiku-4-5", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 14, "latencyMs": 1253.706624999999 @@ -2887,7 +2887,7 @@ "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 142, "latencyMs": 2471.819457999998 @@ -2898,7 +2898,7 @@ "model": "claude-haiku-4-5", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 14, "latencyMs": 1063.2195409999986 @@ -2909,7 +2909,7 @@ "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 142, "latencyMs": 2061.6382500000036 @@ -2920,7 +2920,7 @@ "model": "claude-haiku-4-5", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 14, "latencyMs": 1877.579082999997 @@ -2931,7 +2931,7 @@ "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6317, "outputTokens": 142, "latencyMs": 3448.810375000001 @@ -2942,7 +2942,7 @@ "model": "claude-haiku-4-5", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 14, "latencyMs": 1265.9410419999986 @@ -2953,7 +2953,7 @@ "model": "gpt-5-nano", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 78, "latencyMs": 2152.5591669999994 @@ -2964,7 +2964,7 @@ "model": "claude-haiku-4-5", "expected": "jayme.kertzmann77@gmail.com", "actual": "jayme.kertzmann77@gmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 14, "latencyMs": 1432.513583 @@ -2975,7 +2975,7 @@ "model": "gpt-5-nano", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 136, "latencyMs": 2707.4454169999954 @@ -2986,7 +2986,7 @@ "model": "claude-haiku-4-5", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 6, "latencyMs": 1568.5869169999933 @@ -2997,7 +2997,7 @@ "model": "gpt-5-nano", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 136, "latencyMs": 2373.4566669999986 @@ -3008,7 +3008,7 @@ "model": "claude-haiku-4-5", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 6, "latencyMs": 1525.172749999998 @@ -3019,7 +3019,7 @@ "model": "gpt-5-nano", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 136, "latencyMs": 9347.989583000002 @@ -3030,7 +3030,7 @@ "model": "claude-haiku-4-5", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 6, "latencyMs": 1748.783334000007 @@ -3041,7 +3041,7 @@ "model": "gpt-5-nano", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 72, "latencyMs": 1929.517458000002 @@ -3052,7 +3052,7 @@ "model": "claude-haiku-4-5", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 6, "latencyMs": 1022.1345000000001 @@ -3063,7 +3063,7 @@ "model": "gpt-5-nano", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 136, "latencyMs": 2102.925624999996 @@ -3074,7 +3074,7 @@ "model": "claude-haiku-4-5", "expected": "104053", "actual": "104053", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 6, "latencyMs": 1471.7255839999998 @@ -3085,7 +3085,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 71, "latencyMs": 1983.693041999999 @@ -3096,7 +3096,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7872, "outputTokens": 4, "latencyMs": 1077.2119579999999 @@ -3107,7 +3107,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 71, "latencyMs": 2549.1221250000017 @@ -3118,7 +3118,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 4, "latencyMs": 921.1110840000038 @@ -3129,7 +3129,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 135, "latencyMs": 4070.615666999998 @@ -3140,7 +3140,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2858, "outputTokens": 4, "latencyMs": 974.754832999999 @@ -3151,7 +3151,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6317, "outputTokens": 135, "latencyMs": 2665.842083000003 @@ -3162,7 +3162,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6367, "outputTokens": 4, "latencyMs": 1081.2904160000035 @@ -3173,7 +3173,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 135, "latencyMs": 2897.919332999998 @@ -3184,7 +3184,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5762, "outputTokens": 4, "latencyMs": 1341.0955420000028 @@ -3195,7 +3195,7 @@ "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 204, "latencyMs": 3231.9646249999932 @@ -3206,7 +3206,7 @@ "model": "claude-haiku-4-5", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 12, "latencyMs": 1288.5363330000037 @@ -3217,7 +3217,7 @@ "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 76, "latencyMs": 2581.508915999999 @@ -3228,7 +3228,7 @@ "model": "claude-haiku-4-5", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 12, "latencyMs": 1183.8337079999983 @@ -3239,7 +3239,7 @@ "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 140, "latencyMs": 2073.944792000002 @@ -3250,7 +3250,7 @@ "model": "claude-haiku-4-5", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 12, "latencyMs": 1302.5857499999984 @@ -3261,7 +3261,7 @@ "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 204, "latencyMs": 3076.5304590000014 @@ -3272,7 +3272,7 @@ "model": "claude-haiku-4-5", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 12, "latencyMs": 1110.9787920000017 @@ -3283,7 +3283,7 @@ "model": "gpt-5-nano", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 76, "latencyMs": 3381.732917000001 @@ -3294,7 +3294,7 @@ "model": "claude-haiku-4-5", "expected": "carley.bauch@yahoo.com", "actual": "carley.bauch@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 12, "latencyMs": 1198.1488329999993 @@ -3305,7 +3305,7 @@ "model": "gpt-5-nano", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 6393, "outputTokens": 136, "latencyMs": 2687.965959000001 @@ -3316,7 +3316,7 @@ "model": "claude-haiku-4-5", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 7874, "outputTokens": 6, "latencyMs": 2615.956250000003 @@ -3327,7 +3327,7 @@ "model": "gpt-5-nano", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 2530, "outputTokens": 136, "latencyMs": 2132.413249999998 @@ -3338,7 +3338,7 @@ "model": "claude-haiku-4-5", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 2986, "outputTokens": 6, "latencyMs": 1091.060666999998 @@ -3349,7 +3349,7 @@ "model": "gpt-5-nano", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 2384, "outputTokens": 72, "latencyMs": 2074.8201670000053 @@ -3360,7 +3360,7 @@ "model": "claude-haiku-4-5", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 2860, "outputTokens": 6, "latencyMs": 1622.2757499999934 @@ -3371,7 +3371,7 @@ "model": "gpt-5-nano", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 6319, "outputTokens": 200, "latencyMs": 3122.3756670000002 @@ -3382,7 +3382,7 @@ "model": "claude-haiku-4-5", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 6369, "outputTokens": 6, "latencyMs": 1175.7301249999946 @@ -3393,7 +3393,7 @@ "model": "gpt-5-nano", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 5015, "outputTokens": 136, "latencyMs": 2601.074916999998 @@ -3404,7 +3404,7 @@ "model": "claude-haiku-4-5", "expected": "142029", "actual": "142029", - "correct": true, + "isCorrect": true, "inputTokens": 5764, "outputTokens": 6, "latencyMs": 1089.4757079999981 @@ -3415,7 +3415,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 135, "latencyMs": 6939.617750000005 @@ -3426,7 +3426,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 4, "latencyMs": 1207.9619999999995 @@ -3437,7 +3437,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 135, "latencyMs": 2784.063166 @@ -3448,7 +3448,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 4, "latencyMs": 1011.0956670000014 @@ -3459,7 +3459,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 135, "latencyMs": 3098.7147909999985 @@ -3470,7 +3470,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 4, "latencyMs": 983.9449170000007 @@ -3481,7 +3481,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 135, "latencyMs": 3889.572291999997 @@ -3492,7 +3492,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 4, "latencyMs": 1096.1613339999967 @@ -3503,7 +3503,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 71, "latencyMs": 2484.078917000006 @@ -3514,7 +3514,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 4, "latencyMs": 1150.418792000004 @@ -3525,7 +3525,7 @@ "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6393, "outputTokens": 140, "latencyMs": 2221.4447079999954 @@ -3536,7 +3536,7 @@ "model": "claude-haiku-4-5", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 7872, "outputTokens": 14, "latencyMs": 1193.9583749999947 @@ -3547,7 +3547,7 @@ "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2530, "outputTokens": 76, "latencyMs": 2170.8865829999995 @@ -3558,7 +3558,7 @@ "model": "claude-haiku-4-5", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 14, "latencyMs": 1247.6116660000043 @@ -3569,7 +3569,7 @@ "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2384, "outputTokens": 76, "latencyMs": 3827.705667000002 @@ -3580,7 +3580,7 @@ "model": "claude-haiku-4-5", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 2858, "outputTokens": 14, "latencyMs": 1084.8218339999949 @@ -3591,7 +3591,7 @@ "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6319, "outputTokens": 140, "latencyMs": 3311.8220839999994 @@ -3602,7 +3602,7 @@ "model": "claude-haiku-4-5", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 6367, "outputTokens": 14, "latencyMs": 1269.2092920000068 @@ -3613,7 +3613,7 @@ "model": "gpt-5-nano", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5015, "outputTokens": 140, "latencyMs": 2648.3102500000023 @@ -3624,7 +3624,7 @@ "model": "claude-haiku-4-5", "expected": "cheyenne_skiles@hotmail.com", "actual": "cheyenne_skiles@hotmail.com", - "correct": true, + "isCorrect": true, "inputTokens": 5762, "outputTokens": 14, "latencyMs": 1278.0403750000041 @@ -3635,7 +3635,7 @@ "model": "gpt-5-nano", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 6391, "outputTokens": 136, "latencyMs": 3555.1511670000036 @@ -3646,7 +3646,7 @@ "model": "claude-haiku-4-5", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 6, "latencyMs": 1317.5797499999971 @@ -3657,7 +3657,7 @@ "model": "gpt-5-nano", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 2528, "outputTokens": 136, "latencyMs": 2291.943041999999 @@ -3668,7 +3668,7 @@ "model": "claude-haiku-4-5", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 6, "latencyMs": 2081.3947499999995 @@ -3679,7 +3679,7 @@ "model": "gpt-5-nano", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 2382, "outputTokens": 72, "latencyMs": 2067.9348329999993 @@ -3690,7 +3690,7 @@ "model": "claude-haiku-4-5", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 6, "latencyMs": 1192.6603340000001 @@ -3701,7 +3701,7 @@ "model": "gpt-5-nano", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 6317, "outputTokens": 200, "latencyMs": 3044.592457999999 @@ -3712,7 +3712,7 @@ "model": "claude-haiku-4-5", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 6, "latencyMs": 1106.2235409999994 @@ -3723,7 +3723,7 @@ "model": "gpt-5-nano", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 5013, "outputTokens": 136, "latencyMs": 2627.8240000000005 @@ -3734,7 +3734,7 @@ "model": "claude-haiku-4-5", "expected": "84650", "actual": "84650", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 6, "latencyMs": 1379.9015 @@ -3745,7 +3745,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 263, "latencyMs": 3705.3900829999984 @@ -3756,7 +3756,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 4, "latencyMs": 1909.4442500000005 @@ -3767,7 +3767,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 135, "latencyMs": 2173.6019589999996 @@ -3778,7 +3778,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 4, "latencyMs": 1063.8584580000024 @@ -3789,7 +3789,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 71, "latencyMs": 1800.4930420000019 @@ -3800,7 +3800,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2857, "outputTokens": 4, "latencyMs": 1011.3969579999975 @@ -3811,7 +3811,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 135, "latencyMs": 2562.2492500000008 @@ -3822,7 +3822,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 4, "latencyMs": 1349.1809170000051 @@ -3833,7 +3833,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 71, "latencyMs": 1883.7523750000037 @@ -3844,7 +3844,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 4, "latencyMs": 1135.412292000001 @@ -3855,7 +3855,7 @@ "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 334, "latencyMs": 4067.161957999997 @@ -3866,7 +3866,7 @@ "model": "claude-haiku-4-5", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 14, "latencyMs": 1333.0713749999995 @@ -3877,7 +3877,7 @@ "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 142, "latencyMs": 2081.8315000000002 @@ -3888,7 +3888,7 @@ "model": "claude-haiku-4-5", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 14, "latencyMs": 1231.0224579999995 @@ -3899,7 +3899,7 @@ "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 78, "latencyMs": 2333.0360409999994 @@ -3910,7 +3910,7 @@ "model": "claude-haiku-4-5", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 14, "latencyMs": 1175.1937500000058 @@ -3921,7 +3921,7 @@ "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 206, "latencyMs": 7391.094749999997 @@ -3932,7 +3932,7 @@ "model": "claude-haiku-4-5", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 14, "latencyMs": 1843.981458000002 @@ -3943,7 +3943,7 @@ "model": "gpt-5-nano", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 142, "latencyMs": 2386.8134589999972 @@ -3954,7 +3954,7 @@ "model": "claude-haiku-4-5", "expected": "macey.gottlieb5@yahoo.com", "actual": "macey.gottlieb5@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 14, "latencyMs": 1449.751750000003 @@ -3965,7 +3965,7 @@ "model": "gpt-5-nano", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 136, "latencyMs": 4075.600666999999 @@ -3976,7 +3976,7 @@ "model": "claude-haiku-4-5", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 6, "latencyMs": 985.1729999999952 @@ -3987,7 +3987,7 @@ "model": "gpt-5-nano", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 136, "latencyMs": 2891.2602079999997 @@ -3998,7 +3998,7 @@ "model": "claude-haiku-4-5", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 6, "latencyMs": 2073.129000000001 @@ -4009,7 +4009,7 @@ "model": "gpt-5-nano", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 72, "latencyMs": 1894.3316669999986 @@ -4020,7 +4020,7 @@ "model": "claude-haiku-4-5", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 6, "latencyMs": 1172.3735000000015 @@ -4031,7 +4031,7 @@ "model": "gpt-5-nano", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 72, "latencyMs": 2456.6511249999967 @@ -4042,7 +4042,7 @@ "model": "claude-haiku-4-5", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 6, "latencyMs": 1298.1367079999982 @@ -4053,7 +4053,7 @@ "model": "gpt-5-nano", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 136, "latencyMs": 6018.304375 @@ -4064,7 +4064,7 @@ "model": "claude-haiku-4-5", "expected": "89773", "actual": "89773", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 6, "latencyMs": 1103.9152499999982 @@ -4075,7 +4075,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 71, "latencyMs": 3867.303832999998 @@ -4086,7 +4086,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 7868, "outputTokens": 4, "latencyMs": 1287.7528749999983 @@ -4097,7 +4097,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 135, "latencyMs": 2355.0305829999998 @@ -4108,7 +4108,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 4, "latencyMs": 1086.8424579999992 @@ -4119,7 +4119,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 71, "latencyMs": 3472.6323339999944 @@ -4130,7 +4130,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 2854, "outputTokens": 4, "latencyMs": 948.3086249999978 @@ -4141,7 +4141,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 71, "latencyMs": 3343.3446659999972 @@ -4152,7 +4152,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 6363, "outputTokens": 4, "latencyMs": 1048.567959 @@ -4163,7 +4163,7 @@ "model": "gpt-5-nano", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 71, "latencyMs": 3761.141875000001 @@ -4174,7 +4174,7 @@ "model": "claude-haiku-4-5", "expected": "Marketing", "actual": "Marketing", - "correct": true, + "isCorrect": true, "inputTokens": 5758, "outputTokens": 4, "latencyMs": 1130.9393339999951 @@ -4185,7 +4185,7 @@ "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6389, "outputTokens": 79, "latencyMs": 4200.215792000003 @@ -4196,7 +4196,7 @@ "model": "claude-haiku-4-5", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 7869, "outputTokens": 13, "latencyMs": 1351.981166999998 @@ -4207,7 +4207,7 @@ "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2526, "outputTokens": 143, "latencyMs": 2465.4245840000003 @@ -4218,7 +4218,7 @@ "model": "claude-haiku-4-5", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 13, "latencyMs": 885.4770840000056 @@ -4229,7 +4229,7 @@ "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2380, "outputTokens": 143, "latencyMs": 2903.201958000005 @@ -4240,7 +4240,7 @@ "model": "claude-haiku-4-5", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 2855, "outputTokens": 13, "latencyMs": 1006.1219579999961 @@ -4251,7 +4251,7 @@ "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6315, "outputTokens": 207, "latencyMs": 3253.900333999998 @@ -4262,7 +4262,7 @@ "model": "claude-haiku-4-5", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 6364, "outputTokens": 13, "latencyMs": 1219.713582999997 @@ -4273,7 +4273,7 @@ "model": "gpt-5-nano", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5011, "outputTokens": 143, "latencyMs": 2335.6635000000024 @@ -4284,7 +4284,7 @@ "model": "claude-haiku-4-5", "expected": "georgianna_renner@yahoo.com", "actual": "georgianna_renner@yahoo.com", - "correct": true, + "isCorrect": true, "inputTokens": 5759, "outputTokens": 13, "latencyMs": 1334.1358330000003 @@ -4295,7 +4295,7 @@ "model": "gpt-5-nano", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 6390, "outputTokens": 136, "latencyMs": 1912.2536669999972 @@ -4306,7 +4306,7 @@ "model": "claude-haiku-4-5", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 7871, "outputTokens": 6, "latencyMs": 1104.4684160000033 @@ -4317,7 +4317,7 @@ "model": "gpt-5-nano", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 2527, "outputTokens": 72, "latencyMs": 2648.919750000001 @@ -4328,7 +4328,7 @@ "model": "claude-haiku-4-5", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 2983, "outputTokens": 6, "latencyMs": 1525.6309170000022 @@ -4339,7 +4339,7 @@ "model": "gpt-5-nano", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 2381, "outputTokens": 136, "latencyMs": 2736.3283749999973 @@ -4350,7 +4350,7 @@ "model": "claude-haiku-4-5", "expected": "49741", "actual": "144426", - "correct": false, + "isCorrect": false, "inputTokens": 2857, "outputTokens": 6, "latencyMs": 1077.766334 @@ -4361,7 +4361,7 @@ "model": "gpt-5-nano", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 6316, "outputTokens": 72, "latencyMs": 2116.5284170000014 @@ -4372,7 +4372,7 @@ "model": "claude-haiku-4-5", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 6366, "outputTokens": 6, "latencyMs": 1159.7744170000005 @@ -4383,7 +4383,7 @@ "model": "gpt-5-nano", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 5012, "outputTokens": 72, "latencyMs": 2529.7074160000047 @@ -4394,7 +4394,7 @@ "model": "claude-haiku-4-5", "expected": "49741", "actual": "49741", - "correct": true, + "isCorrect": true, "inputTokens": 5761, "outputTokens": 6, "latencyMs": 1604.601791999994 @@ -4405,7 +4405,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 967, "latencyMs": 8300.216583000001 @@ -4416,7 +4416,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1204.089749999992 @@ -4427,7 +4427,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 455, "latencyMs": 5231.604541000001 @@ -4438,7 +4438,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 1168.508707999994 @@ -4449,7 +4449,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 967, "latencyMs": 8396.912500000006 @@ -4460,7 +4460,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 1060.6276250000083 @@ -4471,7 +4471,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 775, "latencyMs": 9340.763791999998 @@ -4482,7 +4482,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1020.8827080000046 @@ -4493,7 +4493,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 903, "latencyMs": 8792.062000000005 @@ -4504,7 +4504,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 1459.8301659999997 @@ -4515,7 +4515,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 519, "latencyMs": 6439.622583000004 @@ -4526,7 +4526,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1416.1659170000057 @@ -4537,7 +4537,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 903, "latencyMs": 8064.398499999996 @@ -4548,7 +4548,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "14", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 998.3781250000029 @@ -4559,7 +4559,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 647, "latencyMs": 5498.786500000002 @@ -4570,7 +4570,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 1343.9632910000073 @@ -4581,7 +4581,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 647, "latencyMs": 7565.158291 @@ -4592,7 +4592,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "14", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1320.9714169999934 @@ -4603,7 +4603,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 839, "latencyMs": 10626.395499999999 @@ -4614,7 +4614,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 3227.584917 @@ -4625,7 +4625,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 583, "latencyMs": 6690.373416000002 @@ -4636,7 +4636,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1187.1296250000014 @@ -4647,7 +4647,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 519, "latencyMs": 5081.884875000003 @@ -4658,7 +4658,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 1576.2339999999967 @@ -4669,7 +4669,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 1031, "latencyMs": 9927.5775 @@ -4680,7 +4680,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 1169.6451669999951 @@ -4691,7 +4691,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 519, "latencyMs": 6772.954291999995 @@ -4702,7 +4702,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1905.9189590000024 @@ -4713,7 +4713,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 455, "latencyMs": 6827.424666999999 @@ -4724,7 +4724,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 2121.3979160000017 @@ -4735,7 +4735,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 519, "latencyMs": 15235.099042000002 @@ -4746,7 +4746,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1182.0669170000037 @@ -4757,7 +4757,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 583, "latencyMs": 6872.47600000001 @@ -4768,7 +4768,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 931.0203749999928 @@ -4779,7 +4779,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 2311, "latencyMs": 17952.683875000002 @@ -4790,7 +4790,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 1167.8899999999994 @@ -4801,7 +4801,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 455, "latencyMs": 6896.831916999989 @@ -4812,7 +4812,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1401.859083000003 @@ -4823,7 +4823,7 @@ "model": "gpt-5-nano", "expected": "17", "actual": "17", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 647, "latencyMs": 5266.956917000003 @@ -4834,7 +4834,7 @@ "model": "claude-haiku-4-5", "expected": "17", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 1100.9057919999905 @@ -4845,7 +4845,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 1095, "latencyMs": 15621.264291999993 @@ -4856,7 +4856,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "12", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1063.5868750000081 @@ -4867,7 +4867,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 455, "latencyMs": 5703.061916000006 @@ -4878,7 +4878,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 1113.9432499999966 @@ -4889,7 +4889,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 3015, "latencyMs": 22321.357124999995 @@ -4900,7 +4900,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 968.0936249999941 @@ -4911,7 +4911,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 1287, "latencyMs": 14521.080749999994 @@ -4922,7 +4922,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "12", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1228.1847500000003 @@ -4933,7 +4933,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 455, "latencyMs": 5216.268042000011 @@ -4944,7 +4944,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 1026.5127079999947 @@ -4955,7 +4955,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 6387, "outputTokens": 391, "latencyMs": 4335.125541000001 @@ -4966,7 +4966,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 7865, "outputTokens": 5, "latencyMs": 1116.4177909999999 @@ -4977,7 +4977,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 2524, "outputTokens": 583, "latencyMs": 4128.823499999999 @@ -4988,7 +4988,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2977, "outputTokens": 5, "latencyMs": 1105.622457999998 @@ -4999,7 +4999,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 2378, "outputTokens": 839, "latencyMs": 6542.58583299999 @@ -5010,7 +5010,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 2851, "outputTokens": 5, "latencyMs": 1084.2237909999967 @@ -5021,7 +5021,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 6313, "outputTokens": 455, "latencyMs": 5050.133375000005 @@ -5032,7 +5032,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 6360, "outputTokens": 5, "latencyMs": 1075.023709000001 @@ -5043,7 +5043,7 @@ "model": "gpt-5-nano", "expected": "16", "actual": "16", - "correct": true, + "isCorrect": true, "inputTokens": 5009, "outputTokens": 711, "latencyMs": 9237.985791 @@ -5054,7 +5054,7 @@ "model": "claude-haiku-4-5", "expected": "16", "actual": "12", - "correct": false, + "isCorrect": false, "inputTokens": 5755, "outputTokens": 5, "latencyMs": 1346.3510000000097 @@ -5065,7 +5065,7 @@ "model": "gpt-5-nano", "expected": "91", "actual": "91", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 2375, "latencyMs": 27655.89520900001 @@ -5076,7 +5076,7 @@ "model": "claude-haiku-4-5", "expected": "91", "actual": "89", - "correct": false, + "isCorrect": false, "inputTokens": 7870, "outputTokens": 5, "latencyMs": 1315.7111659999937 @@ -5087,7 +5087,7 @@ "model": "gpt-5-nano", "expected": "91", "actual": "91", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 2695, "latencyMs": 26482.504707999993 @@ -5098,7 +5098,7 @@ "model": "claude-haiku-4-5", "expected": "91", "actual": "85", - "correct": false, + "isCorrect": false, "inputTokens": 2982, "outputTokens": 5, "latencyMs": 1368.221916999988 @@ -5109,7 +5109,7 @@ "model": "gpt-5-nano", "expected": "91", "actual": "91", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 1671, "latencyMs": 18249.434333000012 @@ -5120,7 +5120,7 @@ "model": "claude-haiku-4-5", "expected": "91", "actual": "85", - "correct": false, + "isCorrect": false, "inputTokens": 2856, "outputTokens": 5, "latencyMs": 1051.9521660000028 @@ -5131,7 +5131,7 @@ "model": "gpt-5-nano", "expected": "91", "actual": "91", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 1799, "latencyMs": 15867.284083999999 @@ -5142,7 +5142,7 @@ "model": "claude-haiku-4-5", "expected": "91", "actual": "89", - "correct": false, + "isCorrect": false, "inputTokens": 6365, "outputTokens": 5, "latencyMs": 1831.3835839999956 @@ -5153,7 +5153,7 @@ "model": "gpt-5-nano", "expected": "91", "actual": "91", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 2247, "latencyMs": 19254.821666999997 @@ -5164,7 +5164,7 @@ "model": "claude-haiku-4-5", "expected": "91", "actual": "89", - "correct": false, + "isCorrect": false, "inputTokens": 5760, "outputTokens": 5, "latencyMs": 1762.2908329999918 @@ -5175,7 +5175,7 @@ "model": "gpt-5-nano", "expected": "67", "actual": "67", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 1479, "latencyMs": 13444.104542000001 @@ -5186,7 +5186,7 @@ "model": "claude-haiku-4-5", "expected": "67", "actual": "57", - "correct": false, + "isCorrect": false, "inputTokens": 7870, "outputTokens": 5, "latencyMs": 1182.2523340000043 @@ -5197,7 +5197,7 @@ "model": "gpt-5-nano", "expected": "67", "actual": "67", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 2183, "latencyMs": 19257.86050000001 @@ -5208,7 +5208,7 @@ "model": "claude-haiku-4-5", "expected": "67", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 2982, "outputTokens": 5, "latencyMs": 1081.3142080000107 @@ -5219,7 +5219,7 @@ "model": "gpt-5-nano", "expected": "67", "actual": "67", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 3463, "latencyMs": 21384.707542000004 @@ -5230,7 +5230,7 @@ "model": "claude-haiku-4-5", "expected": "67", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 2856, "outputTokens": 5, "latencyMs": 1051.6647080000112 @@ -5241,7 +5241,7 @@ "model": "gpt-5-nano", "expected": "67", "actual": "67", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 2439, "latencyMs": 19519.416207999995 @@ -5252,7 +5252,7 @@ "model": "claude-haiku-4-5", "expected": "67", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 6365, "outputTokens": 5, "latencyMs": 1060.1008749999892 @@ -5263,7 +5263,7 @@ "model": "gpt-5-nano", "expected": "67", "actual": "66", - "correct": false, + "isCorrect": false, "inputTokens": 5014, "outputTokens": 1991, "latencyMs": 15234.403459000008 @@ -5274,7 +5274,7 @@ "model": "claude-haiku-4-5", "expected": "67", "actual": "57", - "correct": false, + "isCorrect": false, "inputTokens": 5760, "outputTokens": 5, "latencyMs": 1208.8559589999932 @@ -5285,7 +5285,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 1415, "latencyMs": 14119.885540999996 @@ -5296,7 +5296,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 7870, "outputTokens": 5, "latencyMs": 1428.8373750000028 @@ -5307,7 +5307,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 1607, "latencyMs": 13997.297709000006 @@ -5318,7 +5318,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "27", - "correct": false, + "isCorrect": false, "inputTokens": 2982, "outputTokens": 5, "latencyMs": 1270.4412920000032 @@ -5329,7 +5329,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 1415, "latencyMs": 13861.177167000002 @@ -5340,7 +5340,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 2856, "outputTokens": 5, "latencyMs": 916.5238340000069 @@ -5351,7 +5351,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 6318, "outputTokens": 1799, "latencyMs": 16007.06925 @@ -5362,7 +5362,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "27", - "correct": false, + "isCorrect": false, "inputTokens": 6365, "outputTokens": 5, "latencyMs": 1426.0594579999888 @@ -5373,7 +5373,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 2055, "latencyMs": 22966.680624999994 @@ -5384,7 +5384,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 5760, "outputTokens": 5, "latencyMs": 1044.6609999999928 @@ -5395,7 +5395,7 @@ "model": "gpt-5-nano", "expected": "26", "actual": "26", - "correct": true, + "isCorrect": true, "inputTokens": 6392, "outputTokens": 1159, "latencyMs": 10799.117333000002 @@ -5406,7 +5406,7 @@ "model": "claude-haiku-4-5", "expected": "26", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 7870, "outputTokens": 5, "latencyMs": 1359.5568330000096 @@ -5417,7 +5417,7 @@ "model": "gpt-5-nano", "expected": "26", "actual": "26", - "correct": true, + "isCorrect": true, "inputTokens": 2529, "outputTokens": 1543, "latencyMs": 13702.052542000005 @@ -5428,7 +5428,7 @@ "model": "claude-haiku-4-5", "expected": "26", "actual": "16", - "correct": false, + "isCorrect": false, "inputTokens": 2982, "outputTokens": 5, "latencyMs": 967.0454159999936 @@ -5439,7 +5439,7 @@ "model": "gpt-5-nano", "expected": "26", "actual": "26", - "correct": true, + "isCorrect": true, "inputTokens": 2383, "outputTokens": 1671, "latencyMs": 13116.871958000003 @@ -5450,7 +5450,7 @@ "model": "claude-haiku-4-5", "expected": "26", "actual": "16", - "correct": false, + "isCorrect": false, "inputTokens": 2856, "outputTokens": 5, "latencyMs": 1088.8372910000035 @@ -5461,7 +5461,7 @@ "model": "gpt-5-nano", "expected": "26", "actual": "26", - "correct": true, + "isCorrect": true, "inputTokens": 6318, "outputTokens": 1543, "latencyMs": 14387.148624999987 @@ -5472,7 +5472,7 @@ "model": "claude-haiku-4-5", "expected": "26", "actual": "16", - "correct": false, + "isCorrect": false, "inputTokens": 6365, "outputTokens": 5, "latencyMs": 1273.9564170000085 @@ -5483,7 +5483,7 @@ "model": "gpt-5-nano", "expected": "26", "actual": "26", - "correct": true, + "isCorrect": true, "inputTokens": 5014, "outputTokens": 1223, "latencyMs": 12143.083792000005 @@ -5494,7 +5494,7 @@ "model": "claude-haiku-4-5", "expected": "26", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 5760, "outputTokens": 5, "latencyMs": 1032.9807079999882 @@ -5505,7 +5505,7 @@ "model": "gpt-5-nano", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 6386, "outputTokens": 2631, "latencyMs": 23077.678417000003 @@ -5516,7 +5516,7 @@ "model": "claude-haiku-4-5", "expected": "78", "actual": "81", - "correct": false, + "isCorrect": false, "inputTokens": 7864, "outputTokens": 5, "latencyMs": 1281.171417000005 @@ -5527,7 +5527,7 @@ "model": "gpt-5-nano", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 2523, "outputTokens": 2759, "latencyMs": 20331.962667 @@ -5538,7 +5538,7 @@ "model": "claude-haiku-4-5", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 2976, "outputTokens": 5, "latencyMs": 1014.3847079999978 @@ -5549,7 +5549,7 @@ "model": "gpt-5-nano", "expected": "78", "actual": "81", - "correct": false, + "isCorrect": false, "inputTokens": 2377, "outputTokens": 3335, "latencyMs": 18037.630208000002 @@ -5560,7 +5560,7 @@ "model": "claude-haiku-4-5", "expected": "78", "actual": "73", - "correct": false, + "isCorrect": false, "inputTokens": 2850, "outputTokens": 5, "latencyMs": 918.3078749999986 @@ -5571,7 +5571,7 @@ "model": "gpt-5-nano", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 6312, "outputTokens": 1991, "latencyMs": 15660.232958000008 @@ -5582,7 +5582,7 @@ "model": "claude-haiku-4-5", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 6359, "outputTokens": 5, "latencyMs": 1033.7647080000024 @@ -5593,7 +5593,7 @@ "model": "gpt-5-nano", "expected": "78", "actual": "78", - "correct": true, + "isCorrect": true, "inputTokens": 5008, "outputTokens": 4295, "latencyMs": 26817.97 @@ -5604,7 +5604,7 @@ "model": "claude-haiku-4-5", "expected": "78", "actual": "77", - "correct": false, + "isCorrect": false, "inputTokens": 5754, "outputTokens": 5, "latencyMs": 1348.084750000009 @@ -5615,7 +5615,7 @@ "model": "gpt-5-nano", "expected": "22", "actual": "22", - "correct": true, + "isCorrect": true, "inputTokens": 6386, "outputTokens": 1223, "latencyMs": 10273.866540999996 @@ -5626,7 +5626,7 @@ "model": "claude-haiku-4-5", "expected": "22", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 7864, "outputTokens": 5, "latencyMs": 1081.604707999999 @@ -5637,7 +5637,7 @@ "model": "gpt-5-nano", "expected": "22", "actual": "22", - "correct": true, + "isCorrect": true, "inputTokens": 2523, "outputTokens": 903, "latencyMs": 13862.020499999999 @@ -5648,7 +5648,7 @@ "model": "claude-haiku-4-5", "expected": "22", "actual": "16", - "correct": false, + "isCorrect": false, "inputTokens": 2976, "outputTokens": 5, "latencyMs": 965.817916 @@ -5659,7 +5659,7 @@ "model": "gpt-5-nano", "expected": "22", "actual": "21", - "correct": false, + "isCorrect": false, "inputTokens": 2377, "outputTokens": 2631, "latencyMs": 24254.82570799999 @@ -5670,7 +5670,7 @@ "model": "claude-haiku-4-5", "expected": "22", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 2850, "outputTokens": 5, "latencyMs": 998.7978339999972 @@ -5681,7 +5681,7 @@ "model": "gpt-5-nano", "expected": "22", "actual": "22", - "correct": true, + "isCorrect": true, "inputTokens": 6312, "outputTokens": 1095, "latencyMs": 10401.351500000004 @@ -5692,7 +5692,7 @@ "model": "claude-haiku-4-5", "expected": "22", "actual": "15", - "correct": false, + "isCorrect": false, "inputTokens": 6359, "outputTokens": 5, "latencyMs": 1479.388791999998 @@ -5703,7 +5703,7 @@ "model": "gpt-5-nano", "expected": "22", "actual": "22", - "correct": true, + "isCorrect": true, "inputTokens": 5008, "outputTokens": 839, "latencyMs": 8160.454833999989 @@ -5714,7 +5714,7 @@ "model": "claude-haiku-4-5", "expected": "22", "actual": "16", - "correct": false, + "isCorrect": false, "inputTokens": 5754, "outputTokens": 5, "latencyMs": 1763.230291999993 @@ -5725,7 +5725,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 6394, "outputTokens": 1671, "latencyMs": 14807.253333 @@ -5736,7 +5736,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "9", - "correct": false, + "isCorrect": false, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1185.018333 @@ -5747,7 +5747,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 2531, "outputTokens": 1607, "latencyMs": 13592.477832999997 @@ -5758,7 +5758,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "9", - "correct": false, + "isCorrect": false, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 947.2789590000029 @@ -5769,7 +5769,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 2385, "outputTokens": 2759, "latencyMs": 22718.536041999992 @@ -5780,7 +5780,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 973.4814580000093 @@ -5791,7 +5791,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 6320, "outputTokens": 1031, "latencyMs": 10025.186000000002 @@ -5802,7 +5802,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 1038.4732499999955 @@ -5813,7 +5813,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 5016, "outputTokens": 903, "latencyMs": 12459.619915999996 @@ -5824,7 +5824,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 1448.7940839999937 @@ -5835,7 +5835,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 6394, "outputTokens": 1415, "latencyMs": 13094.547666999992 @@ -5846,7 +5846,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1241.7239169999957 @@ -5857,7 +5857,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 2531, "outputTokens": 1031, "latencyMs": 10610.864084 @@ -5868,7 +5868,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "6", - "correct": false, + "isCorrect": false, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 1100.7670829999988 @@ -5879,7 +5879,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 2385, "outputTokens": 1095, "latencyMs": 11523.293417000008 @@ -5890,7 +5890,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 980.1522499999992 @@ -5901,7 +5901,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 6320, "outputTokens": 1095, "latencyMs": 8184.143375 @@ -5912,7 +5912,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "6", - "correct": false, + "isCorrect": false, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 1175.0723330000037 @@ -5923,7 +5923,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 5016, "outputTokens": 1159, "latencyMs": 13082.53912500001 @@ -5934,7 +5934,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 1020.4026659999945 @@ -5945,7 +5945,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 6394, "outputTokens": 1223, "latencyMs": 13166.679334 @@ -5956,7 +5956,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1090.0060839999933 @@ -5967,7 +5967,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 2531, "outputTokens": 1287, "latencyMs": 11181.234958000001 @@ -5978,7 +5978,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 1365.1262080000015 @@ -5989,7 +5989,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 2385, "outputTokens": 967, "latencyMs": 9549.427916999994 @@ -6000,7 +6000,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 981.8662500000064 @@ -6011,7 +6011,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 6320, "outputTokens": 1223, "latencyMs": 11591.030333000002 @@ -6022,7 +6022,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 1430.038750000007 @@ -6033,7 +6033,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "10", - "correct": false, + "isCorrect": false, "inputTokens": 5016, "outputTokens": 1735, "latencyMs": 11458.303500000009 @@ -6044,7 +6044,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "9", - "correct": false, + "isCorrect": false, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 1103.2402909999946 @@ -6055,7 +6055,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "11", - "correct": false, + "isCorrect": false, "inputTokens": 6394, "outputTokens": 2631, "latencyMs": 16900.63120799999 @@ -6066,7 +6066,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1043.442332999999 @@ -6077,7 +6077,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 2531, "outputTokens": 839, "latencyMs": 7278.612083 @@ -6088,7 +6088,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "6", - "correct": false, + "isCorrect": false, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 1705.2114999999903 @@ -6099,7 +6099,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "11", - "correct": false, + "isCorrect": false, "inputTokens": 2385, "outputTokens": 1415, "latencyMs": 10625.603375000006 @@ -6110,7 +6110,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 1081.0501670000085 @@ -6121,7 +6121,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "12", - "correct": true, + "isCorrect": true, "inputTokens": 6320, "outputTokens": 2055, "latencyMs": 17548.71483299999 @@ -6132,7 +6132,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 2302.2003750000003 @@ -6143,7 +6143,7 @@ "model": "gpt-5-nano", "expected": "12", "actual": "11", - "correct": false, + "isCorrect": false, "inputTokens": 5016, "outputTokens": 1287, "latencyMs": 13187.201000000015 @@ -6154,7 +6154,7 @@ "model": "claude-haiku-4-5", "expected": "12", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 2621.4970829999947 @@ -6165,7 +6165,7 @@ "model": "gpt-5-nano", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 6393, "outputTokens": 3783, "latencyMs": 29393.69395799999 @@ -6176,7 +6176,7 @@ "model": "claude-haiku-4-5", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1402.049291999996 @@ -6187,7 +6187,7 @@ "model": "gpt-5-nano", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 2530, "outputTokens": 2823, "latencyMs": 23696.75 @@ -6198,7 +6198,7 @@ "model": "claude-haiku-4-5", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 1064.7778749999998 @@ -6209,7 +6209,7 @@ "model": "gpt-5-nano", "expected": "62", "actual": "64", - "correct": false, + "isCorrect": false, "inputTokens": 2384, "outputTokens": 3143, "latencyMs": 28384.533249999993 @@ -6220,7 +6220,7 @@ "model": "claude-haiku-4-5", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 889.2725839999912 @@ -6231,7 +6231,7 @@ "model": "gpt-5-nano", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 6319, "outputTokens": 6663, "latencyMs": 50113.09675 @@ -6242,7 +6242,7 @@ "model": "claude-haiku-4-5", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 1074.8158330000006 @@ -6253,7 +6253,7 @@ "model": "gpt-5-nano", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 5015, "outputTokens": 2631, "latencyMs": 23841.036083999992 @@ -6264,7 +6264,7 @@ "model": "claude-haiku-4-5", "expected": "62", "actual": "62", - "correct": true, + "isCorrect": true, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 1010.4629169999971 @@ -6275,7 +6275,7 @@ "model": "gpt-5-nano", "expected": "45", "actual": "45", - "correct": true, + "isCorrect": true, "inputTokens": 6393, "outputTokens": 2247, "latencyMs": 18818.030874999997 @@ -6286,7 +6286,7 @@ "model": "claude-haiku-4-5", "expected": "45", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 7872, "outputTokens": 5, "latencyMs": 1203.152833 @@ -6297,7 +6297,7 @@ "model": "gpt-5-nano", "expected": "45", "actual": "45", - "correct": true, + "isCorrect": true, "inputTokens": 2530, "outputTokens": 2631, "latencyMs": 21987.539915999994 @@ -6308,7 +6308,7 @@ "model": "claude-haiku-4-5", "expected": "45", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 2984, "outputTokens": 5, "latencyMs": 1000.0181669999874 @@ -6319,7 +6319,7 @@ "model": "gpt-5-nano", "expected": "45", "actual": "46", - "correct": false, + "isCorrect": false, "inputTokens": 2384, "outputTokens": 3079, "latencyMs": 24534.847250000006 @@ -6330,7 +6330,7 @@ "model": "claude-haiku-4-5", "expected": "45", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 2858, "outputTokens": 5, "latencyMs": 1125.7029999999795 @@ -6341,7 +6341,7 @@ "model": "gpt-5-nano", "expected": "45", "actual": "45", - "correct": true, + "isCorrect": true, "inputTokens": 6319, "outputTokens": 2823, "latencyMs": 27053.90824999998 @@ -6352,7 +6352,7 @@ "model": "claude-haiku-4-5", "expected": "45", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 6367, "outputTokens": 5, "latencyMs": 1474.1193330000096 @@ -6363,7 +6363,7 @@ "model": "gpt-5-nano", "expected": "45", "actual": "45", - "correct": true, + "isCorrect": true, "inputTokens": 5015, "outputTokens": 2567, "latencyMs": 21642.824207999976 @@ -6374,7 +6374,7 @@ "model": "claude-haiku-4-5", "expected": "45", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 5762, "outputTokens": 5, "latencyMs": 1170.1535830000066 @@ -6385,7 +6385,7 @@ "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 73, "latencyMs": 2340.6126670000085 @@ -6396,7 +6396,7 @@ "model": "claude-haiku-4-5", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1337.4746670000022 @@ -6407,7 +6407,7 @@ "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 137, "latencyMs": 2275.1715830000176 @@ -6418,7 +6418,7 @@ "model": "claude-haiku-4-5", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1086.9557499999937 @@ -6429,7 +6429,7 @@ "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 137, "latencyMs": 2881.4037499999977 @@ -6440,7 +6440,7 @@ "model": "claude-haiku-4-5", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1172.774000000005 @@ -6451,7 +6451,7 @@ "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 201, "latencyMs": 7706.478582999989 @@ -6462,7 +6462,7 @@ "model": "claude-haiku-4-5", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1106.0717920000025 @@ -6473,7 +6473,7 @@ "model": "gpt-5-nano", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 137, "latencyMs": 6185.161250000005 @@ -6484,7 +6484,7 @@ "model": "claude-haiku-4-5", "expected": "96.17", "actual": "96.17", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1388.4410000000207 @@ -6495,7 +6495,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 136, "latencyMs": 6699.9394589999865 @@ -6506,7 +6506,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1152.8117919999931 @@ -6517,7 +6517,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 136, "latencyMs": 2446.019666999986 @@ -6528,7 +6528,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1046.3494580000115 @@ -6539,7 +6539,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 200, "latencyMs": 6084.429165999987 @@ -6550,7 +6550,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1787.2428749999963 @@ -6561,7 +6561,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 264, "latencyMs": 5364.3007919999945 @@ -6572,7 +6572,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1269.2162499999977 @@ -6583,7 +6583,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 72, "latencyMs": 2381.514374999999 @@ -6594,7 +6594,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1222.1361669999897 @@ -6605,7 +6605,7 @@ "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 201, "latencyMs": 3641.536167000013 @@ -6616,7 +6616,7 @@ "model": "claude-haiku-4-5", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 2457.5752079999947 @@ -6627,7 +6627,7 @@ "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 201, "latencyMs": 3384.6115839999984 @@ -6638,7 +6638,7 @@ "model": "claude-haiku-4-5", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1372.8756669999857 @@ -6649,7 +6649,7 @@ "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 265, "latencyMs": 5826.962750000006 @@ -6660,7 +6660,7 @@ "model": "claude-haiku-4-5", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1303.1691670000146 @@ -6671,7 +6671,7 @@ "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 265, "latencyMs": 3602.1091250000172 @@ -6682,7 +6682,7 @@ "model": "claude-haiku-4-5", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1451.1585410000116 @@ -6693,7 +6693,7 @@ "model": "gpt-5-nano", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 137, "latencyMs": 2453.183083000011 @@ -6704,7 +6704,7 @@ "model": "claude-haiku-4-5", "expected": "599.39", "actual": "599.39", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1152.136541999993 @@ -6715,7 +6715,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 199, "latencyMs": 5025.56916699998 @@ -6726,7 +6726,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1111.5014169999922 @@ -6737,7 +6737,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 199, "latencyMs": 3548.9061660000007 @@ -6748,7 +6748,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1404.0692500000005 @@ -6759,7 +6759,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 135, "latencyMs": 2879.9619169999787 @@ -6770,7 +6770,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1258.860249999998 @@ -6781,7 +6781,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 263, "latencyMs": 7819.738958000002 @@ -6792,7 +6792,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1495.973915999988 @@ -6803,7 +6803,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 135, "latencyMs": 3092.4329169999983 @@ -6814,7 +6814,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1268.1641250000102 @@ -6825,7 +6825,7 @@ "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 265, "latencyMs": 4409.96212500002 @@ -6836,7 +6836,7 @@ "model": "claude-haiku-4-5", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1422.6079999999783 @@ -6847,7 +6847,7 @@ "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 329, "latencyMs": 3593.100334000017 @@ -6858,7 +6858,7 @@ "model": "claude-haiku-4-5", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1474.3911249999946 @@ -6869,7 +6869,7 @@ "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 265, "latencyMs": 5419.795374999987 @@ -6880,7 +6880,7 @@ "model": "claude-haiku-4-5", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1059.3489999999874 @@ -6891,7 +6891,7 @@ "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 265, "latencyMs": 4783.504167000006 @@ -6902,7 +6902,7 @@ "model": "claude-haiku-4-5", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1340.6675410000025 @@ -6913,7 +6913,7 @@ "model": "gpt-5-nano", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 329, "latencyMs": 4222.140958000004 @@ -6924,7 +6924,7 @@ "model": "claude-haiku-4-5", "expected": "528.71", "actual": "528.71", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1169.892125000013 @@ -6935,7 +6935,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 135, "latencyMs": 2854.8382500000007 @@ -6946,7 +6946,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1077.335374999995 @@ -6957,7 +6957,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 135, "latencyMs": 2525.2092499999853 @@ -6968,7 +6968,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 2100.2050000000163 @@ -6979,7 +6979,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 263, "latencyMs": 5882.592499999999 @@ -6990,7 +6990,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1168.5295410000253 @@ -7001,7 +7001,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 263, "latencyMs": 3944.433083000011 @@ -7012,7 +7012,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1882.1263749999925 @@ -7023,7 +7023,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 135, "latencyMs": 1657.7255829999922 @@ -7034,7 +7034,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1056.5719169999938 @@ -7045,7 +7045,7 @@ "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 266, "latencyMs": 5764.2531250000175 @@ -7056,7 +7056,7 @@ "model": "claude-haiku-4-5", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 8, "latencyMs": 1241.8239590000012 @@ -7067,7 +7067,7 @@ "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 266, "latencyMs": 3203.148416000011 @@ -7078,7 +7078,7 @@ "model": "claude-haiku-4-5", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 8, "latencyMs": 1395.2265419999894 @@ -7089,7 +7089,7 @@ "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 330, "latencyMs": 3854.1738750000077 @@ -7100,7 +7100,7 @@ "model": "claude-haiku-4-5", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 8, "latencyMs": 1868.680457999988 @@ -7111,7 +7111,7 @@ "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 330, "latencyMs": 4486.571708000003 @@ -7122,7 +7122,7 @@ "model": "claude-haiku-4-5", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 8, "latencyMs": 1336.9320829999924 @@ -7133,7 +7133,7 @@ "model": "gpt-5-nano", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 266, "latencyMs": 3571.6664579999924 @@ -7144,7 +7144,7 @@ "model": "claude-haiku-4-5", "expected": "1687.82", "actual": "1687.82", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 8, "latencyMs": 1179.5032920000085 @@ -7155,7 +7155,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 200, "latencyMs": 3395.709499999997 @@ -7166,7 +7166,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1374.4573329999985 @@ -7177,7 +7177,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 200, "latencyMs": 3162.779542000004 @@ -7188,7 +7188,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1010.6076670000039 @@ -7199,7 +7199,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 328, "latencyMs": 3606.7964999999967 @@ -7210,7 +7210,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1432.5227920000034 @@ -7221,7 +7221,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 328, "latencyMs": 2916.351958000014 @@ -7232,7 +7232,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1207.7237920000043 @@ -7243,7 +7243,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 136, "latencyMs": 2741.256458000018 @@ -7254,7 +7254,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1385.7817920000234 @@ -7265,7 +7265,7 @@ "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 201, "latencyMs": 4731.81024999998 @@ -7276,7 +7276,7 @@ "model": "claude-haiku-4-5", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1572.4971659999865 @@ -7287,7 +7287,7 @@ "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 137, "latencyMs": 2684.556333000015 @@ -7298,7 +7298,7 @@ "model": "claude-haiku-4-5", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1314.9989999999816 @@ -7309,7 +7309,7 @@ "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 137, "latencyMs": 2746.457541999989 @@ -7320,7 +7320,7 @@ "model": "claude-haiku-4-5", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1254.8903329999885 @@ -7331,7 +7331,7 @@ "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 137, "latencyMs": 4298.293416 @@ -7342,7 +7342,7 @@ "model": "claude-haiku-4-5", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1346.4980839999916 @@ -7353,7 +7353,7 @@ "model": "gpt-5-nano", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 265, "latencyMs": 3634.2565419999883 @@ -7364,7 +7364,7 @@ "model": "claude-haiku-4-5", "expected": "423.6", "actual": "423.6", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1363.8280410000007 @@ -7375,7 +7375,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 392, "latencyMs": 3933.217000000004 @@ -7386,7 +7386,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1229.9339579999796 @@ -7397,7 +7397,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 136, "latencyMs": 2728.4598340000084 @@ -7408,7 +7408,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1427.2494170000136 @@ -7419,7 +7419,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 200, "latencyMs": 3187.385666999995 @@ -7430,7 +7430,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1482.2487079999992 @@ -7441,7 +7441,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 264, "latencyMs": 3429.744458000001 @@ -7452,7 +7452,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1100.8814589999965 @@ -7463,7 +7463,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 72, "latencyMs": 1993.443707999977 @@ -7474,7 +7474,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1105.5260419999831 @@ -7485,7 +7485,7 @@ "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 137, "latencyMs": 3255.3775840000017 @@ -7496,7 +7496,7 @@ "model": "claude-haiku-4-5", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1274.000417000003 @@ -7507,7 +7507,7 @@ "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 265, "latencyMs": 3098.326624999987 @@ -7518,7 +7518,7 @@ "model": "claude-haiku-4-5", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1057.8637079999899 @@ -7529,7 +7529,7 @@ "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 201, "latencyMs": 3651.3826249999984 @@ -7540,7 +7540,7 @@ "model": "claude-haiku-4-5", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1404.9795829999784 @@ -7551,7 +7551,7 @@ "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 201, "latencyMs": 4157.148833000014 @@ -7562,7 +7562,7 @@ "model": "claude-haiku-4-5", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1607.9431249999907 @@ -7573,7 +7573,7 @@ "model": "gpt-5-nano", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 329, "latencyMs": 4582.246665999992 @@ -7584,7 +7584,7 @@ "model": "claude-haiku-4-5", "expected": "784.03", "actual": "784.03", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1458.8513329999987 @@ -7595,7 +7595,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 200, "latencyMs": 3341.994207999989 @@ -7606,7 +7606,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1144.3136670000094 @@ -7617,7 +7617,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 392, "latencyMs": 6067.672458999994 @@ -7628,7 +7628,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1325.0467500000086 @@ -7639,7 +7639,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 200, "latencyMs": 2847.485000000015 @@ -7650,7 +7650,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1212.1944169999915 @@ -7661,7 +7661,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 456, "latencyMs": 5099.853499999997 @@ -7672,7 +7672,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1284.708416999987 @@ -7683,7 +7683,7 @@ "model": "gpt-5-nano", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 200, "latencyMs": 2745.7869170000195 @@ -7694,7 +7694,7 @@ "model": "claude-haiku-4-5", "expected": "shipped", "actual": "shipped", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1114.6338329999999 @@ -7705,7 +7705,7 @@ "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 265, "latencyMs": 3482.8154170000053 @@ -7716,7 +7716,7 @@ "model": "claude-haiku-4-5", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1156.5491669999901 @@ -7727,7 +7727,7 @@ "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 201, "latencyMs": 2970.104541000008 @@ -7738,7 +7738,7 @@ "model": "claude-haiku-4-5", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1297.768374999985 @@ -7749,7 +7749,7 @@ "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 201, "latencyMs": 3475.6895419999782 @@ -7760,7 +7760,7 @@ "model": "claude-haiku-4-5", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1469.7436250000028 @@ -7771,7 +7771,7 @@ "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 265, "latencyMs": 4107.424582999985 @@ -7782,7 +7782,7 @@ "model": "claude-haiku-4-5", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1070.4507500000182 @@ -7793,7 +7793,7 @@ "model": "gpt-5-nano", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 265, "latencyMs": 3768.3023749999993 @@ -7804,7 +7804,7 @@ "model": "claude-haiku-4-5", "expected": "645.88", "actual": "645.88", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1111.744915999996 @@ -7815,7 +7815,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 263, "latencyMs": 3199.3634999999776 @@ -7826,7 +7826,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1232.4811659999832 @@ -7837,7 +7837,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 263, "latencyMs": 5616.989999999991 @@ -7848,7 +7848,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1697.3162920000032 @@ -7859,7 +7859,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 199, "latencyMs": 2781.3399999999965 @@ -7870,7 +7870,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1162.0402089999989 @@ -7881,7 +7881,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 199, "latencyMs": 3651.1349579999805 @@ -7892,7 +7892,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1132.3132920000062 @@ -7903,7 +7903,7 @@ "model": "gpt-5-nano", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 135, "latencyMs": 3017.5073749999865 @@ -7914,7 +7914,7 @@ "model": "claude-haiku-4-5", "expected": "processing", "actual": "processing", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1294.688374999998 @@ -7925,7 +7925,7 @@ "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 201, "latencyMs": 3591.221499999985 @@ -7936,7 +7936,7 @@ "model": "claude-haiku-4-5", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1329.419332999998 @@ -7947,7 +7947,7 @@ "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 137, "latencyMs": 2655.557792000007 @@ -7958,7 +7958,7 @@ "model": "claude-haiku-4-5", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1446.9020000000019 @@ -7969,7 +7969,7 @@ "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 201, "latencyMs": 3450.5822500000068 @@ -7980,7 +7980,7 @@ "model": "claude-haiku-4-5", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1291.2180410000146 @@ -7991,7 +7991,7 @@ "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 201, "latencyMs": 2803.9767500000016 @@ -8002,7 +8002,7 @@ "model": "claude-haiku-4-5", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1098.5968749999884 @@ -8013,7 +8013,7 @@ "model": "gpt-5-nano", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 201, "latencyMs": 3047.8699999999953 @@ -8024,7 +8024,7 @@ "model": "claude-haiku-4-5", "expected": "371.91", "actual": "371.91", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1800.6882080000069 @@ -8035,7 +8035,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 199, "latencyMs": 2957.2203330000048 @@ -8046,7 +8046,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1165.7748750000028 @@ -8057,7 +8057,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 135, "latencyMs": 2362.283208000008 @@ -8068,7 +8068,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1871.7275829999999 @@ -8079,7 +8079,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 263, "latencyMs": 4747.243208 @@ -8090,7 +8090,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1275.342082999996 @@ -8101,7 +8101,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 199, "latencyMs": 3180.0179160000116 @@ -8112,7 +8112,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 2343.5514580000017 @@ -8123,7 +8123,7 @@ "model": "gpt-5-nano", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 135, "latencyMs": 2362.525915999984 @@ -8134,7 +8134,7 @@ "model": "claude-haiku-4-5", "expected": "pending", "actual": "pending", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1231.4291669999948 @@ -8145,7 +8145,7 @@ "model": "gpt-5-nano", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 200, "latencyMs": 3091.9045840000035 @@ -8156,7 +8156,7 @@ "model": "claude-haiku-4-5", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 6, "latencyMs": 1111.9695000000065 @@ -8167,7 +8167,7 @@ "model": "gpt-5-nano", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 264, "latencyMs": 3977.5146669999813 @@ -8178,7 +8178,7 @@ "model": "claude-haiku-4-5", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 6, "latencyMs": 1195.262208 @@ -8189,7 +8189,7 @@ "model": "gpt-5-nano", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 328, "latencyMs": 3839.0627499999828 @@ -8200,7 +8200,7 @@ "model": "claude-haiku-4-5", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 6, "latencyMs": 2186.8021250000165 @@ -8211,7 +8211,7 @@ "model": "gpt-5-nano", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 328, "latencyMs": 6945.004667000001 @@ -8222,7 +8222,7 @@ "model": "claude-haiku-4-5", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 6, "latencyMs": 1103.6762919999892 @@ -8233,7 +8233,7 @@ "model": "gpt-5-nano", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 264, "latencyMs": 3924.5181250000023 @@ -8244,7 +8244,7 @@ "model": "claude-haiku-4-5", "expected": "1066", "actual": "1066", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 6, "latencyMs": 1023.334583000018 @@ -8255,7 +8255,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 264, "latencyMs": 4017.931666999997 @@ -8266,7 +8266,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1278.6839580000087 @@ -8277,7 +8277,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 200, "latencyMs": 2566.9374580000003 @@ -8288,7 +8288,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 958.4104159999988 @@ -8299,7 +8299,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 264, "latencyMs": 3640.0960409999825 @@ -8310,7 +8310,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1534.7306249999965 @@ -8321,7 +8321,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 328, "latencyMs": 3905.6711249999935 @@ -8332,7 +8332,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 2067.435375000001 @@ -8343,7 +8343,7 @@ "model": "gpt-5-nano", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 264, "latencyMs": 3613.7146249999932 @@ -8354,7 +8354,7 @@ "model": "claude-haiku-4-5", "expected": "cancelled", "actual": "cancelled", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1154.955958000006 @@ -8365,7 +8365,7 @@ "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 330, "latencyMs": 3904.2146250000224 @@ -8376,7 +8376,7 @@ "model": "claude-haiku-4-5", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 8, "latencyMs": 1618.7487079999992 @@ -8387,7 +8387,7 @@ "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 202, "latencyMs": 2906.194541999983 @@ -8398,7 +8398,7 @@ "model": "claude-haiku-4-5", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 8, "latencyMs": 1481.559333000012 @@ -8409,7 +8409,7 @@ "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 266, "latencyMs": 3879.7539999999863 @@ -8420,7 +8420,7 @@ "model": "claude-haiku-4-5", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 8, "latencyMs": 1809.5822499999776 @@ -8431,7 +8431,7 @@ "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 202, "latencyMs": 3147.330500000011 @@ -8442,7 +8442,7 @@ "model": "claude-haiku-4-5", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 8, "latencyMs": 1297.2377080000006 @@ -8453,7 +8453,7 @@ "model": "gpt-5-nano", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 394, "latencyMs": 3710.157500000001 @@ -8464,7 +8464,7 @@ "model": "claude-haiku-4-5", "expected": "1697.4", "actual": "1697.4", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 8, "latencyMs": 1238.5442500000063 @@ -8475,7 +8475,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 392, "latencyMs": 4101.743083999987 @@ -8486,7 +8486,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 11906, "outputTokens": 4, "latencyMs": 1170.750417000003 @@ -8497,7 +8497,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 264, "latencyMs": 8324.009665999998 @@ -8508,7 +8508,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6992, "outputTokens": 4, "latencyMs": 1173.343790999992 @@ -8519,7 +8519,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 264, "latencyMs": 3005.4394999999786 @@ -8530,7 +8530,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 8413, "outputTokens": 4, "latencyMs": 1376.5506659999955 @@ -8541,7 +8541,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 136, "latencyMs": 3209.5317499999946 @@ -8552,7 +8552,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 9288, "outputTokens": 4, "latencyMs": 1299.4064170000202 @@ -8563,7 +8563,7 @@ "model": "gpt-5-nano", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 264, "latencyMs": 3753.726042000024 @@ -8574,7 +8574,7 @@ "model": "claude-haiku-4-5", "expected": "delivered", "actual": "delivered", - "correct": true, + "isCorrect": true, "inputTokens": 8384, "outputTokens": 4, "latencyMs": 1134.558416999993 @@ -8585,7 +8585,7 @@ "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 73, "latencyMs": 2494.451874999999 @@ -8596,7 +8596,7 @@ "model": "claude-haiku-4-5", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 9, "latencyMs": 1270.5290410000016 @@ -8607,7 +8607,7 @@ "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 137, "latencyMs": 2403.4134579999954 @@ -8618,7 +8618,7 @@ "model": "claude-haiku-4-5", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 9, "latencyMs": 1673.0169579999929 @@ -8629,7 +8629,7 @@ "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 73, "latencyMs": 1704.8420409999962 @@ -8640,7 +8640,7 @@ "model": "claude-haiku-4-5", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 9, "latencyMs": 1447.5210840000072 @@ -8651,7 +8651,7 @@ "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 73, "latencyMs": 1638.756207999977 @@ -8662,7 +8662,7 @@ "model": "claude-haiku-4-5", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 9, "latencyMs": 1504.7892920000013 @@ -8673,7 +8673,7 @@ "model": "gpt-5-nano", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 137, "latencyMs": 2409.509625000006 @@ -8684,7 +8684,7 @@ "model": "claude-haiku-4-5", "expected": "Valerie Braun", "actual": "Valerie Braun", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 9, "latencyMs": 1318.699833999999 @@ -8695,7 +8695,7 @@ "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 138, "latencyMs": 2616.233749999985 @@ -8706,7 +8706,7 @@ "model": "claude-haiku-4-5", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 9, "latencyMs": 1314.3836249999877 @@ -8717,7 +8717,7 @@ "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 138, "latencyMs": 2722.7087499999907 @@ -8728,7 +8728,7 @@ "model": "claude-haiku-4-5", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 9, "latencyMs": 1190.632500000007 @@ -8739,7 +8739,7 @@ "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 330, "latencyMs": 4346.388291999989 @@ -8750,7 +8750,7 @@ "model": "claude-haiku-4-5", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 9, "latencyMs": 1327.8158750000002 @@ -8761,7 +8761,7 @@ "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 74, "latencyMs": 2443.0598340000142 @@ -8772,7 +8772,7 @@ "model": "claude-haiku-4-5", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 9, "latencyMs": 1396.4260829999985 @@ -8783,7 +8783,7 @@ "model": "gpt-5-nano", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 266, "latencyMs": 4886.8007919999945 @@ -8794,7 +8794,7 @@ "model": "claude-haiku-4-5", "expected": "Anita Kozey", "actual": "Anita Kozey", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 9, "latencyMs": 1469.287249999994 @@ -8805,7 +8805,7 @@ "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 139, "latencyMs": 2891.1199170000036 @@ -8816,7 +8816,7 @@ "model": "claude-haiku-4-5", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 10, "latencyMs": 1342.1902079999854 @@ -8827,7 +8827,7 @@ "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 139, "latencyMs": 2846.046624999988 @@ -8838,7 +8838,7 @@ "model": "claude-haiku-4-5", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 10, "latencyMs": 1327.919499999989 @@ -8849,7 +8849,7 @@ "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 139, "latencyMs": 4302.444041999988 @@ -8860,7 +8860,7 @@ "model": "claude-haiku-4-5", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 10, "latencyMs": 1207.6207500000019 @@ -8871,7 +8871,7 @@ "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 267, "latencyMs": 3389.5046659999934 @@ -8882,7 +8882,7 @@ "model": "claude-haiku-4-5", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 10, "latencyMs": 1236.2248340000224 @@ -8893,7 +8893,7 @@ "model": "gpt-5-nano", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 139, "latencyMs": 2138.4831669999985 @@ -8904,7 +8904,7 @@ "model": "claude-haiku-4-5", "expected": "Elmer Kub PhD", "actual": "Elmer Kub PhD", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 10, "latencyMs": 1233.3828330000106 @@ -8915,7 +8915,7 @@ "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 138, "latencyMs": 3346.8621669999848 @@ -8926,7 +8926,7 @@ "model": "claude-haiku-4-5", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 10, "latencyMs": 1321.650082999986 @@ -8937,7 +8937,7 @@ "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 138, "latencyMs": 2395.766499999998 @@ -8948,7 +8948,7 @@ "model": "claude-haiku-4-5", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 10, "latencyMs": 1749.51670800001 @@ -8959,7 +8959,7 @@ "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 330, "latencyMs": 4207.4487500000105 @@ -8970,7 +8970,7 @@ "model": "claude-haiku-4-5", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 10, "latencyMs": 1495.846125000011 @@ -8981,7 +8981,7 @@ "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 266, "latencyMs": 4258.881374999997 @@ -8992,7 +8992,7 @@ "model": "claude-haiku-4-5", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 10, "latencyMs": 1113.9782499999856 @@ -9003,7 +9003,7 @@ "model": "gpt-5-nano", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 74, "latencyMs": 1841.1115829999908 @@ -9014,7 +9014,7 @@ "model": "claude-haiku-4-5", "expected": "Maxine Zemlak", "actual": "Maxine Zemlak", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 10, "latencyMs": 1350.6631249999919 @@ -9025,7 +9025,7 @@ "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 138, "latencyMs": 2322.9531669999997 @@ -9036,7 +9036,7 @@ "model": "claude-haiku-4-5", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1556.4763749999984 @@ -9047,7 +9047,7 @@ "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 74, "latencyMs": 2354.004667000001 @@ -9058,7 +9058,7 @@ "model": "claude-haiku-4-5", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1314.1952909999818 @@ -9069,7 +9069,7 @@ "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 138, "latencyMs": 3437.8392080000194 @@ -9080,7 +9080,7 @@ "model": "claude-haiku-4-5", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1131.0356249999895 @@ -9091,7 +9091,7 @@ "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 138, "latencyMs": 3209.646000000008 @@ -9102,7 +9102,7 @@ "model": "claude-haiku-4-5", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1175.6475829999836 @@ -9113,7 +9113,7 @@ "model": "gpt-5-nano", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 266, "latencyMs": 3785.0792920000094 @@ -9124,7 +9124,7 @@ "model": "claude-haiku-4-5", "expected": "Emanuel Littel", "actual": "Emanuel Littel", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1314.7905420000025 @@ -9135,7 +9135,7 @@ "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 72, "latencyMs": 2562.896166999999 @@ -9146,7 +9146,7 @@ "model": "claude-haiku-4-5", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 3205.178583000001 @@ -9157,7 +9157,7 @@ "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 136, "latencyMs": 3746.9874170000257 @@ -9168,7 +9168,7 @@ "model": "claude-haiku-4-5", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1159.280584000022 @@ -9179,7 +9179,7 @@ "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Marvin Thiel", - "correct": false, + "isCorrect": false, "inputTokens": 6781, "outputTokens": 202, "latencyMs": 2584.499542000005 @@ -9190,7 +9190,7 @@ "model": "claude-haiku-4-5", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1249.9375 @@ -9201,7 +9201,7 @@ "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 136, "latencyMs": 2068.6956669999927 @@ -9212,7 +9212,7 @@ "model": "claude-haiku-4-5", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1733.235834000021 @@ -9223,7 +9223,7 @@ "model": "gpt-5-nano", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 200, "latencyMs": 3831.721124999982 @@ -9234,7 +9234,7 @@ "model": "claude-haiku-4-5", "expected": "Andrew Kling", "actual": "Andrew Kling", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1311.1745419999934 @@ -9245,7 +9245,7 @@ "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 139, "latencyMs": 5464.460791999998 @@ -9256,7 +9256,7 @@ "model": "claude-haiku-4-5", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 9, "latencyMs": 1266.8881249999977 @@ -9267,7 +9267,7 @@ "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 203, "latencyMs": 2957.0821250000154 @@ -9278,7 +9278,7 @@ "model": "claude-haiku-4-5", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 9, "latencyMs": 1264.50791700001 @@ -9289,7 +9289,7 @@ "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 331, "latencyMs": 3740.643666000018 @@ -9300,7 +9300,7 @@ "model": "claude-haiku-4-5", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 9, "latencyMs": 1310.5358749999723 @@ -9311,7 +9311,7 @@ "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 139, "latencyMs": 2979.4539579999982 @@ -9322,7 +9322,7 @@ "model": "claude-haiku-4-5", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 9, "latencyMs": 2026.8683329999913 @@ -9333,7 +9333,7 @@ "model": "gpt-5-nano", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 139, "latencyMs": 2932.0294159999758 @@ -9344,7 +9344,7 @@ "model": "claude-haiku-4-5", "expected": "Morris O'Hara", "actual": "Morris O'Hara", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 9, "latencyMs": 1130.2447079999838 @@ -9355,7 +9355,7 @@ "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 203, "latencyMs": 2576.945458000002 @@ -9366,7 +9366,7 @@ "model": "claude-haiku-4-5", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 9, "latencyMs": 1214.6620409999741 @@ -9377,7 +9377,7 @@ "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 203, "latencyMs": 3718.371167000005 @@ -9388,7 +9388,7 @@ "model": "claude-haiku-4-5", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 9, "latencyMs": 1374.984832999995 @@ -9399,7 +9399,7 @@ "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 139, "latencyMs": 2313.5867499999877 @@ -9410,7 +9410,7 @@ "model": "claude-haiku-4-5", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 9, "latencyMs": 1325.0793330000015 @@ -9421,7 +9421,7 @@ "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 139, "latencyMs": 2777.8669999999984 @@ -9432,7 +9432,7 @@ "model": "claude-haiku-4-5", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 9, "latencyMs": 1246.2134589999914 @@ -9443,7 +9443,7 @@ "model": "gpt-5-nano", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 75, "latencyMs": 2246.8254580000066 @@ -9454,7 +9454,7 @@ "model": "claude-haiku-4-5", "expected": "Elijah Franecki", "actual": "Elijah Franecki", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 9, "latencyMs": 1573.5733749999781 @@ -9465,7 +9465,7 @@ "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 74, "latencyMs": 2494.7630000000063 @@ -9476,7 +9476,7 @@ "model": "claude-haiku-4-5", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 7, "latencyMs": 1135.412083000003 @@ -9487,7 +9487,7 @@ "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 138, "latencyMs": 2332.6303330000082 @@ -9498,7 +9498,7 @@ "model": "claude-haiku-4-5", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 7, "latencyMs": 1175.6766249999928 @@ -9509,7 +9509,7 @@ "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 458, "latencyMs": 4252.623416000017 @@ -9520,7 +9520,7 @@ "model": "claude-haiku-4-5", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 7, "latencyMs": 1297.546416999976 @@ -9531,7 +9531,7 @@ "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 74, "latencyMs": 2264.2770829999936 @@ -9542,7 +9542,7 @@ "model": "claude-haiku-4-5", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 7, "latencyMs": 1055.0764170000039 @@ -9553,7 +9553,7 @@ "model": "gpt-5-nano", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 138, "latencyMs": 3193.2753749999974 @@ -9564,7 +9564,7 @@ "model": "claude-haiku-4-5", "expected": "Malcolm Erdman", "actual": "Malcolm Erdman", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 7, "latencyMs": 1912.7229999999981 @@ -9575,7 +9575,7 @@ "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 138, "latencyMs": 2147.5894160000025 @@ -9586,7 +9586,7 @@ "model": "claude-haiku-4-5", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 9, "latencyMs": 1377.5190409999923 @@ -9597,7 +9597,7 @@ "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 202, "latencyMs": 4472.317459000013 @@ -9608,7 +9608,7 @@ "model": "claude-haiku-4-5", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 9, "latencyMs": 1376.0682919999817 @@ -9619,7 +9619,7 @@ "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 202, "latencyMs": 6952.122459000006 @@ -9630,7 +9630,7 @@ "model": "claude-haiku-4-5", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 9, "latencyMs": 1178.8732909999962 @@ -9641,7 +9641,7 @@ "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 266, "latencyMs": 3619.214917000005 @@ -9652,7 +9652,7 @@ "model": "claude-haiku-4-5", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 9, "latencyMs": 1212.3732920000039 @@ -9663,7 +9663,7 @@ "model": "gpt-5-nano", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 202, "latencyMs": 5169.327332999994 @@ -9674,7 +9674,7 @@ "model": "claude-haiku-4-5", "expected": "Fannie Skiles", "actual": "Fannie Skiles", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 9, "latencyMs": 1452.6941670000087 @@ -9685,7 +9685,7 @@ "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 395, "latencyMs": 3384.798125000001 @@ -9696,7 +9696,7 @@ "model": "claude-haiku-4-5", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 10, "latencyMs": 1241.960665999999 @@ -9707,7 +9707,7 @@ "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 331, "latencyMs": 4747.914124999981 @@ -9718,7 +9718,7 @@ "model": "claude-haiku-4-5", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 10, "latencyMs": 1302.8907080000208 @@ -9729,7 +9729,7 @@ "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 331, "latencyMs": 3532.4660830000066 @@ -9740,7 +9740,7 @@ "model": "claude-haiku-4-5", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 10, "latencyMs": 1203.086540999997 @@ -9751,7 +9751,7 @@ "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 331, "latencyMs": 4074.5077089999977 @@ -9762,7 +9762,7 @@ "model": "claude-haiku-4-5", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 10, "latencyMs": 1345.891499999998 @@ -9773,7 +9773,7 @@ "model": "gpt-5-nano", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 75, "latencyMs": 1885.0838330000115 @@ -9784,7 +9784,7 @@ "model": "claude-haiku-4-5", "expected": "Sonja Emmerich", "actual": "Sonja Emmerich", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 10, "latencyMs": 1182.5891669999983 @@ -9795,7 +9795,7 @@ "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 140, "latencyMs": 2772.3258339999884 @@ -9806,7 +9806,7 @@ "model": "claude-haiku-4-5", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 10, "latencyMs": 1424.9674579999992 @@ -9817,7 +9817,7 @@ "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 204, "latencyMs": 2900.4731660000107 @@ -9828,7 +9828,7 @@ "model": "claude-haiku-4-5", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 10, "latencyMs": 2815.817249999993 @@ -9839,7 +9839,7 @@ "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 268, "latencyMs": 3637.2442089999968 @@ -9850,7 +9850,7 @@ "model": "claude-haiku-4-5", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 10, "latencyMs": 1104.2333339999896 @@ -9861,7 +9861,7 @@ "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 396, "latencyMs": 8213.703791999986 @@ -9872,7 +9872,7 @@ "model": "claude-haiku-4-5", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 10, "latencyMs": 2875.9923749999725 @@ -9883,7 +9883,7 @@ "model": "gpt-5-nano", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 140, "latencyMs": 2809.8342080000148 @@ -9894,7 +9894,7 @@ "model": "claude-haiku-4-5", "expected": "Frank Emmerich DVM", "actual": "Frank Emmerich DVM", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 10, "latencyMs": 1306.0824999999895 @@ -9905,7 +9905,7 @@ "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 265, "latencyMs": 3632.680000000022 @@ -9916,7 +9916,7 @@ "model": "claude-haiku-4-5", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 5, "latencyMs": 1446.0535420000087 @@ -9927,7 +9927,7 @@ "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 201, "latencyMs": 2629.6447500000068 @@ -9938,7 +9938,7 @@ "model": "claude-haiku-4-5", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 5, "latencyMs": 1387.298958999978 @@ -9949,7 +9949,7 @@ "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 457, "latencyMs": 8303.644042 @@ -9960,7 +9960,7 @@ "model": "claude-haiku-4-5", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 5, "latencyMs": 1178.2771250000224 @@ -9971,7 +9971,7 @@ "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 329, "latencyMs": 3967.7135410000046 @@ -9982,7 +9982,7 @@ "model": "claude-haiku-4-5", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 5, "latencyMs": 1278.0479160000104 @@ -9993,7 +9993,7 @@ "model": "gpt-5-nano", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 73, "latencyMs": 1974.7658750000119 @@ -10004,7 +10004,7 @@ "model": "claude-haiku-4-5", "expected": "Ronald Collins", "actual": "Ronald Collins", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 5, "latencyMs": 1496.9746670000022 @@ -10015,7 +10015,7 @@ "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 201, "latencyMs": 4246.4962499999965 @@ -10026,7 +10026,7 @@ "model": "claude-haiku-4-5", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 8, "latencyMs": 1322.2766660000198 @@ -10037,7 +10037,7 @@ "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 137, "latencyMs": 2135.097083999979 @@ -10048,7 +10048,7 @@ "model": "claude-haiku-4-5", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 8, "latencyMs": 1213.9765000000189 @@ -10059,7 +10059,7 @@ "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 265, "latencyMs": 3583.0762920000125 @@ -10070,7 +10070,7 @@ "model": "claude-haiku-4-5", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 8, "latencyMs": 1353.168249999988 @@ -10081,7 +10081,7 @@ "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 201, "latencyMs": 3724.366249999992 @@ -10092,7 +10092,7 @@ "model": "claude-haiku-4-5", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 8, "latencyMs": 1239.5215000000026 @@ -10103,7 +10103,7 @@ "model": "gpt-5-nano", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 137, "latencyMs": 2863.772667000012 @@ -10114,7 +10114,7 @@ "model": "claude-haiku-4-5", "expected": "Jeannie Klein", "actual": "Jeannie Klein", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 8, "latencyMs": 1297.5507919999945 @@ -10125,7 +10125,7 @@ "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 9739, "outputTokens": 202, "latencyMs": 2533.5459160000028 @@ -10136,7 +10136,7 @@ "model": "claude-haiku-4-5", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 11907, "outputTokens": 8, "latencyMs": 1313.4649999999965 @@ -10147,7 +10147,7 @@ "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 6013, "outputTokens": 74, "latencyMs": 1609.448166999995 @@ -10158,7 +10158,7 @@ "model": "claude-haiku-4-5", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 6993, "outputTokens": 8, "latencyMs": 1257.2229999999981 @@ -10169,7 +10169,7 @@ "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 6781, "outputTokens": 458, "latencyMs": 5294.154332999984 @@ -10180,7 +10180,7 @@ "model": "claude-haiku-4-5", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 8414, "outputTokens": 8, "latencyMs": 1363.172208999982 @@ -10191,7 +10191,7 @@ "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 9158, "outputTokens": 74, "latencyMs": 2154.742499999993 @@ -10202,7 +10202,7 @@ "model": "claude-haiku-4-5", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 9289, "outputTokens": 8, "latencyMs": 1509.8229580000043 @@ -10213,7 +10213,7 @@ "model": "gpt-5-nano", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 7373, "outputTokens": 74, "latencyMs": 2010.5185419999762 @@ -10224,7 +10224,7 @@ "model": "claude-haiku-4-5", "expected": "Joshua Watsica", "actual": "Joshua Watsica", - "correct": true, + "isCorrect": true, "inputTokens": 8385, "outputTokens": 8, "latencyMs": 1193.5151659999974 @@ -10235,7 +10235,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9735, "outputTokens": 1031, "latencyMs": 9550.510582999996 @@ -10246,7 +10246,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 11902, "outputTokens": 5, "latencyMs": 1146.0822499999776 @@ -10257,7 +10257,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6009, "outputTokens": 775, "latencyMs": 6479.700542000006 @@ -10268,7 +10268,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 6988, "outputTokens": 5, "latencyMs": 1329.610708000022 @@ -10279,7 +10279,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6777, "outputTokens": 967, "latencyMs": 15240.216207999998 @@ -10290,7 +10290,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8409, "outputTokens": 5, "latencyMs": 1203.151125000004 @@ -10301,7 +10301,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9154, "outputTokens": 583, "latencyMs": 6073.186583000002 @@ -10312,7 +10312,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 9284, "outputTokens": 5, "latencyMs": 1452.6655419999734 @@ -10323,7 +10323,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 7369, "outputTokens": 647, "latencyMs": 7084.941665999999 @@ -10334,7 +10334,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8380, "outputTokens": 5, "latencyMs": 1120.7099159999925 @@ -10345,7 +10345,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9735, "outputTokens": 903, "latencyMs": 8906.334791000001 @@ -10356,7 +10356,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 11902, "outputTokens": 5, "latencyMs": 1109.434333000012 @@ -10367,7 +10367,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6009, "outputTokens": 391, "latencyMs": 4955.000415999995 @@ -10378,7 +10378,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 6988, "outputTokens": 5, "latencyMs": 1040.817624999996 @@ -10389,7 +10389,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6777, "outputTokens": 775, "latencyMs": 8308.952791000018 @@ -10400,7 +10400,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8409, "outputTokens": 5, "latencyMs": 1128.542833000014 @@ -10411,7 +10411,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9154, "outputTokens": 775, "latencyMs": 7118.855291000014 @@ -10422,7 +10422,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 9284, "outputTokens": 5, "latencyMs": 1232.1081249999988 @@ -10433,7 +10433,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 7369, "outputTokens": 647, "latencyMs": 6776.706208000018 @@ -10444,7 +10444,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8380, "outputTokens": 5, "latencyMs": 1677.1033330000064 @@ -10455,7 +10455,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9736, "outputTokens": 583, "latencyMs": 5866.636624999985 @@ -10466,7 +10466,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 11902, "outputTokens": 5, "latencyMs": 1574.224125000008 @@ -10477,7 +10477,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6010, "outputTokens": 711, "latencyMs": 7998.43637499999 @@ -10488,7 +10488,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 6988, "outputTokens": 5, "latencyMs": 1175.3050419999927 @@ -10499,7 +10499,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6778, "outputTokens": 647, "latencyMs": 6424.974583000003 @@ -10510,7 +10510,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8409, "outputTokens": 5, "latencyMs": 1352.1832500000019 @@ -10521,7 +10521,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9155, "outputTokens": 647, "latencyMs": 6132.921792000008 @@ -10532,7 +10532,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 9284, "outputTokens": 5, "latencyMs": 1241.7496250000258 @@ -10543,7 +10543,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 7370, "outputTokens": 455, "latencyMs": 8074.935457999993 @@ -10554,7 +10554,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "7", - "correct": false, + "isCorrect": false, "inputTokens": 8380, "outputTokens": 5, "latencyMs": 1294.4225830000069 @@ -10565,7 +10565,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9736, "outputTokens": 775, "latencyMs": 7724.665375000011 @@ -10576,7 +10576,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 11902, "outputTokens": 5, "latencyMs": 1450.864333000005 @@ -10587,7 +10587,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6010, "outputTokens": 711, "latencyMs": 5055.026333999995 @@ -10598,7 +10598,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6988, "outputTokens": 5, "latencyMs": 1177.2059999999765 @@ -10609,7 +10609,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6778, "outputTokens": 839, "latencyMs": 7951.241416999983 @@ -10620,7 +10620,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 8409, "outputTokens": 5, "latencyMs": 1537.2077500000014 @@ -10631,7 +10631,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9155, "outputTokens": 519, "latencyMs": 9752.917709000001 @@ -10642,7 +10642,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9284, "outputTokens": 5, "latencyMs": 1101.1202090000152 @@ -10653,7 +10653,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 7370, "outputTokens": 647, "latencyMs": 5711.038375000004 @@ -10664,7 +10664,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 8380, "outputTokens": 5, "latencyMs": 1208.3837910000002 @@ -10675,7 +10675,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9736, "outputTokens": 775, "latencyMs": 6578.005040999997 @@ -10686,7 +10686,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 11902, "outputTokens": 5, "latencyMs": 1351.4712499999732 @@ -10697,7 +10697,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6010, "outputTokens": 583, "latencyMs": 6437.821874999994 @@ -10708,7 +10708,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 6988, "outputTokens": 5, "latencyMs": 1155.7898750000168 @@ -10719,7 +10719,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 6778, "outputTokens": 647, "latencyMs": 6673.183250000002 @@ -10730,7 +10730,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 8409, "outputTokens": 5, "latencyMs": 1359.994417000009 @@ -10741,7 +10741,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9155, "outputTokens": 647, "latencyMs": 5806.33679099998 @@ -10752,7 +10752,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 9284, "outputTokens": 5, "latencyMs": 1339.4869999999937 @@ -10763,7 +10763,7 @@ "model": "gpt-5-nano", "expected": "10", "actual": "10", - "correct": true, + "isCorrect": true, "inputTokens": 7370, "outputTokens": 519, "latencyMs": 6011.0411669999885 @@ -10774,7 +10774,7 @@ "model": "claude-haiku-4-5", "expected": "10", "actual": "8", - "correct": false, + "isCorrect": false, "inputTokens": 8380, "outputTokens": 5, "latencyMs": 1305.6029999999737 @@ -10785,7 +10785,7 @@ "model": "gpt-5-nano", "expected": "42342.25", "actual": "41001.14", - "correct": false, + "isCorrect": false, "inputTokens": 9736, "outputTokens": 1226, "latencyMs": 11276.714458000002 @@ -10796,7 +10796,7 @@ "model": "claude-haiku-4-5", "expected": "42342.25", "actual": "48,847.66", - "correct": false, + "isCorrect": false, "inputTokens": 11902, "outputTokens": 9, "latencyMs": 1400.5162910000072 @@ -10807,7 +10807,7 @@ "model": "gpt-5-nano", "expected": "42342.25", "actual": "42342.25", - "correct": true, + "isCorrect": true, "inputTokens": 6010, "outputTokens": 5962, "latencyMs": 50971.727667 @@ -10818,7 +10818,7 @@ "model": "claude-haiku-4-5", "expected": "42342.25", "actual": "41,847.47", - "correct": false, + "isCorrect": false, "inputTokens": 6988, "outputTokens": 9, "latencyMs": 1118.9986250000075 @@ -10829,7 +10829,7 @@ "model": "gpt-5-nano", "expected": "42342.25", "actual": "42342.25", - "correct": true, + "isCorrect": true, "inputTokens": 6778, "outputTokens": 3082, "latencyMs": 22816.508165999985 @@ -10840,7 +10840,7 @@ "model": "claude-haiku-4-5", "expected": "42342.25", "actual": "48,847.47", - "correct": false, + "isCorrect": false, "inputTokens": 8409, "outputTokens": 9, "latencyMs": 1104.31912499998 @@ -10851,7 +10851,7 @@ "model": "gpt-5-nano", "expected": "42342.25", "actual": "42425.97", - "correct": false, + "isCorrect": false, "inputTokens": 9155, "outputTokens": 2762, "latencyMs": 17412.623583000008 @@ -10862,7 +10862,7 @@ "model": "claude-haiku-4-5", "expected": "42342.25", "actual": "47,847.47", - "correct": false, + "isCorrect": false, "inputTokens": 9284, "outputTokens": 9, "latencyMs": 1435.553082999977 @@ -10873,7 +10873,7 @@ "model": "gpt-5-nano", "expected": "42342.25", "actual": "42342.25", - "correct": true, + "isCorrect": true, "inputTokens": 7370, "outputTokens": 3402, "latencyMs": 26299.00112500001 @@ -10884,7 +10884,7 @@ "model": "claude-haiku-4-5", "expected": "42342.25", "actual": "41,847.47", - "correct": false, + "isCorrect": false, "inputTokens": 8380, "outputTokens": 9, "latencyMs": 1272.4541250000184 @@ -10895,7 +10895,7 @@ "model": "gpt-5-nano", "expected": "44", "actual": "44", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 1351, "latencyMs": 13461.932250000013 @@ -10906,7 +10906,7 @@ "model": "claude-haiku-4-5", "expected": "44", "actual": "48", - "correct": false, + "isCorrect": false, "inputTokens": 11904, "outputTokens": 5, "latencyMs": 1772.9891250000219 @@ -10917,7 +10917,7 @@ "model": "gpt-5-nano", "expected": "44", "actual": "44", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 1735, "latencyMs": 14196.807250000013 @@ -10928,7 +10928,7 @@ "model": "claude-haiku-4-5", "expected": "44", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 6990, "outputTokens": 5, "latencyMs": 1749.7322920000006 @@ -10939,7 +10939,7 @@ "model": "gpt-5-nano", "expected": "44", "actual": "44", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 1863, "latencyMs": 14291.044916999992 @@ -10950,7 +10950,7 @@ "model": "claude-haiku-4-5", "expected": "44", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 8411, "outputTokens": 5, "latencyMs": 1453.1822079999838 @@ -10961,7 +10961,7 @@ "model": "gpt-5-nano", "expected": "44", "actual": "44", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 1799, "latencyMs": 16012.806332999986 @@ -10972,7 +10972,7 @@ "model": "claude-haiku-4-5", "expected": "44", "actual": "48", - "correct": false, + "isCorrect": false, "inputTokens": 9286, "outputTokens": 5, "latencyMs": 1761.131041000015 @@ -10983,7 +10983,7 @@ "model": "gpt-5-nano", "expected": "44", "actual": "44", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 1415, "latencyMs": 12218.14491599999 @@ -10994,7 +10994,7 @@ "model": "claude-haiku-4-5", "expected": "44", "actual": "45", - "correct": false, + "isCorrect": false, "inputTokens": 8382, "outputTokens": 5, "latencyMs": 1255.681917000009 @@ -11005,7 +11005,7 @@ "model": "gpt-5-nano", "expected": "39", "actual": "39", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 2311, "latencyMs": 22316.87704199998 @@ -11016,7 +11016,7 @@ "model": "claude-haiku-4-5", "expected": "39", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 11904, "outputTokens": 5, "latencyMs": 1090.176792000013 @@ -11027,7 +11027,7 @@ "model": "gpt-5-nano", "expected": "39", "actual": "39", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 1095, "latencyMs": 7211.767082999984 @@ -11038,7 +11038,7 @@ "model": "claude-haiku-4-5", "expected": "39", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 6990, "outputTokens": 5, "latencyMs": 1129.9290000000037 @@ -11049,7 +11049,7 @@ "model": "gpt-5-nano", "expected": "39", "actual": "39", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 1415, "latencyMs": 15701.471499999985 @@ -11060,7 +11060,7 @@ "model": "claude-haiku-4-5", "expected": "39", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 8411, "outputTokens": 5, "latencyMs": 1251.5472500000033 @@ -11071,7 +11071,7 @@ "model": "gpt-5-nano", "expected": "39", "actual": "39", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 1799, "latencyMs": 16689.30345800001 @@ -11082,7 +11082,7 @@ "model": "claude-haiku-4-5", "expected": "39", "actual": "41", - "correct": false, + "isCorrect": false, "inputTokens": 9286, "outputTokens": 5, "latencyMs": 1168.8190419999883 @@ -11093,7 +11093,7 @@ "model": "gpt-5-nano", "expected": "39", "actual": "39", - "correct": true, + "isCorrect": true, "inputTokens": 7372, "outputTokens": 1863, "latencyMs": 14505.393958999979 @@ -11104,7 +11104,7 @@ "model": "claude-haiku-4-5", "expected": "39", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 8382, "outputTokens": 5, "latencyMs": 1149.8783330000006 @@ -11115,7 +11115,7 @@ "model": "gpt-5-nano", "expected": "32", "actual": "32", - "correct": true, + "isCorrect": true, "inputTokens": 9738, "outputTokens": 1607, "latencyMs": 13945.93979200002 @@ -11126,7 +11126,7 @@ "model": "claude-haiku-4-5", "expected": "32", "actual": "28", - "correct": false, + "isCorrect": false, "inputTokens": 11904, "outputTokens": 5, "latencyMs": 1175.8143749999872 @@ -11137,7 +11137,7 @@ "model": "gpt-5-nano", "expected": "32", "actual": "32", - "correct": true, + "isCorrect": true, "inputTokens": 6012, "outputTokens": 1351, "latencyMs": 11991.764750000002 @@ -11148,7 +11148,7 @@ "model": "claude-haiku-4-5", "expected": "32", "actual": "26", - "correct": false, + "isCorrect": false, "inputTokens": 6990, "outputTokens": 5, "latencyMs": 1643.4279169999936 @@ -11159,7 +11159,7 @@ "model": "gpt-5-nano", "expected": "32", "actual": "32", - "correct": true, + "isCorrect": true, "inputTokens": 6780, "outputTokens": 1799, "latencyMs": 17324.695000000007 @@ -11170,7 +11170,7 @@ "model": "claude-haiku-4-5", "expected": "32", "actual": "28", - "correct": false, + "isCorrect": false, "inputTokens": 8411, "outputTokens": 5, "latencyMs": 1197.7254160000011 @@ -11181,7 +11181,7 @@ "model": "gpt-5-nano", "expected": "32", "actual": "32", - "correct": true, + "isCorrect": true, "inputTokens": 9157, "outputTokens": 1607, "latencyMs": 22426.01029199999 @@ -11192,7 +11192,7 @@ "model": "claude-haiku-4-5", "expected": "32", "actual": "28", - "correct": false, + "isCorrect": false, "inputTokens": 9286, "outputTokens": 5, "latencyMs": 1065.6509170000209 @@ -11203,7 +11203,7 @@ "model": "gpt-5-nano", "expected": "32", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 7372, "outputTokens": 1543, "latencyMs": 12786.843416999996 @@ -11214,7 +11214,7 @@ "model": "claude-haiku-4-5", "expected": "32", "actual": "26", - "correct": false, + "isCorrect": false, "inputTokens": 8382, "outputTokens": 5, "latencyMs": 2054.993749999994 @@ -11225,7 +11225,7 @@ "model": "gpt-5-nano", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 72, "latencyMs": 2244.986208999995 @@ -11236,7 +11236,7 @@ "model": "claude-haiku-4-5", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1162.9390420000127 @@ -11247,7 +11247,7 @@ "model": "gpt-5-nano", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 136, "latencyMs": 2179.3558330000087 @@ -11258,7 +11258,7 @@ "model": "claude-haiku-4-5", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1013.4975409999897 @@ -11269,7 +11269,7 @@ "model": "gpt-5-nano", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 72, "latencyMs": 4859.720833999978 @@ -11280,7 +11280,7 @@ "model": "claude-haiku-4-5", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1437.758375000005 @@ -11291,7 +11291,7 @@ "model": "gpt-5-nano", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 72, "latencyMs": 3120.702874999988 @@ -11302,7 +11302,7 @@ "model": "claude-haiku-4-5", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1051.775708000001 @@ -11313,7 +11313,7 @@ "model": "gpt-5-nano", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 72, "latencyMs": 2182.880084000004 @@ -11324,7 +11324,7 @@ "model": "claude-haiku-4-5", "expected": "6975", "actual": "6975", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1045.2009580000013 @@ -11335,7 +11335,7 @@ "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 5291.923750000016 @@ -11346,7 +11346,7 @@ "model": "claude-haiku-4-5", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1009.6958750000049 @@ -11357,7 +11357,7 @@ "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 74, "latencyMs": 2582.2320419999887 @@ -11368,7 +11368,7 @@ "model": "claude-haiku-4-5", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1203.816542000015 @@ -11379,7 +11379,7 @@ "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 138, "latencyMs": 2774.835167000012 @@ -11390,7 +11390,7 @@ "model": "claude-haiku-4-5", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 979.9191669999855 @@ -11401,7 +11401,7 @@ "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 138, "latencyMs": 2616.684333000012 @@ -11412,7 +11412,7 @@ "model": "claude-haiku-4-5", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1253.4844169999997 @@ -11423,7 +11423,7 @@ "model": "gpt-5-nano", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 74, "latencyMs": 2267.1155000000144 @@ -11434,7 +11434,7 @@ "model": "claude-haiku-4-5", "expected": "6686.23", "actual": "6686.23", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1185.4212080000143 @@ -11445,7 +11445,7 @@ "model": "gpt-5-nano", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 136, "latencyMs": 2905.6011250000156 @@ -11456,7 +11456,7 @@ "model": "claude-haiku-4-5", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1571.1469999999972 @@ -11467,7 +11467,7 @@ "model": "gpt-5-nano", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 328, "latencyMs": 3884.65858399999 @@ -11478,7 +11478,7 @@ "model": "claude-haiku-4-5", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1207.1518330000108 @@ -11489,7 +11489,7 @@ "model": "gpt-5-nano", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 72, "latencyMs": 1995.0557919999992 @@ -11500,7 +11500,7 @@ "model": "claude-haiku-4-5", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1238.8113749999902 @@ -11511,7 +11511,7 @@ "model": "gpt-5-nano", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 136, "latencyMs": 5824.06574999998 @@ -11522,7 +11522,7 @@ "model": "claude-haiku-4-5", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1337.474749999994 @@ -11533,7 +11533,7 @@ "model": "gpt-5-nano", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 2286.1839580000087 @@ -11544,7 +11544,7 @@ "model": "claude-haiku-4-5", "expected": "7500", "actual": "7500", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1326.3640000000014 @@ -11555,7 +11555,7 @@ "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 3801.309249999991 @@ -11566,7 +11566,7 @@ "model": "claude-haiku-4-5", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1054.8991249999963 @@ -11577,7 +11577,7 @@ "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 74, "latencyMs": 3338.1347499999974 @@ -11588,7 +11588,7 @@ "model": "claude-haiku-4-5", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1393.589082999999 @@ -11599,7 +11599,7 @@ "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 202, "latencyMs": 3719.6092089999875 @@ -11610,7 +11610,7 @@ "model": "claude-haiku-4-5", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 1030.9656669999822 @@ -11621,7 +11621,7 @@ "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 74, "latencyMs": 2226.628250000009 @@ -11632,7 +11632,7 @@ "model": "claude-haiku-4-5", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1154.132540999999 @@ -11643,7 +11643,7 @@ "model": "gpt-5-nano", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 2922.2590830000117 @@ -11654,7 +11654,7 @@ "model": "claude-haiku-4-5", "expected": "14297.05", "actual": "14297.05", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 2048.011916999996 @@ -11665,7 +11665,7 @@ "model": "gpt-5-nano", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 200, "latencyMs": 2520.5313329999917 @@ -11676,7 +11676,7 @@ "model": "claude-haiku-4-5", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 943.3422089999949 @@ -11687,7 +11687,7 @@ "model": "gpt-5-nano", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 136, "latencyMs": 2300.8406249999825 @@ -11698,7 +11698,7 @@ "model": "claude-haiku-4-5", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1128.4146670000046 @@ -11709,7 +11709,7 @@ "model": "gpt-5-nano", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 200, "latencyMs": 2929.585208000004 @@ -11720,7 +11720,7 @@ "model": "claude-haiku-4-5", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1230.4635420000122 @@ -11731,7 +11731,7 @@ "model": "gpt-5-nano", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 136, "latencyMs": 3650.3654169999936 @@ -11742,7 +11742,7 @@ "model": "claude-haiku-4-5", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 985.8184590000019 @@ -11753,7 +11753,7 @@ "model": "gpt-5-nano", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 328, "latencyMs": 3772.2553330000082 @@ -11764,7 +11764,7 @@ "model": "claude-haiku-4-5", "expected": "6692", "actual": "6692", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1311.8630419999827 @@ -11775,7 +11775,7 @@ "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 2935.785124999995 @@ -11786,7 +11786,7 @@ "model": "claude-haiku-4-5", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1391.9168749999953 @@ -11797,7 +11797,7 @@ "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 138, "latencyMs": 5759.15529200001 @@ -11808,7 +11808,7 @@ "model": "claude-haiku-4-5", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1064.3980420000153 @@ -11819,7 +11819,7 @@ "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 74, "latencyMs": 3640.193708000006 @@ -11830,7 +11830,7 @@ "model": "claude-haiku-4-5", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 983.806166000024 @@ -11841,7 +11841,7 @@ "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 266, "latencyMs": 2604.2135000000126 @@ -11852,7 +11852,7 @@ "model": "claude-haiku-4-5", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1128.6182499999995 @@ -11863,7 +11863,7 @@ "model": "gpt-5-nano", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 2548.5608749999956 @@ -11874,7 +11874,7 @@ "model": "claude-haiku-4-5", "expected": "9302.76", "actual": "9302.76", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1029.5365000000165 @@ -11885,7 +11885,7 @@ "model": "gpt-5-nano", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 136, "latencyMs": 3983.6009170000034 @@ -11896,7 +11896,7 @@ "model": "claude-haiku-4-5", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1095.2366250000196 @@ -11907,7 +11907,7 @@ "model": "gpt-5-nano", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 72, "latencyMs": 2207.884417000023 @@ -11918,7 +11918,7 @@ "model": "claude-haiku-4-5", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 2292.4111660000053 @@ -11929,7 +11929,7 @@ "model": "gpt-5-nano", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 136, "latencyMs": 2749.430541000009 @@ -11940,7 +11940,7 @@ "model": "claude-haiku-4-5", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1215.8329999999842 @@ -11951,7 +11951,7 @@ "model": "gpt-5-nano", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 136, "latencyMs": 2086.6161659999925 @@ -11962,7 +11962,7 @@ "model": "claude-haiku-4-5", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1299.715790999995 @@ -11973,7 +11973,7 @@ "model": "gpt-5-nano", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 7107.394916999998 @@ -11984,7 +11984,7 @@ "model": "claude-haiku-4-5", "expected": "3285", "actual": "3285", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 899.2319579999894 @@ -11995,7 +11995,7 @@ "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 2810.5213330000115 @@ -12006,7 +12006,7 @@ "model": "claude-haiku-4-5", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 989.2326659999962 @@ -12017,7 +12017,7 @@ "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 138, "latencyMs": 2622.7841670000053 @@ -12028,7 +12028,7 @@ "model": "claude-haiku-4-5", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 850.1227920000092 @@ -12039,7 +12039,7 @@ "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 138, "latencyMs": 3057.1578750000044 @@ -12050,7 +12050,7 @@ "model": "claude-haiku-4-5", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 1261.3340000000026 @@ -12061,7 +12061,7 @@ "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 202, "latencyMs": 3061.791499999992 @@ -12072,7 +12072,7 @@ "model": "claude-haiku-4-5", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1196.6509999999835 @@ -12083,7 +12083,7 @@ "model": "gpt-5-nano", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 3567.4540839999972 @@ -12094,7 +12094,7 @@ "model": "claude-haiku-4-5", "expected": "3826.93", "actual": "3826.93", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1033.8556249999965 @@ -12105,7 +12105,7 @@ "model": "gpt-5-nano", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 136, "latencyMs": 2842.961707999988 @@ -12116,7 +12116,7 @@ "model": "claude-haiku-4-5", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1258.130582999991 @@ -12127,7 +12127,7 @@ "model": "gpt-5-nano", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 456, "latencyMs": 5828.652415999997 @@ -12138,7 +12138,7 @@ "model": "claude-haiku-4-5", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1004.821958000015 @@ -12149,7 +12149,7 @@ "model": "gpt-5-nano", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 72, "latencyMs": 3102.38612499999 @@ -12160,7 +12160,7 @@ "model": "claude-haiku-4-5", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1454.8658750000177 @@ -12171,7 +12171,7 @@ "model": "gpt-5-nano", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 136, "latencyMs": 2018.8434999999881 @@ -12182,7 +12182,7 @@ "model": "claude-haiku-4-5", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1237.4057080000057 @@ -12193,7 +12193,7 @@ "model": "gpt-5-nano", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 3670.7451670000155 @@ -12204,7 +12204,7 @@ "model": "claude-haiku-4-5", "expected": "6191", "actual": "6191", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1070.646584000002 @@ -12215,7 +12215,7 @@ "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 202, "latencyMs": 3731.3879579999775 @@ -12226,7 +12226,7 @@ "model": "claude-haiku-4-5", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1387.9798329999903 @@ -12237,7 +12237,7 @@ "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 394, "latencyMs": 5560.397957999987 @@ -12248,7 +12248,7 @@ "model": "claude-haiku-4-5", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1552.963958999986 @@ -12259,7 +12259,7 @@ "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 138, "latencyMs": 21759.84366700001 @@ -12270,7 +12270,7 @@ "model": "claude-haiku-4-5", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 1132.519083000021 @@ -12281,7 +12281,7 @@ "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 138, "latencyMs": 2277.2652499999967 @@ -12292,7 +12292,7 @@ "model": "claude-haiku-4-5", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1098.0825420000183 @@ -12303,7 +12303,7 @@ "model": "gpt-5-nano", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 202, "latencyMs": 2813.10504200001 @@ -12314,7 +12314,7 @@ "model": "claude-haiku-4-5", "expected": "1854.66", "actual": "1854.66", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1131.9674159999995 @@ -12325,7 +12325,7 @@ "model": "gpt-5-nano", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 136, "latencyMs": 6657.446207999979 @@ -12336,7 +12336,7 @@ "model": "claude-haiku-4-5", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1265.4548749999958 @@ -12347,7 +12347,7 @@ "model": "gpt-5-nano", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 136, "latencyMs": 3299.298792000016 @@ -12358,7 +12358,7 @@ "model": "claude-haiku-4-5", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1618.5091249999823 @@ -12369,7 +12369,7 @@ "model": "gpt-5-nano", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 136, "latencyMs": 5353.29241699999 @@ -12380,7 +12380,7 @@ "model": "claude-haiku-4-5", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 870.5113749999728 @@ -12391,7 +12391,7 @@ "model": "gpt-5-nano", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 200, "latencyMs": 2780.5659159999923 @@ -12402,7 +12402,7 @@ "model": "claude-haiku-4-5", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1069.2415409999958 @@ -12413,7 +12413,7 @@ "model": "gpt-5-nano", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 200, "latencyMs": 3036.145666999975 @@ -12424,7 +12424,7 @@ "model": "claude-haiku-4-5", "expected": "4696", "actual": "4696", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1252.9633329999924 @@ -12435,7 +12435,7 @@ "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 2617.047249999974 @@ -12446,7 +12446,7 @@ "model": "claude-haiku-4-5", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1261.9117079999996 @@ -12457,7 +12457,7 @@ "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 202, "latencyMs": 6192.06358300001 @@ -12468,7 +12468,7 @@ "model": "claude-haiku-4-5", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1158.3806249999907 @@ -12479,7 +12479,7 @@ "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 138, "latencyMs": 2867.840083999996 @@ -12490,7 +12490,7 @@ "model": "claude-haiku-4-5", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 856.2939580000238 @@ -12501,7 +12501,7 @@ "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 138, "latencyMs": 2329.6339579999913 @@ -12512,7 +12512,7 @@ "model": "claude-haiku-4-5", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1106.5591669999994 @@ -12523,7 +12523,7 @@ "model": "gpt-5-nano", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 2590.7533330000006 @@ -12534,7 +12534,7 @@ "model": "claude-haiku-4-5", "expected": "4211.6", "actual": "4211.6", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1007.0892920000188 @@ -12545,7 +12545,7 @@ "model": "gpt-5-nano", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 200, "latencyMs": 3839.2745000000286 @@ -12556,7 +12556,7 @@ "model": "claude-haiku-4-5", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1388.2399160000205 @@ -12567,7 +12567,7 @@ "model": "gpt-5-nano", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 200, "latencyMs": 3955.22095800002 @@ -12578,7 +12578,7 @@ "model": "claude-haiku-4-5", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1036.567458000005 @@ -12589,7 +12589,7 @@ "model": "gpt-5-nano", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 200, "latencyMs": 5566.705209000007 @@ -12600,7 +12600,7 @@ "model": "claude-haiku-4-5", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1078.5011670000094 @@ -12611,7 +12611,7 @@ "model": "gpt-5-nano", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 200, "latencyMs": 2956.9618330000376 @@ -12622,7 +12622,7 @@ "model": "claude-haiku-4-5", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1797.4496250000084 @@ -12633,7 +12633,7 @@ "model": "gpt-5-nano", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 2647.741832999978 @@ -12644,7 +12644,7 @@ "model": "claude-haiku-4-5", "expected": "6196", "actual": "6196", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1221.9055410000146 @@ -12655,7 +12655,7 @@ "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 3783.334333000006 @@ -12666,7 +12666,7 @@ "model": "claude-haiku-4-5", "expected": "6105.3", "actual": "6105.30", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1135.7771670000511 @@ -12677,7 +12677,7 @@ "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 266, "latencyMs": 3364.4232920000213 @@ -12688,7 +12688,7 @@ "model": "claude-haiku-4-5", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1161.263666999992 @@ -12699,7 +12699,7 @@ "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 74, "latencyMs": 3646.0659589999705 @@ -12710,7 +12710,7 @@ "model": "claude-haiku-4-5", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 955.7597500000265 @@ -12721,7 +12721,7 @@ "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 74, "latencyMs": 2345.2203750000335 @@ -12732,7 +12732,7 @@ "model": "claude-haiku-4-5", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1541.918249999988 @@ -12743,7 +12743,7 @@ "model": "gpt-5-nano", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 6126.976708000002 @@ -12754,7 +12754,7 @@ "model": "claude-haiku-4-5", "expected": "6105.3", "actual": "6105.3", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1097.440709000046 @@ -12765,7 +12765,7 @@ "model": "gpt-5-nano", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 264, "latencyMs": 3404.643708999967 @@ -12776,7 +12776,7 @@ "model": "claude-haiku-4-5", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1227.7047499999753 @@ -12787,7 +12787,7 @@ "model": "gpt-5-nano", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 136, "latencyMs": 2495.85037499998 @@ -12798,7 +12798,7 @@ "model": "claude-haiku-4-5", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1048.344832999981 @@ -12809,7 +12809,7 @@ "model": "gpt-5-nano", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 136, "latencyMs": 3007.2462499999674 @@ -12820,7 +12820,7 @@ "model": "claude-haiku-4-5", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 840.0351669999654 @@ -12831,7 +12831,7 @@ "model": "gpt-5-nano", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 328, "latencyMs": 3149.872374999977 @@ -12842,7 +12842,7 @@ "model": "claude-haiku-4-5", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 973.716167000006 @@ -12853,7 +12853,7 @@ "model": "gpt-5-nano", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 456, "latencyMs": 5305.827791999967 @@ -12864,7 +12864,7 @@ "model": "claude-haiku-4-5", "expected": "6528", "actual": "6528", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 953.3122500000172 @@ -12875,7 +12875,7 @@ "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 3435.850167000026 @@ -12886,7 +12886,7 @@ "model": "claude-haiku-4-5", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1110.8856249999953 @@ -12897,7 +12897,7 @@ "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 266, "latencyMs": 3303.3427500000107 @@ -12908,7 +12908,7 @@ "model": "claude-haiku-4-5", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 954.5857910000486 @@ -12919,7 +12919,7 @@ "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 138, "latencyMs": 5035.666582999984 @@ -12930,7 +12930,7 @@ "model": "claude-haiku-4-5", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 867.9529159999802 @@ -12941,7 +12941,7 @@ "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 202, "latencyMs": 2817.1118750000023 @@ -12952,7 +12952,7 @@ "model": "claude-haiku-4-5", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1029.4406660000095 @@ -12963,7 +12963,7 @@ "model": "gpt-5-nano", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 2521.28145900002 @@ -12974,7 +12974,7 @@ "model": "claude-haiku-4-5", "expected": "1136.09", "actual": "1136.09", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1266.9695000000065 @@ -12985,7 +12985,7 @@ "model": "gpt-5-nano", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 72, "latencyMs": 2383.6225830000476 @@ -12996,7 +12996,7 @@ "model": "claude-haiku-4-5", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 1100.3007499999949 @@ -13007,7 +13007,7 @@ "model": "gpt-5-nano", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 200, "latencyMs": 2816.252374999982 @@ -13018,7 +13018,7 @@ "model": "claude-haiku-4-5", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1030.0248330000322 @@ -13029,7 +13029,7 @@ "model": "gpt-5-nano", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 72, "latencyMs": 1819.5161669999943 @@ -13040,7 +13040,7 @@ "model": "claude-haiku-4-5", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 1012.0581670000101 @@ -13051,7 +13051,7 @@ "model": "gpt-5-nano", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 136, "latencyMs": 2960.8910000000033 @@ -13062,7 +13062,7 @@ "model": "claude-haiku-4-5", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1346.7110000000102 @@ -13073,7 +13073,7 @@ "model": "gpt-5-nano", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 3081.40625 @@ -13084,7 +13084,7 @@ "model": "claude-haiku-4-5", "expected": "4689", "actual": "4689", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1485.0133330000099 @@ -13095,7 +13095,7 @@ "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 138, "latencyMs": 3632.860875000013 @@ -13106,7 +13106,7 @@ "model": "claude-haiku-4-5", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1224.803750000021 @@ -13117,7 +13117,7 @@ "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 138, "latencyMs": 2323.675958000007 @@ -13128,7 +13128,7 @@ "model": "claude-haiku-4-5", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1114.0831669999752 @@ -13139,7 +13139,7 @@ "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 202, "latencyMs": 3465.111333000008 @@ -13150,7 +13150,7 @@ "model": "claude-haiku-4-5", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 1082.4990419999813 @@ -13161,7 +13161,7 @@ "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 138, "latencyMs": 5648.285415999999 @@ -13172,7 +13172,7 @@ "model": "claude-haiku-4-5", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1087.8757500000065 @@ -13183,7 +13183,7 @@ "model": "gpt-5-nano", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 4587.399166000017 @@ -13194,7 +13194,7 @@ "model": "claude-haiku-4-5", "expected": "2637.73", "actual": "2637.73", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1007.4333340000012 @@ -13205,7 +13205,7 @@ "model": "gpt-5-nano", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 3712, "outputTokens": 72, "latencyMs": 2307.9398339999607 @@ -13216,7 +13216,7 @@ "model": "claude-haiku-4-5", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 4080, "outputTokens": 6, "latencyMs": 2368.3719580000034 @@ -13227,7 +13227,7 @@ "model": "gpt-5-nano", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 1563, "outputTokens": 200, "latencyMs": 3587.720166999963 @@ -13238,7 +13238,7 @@ "model": "claude-haiku-4-5", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 1509, "outputTokens": 6, "latencyMs": 1053.9867080000113 @@ -13249,7 +13249,7 @@ "model": "gpt-5-nano", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 1441, "outputTokens": 136, "latencyMs": 1593.4699169999803 @@ -13260,7 +13260,7 @@ "model": "claude-haiku-4-5", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 1445, "outputTokens": 6, "latencyMs": 2256.4729170000064 @@ -13271,7 +13271,7 @@ "model": "gpt-5-nano", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 3829, "outputTokens": 200, "latencyMs": 4466.158916999993 @@ -13282,7 +13282,7 @@ "model": "claude-haiku-4-5", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 3415, "outputTokens": 6, "latencyMs": 1305.1236670000362 @@ -13293,7 +13293,7 @@ "model": "gpt-5-nano", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 2985, "outputTokens": 136, "latencyMs": 3014.9748339999933 @@ -13304,7 +13304,7 @@ "model": "claude-haiku-4-5", "expected": "5685", "actual": "5685", - "correct": true, + "isCorrect": true, "inputTokens": 3110, "outputTokens": 6, "latencyMs": 1421.9597920000087 @@ -13315,7 +13315,7 @@ "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 3711, "outputTokens": 202, "latencyMs": 19503.25695900002 @@ -13326,7 +13326,7 @@ "model": "claude-haiku-4-5", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 4079, "outputTokens": 8, "latencyMs": 1164.002959000005 @@ -13337,7 +13337,7 @@ "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 1562, "outputTokens": 330, "latencyMs": 4662.637042000017 @@ -13348,7 +13348,7 @@ "model": "claude-haiku-4-5", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 1508, "outputTokens": 8, "latencyMs": 1086.9569170000032 @@ -13359,7 +13359,7 @@ "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 1440, "outputTokens": 202, "latencyMs": 2683.73904200003 @@ -13370,7 +13370,7 @@ "model": "claude-haiku-4-5", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 1444, "outputTokens": 8, "latencyMs": 2289.0300419999985 @@ -13381,7 +13381,7 @@ "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 3828, "outputTokens": 74, "latencyMs": 1877.1760409999988 @@ -13392,7 +13392,7 @@ "model": "claude-haiku-4-5", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 3414, "outputTokens": 8, "latencyMs": 1460.1729160000104 @@ -13403,7 +13403,7 @@ "model": "gpt-5-nano", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 2984, "outputTokens": 138, "latencyMs": 2582.983708999993 @@ -13414,7 +13414,7 @@ "model": "claude-haiku-4-5", "expected": "3421.06", "actual": "3421.06", - "correct": true, + "isCorrect": true, "inputTokens": 3109, "outputTokens": 8, "latencyMs": 1014.1320839999826 @@ -13425,7 +13425,7 @@ "model": "gpt-5-nano", "expected": "344498", "actual": "344498", - "correct": true, + "isCorrect": true, "inputTokens": 3709, "outputTokens": 2376, "latencyMs": 26290.846458000015 @@ -13436,7 +13436,7 @@ "model": "claude-haiku-4-5", "expected": "344498", "actual": "188,945", - "correct": false, + "isCorrect": false, "inputTokens": 4077, "outputTokens": 7, "latencyMs": 1288.6627500000177 @@ -13447,7 +13447,7 @@ "model": "gpt-5-nano", "expected": "344498", "actual": "344498", - "correct": true, + "isCorrect": true, "inputTokens": 1560, "outputTokens": 1736, "latencyMs": 13565.930124999955 @@ -13458,7 +13458,7 @@ "model": "claude-haiku-4-5", "expected": "344498", "actual": "337,045", - "correct": false, + "isCorrect": false, "inputTokens": 1506, "outputTokens": 7, "latencyMs": 1190.8501249999972 @@ -13469,7 +13469,7 @@ "model": "gpt-5-nano", "expected": "344498", "actual": "344498", - "correct": true, + "isCorrect": true, "inputTokens": 1438, "outputTokens": 2888, "latencyMs": 21377.612083000015 @@ -13480,7 +13480,7 @@ "model": "claude-haiku-4-5", "expected": "344498", "actual": "372,915", - "correct": false, + "isCorrect": false, "inputTokens": 1442, "outputTokens": 7, "latencyMs": 931.349749999994 @@ -13491,7 +13491,7 @@ "model": "gpt-5-nano", "expected": "344498", "actual": "344498", - "correct": true, + "isCorrect": true, "inputTokens": 3826, "outputTokens": 3208, "latencyMs": 18997.804958999972 @@ -13502,7 +13502,7 @@ "model": "claude-haiku-4-5", "expected": "344498", "actual": "188,647", - "correct": false, + "isCorrect": false, "inputTokens": 3412, "outputTokens": 7, "latencyMs": 1185.3518330000225 @@ -13513,7 +13513,7 @@ "model": "gpt-5-nano", "expected": "344498", "actual": "344498", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 2184, "latencyMs": 23924.366792000015 @@ -13524,7 +13524,7 @@ "model": "claude-haiku-4-5", "expected": "344498", "actual": "181,854", - "correct": false, + "isCorrect": false, "inputTokens": 3107, "outputTokens": 7, "latencyMs": 2958.913666999957 @@ -13535,7 +13535,7 @@ "model": "gpt-5-nano", "expected": "312818.50", "actual": "312818.50", - "correct": true, + "isCorrect": true, "inputTokens": 3707, "outputTokens": 4170, "latencyMs": 29361.525874999992 @@ -13546,7 +13546,7 @@ "model": "claude-haiku-4-5", "expected": "312818.50", "actual": "287,745.89", - "correct": false, + "isCorrect": false, "inputTokens": 4075, "outputTokens": 9, "latencyMs": 1325.5311249999795 @@ -13557,7 +13557,7 @@ "model": "gpt-5-nano", "expected": "312818.50", "actual": "312818.50", - "correct": true, + "isCorrect": true, "inputTokens": 1558, "outputTokens": 4106, "latencyMs": 37997.09958400001 @@ -13568,7 +13568,7 @@ "model": "claude-haiku-4-5", "expected": "312818.50", "actual": "487,891.45", - "correct": false, + "isCorrect": false, "inputTokens": 1504, "outputTokens": 9, "latencyMs": 1184.0957090000156 @@ -13579,7 +13579,7 @@ "model": "gpt-5-nano", "expected": "312818.50", "actual": "312818.50", - "correct": true, + "isCorrect": true, "inputTokens": 1436, "outputTokens": 3658, "latencyMs": 26945.63508400001 @@ -13590,7 +13590,7 @@ "model": "claude-haiku-4-5", "expected": "312818.50", "actual": "487,891.89", - "correct": false, + "isCorrect": false, "inputTokens": 1440, "outputTokens": 9, "latencyMs": 1162.16949999996 @@ -13601,7 +13601,7 @@ "model": "gpt-5-nano", "expected": "312818.50", "actual": "312818.50", - "correct": true, + "isCorrect": true, "inputTokens": 3824, "outputTokens": 3722, "latencyMs": 27321.698167000024 @@ -13612,7 +13612,7 @@ "model": "claude-haiku-4-5", "expected": "312818.50", "actual": "381,968.89", - "correct": false, + "isCorrect": false, "inputTokens": 3410, "outputTokens": 9, "latencyMs": 2065.7583339999546 @@ -13623,7 +13623,7 @@ "model": "gpt-5-nano", "expected": "312818.50", "actual": "312818.50", - "correct": true, + "isCorrect": true, "inputTokens": 2980, "outputTokens": 3658, "latencyMs": 28778.99891600001 @@ -13634,7 +13634,7 @@ "model": "claude-haiku-4-5", "expected": "312818.50", "actual": "381,847.89", - "correct": false, + "isCorrect": false, "inputTokens": 3105, "outputTokens": 9, "latencyMs": 1233.4267090000212 @@ -13645,7 +13645,7 @@ "model": "gpt-5-nano", "expected": "1811", "actual": "1811", - "correct": true, + "isCorrect": true, "inputTokens": 3709, "outputTokens": 2568, "latencyMs": 28626.692666999996 @@ -13656,7 +13656,7 @@ "model": "claude-haiku-4-5", "expected": "1811", "actual": "1,234", - "correct": false, + "isCorrect": false, "inputTokens": 4078, "outputTokens": 7, "latencyMs": 1133.735584000009 @@ -13667,7 +13667,7 @@ "model": "gpt-5-nano", "expected": "1811", "actual": "1811", - "correct": true, + "isCorrect": true, "inputTokens": 1560, "outputTokens": 1672, "latencyMs": 14898.688125000044 @@ -13678,7 +13678,7 @@ "model": "claude-haiku-4-5", "expected": "1811", "actual": "1,945", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 7, "latencyMs": 1178.2744999999995 @@ -13689,7 +13689,7 @@ "model": "gpt-5-nano", "expected": "1811", "actual": "1811", - "correct": true, + "isCorrect": true, "inputTokens": 1438, "outputTokens": 1864, "latencyMs": 15225.964540999965 @@ -13700,7 +13700,7 @@ "model": "claude-haiku-4-5", "expected": "1811", "actual": "1,945", - "correct": false, + "isCorrect": false, "inputTokens": 1443, "outputTokens": 7, "latencyMs": 1077.2695419999654 @@ -13711,7 +13711,7 @@ "model": "gpt-5-nano", "expected": "1811", "actual": "1811", - "correct": true, + "isCorrect": true, "inputTokens": 3826, "outputTokens": 1928, "latencyMs": 14057.434583000024 @@ -13722,7 +13722,7 @@ "model": "claude-haiku-4-5", "expected": "1811", "actual": "1,454", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 7, "latencyMs": 1177.537500000035 @@ -13733,7 +13733,7 @@ "model": "gpt-5-nano", "expected": "1811", "actual": "1811", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 2312, "latencyMs": 19125.74099999998 @@ -13744,7 +13744,7 @@ "model": "claude-haiku-4-5", "expected": "1811", "actual": "1,454", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 7, "latencyMs": 1047.243833000015 @@ -13755,7 +13755,7 @@ "model": "gpt-5-nano", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 3709, "outputTokens": 1735, "latencyMs": 14875.021707999986 @@ -13766,7 +13766,7 @@ "model": "claude-haiku-4-5", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1076.5694999999832 @@ -13777,7 +13777,7 @@ "model": "gpt-5-nano", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 1560, "outputTokens": 2823, "latencyMs": 22604.422416999994 @@ -13788,7 +13788,7 @@ "model": "claude-haiku-4-5", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 1451.705666999973 @@ -13799,7 +13799,7 @@ "model": "gpt-5-nano", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 1438, "outputTokens": 2183, "latencyMs": 16916.007042000012 @@ -13810,7 +13810,7 @@ "model": "claude-haiku-4-5", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 1103.1098750000237 @@ -13821,7 +13821,7 @@ "model": "gpt-5-nano", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 3826, "outputTokens": 2055, "latencyMs": 17162.629124999978 @@ -13832,7 +13832,7 @@ "model": "claude-haiku-4-5", "expected": "42", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1150.0435000000289 @@ -13843,7 +13843,7 @@ "model": "gpt-5-nano", "expected": "42", "actual": "42", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 1607, "latencyMs": 14835.323333000008 @@ -13854,7 +13854,7 @@ "model": "claude-haiku-4-5", "expected": "42", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1206.8219590000226 @@ -13865,7 +13865,7 @@ "model": "gpt-5-nano", "expected": "28", "actual": "28", - "correct": true, + "isCorrect": true, "inputTokens": 3709, "outputTokens": 1479, "latencyMs": 11560.967958000023 @@ -13876,7 +13876,7 @@ "model": "claude-haiku-4-5", "expected": "28", "actual": "24", - "correct": false, + "isCorrect": false, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1151.9984169999952 @@ -13887,7 +13887,7 @@ "model": "gpt-5-nano", "expected": "28", "actual": "28", - "correct": true, + "isCorrect": true, "inputTokens": 1560, "outputTokens": 1927, "latencyMs": 15431.08262499998 @@ -13898,7 +13898,7 @@ "model": "claude-haiku-4-5", "expected": "28", "actual": "26", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 1032.7485419999575 @@ -13909,7 +13909,7 @@ "model": "gpt-5-nano", "expected": "28", "actual": "28", - "correct": true, + "isCorrect": true, "inputTokens": 1438, "outputTokens": 1607, "latencyMs": 9425.883957999991 @@ -13920,7 +13920,7 @@ "model": "claude-haiku-4-5", "expected": "28", "actual": "23", - "correct": false, + "isCorrect": false, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 943.5942919999943 @@ -13931,7 +13931,7 @@ "model": "gpt-5-nano", "expected": "28", "actual": "28", - "correct": true, + "isCorrect": true, "inputTokens": 3826, "outputTokens": 1927, "latencyMs": 16529.66529199999 @@ -13942,7 +13942,7 @@ "model": "claude-haiku-4-5", "expected": "28", "actual": "24", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1107.5635419999599 @@ -13953,7 +13953,7 @@ "model": "gpt-5-nano", "expected": "28", "actual": "28", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 1863, "latencyMs": 21071.067082999973 @@ -13964,7 +13964,7 @@ "model": "claude-haiku-4-5", "expected": "28", "actual": "23", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1018.46212500002 @@ -13975,7 +13975,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 3709, "outputTokens": 1223, "latencyMs": 8242.37608300004 @@ -13986,7 +13986,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1052.7201249999925 @@ -13997,7 +13997,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 1560, "outputTokens": 903, "latencyMs": 5430.806291999994 @@ -14008,7 +14008,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "12", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 2354.328999999969 @@ -14019,7 +14019,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 1438, "outputTokens": 1607, "latencyMs": 21944.211458000005 @@ -14030,7 +14030,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 1249.9959590000217 @@ -14041,7 +14041,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 3826, "outputTokens": 1415, "latencyMs": 15465.409875000012 @@ -14052,7 +14052,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1131.9575830000103 @@ -14063,7 +14063,7 @@ "model": "gpt-5-nano", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 2982, "outputTokens": 2503, "latencyMs": 24744.971958999988 @@ -14074,7 +14074,7 @@ "model": "claude-haiku-4-5", "expected": "11", "actual": "11", - "correct": true, + "isCorrect": true, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1274.6952499999898 @@ -14085,7 +14085,7 @@ "model": "gpt-5-nano", "expected": "58", "actual": "58", - "correct": true, + "isCorrect": true, "inputTokens": 3708, "outputTokens": 1351, "latencyMs": 12546.867542000022 @@ -14096,7 +14096,7 @@ "model": "claude-haiku-4-5", "expected": "58", "actual": "50", - "correct": false, + "isCorrect": false, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1231.453749999986 @@ -14107,7 +14107,7 @@ "model": "gpt-5-nano", "expected": "58", "actual": "58", - "correct": true, + "isCorrect": true, "inputTokens": 1559, "outputTokens": 1543, "latencyMs": 16593.402166999993 @@ -14118,7 +14118,7 @@ "model": "claude-haiku-4-5", "expected": "58", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 1079.0991659999709 @@ -14129,7 +14129,7 @@ "model": "gpt-5-nano", "expected": "58", "actual": "58", - "correct": true, + "isCorrect": true, "inputTokens": 1437, "outputTokens": 1543, "latencyMs": 10956.456084000005 @@ -14140,7 +14140,7 @@ "model": "claude-haiku-4-5", "expected": "58", "actual": "54", - "correct": false, + "isCorrect": false, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 2018.3774170000106 @@ -14151,7 +14151,7 @@ "model": "gpt-5-nano", "expected": "58", "actual": "58", - "correct": true, + "isCorrect": true, "inputTokens": 3825, "outputTokens": 1351, "latencyMs": 10537.598500000022 @@ -14162,7 +14162,7 @@ "model": "claude-haiku-4-5", "expected": "58", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1039.2452080000076 @@ -14173,7 +14173,7 @@ "model": "gpt-5-nano", "expected": "58", "actual": "58", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 839, "latencyMs": 8039.237708000001 @@ -14184,7 +14184,7 @@ "model": "claude-haiku-4-5", "expected": "58", "actual": "54", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1264.6740829999908 @@ -14195,7 +14195,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 3708, "outputTokens": 1863, "latencyMs": 14310.697374999989 @@ -14206,7 +14206,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1138.4443339999998 @@ -14217,7 +14217,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 1559, "outputTokens": 1927, "latencyMs": 16487.508375000034 @@ -14228,7 +14228,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 1104.2365410000202 @@ -14239,7 +14239,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 1437, "outputTokens": 3015, "latencyMs": 23688.737208999984 @@ -14250,7 +14250,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "38", - "correct": false, + "isCorrect": false, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 1026.8166249999776 @@ -14261,7 +14261,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 3825, "outputTokens": 1671, "latencyMs": 12415.87070899998 @@ -14272,7 +14272,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1062.2278749999823 @@ -14283,7 +14283,7 @@ "model": "gpt-5-nano", "expected": "41", "actual": "41", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 1799, "latencyMs": 15901.829415999993 @@ -14294,7 +14294,7 @@ "model": "claude-haiku-4-5", "expected": "41", "actual": "31", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1051.6962910000002 @@ -14305,7 +14305,7 @@ "model": "gpt-5-nano", "expected": "23", "actual": "23", - "correct": true, + "isCorrect": true, "inputTokens": 3708, "outputTokens": 1863, "latencyMs": 15216.926500000001 @@ -14316,7 +14316,7 @@ "model": "claude-haiku-4-5", "expected": "23", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 4078, "outputTokens": 5, "latencyMs": 1460.9212079999852 @@ -14327,7 +14327,7 @@ "model": "gpt-5-nano", "expected": "23", "actual": "23", - "correct": true, + "isCorrect": true, "inputTokens": 1559, "outputTokens": 2567, "latencyMs": 27103.083999999973 @@ -14338,7 +14338,7 @@ "model": "claude-haiku-4-5", "expected": "23", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 1507, "outputTokens": 5, "latencyMs": 1101.5416669999831 @@ -14349,7 +14349,7 @@ "model": "gpt-5-nano", "expected": "23", "actual": "23", - "correct": true, + "isCorrect": true, "inputTokens": 1437, "outputTokens": 1543, "latencyMs": 14598.558207999973 @@ -14360,7 +14360,7 @@ "model": "claude-haiku-4-5", "expected": "23", "actual": "20", - "correct": false, + "isCorrect": false, "inputTokens": 1443, "outputTokens": 5, "latencyMs": 1270.7722910000011 @@ -14371,7 +14371,7 @@ "model": "gpt-5-nano", "expected": "23", "actual": "23", - "correct": true, + "isCorrect": true, "inputTokens": 3825, "outputTokens": 1415, "latencyMs": 14102.604708999977 @@ -14382,7 +14382,7 @@ "model": "claude-haiku-4-5", "expected": "23", "actual": "21", - "correct": false, + "isCorrect": false, "inputTokens": 3413, "outputTokens": 5, "latencyMs": 1251.4159170000348 @@ -14393,7 +14393,7 @@ "model": "gpt-5-nano", "expected": "23", "actual": "23", - "correct": true, + "isCorrect": true, "inputTokens": 2981, "outputTokens": 1799, "latencyMs": 18696.684999999998 @@ -14404,7 +14404,7 @@ "model": "claude-haiku-4-5", "expected": "23", "actual": "21", - "correct": false, + "isCorrect": false, "inputTokens": 3108, "outputTokens": 5, "latencyMs": 1170.9401669999934 @@ -14415,7 +14415,7 @@ "model": "gpt-5-nano", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 136, "latencyMs": 2872.1482499999693 @@ -14426,7 +14426,7 @@ "model": "claude-haiku-4-5", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 17409, "outputTokens": 6, "latencyMs": 1382.586333000043 @@ -14437,7 +14437,7 @@ "model": "gpt-5-nano", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 904, "latencyMs": 9130.657125000027 @@ -14448,7 +14448,7 @@ "model": "claude-haiku-4-5", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 9279, "outputTokens": 6, "latencyMs": 1164.3372080000117 @@ -14459,7 +14459,7 @@ "model": "gpt-5-nano", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 8556, "outputTokens": 648, "latencyMs": 7763.659999999974 @@ -14470,7 +14470,7 @@ "model": "claude-haiku-4-5", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 9125, "outputTokens": 6, "latencyMs": 1331.3139999999548 @@ -14481,7 +14481,7 @@ "model": "gpt-5-nano", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 584, "latencyMs": 9411.661499999987 @@ -14492,7 +14492,7 @@ "model": "claude-haiku-4-5", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 15367, "outputTokens": 6, "latencyMs": 1272.1991249999846 @@ -14503,7 +14503,7 @@ "model": "gpt-5-nano", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 13171, "outputTokens": 200, "latencyMs": 3587.8712090000045 @@ -14514,7 +14514,7 @@ "model": "claude-haiku-4-5", "expected": "430828", "actual": "430828", - "correct": true, + "isCorrect": true, "inputTokens": 14483, "outputTokens": 6, "latencyMs": 1710.5899999999674 @@ -14525,7 +14525,7 @@ "model": "gpt-5-nano", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 15189, "outputTokens": 328, "latencyMs": 3625.780167000019 @@ -14536,7 +14536,7 @@ "model": "claude-haiku-4-5", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 17410, "outputTokens": 6, "latencyMs": 1785.2782080000034 @@ -14547,7 +14547,7 @@ "model": "gpt-5-nano", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 8790, "outputTokens": 712, "latencyMs": 6381.770374999964 @@ -14558,7 +14558,7 @@ "model": "claude-haiku-4-5", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 9280, "outputTokens": 6, "latencyMs": 1352.5436660000123 @@ -14569,7 +14569,7 @@ "model": "gpt-5-nano", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 8558, "outputTokens": 520, "latencyMs": 27916.417874999985 @@ -14580,7 +14580,7 @@ "model": "claude-haiku-4-5", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 9126, "outputTokens": 6, "latencyMs": 2073.8068330000388 @@ -14591,7 +14591,7 @@ "model": "gpt-5-nano", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 15483, "outputTokens": 328, "latencyMs": 5943.872542000026 @@ -14602,7 +14602,7 @@ "model": "claude-haiku-4-5", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 15368, "outputTokens": 6, "latencyMs": 1767.4393339999951 @@ -14613,7 +14613,7 @@ "model": "gpt-5-nano", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 13173, "outputTokens": 264, "latencyMs": 3115.895124999981 @@ -14624,7 +14624,7 @@ "model": "claude-haiku-4-5", "expected": "11798", "actual": "11798", - "correct": true, + "isCorrect": true, "inputTokens": 14484, "outputTokens": 6, "latencyMs": 1183.2249999999767 @@ -14635,7 +14635,7 @@ "model": "gpt-5-nano", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 15192, "outputTokens": 392, "latencyMs": 4991.646125000028 @@ -14646,7 +14646,7 @@ "model": "claude-haiku-4-5", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 17412, "outputTokens": 6, "latencyMs": 1835.4077919999836 @@ -14657,7 +14657,7 @@ "model": "gpt-5-nano", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 8793, "outputTokens": 712, "latencyMs": 7788.013291999989 @@ -14668,7 +14668,7 @@ "model": "claude-haiku-4-5", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 9282, "outputTokens": 6, "latencyMs": 1082.4066669999738 @@ -14679,7 +14679,7 @@ "model": "gpt-5-nano", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 8561, "outputTokens": 520, "latencyMs": 5664.896500000032 @@ -14690,7 +14690,7 @@ "model": "claude-haiku-4-5", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 9128, "outputTokens": 6, "latencyMs": 1215.8875830000034 @@ -14701,7 +14701,7 @@ "model": "gpt-5-nano", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 15486, "outputTokens": 456, "latencyMs": 5141.449292000034 @@ -14712,7 +14712,7 @@ "model": "claude-haiku-4-5", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 15370, "outputTokens": 6, "latencyMs": 1483.2090420000022 @@ -14723,7 +14723,7 @@ "model": "gpt-5-nano", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 13176, "outputTokens": 328, "latencyMs": 7532.760624999995 @@ -14734,7 +14734,7 @@ "model": "claude-haiku-4-5", "expected": "183631", "actual": "183631", - "correct": true, + "isCorrect": true, "inputTokens": 14486, "outputTokens": 6, "latencyMs": 1458.0657500000088 @@ -14745,7 +14745,7 @@ "model": "gpt-5-nano", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 15191, "outputTokens": 392, "latencyMs": 7922.4705829999875 @@ -14756,7 +14756,7 @@ "model": "claude-haiku-4-5", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 17412, "outputTokens": 6, "latencyMs": 1510.0054579999996 @@ -14767,7 +14767,7 @@ "model": "gpt-5-nano", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 8792, "outputTokens": 776, "latencyMs": 8475.77466699999 @@ -14778,7 +14778,7 @@ "model": "claude-haiku-4-5", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 9282, "outputTokens": 6, "latencyMs": 1203.3620419999934 @@ -14789,7 +14789,7 @@ "model": "gpt-5-nano", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 8560, "outputTokens": 776, "latencyMs": 7283.84258300002 @@ -14800,7 +14800,7 @@ "model": "claude-haiku-4-5", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 9128, "outputTokens": 6, "latencyMs": 1365.2434169999906 @@ -14811,7 +14811,7 @@ "model": "gpt-5-nano", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 15485, "outputTokens": 520, "latencyMs": 5846.538916999998 @@ -14822,7 +14822,7 @@ "model": "claude-haiku-4-5", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 15370, "outputTokens": 6, "latencyMs": 1203.6220829999656 @@ -14833,7 +14833,7 @@ "model": "gpt-5-nano", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 13175, "outputTokens": 456, "latencyMs": 5973.848832999996 @@ -14844,7 +14844,7 @@ "model": "claude-haiku-4-5", "expected": "29246", "actual": "29246", - "correct": true, + "isCorrect": true, "inputTokens": 14486, "outputTokens": 6, "latencyMs": 1189.811875000014 @@ -14855,7 +14855,7 @@ "model": "gpt-5-nano", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 328, "latencyMs": 8872.252957999997 @@ -14866,7 +14866,7 @@ "model": "claude-haiku-4-5", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 17407, "outputTokens": 6, "latencyMs": 1775.476083000016 @@ -14877,7 +14877,7 @@ "model": "gpt-5-nano", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 648, "latencyMs": 7149.649291000038 @@ -14888,7 +14888,7 @@ "model": "claude-haiku-4-5", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 9277, "outputTokens": 6, "latencyMs": 1577.2079999999842 @@ -14899,7 +14899,7 @@ "model": "gpt-5-nano", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 8556, "outputTokens": 1288, "latencyMs": 11344.462834000005 @@ -14910,7 +14910,7 @@ "model": "claude-haiku-4-5", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 9123, "outputTokens": 6, "latencyMs": 1340.27887499996 @@ -14921,7 +14921,7 @@ "model": "gpt-5-nano", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 392, "latencyMs": 6256.696250000037 @@ -14932,7 +14932,7 @@ "model": "claude-haiku-4-5", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 15365, "outputTokens": 6, "latencyMs": 1604.6909999999916 @@ -14943,7 +14943,7 @@ "model": "gpt-5-nano", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 13171, "outputTokens": 456, "latencyMs": 5982.022666999954 @@ -14954,7 +14954,7 @@ "model": "claude-haiku-4-5", "expected": "135306", "actual": "135306", - "correct": true, + "isCorrect": true, "inputTokens": 14481, "outputTokens": 6, "latencyMs": 1259.2409589999588 @@ -14965,7 +14965,7 @@ "model": "gpt-5-nano", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 15186, "outputTokens": 200, "latencyMs": 2858.1693749999977 @@ -14976,7 +14976,7 @@ "model": "claude-haiku-4-5", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 17408, "outputTokens": 6, "latencyMs": 1786.5725000000093 @@ -14987,7 +14987,7 @@ "model": "gpt-5-nano", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 8787, "outputTokens": 2696, "latencyMs": 23868.72975 @@ -14998,7 +14998,7 @@ "model": "claude-haiku-4-5", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 9278, "outputTokens": 6, "latencyMs": 1116.0275000000256 @@ -15009,7 +15009,7 @@ "model": "gpt-5-nano", "expected": "24914", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 8555, "outputTokens": 1543, "latencyMs": 17006.341916999954 @@ -15020,7 +15020,7 @@ "model": "claude-haiku-4-5", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 9124, "outputTokens": 6, "latencyMs": 1425.7799160000286 @@ -15031,7 +15031,7 @@ "model": "gpt-5-nano", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 15480, "outputTokens": 648, "latencyMs": 8414.583791000012 @@ -15042,7 +15042,7 @@ "model": "claude-haiku-4-5", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 15366, "outputTokens": 6, "latencyMs": 1374.9217920000083 @@ -15053,7 +15053,7 @@ "model": "gpt-5-nano", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 13170, "outputTokens": 456, "latencyMs": 6113.31808300002 @@ -15064,7 +15064,7 @@ "model": "claude-haiku-4-5", "expected": "24914", "actual": "24914", - "correct": true, + "isCorrect": true, "inputTokens": 14482, "outputTokens": 6, "latencyMs": 1374.9246660000063 @@ -15075,7 +15075,7 @@ "model": "gpt-5-nano", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 15186, "outputTokens": 392, "latencyMs": 5410.596499999985 @@ -15086,7 +15086,7 @@ "model": "claude-haiku-4-5", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 17407, "outputTokens": 6, "latencyMs": 1607.6261659999727 @@ -15097,7 +15097,7 @@ "model": "gpt-5-nano", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 8787, "outputTokens": 520, "latencyMs": 6469.81479199999 @@ -15108,7 +15108,7 @@ "model": "claude-haiku-4-5", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 9277, "outputTokens": 6, "latencyMs": 1103.9521250000107 @@ -15119,7 +15119,7 @@ "model": "gpt-5-nano", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 8555, "outputTokens": 904, "latencyMs": 8993.236791000003 @@ -15130,7 +15130,7 @@ "model": "claude-haiku-4-5", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 9123, "outputTokens": 6, "latencyMs": 1118.0249590000021 @@ -15141,7 +15141,7 @@ "model": "gpt-5-nano", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 15480, "outputTokens": 392, "latencyMs": 4705.902084000001 @@ -15152,7 +15152,7 @@ "model": "claude-haiku-4-5", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 15365, "outputTokens": 6, "latencyMs": 1454.1250839999993 @@ -15163,7 +15163,7 @@ "model": "gpt-5-nano", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 13170, "outputTokens": 456, "latencyMs": 5041.734750000003 @@ -15174,7 +15174,7 @@ "model": "claude-haiku-4-5", "expected": "111683", "actual": "111683", - "correct": true, + "isCorrect": true, "inputTokens": 14481, "outputTokens": 6, "latencyMs": 1199.9473330000183 @@ -15185,7 +15185,7 @@ "model": "gpt-5-nano", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 15193, "outputTokens": 328, "latencyMs": 4364.900083000015 @@ -15196,7 +15196,7 @@ "model": "claude-haiku-4-5", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 17412, "outputTokens": 6, "latencyMs": 1320.7056250000023 @@ -15207,7 +15207,7 @@ "model": "gpt-5-nano", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 8794, "outputTokens": 904, "latencyMs": 8590.36599999998 @@ -15218,7 +15218,7 @@ "model": "claude-haiku-4-5", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 9282, "outputTokens": 6, "latencyMs": 1166.0237089999719 @@ -15229,7 +15229,7 @@ "model": "gpt-5-nano", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 8562, "outputTokens": 648, "latencyMs": 6442.057417000004 @@ -15240,7 +15240,7 @@ "model": "claude-haiku-4-5", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 9128, "outputTokens": 6, "latencyMs": 1342.8652910000528 @@ -15251,7 +15251,7 @@ "model": "gpt-5-nano", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 15487, "outputTokens": 264, "latencyMs": 4450.340833000024 @@ -15262,7 +15262,7 @@ "model": "claude-haiku-4-5", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 15370, "outputTokens": 6, "latencyMs": 1551.4001249999856 @@ -15273,7 +15273,7 @@ "model": "gpt-5-nano", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 13177, "outputTokens": 520, "latencyMs": 5858.679374999949 @@ -15284,7 +15284,7 @@ "model": "claude-haiku-4-5", "expected": "13364", "actual": "13364", - "correct": true, + "isCorrect": true, "inputTokens": 14486, "outputTokens": 6, "latencyMs": 1173.6422499999753 @@ -15295,7 +15295,7 @@ "model": "gpt-5-nano", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 15185, "outputTokens": 456, "latencyMs": 6377.878708000004 @@ -15306,7 +15306,7 @@ "model": "claude-haiku-4-5", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 17405, "outputTokens": 6, "latencyMs": 1312.9188750000321 @@ -15317,7 +15317,7 @@ "model": "gpt-5-nano", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 8786, "outputTokens": 4680, "latencyMs": 36395.80937499995 @@ -15328,7 +15328,7 @@ "model": "claude-haiku-4-5", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 9275, "outputTokens": 6, "latencyMs": 2024.6539580000099 @@ -15339,7 +15339,7 @@ "model": "gpt-5-nano", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 8554, "outputTokens": 3784, "latencyMs": 30336.309707999986 @@ -15350,7 +15350,7 @@ "model": "claude-haiku-4-5", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 9121, "outputTokens": 6, "latencyMs": 1237.6976249999716 @@ -15361,7 +15361,7 @@ "model": "gpt-5-nano", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 15479, "outputTokens": 264, "latencyMs": 5297.444375000021 @@ -15372,7 +15372,7 @@ "model": "claude-haiku-4-5", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 15363, "outputTokens": 6, "latencyMs": 1775.3334170000162 @@ -15383,7 +15383,7 @@ "model": "gpt-5-nano", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 13169, "outputTokens": 392, "latencyMs": 8030.958958000003 @@ -15394,7 +15394,7 @@ "model": "claude-haiku-4-5", "expected": "98464", "actual": "98464", - "correct": true, + "isCorrect": true, "inputTokens": 14479, "outputTokens": 6, "latencyMs": 1401.1453330000513 @@ -15405,7 +15405,7 @@ "model": "gpt-5-nano", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 264, "latencyMs": 6193.845583000046 @@ -15416,7 +15416,7 @@ "model": "claude-haiku-4-5", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 17408, "outputTokens": 6, "latencyMs": 2449.4082920000073 @@ -15427,7 +15427,7 @@ "model": "gpt-5-nano", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 2568, "latencyMs": 25386.850749999983 @@ -15438,7 +15438,7 @@ "model": "claude-haiku-4-5", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 9278, "outputTokens": 6, "latencyMs": 1351.401165999996 @@ -15449,7 +15449,7 @@ "model": "gpt-5-nano", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 8556, "outputTokens": 456, "latencyMs": 5087.453167000029 @@ -15460,7 +15460,7 @@ "model": "claude-haiku-4-5", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 9124, "outputTokens": 6, "latencyMs": 1229.4187500000116 @@ -15471,7 +15471,7 @@ "model": "gpt-5-nano", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 520, "latencyMs": 6781.348249999981 @@ -15482,7 +15482,7 @@ "model": "claude-haiku-4-5", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 15366, "outputTokens": 6, "latencyMs": 1411.0081670000218 @@ -15493,7 +15493,7 @@ "model": "gpt-5-nano", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 13171, "outputTokens": 328, "latencyMs": 9405.325083000003 @@ -15504,7 +15504,7 @@ "model": "claude-haiku-4-5", "expected": "6378", "actual": "6378", - "correct": true, + "isCorrect": true, "inputTokens": 14482, "outputTokens": 6, "latencyMs": 1575.9942499999888 @@ -15515,7 +15515,7 @@ "model": "gpt-5-nano", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 15189, "outputTokens": 456, "latencyMs": 7723.79820900003 @@ -15526,7 +15526,7 @@ "model": "claude-haiku-4-5", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 17409, "outputTokens": 6, "latencyMs": 1496.878625000012 @@ -15537,7 +15537,7 @@ "model": "gpt-5-nano", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 8790, "outputTokens": 328, "latencyMs": 5231.312959000003 @@ -15548,7 +15548,7 @@ "model": "claude-haiku-4-5", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 9279, "outputTokens": 6, "latencyMs": 1145.5107919999864 @@ -15559,7 +15559,7 @@ "model": "gpt-5-nano", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 8558, "outputTokens": 392, "latencyMs": 4585.943417000002 @@ -15570,7 +15570,7 @@ "model": "claude-haiku-4-5", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 9125, "outputTokens": 6, "latencyMs": 1386.1237079999992 @@ -15581,7 +15581,7 @@ "model": "gpt-5-nano", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 15483, "outputTokens": 328, "latencyMs": 9374.248917000019 @@ -15592,7 +15592,7 @@ "model": "claude-haiku-4-5", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 15367, "outputTokens": 6, "latencyMs": 1332.4388340000296 @@ -15603,7 +15603,7 @@ "model": "gpt-5-nano", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 13173, "outputTokens": 200, "latencyMs": 3953.8284580000327 @@ -15614,7 +15614,7 @@ "model": "claude-haiku-4-5", "expected": "254916", "actual": "254916", - "correct": true, + "isCorrect": true, "inputTokens": 14483, "outputTokens": 6, "latencyMs": 1294.3535840000259 @@ -15625,7 +15625,7 @@ "model": "gpt-5-nano", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 584, "latencyMs": 8515.676582999993 @@ -15636,7 +15636,7 @@ "model": "claude-haiku-4-5", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 17410, "outputTokens": 6, "latencyMs": 2508.0940420000115 @@ -15647,7 +15647,7 @@ "model": "gpt-5-nano", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 584, "latencyMs": 6331.0320000000065 @@ -15658,7 +15658,7 @@ "model": "claude-haiku-4-5", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 9280, "outputTokens": 6, "latencyMs": 1249.4856250000303 @@ -15669,7 +15669,7 @@ "model": "gpt-5-nano", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 8556, "outputTokens": 648, "latencyMs": 8463.519499999995 @@ -15680,7 +15680,7 @@ "model": "claude-haiku-4-5", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 9126, "outputTokens": 6, "latencyMs": 1035.4223750000237 @@ -15691,7 +15691,7 @@ "model": "gpt-5-nano", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 520, "latencyMs": 9625.975833999983 @@ -15702,7 +15702,7 @@ "model": "claude-haiku-4-5", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 15368, "outputTokens": 6, "latencyMs": 1460.7396250000456 @@ -15713,7 +15713,7 @@ "model": "gpt-5-nano", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 13171, "outputTokens": 712, "latencyMs": 7525.112709000008 @@ -15724,7 +15724,7 @@ "model": "claude-haiku-4-5", "expected": "32413", "actual": "32413", - "correct": true, + "isCorrect": true, "inputTokens": 14484, "outputTokens": 6, "latencyMs": 1488.0029170000344 @@ -15735,7 +15735,7 @@ "model": "gpt-5-nano", "expected": "240059", "actual": "not found", - "correct": false, + "isCorrect": false, "inputTokens": 15185, "outputTokens": 1352, "latencyMs": 8303.157542 @@ -15746,7 +15746,7 @@ "model": "claude-haiku-4-5", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 17405, "outputTokens": 6, "latencyMs": 1515.7900000000373 @@ -15757,7 +15757,7 @@ "model": "gpt-5-nano", "expected": "240059", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 8786, "outputTokens": 2503, "latencyMs": 20915.808583000035 @@ -15768,7 +15768,7 @@ "model": "claude-haiku-4-5", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 9275, "outputTokens": 6, "latencyMs": 1193.4237079999875 @@ -15779,7 +15779,7 @@ "model": "gpt-5-nano", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 8554, "outputTokens": 4360, "latencyMs": 34760.80329100002 @@ -15790,7 +15790,7 @@ "model": "claude-haiku-4-5", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 9121, "outputTokens": 6, "latencyMs": 3022.242749999976 @@ -15801,7 +15801,7 @@ "model": "gpt-5-nano", "expected": "240059", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 15479, "outputTokens": 2567, "latencyMs": 15901.546999999962 @@ -15812,7 +15812,7 @@ "model": "claude-haiku-4-5", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 15363, "outputTokens": 6, "latencyMs": 1358.283374999999 @@ -15823,7 +15823,7 @@ "model": "gpt-5-nano", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 13169, "outputTokens": 584, "latencyMs": 10520.349042000016 @@ -15834,7 +15834,7 @@ "model": "claude-haiku-4-5", "expected": "240059", "actual": "240059", - "correct": true, + "isCorrect": true, "inputTokens": 14479, "outputTokens": 6, "latencyMs": 1426.0678330000374 @@ -15845,7 +15845,7 @@ "model": "gpt-5-nano", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 15186, "outputTokens": 712, "latencyMs": 7069.827042000019 @@ -15856,7 +15856,7 @@ "model": "claude-haiku-4-5", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 17406, "outputTokens": 6, "latencyMs": 1507.9525419999845 @@ -15867,7 +15867,7 @@ "model": "gpt-5-nano", "expected": "48986", "actual": "undefined", - "correct": false, + "isCorrect": false, "inputTokens": 8787, "outputTokens": 2311, "latencyMs": 18257.385332999984 @@ -15878,7 +15878,7 @@ "model": "claude-haiku-4-5", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 9276, "outputTokens": 6, "latencyMs": 1397.3040420000325 @@ -15889,7 +15889,7 @@ "model": "gpt-5-nano", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 8555, "outputTokens": 3976, "latencyMs": 29865.140291999967 @@ -15900,7 +15900,7 @@ "model": "claude-haiku-4-5", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 9122, "outputTokens": 6, "latencyMs": 1218.4357079999754 @@ -15911,7 +15911,7 @@ "model": "gpt-5-nano", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 15480, "outputTokens": 904, "latencyMs": 8906.708750000049 @@ -15922,7 +15922,7 @@ "model": "claude-haiku-4-5", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 15364, "outputTokens": 6, "latencyMs": 1917.3721249999944 @@ -15933,7 +15933,7 @@ "model": "gpt-5-nano", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 13170, "outputTokens": 1160, "latencyMs": 9665.802708000003 @@ -15944,7 +15944,7 @@ "model": "claude-haiku-4-5", "expected": "48986", "actual": "48986", - "correct": true, + "isCorrect": true, "inputTokens": 14480, "outputTokens": 6, "latencyMs": 1342.7929170000134 @@ -15955,7 +15955,7 @@ "model": "gpt-5-nano", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 15185, "outputTokens": 648, "latencyMs": 6259.387500000012 @@ -15966,7 +15966,7 @@ "model": "claude-haiku-4-5", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 17405, "outputTokens": 6, "latencyMs": 1860.1597499999916 @@ -15977,7 +15977,7 @@ "model": "gpt-5-nano", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 8786, "outputTokens": 3336, "latencyMs": 23288.63820799999 @@ -15988,7 +15988,7 @@ "model": "claude-haiku-4-5", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 9275, "outputTokens": 6, "latencyMs": 1180.5804169999901 @@ -15999,7 +15999,7 @@ "model": "gpt-5-nano", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 8554, "outputTokens": 840, "latencyMs": 6988.782166000048 @@ -16010,7 +16010,7 @@ "model": "claude-haiku-4-5", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 9121, "outputTokens": 6, "latencyMs": 1391.326041000022 @@ -16021,7 +16021,7 @@ "model": "gpt-5-nano", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 15479, "outputTokens": 648, "latencyMs": 6708.915624999965 @@ -16032,7 +16032,7 @@ "model": "claude-haiku-4-5", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 15363, "outputTokens": 6, "latencyMs": 1364.766833999951 @@ -16043,7 +16043,7 @@ "model": "gpt-5-nano", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 13169, "outputTokens": 328, "latencyMs": 3396.199416999996 @@ -16054,7 +16054,7 @@ "model": "claude-haiku-4-5", "expected": "209624", "actual": "209624", - "correct": true, + "isCorrect": true, "inputTokens": 14479, "outputTokens": 6, "latencyMs": 1378.3461249999818 @@ -16065,7 +16065,7 @@ "model": "gpt-5-nano", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 15185, "outputTokens": 200, "latencyMs": 2947.7053750000196 @@ -16076,7 +16076,7 @@ "model": "claude-haiku-4-5", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 17406, "outputTokens": 6, "latencyMs": 1512.1218329999829 @@ -16087,7 +16087,7 @@ "model": "gpt-5-nano", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 8786, "outputTokens": 840, "latencyMs": 7657.443458000023 @@ -16098,7 +16098,7 @@ "model": "claude-haiku-4-5", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 9276, "outputTokens": 6, "latencyMs": 1119.6807499999995 @@ -16109,7 +16109,7 @@ "model": "gpt-5-nano", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 8554, "outputTokens": 392, "latencyMs": 4410.906208000029 @@ -16120,7 +16120,7 @@ "model": "claude-haiku-4-5", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 9122, "outputTokens": 6, "latencyMs": 1227.467249999987 @@ -16131,7 +16131,7 @@ "model": "gpt-5-nano", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 15479, "outputTokens": 328, "latencyMs": 4168.014292000036 @@ -16142,7 +16142,7 @@ "model": "claude-haiku-4-5", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 15364, "outputTokens": 6, "latencyMs": 1878.2624590000487 @@ -16153,7 +16153,7 @@ "model": "gpt-5-nano", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 13169, "outputTokens": 456, "latencyMs": 4726.903416000016 @@ -16164,7 +16164,7 @@ "model": "claude-haiku-4-5", "expected": "58023", "actual": "58023", - "correct": true, + "isCorrect": true, "inputTokens": 14480, "outputTokens": 6, "latencyMs": 1665.950124999974 @@ -16175,7 +16175,7 @@ "model": "gpt-5-nano", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 15188, "outputTokens": 456, "latencyMs": 5633.756834 @@ -16186,7 +16186,7 @@ "model": "claude-haiku-4-5", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 17407, "outputTokens": 6, "latencyMs": 1482.6277910000063 @@ -16197,7 +16197,7 @@ "model": "gpt-5-nano", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 1416, "latencyMs": 11371.267457999988 @@ -16208,7 +16208,7 @@ "model": "claude-haiku-4-5", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 9277, "outputTokens": 6, "latencyMs": 1690.2400420000195 @@ -16219,7 +16219,7 @@ "model": "gpt-5-nano", "expected": "196024", "actual": "Repo not found", - "correct": false, + "isCorrect": false, "inputTokens": 8557, "outputTokens": 3273, "latencyMs": 28731.530667000043 @@ -16230,7 +16230,7 @@ "model": "claude-haiku-4-5", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 9123, "outputTokens": 6, "latencyMs": 1070.5141670000157 @@ -16241,7 +16241,7 @@ "model": "gpt-5-nano", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 520, "latencyMs": 7021.771125000028 @@ -16252,7 +16252,7 @@ "model": "claude-haiku-4-5", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 15365, "outputTokens": 6, "latencyMs": 1243.7466250000289 @@ -16263,7 +16263,7 @@ "model": "gpt-5-nano", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 13172, "outputTokens": 456, "latencyMs": 5286.169750000001 @@ -16274,7 +16274,7 @@ "model": "claude-haiku-4-5", "expected": "196024", "actual": "196024", - "correct": true, + "isCorrect": true, "inputTokens": 14481, "outputTokens": 6, "latencyMs": 1450.456957999966 @@ -16285,7 +16285,7 @@ "model": "gpt-5-nano", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 15188, "outputTokens": 456, "latencyMs": 5440.864250000042 @@ -16296,7 +16296,7 @@ "model": "claude-haiku-4-5", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 17408, "outputTokens": 6, "latencyMs": 1369.6618330000201 @@ -16307,7 +16307,7 @@ "model": "gpt-5-nano", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 712, "latencyMs": 6130.9379999999655 @@ -16318,7 +16318,7 @@ "model": "claude-haiku-4-5", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 9278, "outputTokens": 6, "latencyMs": 1635.81579100003 @@ -16329,7 +16329,7 @@ "model": "gpt-5-nano", "expected": "30919", "actual": "N/A", - "correct": false, + "isCorrect": false, "inputTokens": 8557, "outputTokens": 1288, "latencyMs": 20319.653374999994 @@ -16340,7 +16340,7 @@ "model": "claude-haiku-4-5", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 9124, "outputTokens": 6, "latencyMs": 1381.8252079999656 @@ -16351,7 +16351,7 @@ "model": "gpt-5-nano", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 328, "latencyMs": 5951.751374999993 @@ -16362,7 +16362,7 @@ "model": "claude-haiku-4-5", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 15366, "outputTokens": 6, "latencyMs": 1367.1241670000018 @@ -16373,7 +16373,7 @@ "model": "gpt-5-nano", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 13172, "outputTokens": 328, "latencyMs": 3499.136334000039 @@ -16384,7 +16384,7 @@ "model": "claude-haiku-4-5", "expected": "30919", "actual": "30919", - "correct": true, + "isCorrect": true, "inputTokens": 14482, "outputTokens": 6, "latencyMs": 1573.7027499999967 @@ -16395,7 +16395,7 @@ "model": "gpt-5-nano", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 392, "latencyMs": 7833.668625000049 @@ -16406,7 +16406,7 @@ "model": "claude-haiku-4-5", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 17405, "outputTokens": 6, "latencyMs": 1477.048582999967 @@ -16417,7 +16417,7 @@ "model": "gpt-5-nano", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 520, "latencyMs": 4880.817959000007 @@ -16428,7 +16428,7 @@ "model": "claude-haiku-4-5", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 9275, "outputTokens": 6, "latencyMs": 1081.6979169999831 @@ -16439,7 +16439,7 @@ "model": "gpt-5-nano", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 8556, "outputTokens": 1992, "latencyMs": 14180.11841699999 @@ -16450,7 +16450,7 @@ "model": "claude-haiku-4-5", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 9121, "outputTokens": 6, "latencyMs": 1393.665417000011 @@ -16461,7 +16461,7 @@ "model": "gpt-5-nano", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 392, "latencyMs": 4068.912416999985 @@ -16472,7 +16472,7 @@ "model": "claude-haiku-4-5", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 15363, "outputTokens": 6, "latencyMs": 1687.0724170000176 @@ -16483,7 +16483,7 @@ "model": "gpt-5-nano", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 13171, "outputTokens": 392, "latencyMs": 4048.8707089999807 @@ -16494,7 +16494,7 @@ "model": "claude-haiku-4-5", "expected": "192220", "actual": "192220", - "correct": true, + "isCorrect": true, "inputTokens": 14479, "outputTokens": 6, "latencyMs": 1441.8594579999917 @@ -16505,7 +16505,7 @@ "model": "gpt-5-nano", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 15190, "outputTokens": 392, "latencyMs": 4563.366041000001 @@ -16516,7 +16516,7 @@ "model": "claude-haiku-4-5", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 17414, "outputTokens": 6, "latencyMs": 1361.9952920000069 @@ -16527,7 +16527,7 @@ "model": "gpt-5-nano", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 8791, "outputTokens": 904, "latencyMs": 9523.924416000023 @@ -16538,7 +16538,7 @@ "model": "claude-haiku-4-5", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 9284, "outputTokens": 6, "latencyMs": 1235.863416999986 @@ -16549,7 +16549,7 @@ "model": "gpt-5-nano", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 8559, "outputTokens": 584, "latencyMs": 5264.637583000003 @@ -16560,7 +16560,7 @@ "model": "claude-haiku-4-5", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 9130, "outputTokens": 6, "latencyMs": 1307.1584169999696 @@ -16571,7 +16571,7 @@ "model": "gpt-5-nano", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 15484, "outputTokens": 328, "latencyMs": 8621.355207999994 @@ -16582,7 +16582,7 @@ "model": "claude-haiku-4-5", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 15372, "outputTokens": 6, "latencyMs": 1464.8200829999987 @@ -16593,7 +16593,7 @@ "model": "gpt-5-nano", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 13174, "outputTokens": 264, "latencyMs": 3034.7359999999753 @@ -16604,7 +16604,7 @@ "model": "claude-haiku-4-5", "expected": "11763", "actual": "11763", - "correct": true, + "isCorrect": true, "inputTokens": 14488, "outputTokens": 6, "latencyMs": 1959.3285000000033 @@ -16615,7 +16615,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 15187, "outputTokens": 2055, "latencyMs": 16430.930082999985 @@ -16626,7 +16626,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 17406, "outputTokens": 5, "latencyMs": 1730.124458999955 @@ -16637,7 +16637,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8788, "outputTokens": 839, "latencyMs": 7275.640458000009 @@ -16648,7 +16648,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 9276, "outputTokens": 5, "latencyMs": 1286.8315839999705 @@ -16659,7 +16659,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 8556, "outputTokens": 2695, "latencyMs": 24177.570000000007 @@ -16670,7 +16670,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 9122, "outputTokens": 5, "latencyMs": 1102.5337500000023 @@ -16681,7 +16681,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 15481, "outputTokens": 1671, "latencyMs": 14929.856415999995 @@ -16692,7 +16692,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 15364, "outputTokens": 5, "latencyMs": 1227.103541999997 @@ -16703,7 +16703,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 13171, "outputTokens": 583, "latencyMs": 5785.248666999978 @@ -16714,7 +16714,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "0", - "correct": false, + "isCorrect": false, "inputTokens": 14480, "outputTokens": 5, "latencyMs": 1959.456125000026 @@ -16725,7 +16725,7 @@ "model": "gpt-5-nano", "expected": "15404143", "actual": "19196630", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 13385, "latencyMs": 239619.323125 @@ -16736,7 +16736,7 @@ "model": "claude-haiku-4-5", "expected": "15404143", "actual": "13,847,892", - "correct": false, + "isCorrect": false, "inputTokens": 17407, "outputTokens": 9, "latencyMs": 1838.8340420000022 @@ -16747,7 +16747,7 @@ "model": "gpt-5-nano", "expected": "15404143", "actual": "15404143", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 12169, "latencyMs": 109453.991416 @@ -16758,7 +16758,7 @@ "model": "claude-haiku-4-5", "expected": "15404143", "actual": "13,847,892", - "correct": false, + "isCorrect": false, "inputTokens": 9277, "outputTokens": 9, "latencyMs": 1443.470417000004 @@ -16769,7 +16769,7 @@ "model": "gpt-5-nano", "expected": "15404143", "actual": "15404143", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 6281, "latencyMs": 45474.442209 @@ -16780,7 +16780,7 @@ "model": "claude-haiku-4-5", "expected": "15404143", "actual": "15,847,892", - "correct": false, + "isCorrect": false, "inputTokens": 9123, "outputTokens": 9, "latencyMs": 1361.6022089999751 @@ -16791,7 +16791,7 @@ "model": "gpt-5-nano", "expected": "15404143", "actual": "15404143", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 4489, "latencyMs": 29654.25554099999 @@ -16802,7 +16802,7 @@ "model": "claude-haiku-4-5", "expected": "15404143", "actual": "13,847,892", - "correct": false, + "isCorrect": false, "inputTokens": 15365, "outputTokens": 9, "latencyMs": 1796.0902500000084 @@ -16813,7 +16813,7 @@ "model": "gpt-5-nano", "expected": "15404143", "actual": "15404143", - "correct": true, + "isCorrect": true, "inputTokens": 13172, "outputTokens": 6409, "latencyMs": 70234.84133299999 @@ -16824,7 +16824,7 @@ "model": "claude-haiku-4-5", "expected": "15404143", "actual": "13,847,892", - "correct": false, + "isCorrect": false, "inputTokens": 14481, "outputTokens": 9, "latencyMs": 1965.7452919999487 @@ -16835,7 +16835,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "60", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 7495, "latencyMs": 72992.43658400001 @@ -16846,7 +16846,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 17408, "outputTokens": 5, "latencyMs": 1772.3059999999823 @@ -16857,7 +16857,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 2759, "latencyMs": 19214.133417000005 @@ -16868,7 +16868,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 9278, "outputTokens": 5, "latencyMs": 1115.5979170000064 @@ -16879,7 +16879,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 2439, "latencyMs": 27365.987334000005 @@ -16890,7 +16890,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 9124, "outputTokens": 5, "latencyMs": 1322.4322910000337 @@ -16901,7 +16901,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 5767, "latencyMs": 60524.90554200002 @@ -16912,7 +16912,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 15366, "outputTokens": 5, "latencyMs": 1597.7364170000073 @@ -16923,7 +16923,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 13172, "outputTokens": 4039, "latencyMs": 28819.869999999995 @@ -16934,7 +16934,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 14482, "outputTokens": 5, "latencyMs": 1798.9455409999937 @@ -16945,7 +16945,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "86", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 2375, "latencyMs": 23963.549916999997 @@ -16956,7 +16956,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 17408, "outputTokens": 5, "latencyMs": 1836.1375000000116 @@ -16967,7 +16967,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 3079, "latencyMs": 26957.04420799995 @@ -16978,7 +16978,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 9278, "outputTokens": 5, "latencyMs": 1209.7997920000344 @@ -16989,7 +16989,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 2887, "latencyMs": 27174.970375000034 @@ -17000,7 +17000,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 9124, "outputTokens": 5, "latencyMs": 1293.6252920000115 @@ -17011,7 +17011,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "98", - "correct": false, + "isCorrect": false, "inputTokens": 15482, "outputTokens": 2567, "latencyMs": 29565.065250000043 @@ -17022,7 +17022,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 15366, "outputTokens": 5, "latencyMs": 1230.7459160000435 @@ -17033,7 +17033,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 13172, "outputTokens": 2695, "latencyMs": 20706.84841700003 @@ -17044,7 +17044,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 14482, "outputTokens": 5, "latencyMs": 1743.1536249999772 @@ -17055,7 +17055,7 @@ "model": "gpt-5-nano", "expected": "76", "actual": "41", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 8263, "latencyMs": 60899.858959000034 @@ -17066,7 +17066,7 @@ "model": "claude-haiku-4-5", "expected": "76", "actual": "100", - "correct": false, + "isCorrect": false, "inputTokens": 17408, "outputTokens": 5, "latencyMs": 1350.1540420000092 @@ -17077,7 +17077,7 @@ "model": "gpt-5-nano", "expected": "76", "actual": "76", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 3847, "latencyMs": 30491.779582999996 @@ -17088,7 +17088,7 @@ "model": "claude-haiku-4-5", "expected": "76", "actual": "100", - "correct": false, + "isCorrect": false, "inputTokens": 9278, "outputTokens": 5, "latencyMs": 1513.2665410000482 @@ -17099,7 +17099,7 @@ "model": "gpt-5-nano", "expected": "76", "actual": "76", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 3847, "latencyMs": 25522.397125000018 @@ -17110,7 +17110,7 @@ "model": "claude-haiku-4-5", "expected": "76", "actual": "100", - "correct": false, + "isCorrect": false, "inputTokens": 9124, "outputTokens": 5, "latencyMs": 1150.7281660000444 @@ -17121,7 +17121,7 @@ "model": "gpt-5-nano", "expected": "76", "actual": "76", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 2631, "latencyMs": 22525.465083000017 @@ -17132,7 +17132,7 @@ "model": "claude-haiku-4-5", "expected": "76", "actual": "100", - "correct": false, + "isCorrect": false, "inputTokens": 15366, "outputTokens": 5, "latencyMs": 1438.5829169999924 @@ -17143,7 +17143,7 @@ "model": "gpt-5-nano", "expected": "76", "actual": "62", - "correct": false, + "isCorrect": false, "inputTokens": 13172, "outputTokens": 1351, "latencyMs": 11162.623291999975 @@ -17154,7 +17154,7 @@ "model": "claude-haiku-4-5", "expected": "76", "actual": "100", - "correct": false, + "isCorrect": false, "inputTokens": 14482, "outputTokens": 5, "latencyMs": 1305.162249999994 @@ -17165,7 +17165,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "129", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 6599, "latencyMs": 49590.68900000001 @@ -17176,7 +17176,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "89", - "correct": false, + "isCorrect": false, "inputTokens": 17409, "outputTokens": 5, "latencyMs": 1750.9506249999977 @@ -17187,7 +17187,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8789, "outputTokens": 8903, "latencyMs": 68556.36550000001 @@ -17198,7 +17198,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "73", - "correct": false, + "isCorrect": false, "inputTokens": 9279, "outputTokens": 5, "latencyMs": 1148.3701669999864 @@ -17209,7 +17209,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "100", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 3271, "latencyMs": 36128.254709 @@ -17220,7 +17220,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "89", - "correct": false, + "isCorrect": false, "inputTokens": 9125, "outputTokens": 5, "latencyMs": 1137.2578750000102 @@ -17231,7 +17231,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "79", - "correct": false, + "isCorrect": false, "inputTokens": 15482, "outputTokens": 3527, "latencyMs": 35526.23958300002 @@ -17242,7 +17242,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "95", - "correct": false, + "isCorrect": false, "inputTokens": 15367, "outputTokens": 5, "latencyMs": 1501.6561670000083 @@ -17253,7 +17253,7 @@ "model": "gpt-5-nano", "expected": "100", "actual": "99", - "correct": false, + "isCorrect": false, "inputTokens": 13172, "outputTokens": 3143, "latencyMs": 26700.229333000025 @@ -17264,7 +17264,7 @@ "model": "claude-haiku-4-5", "expected": "100", "actual": "95", - "correct": false, + "isCorrect": false, "inputTokens": 14483, "outputTokens": 5, "latencyMs": 1159.0904580000206 @@ -17275,7 +17275,7 @@ "model": "gpt-5-nano", "expected": "95", "actual": "94", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 4999, "latencyMs": 32710.407750000013 @@ -17286,7 +17286,7 @@ "model": "claude-haiku-4-5", "expected": "95", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 17409, "outputTokens": 5, "latencyMs": 1451.6710420000018 @@ -17297,7 +17297,7 @@ "model": "gpt-5-nano", "expected": "95", "actual": "82", - "correct": false, + "isCorrect": false, "inputTokens": 8789, "outputTokens": 3143, "latencyMs": 18360.73424999998 @@ -17308,7 +17308,7 @@ "model": "claude-haiku-4-5", "expected": "95", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 9279, "outputTokens": 5, "latencyMs": 1035.2159160000156 @@ -17319,7 +17319,7 @@ "model": "gpt-5-nano", "expected": "95", "actual": "95", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 4487, "latencyMs": 28020.044915999984 @@ -17330,7 +17330,7 @@ "model": "claude-haiku-4-5", "expected": "95", "actual": "42", - "correct": false, + "isCorrect": false, "inputTokens": 9125, "outputTokens": 5, "latencyMs": 1175.8671249999898 @@ -17341,7 +17341,7 @@ "model": "gpt-5-nano", "expected": "95", "actual": "77", - "correct": false, + "isCorrect": false, "inputTokens": 15482, "outputTokens": 2887, "latencyMs": 24031.185459 @@ -17352,7 +17352,7 @@ "model": "claude-haiku-4-5", "expected": "95", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 15367, "outputTokens": 5, "latencyMs": 1724.9393750000163 @@ -17363,7 +17363,7 @@ "model": "gpt-5-nano", "expected": "95", "actual": "81", - "correct": false, + "isCorrect": false, "inputTokens": 13172, "outputTokens": 4359, "latencyMs": 35723.19641699997 @@ -17374,7 +17374,7 @@ "model": "claude-haiku-4-5", "expected": "95", "actual": "47", - "correct": false, + "isCorrect": false, "inputTokens": 14483, "outputTokens": 5, "latencyMs": 1663.259167000011 @@ -17385,7 +17385,7 @@ "model": "gpt-5-nano", "expected": "83", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 15188, "outputTokens": 2439, "latencyMs": 18168.518166999973 @@ -17396,7 +17396,7 @@ "model": "claude-haiku-4-5", "expected": "83", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 17409, "outputTokens": 5, "latencyMs": 1390.1757499999949 @@ -17407,7 +17407,7 @@ "model": "gpt-5-nano", "expected": "83", "actual": "57", - "correct": false, + "isCorrect": false, "inputTokens": 8789, "outputTokens": 4423, "latencyMs": 41240.42016700003 @@ -17418,7 +17418,7 @@ "model": "claude-haiku-4-5", "expected": "83", "actual": "73", - "correct": false, + "isCorrect": false, "inputTokens": 9279, "outputTokens": 5, "latencyMs": 1066.675458999991 @@ -17429,7 +17429,7 @@ "model": "gpt-5-nano", "expected": "83", "actual": "83", - "correct": true, + "isCorrect": true, "inputTokens": 8557, "outputTokens": 5831, "latencyMs": 40638.93858400005 @@ -17440,7 +17440,7 @@ "model": "claude-haiku-4-5", "expected": "83", "actual": "73", - "correct": false, + "isCorrect": false, "inputTokens": 9125, "outputTokens": 5, "latencyMs": 1394.1952499999898 @@ -17451,7 +17451,7 @@ "model": "gpt-5-nano", "expected": "83", "actual": "83", - "correct": true, + "isCorrect": true, "inputTokens": 15482, "outputTokens": 3591, "latencyMs": 25356.36183400004 @@ -17462,7 +17462,7 @@ "model": "claude-haiku-4-5", "expected": "83", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 15367, "outputTokens": 5, "latencyMs": 1238.0827089999802 @@ -17473,7 +17473,7 @@ "model": "gpt-5-nano", "expected": "83", "actual": "72", - "correct": false, + "isCorrect": false, "inputTokens": 13172, "outputTokens": 2567, "latencyMs": 25124.520583999984 @@ -17484,7 +17484,7 @@ "model": "claude-haiku-4-5", "expected": "83", "actual": "71", - "correct": false, + "isCorrect": false, "inputTokens": 14483, "outputTokens": 5, "latencyMs": 2058.834957999992 diff --git a/benchmarks/results/accuracy/report.md b/benchmarks/results/accuracy/report.md index a6f9a5c..b44276b 100644 --- a/benchmarks/results/accuracy/report.md +++ b/benchmarks/results/accuracy/report.md @@ -28,7 +28,7 @@ claude-haiku-4-5 ##### Uniform employee records (TOON optimal format) | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `toon` | 86.2% | 2.483 | 100/116 | | `csv` | 80.2% | 2.337 | 93/116 | | `yaml` | 82.8% | 4.969 | 96/116 | @@ -38,7 +38,7 @@ claude-haiku-4-5 ##### E-commerce orders with nested structures | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `toon` | 90.9% | 5.967 | 80/88 | | `csv` | 90.9% | 6.735 | 80/88 | | `yaml` | 89.8% | 7.328 | 79/88 | @@ -48,17 +48,17 @@ claude-haiku-4-5 ##### Time-series analytics data | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `csv` | 87.9% | 1.393 | 51/58 | | `toon` | 86.2% | 1.515 | 50/58 | | `yaml` | 86.2% | 2.938 | 50/58 | | `json` | 87.9% | 3.665 | 51/58 | | `markdown-kv` | 86.2% | 3.779 | 50/58 | -##### Popular GitHub repositories +##### Top 100 GitHub repositories | Format | Accuracy | Tokens | Correct/Total | -|--------|----------|--------|---------------| +| ------ | -------- | ------ | ------------- | | `csv` | 80.4% | 8.513 | 45/56 | | `toon` | 80.4% | 8.745 | 45/56 | | `yaml` | 78.6% | 13.129 | 44/56 | @@ -70,7 +70,7 @@ claude-haiku-4-5 ##### gpt-5-nano | Format | Accuracy | Correct/Total | -|--------|----------|---------------| +| ------ | -------- | ------------- | | `toon` | 97.5% | 155/159 | | `markdown-kv` | 95.6% | 152/159 | | `yaml` | 94.3% | 150/159 | @@ -80,7 +80,7 @@ claude-haiku-4-5 ##### claude-haiku-4-5 | Format | Accuracy | Correct/Total | -|--------|----------|---------------| +| ------ | -------- | ------------- | | `markdown-kv` | 76.7% | 122/159 | | `toon` | 75.5% | 120/159 | | `json` | 75.5% | 120/159 | diff --git a/benchmarks/results/accuracy/summary.json b/benchmarks/results/accuracy/summary.json index a49a81a..dbbd353 100644 --- a/benchmarks/results/accuracy/summary.json +++ b/benchmarks/results/accuracy/summary.json @@ -61,7 +61,7 @@ }, { "name": "github", - "description": "Popular GitHub repositories" + "description": "Top 100 GitHub repositories" } ], "tokenCounts": { @@ -86,5 +86,5 @@ "yaml-analytics": 2938, "yaml-github": 13129 }, - "timestamp": "2025-10-27T12:43:38.288Z" + "timestamp": "2025-10-27T13:04:50.634Z" } diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts index e2a4c1c..b467c63 100644 --- a/benchmarks/scripts/accuracy-benchmark.ts +++ b/benchmarks/scripts/accuracy-benchmark.ts @@ -81,6 +81,7 @@ else { // Format datasets once (reuse for all questions) const formattedDatasets: Record> = {} + for (const [formatName, formatter] of Object.entries(formatters)) { formattedDatasets[formatName] ??= {} @@ -91,6 +92,7 @@ else { // Generate evaluation tasks const tasks: { question: Question, formatName: string, modelName: string }[] = [] + for (const question of questions) { for (const [formatName] of Object.entries(formatters)) { for (const [modelName] of Object.entries(activeModels)) { @@ -100,7 +102,6 @@ else { } const total = tasks.length - consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`) // Evaluate all tasks in parallel @@ -110,16 +111,15 @@ else { const formattedData = formattedDatasets[task.formatName]![task.question.dataset]! const model = activeModels[task.modelName as keyof typeof activeModels]! - const result = await evaluateQuestion( - task.question, - task.formatName, + const result = await evaluateQuestion({ + question: task.question, + formatName: task.formatName, formattedData, model, - task.modelName, - ) + }) - // Progress update - if ((index + 1) % 10 === 0) { + // Progress update after task completes + if ((index + 1) % 10 === 0 || (index + 1) === total) { const percent = (((index + 1) / total) * 100).toFixed(1) consola.start(`Progress: ${index + 1}/${total} (${percent}%)`) } @@ -133,6 +133,7 @@ else { } // Generate/regenerate markdown report +consola.start('Generating report and saving results…') const formatResults = calculateFormatResults(results, tokenCounts) await saveResults(results, formatResults, questions, tokenCounts) diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index 1b9f7d6..f498f3e 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -46,7 +46,7 @@ const BENCHMARK_EXAMPLES = [ { name: 'E-commerce Order', emoji: '🛒', - description: 'Nested order with customer and items', + description: 'Single nested order with customer and items', getData: generateOrder, showDetailed: false, }, diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts index e146db0..6434dde 100644 --- a/benchmarks/src/constants.ts +++ b/benchmarks/src/constants.ts @@ -5,8 +5,9 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta. export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url)) /** - * Benchmark execution configuration + * Default concurrency for parallel evaluations */ +export const DEFAULT_CONCURRENCY = 20 /** * Enable dry run mode for quick testing with limited AI requests @@ -27,13 +28,3 @@ export const DRY_RUN_LIMITS = { /** Models to use in dry run */ allowedModels: [] as string[], } - -/** - * Default concurrency for parallel evaluations - */ -export const DEFAULT_CONCURRENCY = 20 - -/** - * Delay between API requests to avoid rate limiting (in milliseconds) - */ -export const RATE_LIMIT_DELAY_MS = 100 diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts index 87643f2..0fbb65c 100644 --- a/benchmarks/src/datasets.ts +++ b/benchmarks/src/datasets.ts @@ -122,16 +122,16 @@ const analyticsDataset: Dataset = { } /** - * GitHub dataset: Popular repositories + * Real-world dataset: Top 100 starred GitHub repositories * * @remarks - * Tests TOON's tabular format with real-world data + * Tests TOON's tabular format */ const githubDataset: Dataset = { name: 'github', - description: 'Popular GitHub repositories', + description: 'Top 100 GitHub repositories', data: { - repositories: githubRepos.slice(0, 200), + repositories: githubRepos, }, } diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts index 31642de..e6e490b 100644 --- a/benchmarks/src/evaluate.ts +++ b/benchmarks/src/evaluate.ts @@ -9,12 +9,10 @@ import type { LanguageModelV2 } from '@ai-sdk/provider' import type { EvaluationResult, Question } from './types' -import { setTimeout } from 'node:timers/promises' import { anthropic } from '@ai-sdk/anthropic' import { openai } from '@ai-sdk/openai' import { generateText } from 'ai' import { consola } from 'consola' -import { RATE_LIMIT_DELAY_MS } from './constants' /** * Models used for evaluation @@ -28,11 +26,8 @@ export const models: Record = { * Evaluate a single question with a specific format and model */ export async function evaluateQuestion( - question: Question, - formatName: string, - formattedData: string, - model: LanguageModelV2, - modelName: string, + { question, formatName, formattedData, model}: + { question: Question, formatName: string, formattedData: string, model: LanguageModelV2 }, ): Promise { const prompt = `Given the following data in ${formatName} format: @@ -51,10 +46,8 @@ Provide only the direct answer, without any additional explanation or formatting temperature: model.modelId.startsWith('gpt-') ? undefined : 0, }) - await setTimeout(RATE_LIMIT_DELAY_MS) - const latencyMs = performance.now() - startTime - const correct = await validateAnswer({ + const isCorrect = await validateAnswer({ actual: text.trim(), expected: question.groundTruth, question: question.prompt, @@ -63,10 +56,10 @@ Provide only the direct answer, without any additional explanation or formatting return { questionId: question.id, format: formatName, - model: modelName, + model: model.modelId, expected: question.groundTruth, actual: text.trim(), - correct, + isCorrect, inputTokens: usage.inputTokens, outputTokens: usage.outputTokens, latencyMs, @@ -105,8 +98,6 @@ Respond with only "YES" or "NO".` temperature: 0, }) - await setTimeout(RATE_LIMIT_DELAY_MS) - return text.trim().toUpperCase() === 'YES' } catch (error) { diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index 43d1c23..35891af 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -3,7 +3,7 @@ * * Handles: * - Statistical analysis - * - Twitter-ready markdown report generation with visual elements + * - Markdown report generation with visual elements * - Per-dataset breakdowns * - Cost analysis * - Result file saving @@ -28,7 +28,7 @@ export function calculateFormatResults( return formatNames.map((formatName) => { const formatResults = results.filter(r => r.format === formatName) - const correctCount = formatResults.filter(r => r.correct).length + const correctCount = formatResults.filter(r => r.isCorrect).length const totalCount = formatResults.length const accuracy = correctCount / totalCount @@ -59,24 +59,17 @@ export function generateMarkdownReport( questions: Question[], tokenCounts: Record, ): string { - const lines: string[] = [ - '### Retrieval Accuracy', - '', - ] - const toon = formatResults.find(r => r.format === 'toon') const json = formatResults.find(r => r.format === 'json') - // Model-by-model breakdown with ASCII bars + // Build model-by-model breakdown with ASCII bars const modelCount = Object.keys(models).length - lines.push(`Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks:`, '', '```') - const modelNames = Object.keys(models) - for (let i = 0; i < modelNames.length; i++) { - const modelName = modelNames[i]! + + const modelBreakdown = modelNames.map((modelName, i) => { const modelResults = formatResults.map((fr) => { const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format) - const correctCount = modelFormatResults.filter(r => r.correct).length + const correctCount = modelFormatResults.filter(r => r.isCorrect).length const totalCount = modelFormatResults.length const accuracy = totalCount > 0 ? correctCount / totalCount : 0 @@ -88,34 +81,24 @@ export function generateMarkdownReport( } }).sort((a, b) => b.accuracy - a.accuracy) - // Add blank line before model name, except for first model - if (i > 0) - lines.push('') - lines.push(modelName) - for (const result of modelResults) { + const formatLines = modelResults.map((result) => { const bar = createProgressBar(result.accuracy, 1, 20) const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6) const countStr = `(${result.correctCount}/${result.totalCount})` - lines.push(` ${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`) - } - } + return ` ${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}` + }).join('\n') - lines.push('```', '') + // Add blank line before model name, except for first model + return `${i > 0 ? '\n' : ''}${modelName}\n${formatLines}` + }).join('\n') - // Summary comparison - if (toon && json) { - const tokenSavings = ((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1) - lines.push( - `**Tradeoff:** TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${tokenSavings}% fewer tokens.`, - '', - ) - } - - lines.push('
', 'View detailed breakdown by dataset and model', '', '#### Performance by Dataset', '') - - for (const dataset of datasets) { - lines.push(`##### ${dataset.description}`, '') + // Build summary comparison + const summaryComparison = toon && json + ? `**Tradeoff:** TOON achieves ${(toon.accuracy * 100).toFixed(1)}% accuracy (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using ${((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1)}% fewer tokens.` + : '' + // Build performance by dataset + const datasetBreakdown = datasets.map((dataset) => { const datasetResults = formatResults.map((fr) => { const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name) if (datasetFormatResults.length === 0) @@ -125,7 +108,7 @@ export function generateMarkdownReport( if (formatDatasetResults.length === 0) return undefined - const correctCount = formatDatasetResults.filter(r => r.correct).length + const correctCount = formatDatasetResults.filter(r => r.isCorrect).length const totalCount = formatDatasetResults.length const accuracy = totalCount > 0 ? correctCount / totalCount : 0 @@ -143,7 +126,7 @@ export function generateMarkdownReport( }).filter(Boolean) as { format: string, accuracy: number, tokens: number, correctCount: number, totalCount: number }[] if (datasetResults.length === 0) - continue + return '' // Sort by efficiency datasetResults.sort((a, b) => { @@ -152,29 +135,24 @@ export function generateMarkdownReport( return effB - effA }) - lines.push( - '| Format | Accuracy | Tokens | Correct/Total |', - '|--------|----------|--------|---------------|', - ) + const tableRows = datasetResults.slice(0, 6).map(result => + `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`, + ).join('\n') - for (const result of datasetResults.slice(0, 6)) { - lines.push( - `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`, - ) - } + return ` +##### ${dataset.description} - lines.push('') - } - - // Model breakdown - lines.push('#### Performance by Model', '') - - for (const modelName of Object.keys(models)) { - lines.push(`##### ${modelName}`, '') +| Format | Accuracy | Tokens | Correct/Total | +| ------ | -------- | ------ | ------------- | +${tableRows} +`.trimStart() + }).filter(Boolean).join('\n') + // Build performance by model + const modelPerformance = modelNames.map((modelName) => { const modelResults = formatResults.map((fr) => { const modelFormatResults = results.filter(r => r.model === modelName && r.format === fr.format) - const correctCount = modelFormatResults.filter(r => r.correct).length + const correctCount = modelFormatResults.filter(r => r.isCorrect).length const totalCount = modelFormatResults.length const accuracy = correctCount / totalCount @@ -186,36 +164,55 @@ export function generateMarkdownReport( } }).sort((a, b) => b.accuracy - a.accuracy) - lines.push('| Format | Accuracy | Correct/Total |', '|--------|----------|---------------|') + const tableRows = modelResults.map(result => + `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`, + ).join('\n') - for (const result of modelResults) { - lines.push(`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.correctCount}/${result.totalCount} |`) - } + return ` +##### ${modelName} - lines.push('') - } +| Format | Accuracy | Correct/Total | +| ------ | -------- | ------------- | +${tableRows} +`.trimStart() + }).join('\n') - // Methodology - lines.push( - '#### Methodology', - '', - '- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).', - '- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding.', - '- **Question types**: Field retrieval, aggregation, and filtering tasks.', - '- **Real data**: Faker.js-generated datasets + GitHub repositories.', - '', - '
', - '', - ) + return ` +### Retrieval Accuracy - return lines.join('\n') +Tested across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** with data retrieval tasks: + +\`\`\` +${modelBreakdown} +\`\`\` + +${summaryComparison} + +
+View detailed breakdown by dataset and model + +#### Performance by Dataset + +${datasetBreakdown} +#### Performance by Model + +${modelPerformance} +#### Methodology + +- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching). +- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding. +- **Question types**: Field retrieval, aggregation, and filtering tasks. +- **Real data**: Faker.js-generated datasets + GitHub repositories. + +
+`.trimStart() } /** * Calculate token counts for all format+dataset combinations */ export function calculateTokenCounts( - formatters: Record string>, + formatters: Record string>, ): Record { const tokenCounts: Record = {} @@ -272,7 +269,7 @@ export async function saveResults( } /** - * Generate visual progress bar using ASCII characters (█ for filled, ░ for empty) + * Generate visual progress bar using ASCII characters (`█` for filled, `░` for empty) */ function createProgressBar(tokens: number, maxTokens: number, width = 30): string { const filled = Math.round((tokens / maxTokens) * width) diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts index 11f8bcf..399a167 100644 --- a/benchmarks/src/types.ts +++ b/benchmarks/src/types.ts @@ -18,7 +18,7 @@ export interface EvaluationResult { model: string expected: string actual: string - correct: boolean + isCorrect: boolean inputTokens?: number outputTokens?: number latencyMs: number