chore(benchmarks): replace LLM-as-judge, new structural validation

2026-01-29 15:24:10 +08:00 · 2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ See [benchmarks](#benchmarks) for concrete comparisons across different data str
 ## Key Features
- 💸 **Token-efficient:** typically 30–60% fewer tokens than JSON[^1]
+- 💸 **Token-efficient:** typically 30-60% fewer tokens on large uniform arrays vs formatted JSON[^1]
 - 🤿 **LLM-friendly guardrails:** explicit lengths and fields enable validation
 - 🍱 **Minimal syntax:** removes redundant punctuation (braces, brackets, most quotes)
 - 📐 **Indentation-based structure:** like YAML, uses whitespace instead of braces
@@ -108,19 +108,19 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
 ```
 🛒 E-commerce orders with nested structures  ┊  Tabular: 33%
   │
-   TOON                █████████████░░░░░░░    72,743 tokens
+   TOON                █████████████░░░░░░░    72,771 tokens
-   ├─ vs JSON          (−33.1%)               108,731 tokens
+   ├─ vs JSON          (−33.1%)               108,806 tokens
-   ├─ vs JSON compact  (+5.5%)                 68,936 tokens
+   ├─ vs JSON compact  (+5.5%)                 68,975 tokens
-   ├─ vs YAML          (−14.1%)                84,724 tokens
+   ├─ vs YAML          (−14.2%)                84,780 tokens
-   └─ vs XML           (−40.5%)               122,313 tokens
+   └─ vs XML           (−40.5%)               122,406 tokens
 🧾 Semi-uniform event logs  ┊  Tabular: 50%
   │
-   TOON                █████████████████░░░   153,223 tokens
+   TOON                █████████████████░░░   153,211 tokens
-   ├─ vs JSON          (−15.0%)               180,196 tokens
+   ├─ vs JSON          (−15.0%)               180,176 tokens
-   ├─ vs JSON compact  (+19.9%)               127,740 tokens
+   ├─ vs JSON compact  (+19.9%)               127,731 tokens
-   ├─ vs YAML          (−0.8%)                154,514 tokens
+   ├─ vs YAML          (−0.8%)                154,505 tokens
-   └─ vs XML           (−25.2%)               204,800 tokens
+   └─ vs XML           (−25.2%)               204,777 tokens
 🧩 Deeply nested configuration  ┊  Tabular: 0%
   │
@@ -131,11 +131,11 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
   └─ vs XML           (−37.4%)                 1,008 tokens
 ──────────────────────────────────── Total ────────────────────────────────────
-   TOON                ████████████████░░░░   226,597 tokens
+   TOON                ████████████████░░░░   226,613 tokens
-   ├─ vs JSON          (−21.8%)               289,846 tokens
+   ├─ vs JSON          (−21.8%)               289,901 tokens
-   ├─ vs JSON compact  (+14.9%)               197,240 tokens
+   ├─ vs JSON compact  (+14.9%)               197,270 tokens
-   ├─ vs YAML          (−5.5%)                239,911 tokens
+   ├─ vs YAML          (−5.6%)                239,958 tokens
-   └─ vs XML           (−30.9%)               328,121 tokens
+   └─ vs XML           (−31.0%)               328,191 tokens
 ```
 #### Flat-Only Track
@@ -145,21 +145,21 @@ Datasets with flat tabular structures where CSV is applicable.
 ```
 👥 Uniform employee records  ┊  Tabular: 100%
   │
-   CSV                 ███████████████████░    46,956 tokens
+   CSV                 ███████████████████░    46,954 tokens
-   TOON                ████████████████████    49,827 tokens   (+6.1% vs CSV)
+   TOON                ████████████████████    49,831 tokens   (+6.1% vs CSV)
-   ├─ vs JSON          (−60.7%)               126,854 tokens
+   ├─ vs JSON          (−60.7%)               126,860 tokens
-   ├─ vs JSON compact  (−36.8%)                78,850 tokens
+   ├─ vs JSON compact  (−36.8%)                78,856 tokens
-   ├─ vs YAML          (−50.0%)                99,701 tokens
+   ├─ vs YAML          (−50.0%)                99,706 tokens
-   └─ vs XML           (−66.0%)               146,440 tokens
+   └─ vs XML           (−66.0%)               146,444 tokens
 📈 Time-series analytics data  ┊  Tabular: 100%
   │
-   CSV                 ██████████████████░░     8,396 tokens
+   CSV                 ██████████████████░░     8,388 tokens
-   TOON                ████████████████████     9,128 tokens   (+8.7% vs CSV)
+   TOON                ████████████████████     9,120 tokens   (+8.7% vs CSV)
-   ├─ vs JSON          (−59.0%)                22,258 tokens
+   ├─ vs JSON          (−59.0%)                22,250 tokens
-   ├─ vs JSON compact  (−35.8%)                14,224 tokens
+   ├─ vs JSON compact  (−35.8%)                14,216 tokens
-   ├─ vs YAML          (−48.9%)                17,871 tokens
+   ├─ vs YAML          (−48.9%)                17,863 tokens
-   └─ vs XML           (−65.7%)                26,629 tokens
+   └─ vs XML           (−65.7%)                26,621 tokens
 ⭐ Top 100 GitHub repositories  ┊  Tabular: 100%
   │
@@ -171,12 +171,12 @@ Datasets with flat tabular structures where CSV is applicable.
   └─ vs XML           (−48.8%)                17,095 tokens
 ──────────────────────────────────── Total ────────────────────────────────────
-   CSV                 ███████████████████░    63,865 tokens
+   CSV                 ███████████████████░    63,855 tokens
-   TOON                ████████████████████    67,700 tokens   (+6.0% vs CSV)
+   TOON                ████████████████████    67,696 tokens   (+6.0% vs CSV)
-   ├─ vs JSON          (−58.8%)               164,257 tokens
+   ├─ vs JSON          (−58.8%)               164,255 tokens
-   ├─ vs JSON compact  (−35.2%)               104,529 tokens
+   ├─ vs JSON compact  (−35.2%)               104,527 tokens
-   ├─ vs YAML          (−48.2%)               130,701 tokens
+   ├─ vs YAML          (−48.2%)               130,698 tokens
-   └─ vs XML           (−64.4%)               190,164 tokens
+   └─ vs XML           (−64.4%)               190,160 tokens
 ```
 <details>
@@ -186,64 +186,64 @@ Datasets with flat tabular structures where CSV is applicable.
 **Savings:** 13,130 tokens (59.0% reduction vs JSON)
-**JSON** (22,258 tokens):
+**JSON** (22,250 tokens):
 ```json
 {
  "metrics": [
    {
      "date": "2025-01-01",
-      "views": 7708,
+      "views": 5715,
-      "clicks": 595,
+      "clicks": 211,
-      "conversions": 69,
+      "conversions": 28,
-      "revenue": 15369.93,
+      "revenue": 7976.46,
-      "bounceRate": 0.35
+      "bounceRate": 0.47
    },
    {
      "date": "2025-01-02",
-      "views": 5894,
+      "views": 7103,
-      "clicks": 381,
+      "clicks": 393,
-      "conversions": 21,
+      "conversions": 28,
-      "revenue": 2112.12,
+      "revenue": 8360.53,
-      "bounceRate": 0.3
+      "bounceRate": 0.32
    },
    {
      "date": "2025-01-03",
-      "views": 6835,
+      "views": 7248,
-      "clicks": 422,
+      "clicks": 378,
-      "conversions": 35,
+      "conversions": 24,
-      "revenue": 4525.73,
+      "revenue": 3212.57,
      "bounceRate": 0.5
    },
    {
      "date": "2025-01-04",
-      "views": 5325,
+      "views": 2927,
-      "clicks": 305,
+      "clicks": 77,
-      "conversions": 22,
+      "conversions": 11,
-      "revenue": 2445.3,
+      "revenue": 1211.69,
-      "bounceRate": 0.44
+      "bounceRate": 0.62
    },
    {
      "date": "2025-01-05",
-      "views": 2974,
+      "views": 3530,
-      "clicks": 61,
+      "clicks": 82,
-      "conversions": 6,
+      "conversions": 8,
-      "revenue": 956.57,
+      "revenue": 462.77,
-      "bounceRate": 0.47
+      "bounceRate": 0.56
    }
  ]
 }
 ```
-**TOON** (9,128 tokens):
+**TOON** (9,120 tokens):
 ```
 metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
-  2025-01-01,7708,595,69,15369.93,0.35
+  2025-01-01,5715,211,28,7976.46,0.47
-  2025-01-02,5894,381,21,2112.12,0.3
+  2025-01-02,7103,393,28,8360.53,0.32
-  2025-01-03,6835,422,35,4525.73,0.5
+  2025-01-03,7248,378,24,3212.57,0.5
-  2025-01-04,5325,305,22,2445.3,0.44
+  2025-01-04,2927,77,11,1211.69,0.62
-  2025-01-05,2974,61,6,956.57,0.47
+  2025-01-05,3530,82,8,462.77,0.56
 ```
 ---
@@ -317,7 +317,7 @@ repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watc
 <!-- automd:file src="./benchmarks/results/retrieval-accuracy.md" -->
-Benchmarks test LLM comprehension across different input formats using 204 data retrieval questions on 4 models.
+Benchmarks test LLM comprehension across different input formats using 209 data retrieval questions on 4 models.
 <details>
 <summary><strong>Show Dataset Catalog</strong></summary>
@@ -332,6 +332,11 @@ Benchmarks test LLM comprehension across different input formats using 204 data
 | Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
 | Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
 | Deeply nested configuration | 11 | deep | ✗ | 0% |
 | Valid complete dataset (control) | 20 | uniform | ✓ | 100% |
 | Array truncated: 3 rows removed from end | 17 | uniform | ✓ | 100% |
 | Extra rows added beyond declared length | 23 | uniform | ✓ | 100% |
 | Inconsistent field count (missing salary in row 10) | 20 | uniform | ✓ | 100% |
 | Missing required fields (no email in multiple rows) | 20 | uniform | ✓ | 100% |
 **Structure classes:**
 - **uniform**: All objects have identical fields with primitive values
@@ -350,67 +355,69 @@ Benchmarks test LLM comprehension across different input formats using 204 data
 Each format's overall performance, balancing accuracy against token cost:
 ```
-TOON           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓   17.2  │  75.5% acc  │  4,389 tokens
+TOON           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓   26.9  │  73.9% acc  │  2,744 tokens
-CSV            ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░   16.6  │  67.8% acc  │  4,080 tokens
+JSON compact   ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░   22.9  │  70.7% acc  │  3,081 tokens
-JSON compact   ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░   14.7  │  73.3% acc  │  4,982 tokens
+YAML           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░   18.6  │  69.0% acc  │  3,719 tokens
-YAML           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░   12.1  │  72.4% acc  │  5,976 tokens
+JSON           ▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░   15.3  │  69.7% acc  │  4,545 tokens
-JSON           ▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░   10.0  │  72.4% acc  │  7,260 tokens
+XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░   13.0  │  67.1% acc  │  5,167 tokens
 XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░    8.4  │  69.0% acc  │  8,251 tokens
 ```
-TOON achieves **75.5%** accuracy (vs JSON's 72.4%) while using **39.5% fewer tokens**.
+TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
 **Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
 #### Per-Model Accuracy
-Accuracy across 4 LLMs on 204 data retrieval questions:
+Accuracy across 4 LLMs on 209 data retrieval questions:
 ```
 claude-haiku-4-5-20251001
-→ TOON           ████████████░░░░░░░░    62.3% (127/204)
+→ TOON           ████████████░░░░░░░░    59.8% (125/209)
-  JSON           ███████████░░░░░░░░░    56.9% (116/204)
+  JSON           ███████████░░░░░░░░░    57.4% (120/209)
-  YAML           ███████████░░░░░░░░░    55.9% (114/204)
+  YAML           ███████████░░░░░░░░░    56.0% (117/209)
-  JSON compact   ███████████░░░░░░░░░    54.9% (112/204)
+  XML            ███████████░░░░░░░░░    55.5% (116/209)
-  XML            ███████████░░░░░░░░░    54.9% (112/204)
+  JSON compact   ███████████░░░░░░░░░    55.0% (115/209)
-  CSV            █████████░░░░░░░░░░░    47.1% (49/104)
+  CSV            ██████████░░░░░░░░░░    50.5% (55/109)
 gemini-2.5-flash
-→ TOON           ██████████████████░░    91.2% (186/204)
+→ TOON           ██████████████████░░    87.6% (183/209)
-  YAML           ██████████████████░░    89.7% (183/204)
+  CSV            █████████████████░░░    86.2% (94/109)
-  JSON compact   ██████████████████░░    87.7% (179/204)
+  JSON compact   ████████████████░░░░    82.3% (172/209)
-  JSON           ██████████████████░░    87.7% (179/204)
+  YAML           ████████████████░░░░    79.4% (166/209)
-  XML            █████████████████░░░    87.3% (178/204)
+  XML            ████████████████░░░░    79.4% (166/209)
-  CSV            █████████████████░░░    85.6% (89/104)
+  JSON           ███████████████░░░░░    77.0% (161/209)
 gpt-5-nano
-  JSON compact   ███████████████████░    93.6% (191/204)
+→ TOON           ██████████████████░░    90.9% (190/209)
-  CSV            ██████████████████░░    90.4% (94/104)
+  JSON compact   ██████████████████░░    90.9% (190/209)
-  JSON           ██████████████████░░    89.7% (183/204)
+  JSON           ██████████████████░░    89.0% (186/209)
-→ TOON           ██████████████████░░    89.2% (182/204)
+  CSV            ██████████████████░░    89.0% (97/109)
-  YAML           ██████████████████░░    89.2% (182/204)
+  YAML           █████████████████░░░    87.1% (182/209)
-  XML            ████████████████░░░░    81.4% (166/204)
+  XML            ████████████████░░░░    80.9% (169/209)
 grok-4-fast-non-reasoning
-→ TOON           ████████████░░░░░░░░    59.3% (121/204)
+→ TOON           ███████████░░░░░░░░░    57.4% (120/209)
-  JSON compact   ███████████░░░░░░░░░    56.9% (116/204)
+  JSON           ███████████░░░░░░░░░    55.5% (116/209)
-  JSON           ███████████░░░░░░░░░    55.4% (113/204)
+  JSON compact   ███████████░░░░░░░░░    54.5% (114/209)
-  YAML           ███████████░░░░░░░░░    54.9% (112/204)
+  YAML           ███████████░░░░░░░░░    53.6% (112/209)
-  XML            ██████████░░░░░░░░░░    52.5% (107/204)
+  XML            ███████████░░░░░░░░░    52.6% (110/209)
-  CSV            ██████████░░░░░░░░░░    48.1% (50/104)
+  CSV            ██████████░░░░░░░░░░    52.3% (57/109)
 ```
-**Key tradeoff:** TOON achieves **75.5% accuracy** (vs JSON's 72.4%) while using **39.5% fewer tokens** on these datasets.
+**Key tradeoff:** TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
 <details>
 <summary><strong>Performance by dataset, model, and question type</strong></summary>
 #### Performance by Question Type
-| Question Type | TOON | JSON compact | JSON | YAML | XML | CSV |
+| Question Type | TOON | JSON compact | JSON | CSV | YAML | XML |
 | ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
-| Field Retrieval | 100.0% | 98.9% | 99.6% | 99.3% | 98.5% | 100.0% |
+| Field Retrieval | 99.6% | 99.3% | 99.3% | 100.0% | 98.2% | 98.9% |
-| Aggregation | 56.3% | 52.4% | 53.2% | 53.2% | 47.2% | 40.5% |
+| Aggregation | 54.4% | 47.2% | 48.8% | 44.0% | 47.6% | 41.3% |
-| Filtering | 58.9% | 58.3% | 54.2% | 53.1% | 50.5% | 49.1% |
+| Filtering | 56.3% | 57.3% | 50.5% | 49.1% | 51.0% | 47.9% |
-| Structure Awareness | 89.0% | 85.0% | 82.0% | 85.0% | 79.0% | 84.4% |
+| Structure Awareness | 88.0% | 83.0% | 83.0% | 85.9% | 80.0% | 80.0% |
 | Structural Validation | 70.0% | 45.0% | 50.0% | 80.0% | 60.0% | 80.0% |
 #### Performance by Dataset
@@ -418,64 +425,119 @@ grok-4-fast-non-reasoning
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `csv` | 70.7% | 2,337 | 116/164 |
+| `csv` | 72.0% | 2,352 | 118/164 |
-| `toon` | 72.0% | 2,483 | 118/164 |
+| `toon` | 73.8% | 2,518 | 121/164 |
-| `json-compact` | 71.3% | 3,943 | 117/164 |
+| `json-compact` | 69.5% | 3,953 | 114/164 |
-| `yaml` | 70.1% | 4,969 | 115/164 |
+| `yaml` | 68.3% | 4,982 | 112/164 |
-| `json-pretty` | 72.6% | 6,347 | 119/164 |
+| `json-pretty` | 68.3% | 6,360 | 112/164 |
-| `xml` | 70.7% | 7,314 | 116/164 |
+| `xml` | 69.5% | 7,324 | 114/164 |
 ##### E-commerce orders with nested structures
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 83.5% | 7,197 | 137/164 |
+| `toon` | 81.1% | 7,232 | 133/164 |
-| `json-compact` | 79.3% | 6,784 | 130/164 |
+| `json-compact` | 76.8% | 6,794 | 126/164 |
-| `yaml` | 78.7% | 8,334 | 129/164 |
+| `yaml` | 75.6% | 8,347 | 124/164 |
-| `json-pretty` | 78.7% | 10,700 | 129/164 |
+| `json-pretty` | 76.2% | 10,713 | 125/164 |
-| `xml` | 73.8% | 12,013 | 121/164 |
+| `xml` | 74.4% | 12,023 | 122/164 |
 ##### Time-series analytics data
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 75.8% | 1,513 | 91/120 |
+| `csv` | 73.3% | 1,406 | 88/120 |
-| `csv` | 72.5% | 1,391 | 87/120 |
+| `toon` | 72.5% | 1,548 | 87/120 |
-| `json-compact` | 70.0% | 2,339 | 84/120 |
+| `json-compact` | 71.7% | 2,349 | 86/120 |
-| `yaml` | 70.0% | 2,936 | 84/120 |
+| `yaml` | 71.7% | 2,949 | 86/120 |
-| `json-pretty` | 71.7% | 3,663 | 86/120 |
+| `json-pretty` | 68.3% | 3,676 | 82/120 |
-| `xml` | 71.7% | 4,374 | 86/120 |
+| `xml` | 68.3% | 4,384 | 82/120 |
 ##### Top 100 GitHub repositories
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 64.4% | 8,745 | 85/132 |
+| `toon` | 62.9% | 8,780 | 83/132 |
-| `csv` | 59.8% | 8,513 | 79/132 |
+| `csv` | 61.4% | 8,528 | 81/132 |
-| `json-compact` | 60.6% | 11,455 | 80/132 |
+| `yaml` | 59.8% | 13,142 | 79/132 |
-| `yaml` | 61.4% | 13,129 | 81/132 |
+| `json-compact` | 55.3% | 11,465 | 73/132 |
-| `json-pretty` | 59.1% | 15,145 | 78/132 |
+| `json-pretty` | 56.1% | 15,158 | 74/132 |
-| `xml` | 51.5% | 17,095 | 68/132 |
+| `xml` | 48.5% | 17,105 | 64/132 |
 ##### Semi-uniform event logs
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `json-compact` | 67.5% | 4,809 | 81/120 |
+| `json-compact` | 63.3% | 4,819 | 76/120 |
-| `yaml` | 63.3% | 5,814 | 76/120 |
+| `toon` | 57.5% | 5,799 | 69/120 |
-| `toon` | 62.5% | 5,764 | 75/120 |
+| `json-pretty` | 59.2% | 6,797 | 71/120 |
-| `json-pretty` | 59.2% | 6,784 | 71/120 |
+| `yaml` | 48.3% | 5,827 | 58/120 |
-| `xml` | 55.0% | 7,699 | 66/120 |
+| `xml` | 46.7% | 7,709 | 56/120 |
 ##### Deeply nested configuration
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `json-compact` | 91.4% | 564 | 106/116 |
+| `json-compact` | 92.2% | 574 | 107/116 |
-| `toon` | 94.8% | 631 | 110/116 |
+| `toon` | 95.7% | 666 | 111/116 |
-| `yaml` | 91.4% | 673 | 106/116 |
+| `yaml` | 91.4% | 686 | 106/116 |
-| `json-pretty` | 93.1% | 919 | 108/116 |
+| `json-pretty` | 94.0% | 932 | 109/116 |
-| `xml` | 91.4% | 1,008 | 106/116 |
+| `xml` | 92.2% | 1,018 | 107/116 |
 ##### Valid complete dataset (control)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `toon` | 100.0% | 544 | 4/4 |
 | `json-compact` | 100.0% | 795 | 4/4 |
 | `yaml` | 100.0% | 1,003 | 4/4 |
 | `json-pretty` | 100.0% | 1,282 | 4/4 |
 | `csv` | 25.0% | 492 | 1/4 |
 | `xml` | 0.0% | 1,467 | 0/4 |
 ##### Array truncated: 3 rows removed from end
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 425 | 4/4 |
 | `xml` | 100.0% | 1,251 | 4/4 |
 | `toon` | 0.0% | 474 | 0/4 |
 | `json-compact` | 0.0% | 681 | 0/4 |
 | `json-pretty` | 0.0% | 1,096 | 0/4 |
 | `yaml` | 0.0% | 859 | 0/4 |
 ##### Extra rows added beyond declared length
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 566 | 4/4 |
 | `toon` | 75.0% | 621 | 3/4 |
 | `xml` | 100.0% | 1,692 | 4/4 |
 | `yaml` | 75.0% | 1,157 | 3/4 |
 | `json-compact` | 50.0% | 917 | 2/4 |
 | `json-pretty` | 50.0% | 1,476 | 2/4 |
 ##### Inconsistent field count (missing salary in row 10)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 75.0% | 489 | 3/4 |
 | `yaml` | 100.0% | 996 | 4/4 |
 | `toon` | 100.0% | 1,019 | 4/4 |
 | `json-compact` | 75.0% | 790 | 3/4 |
 | `xml` | 100.0% | 1,458 | 4/4 |
 | `json-pretty` | 75.0% | 1,274 | 3/4 |
 ##### Missing required fields (no email in multiple rows)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 329 | 4/4 |
 | `xml` | 100.0% | 1,411 | 4/4 |
 | `toon` | 75.0% | 983 | 3/4 |
 | `yaml` | 25.0% | 960 | 1/4 |
 | `json-pretty` | 25.0% | 1,230 | 1/4 |
 | `json-compact` | 0.0% | 755 | 0/4 |
 #### Performance by Model
@@ -483,45 +545,45 @@ grok-4-fast-non-reasoning
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 62.3% | 127/204 |
+| `toon` | 59.8% | 125/209 |
-| `json-pretty` | 56.9% | 116/204 |
+| `json-pretty` | 57.4% | 120/209 |
-| `yaml` | 55.9% | 114/204 |
+| `yaml` | 56.0% | 117/209 |
-| `json-compact` | 54.9% | 112/204 |
+| `xml` | 55.5% | 116/209 |
-| `xml` | 54.9% | 112/204 |
+| `json-compact` | 55.0% | 115/209 |
-| `csv` | 47.1% | 49/104 |
+| `csv` | 50.5% | 55/109 |
 ##### gemini-2.5-flash
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 91.2% | 186/204 |
+| `toon` | 87.6% | 183/209 |
-| `yaml` | 89.7% | 183/204 |
+| `csv` | 86.2% | 94/109 |
-| `json-compact` | 87.7% | 179/204 |
+| `json-compact` | 82.3% | 172/209 |
-| `json-pretty` | 87.7% | 179/204 |
+| `yaml` | 79.4% | 166/209 |
-| `xml` | 87.3% | 178/204 |
+| `xml` | 79.4% | 166/209 |
-| `csv` | 85.6% | 89/104 |
+| `json-pretty` | 77.0% | 161/209 |
 ##### gpt-5-nano
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `json-compact` | 93.6% | 191/204 |
+| `toon` | 90.9% | 190/209 |
-| `csv` | 90.4% | 94/104 |
+| `json-compact` | 90.9% | 190/209 |
-| `json-pretty` | 89.7% | 183/204 |
+| `json-pretty` | 89.0% | 186/209 |
-| `toon` | 89.2% | 182/204 |
+| `csv` | 89.0% | 97/109 |
-| `yaml` | 89.2% | 182/204 |
+| `yaml` | 87.1% | 182/209 |
-| `xml` | 81.4% | 166/204 |
+| `xml` | 80.9% | 169/209 |
 ##### grok-4-fast-non-reasoning
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 59.3% | 121/204 |
+| `toon` | 57.4% | 120/209 |
-| `json-compact` | 56.9% | 116/204 |
+| `json-pretty` | 55.5% | 116/209 |
-| `json-pretty` | 55.4% | 113/204 |
+| `json-compact` | 54.5% | 114/209 |
-| `yaml` | 54.9% | 112/204 |
+| `yaml` | 53.6% | 112/209 |
-| `xml` | 52.5% | 107/204 |
+| `xml` | 52.6% | 110/209 |
-| `csv` | 48.1% | 50/104 |
+| `csv` | 52.3% | 57/109 |
 </details>
@@ -534,8 +596,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
 #### Datasets Tested
-Six datasets designed to test different structural patterns:
+Eleven datasets designed to test different structural patterns and validation capabilities:
 **Primary datasets:**
 1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
 2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
 3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
@@ -543,21 +606,28 @@ Six datasets designed to test different structural patterns:
 5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
 6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
 **Structural validation datasets:**
 7. **Control**: Valid complete dataset (baseline for validation)
 8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
 9. **Extra rows**: Array with 3 additional rows beyond declared length
 10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
 11. **Missing fields**: Systematic field omissions (no email in multiple rows)
 #### Question Types
-204 questions are generated dynamically across four categories:
+209 questions are generated dynamically across five categories:
 - **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → `75000`
  - Example: "How many items are in order ORD-0042?" → `3`
  - Example: "What is the customer name for order ORD-0042?" → `John Doe`
- **Aggregation (31%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
+- **Aggregation (30%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
  - Example: "How many employees work in Engineering?" → `17`
  - Example: "What is the total revenue across all orders?" → `45123.50`
  - Example: "How many employees have salary > 80000?" → `23`
- **Filtering (24%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
+- **Filtering (23%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
  - Example: "How many employees in Sales have salary > 80000?" → `5`
  - Example: "How many active employees have more than 10 years of experience?" → `8`
@@ -566,18 +636,23 @@ Six datasets designed to test different structural patterns:
  - Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
  - Example: "What is the department of the last employee?" → `Sales`
 - **Structural validation (2%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
  - Example: "Is this data complete and valid?" → `YES` (control dataset) or `NO` (corrupted datasets)
  - Tests TOON's [N] length validation and {fields} consistency checking
  - Demonstrates CSV's lack of structural validation capabilities
 #### Evaluation Process
-1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, YAML, XML, CSV).
+1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, CSV, YAML, XML).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
+3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`) without requiring an LLM judge.
 #### Models & Configuration
 - **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
 - **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
 - **Temperature**: Not set (models use their defaults)
- **Total evaluations**: 204 questions × 6 formats × 4 models = 4,896 LLM calls
+- **Total evaluations**: 209 questions × 6 formats × 4 models = 5,016 LLM calls
 </details>
@@ -782,6 +857,9 @@ items[1]:
    status: active
 ```
 > [!NOTE]
 > Tabular format requires identical field sets across all objects (same keys, order doesn't matter) and primitive values only (strings, numbers, booleans, null).
 #### Mixed and Non-Uniform Arrays
 Arrays that don't meet the tabular requirements use list format:
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -34,10 +34,10 @@ Results are saved to `results/token-efficiency.md`.
 Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
-1. Generate ~200 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
+1. Generate 209 questions across 11 datasets (6 primary + 5 structural validation; CSV only included for datasets with flat/tabular structure)
 2. Convert each dataset to all supported formats
 3. Query each LLM with formatted data + question
-4. Validate answers using `gpt-5-nano` as judge
+4. Validate answers deterministically using type-aware comparison (no LLM judge needed)
 5. Aggregate metrics and generate report
 ### Setup
@@ -95,10 +95,22 @@ src/
 ├── datasets.ts                   # Test data generators
 ├── evaluate.ts                   # LLM evaluation
 ├── formatters.ts                 # Format converters
-├── questions.ts                  # Question generation
+├── normalize.ts                  # Answer normalization
 ├── report.ts                     # Markdown reports
 ├── storage.ts                    # Result caching
-└── utils.ts                      # Helpers
+├── types.ts                      # Type definitions
 ├── utils.ts                      # Helpers
 └── questions/                    # Question generators
    ├── analytics.ts
    ├── event-logs.ts
    ├── github.ts
    ├── index.ts
    ├── nested-config.ts
    ├── nested.ts
    ├── structural-validation.ts
    ├── structure.ts
    ├── tabular.ts
    └── utils.ts
 data/
 └── github-repos.json             # Top 100 GitHub repos
 results/
--- a/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
+++ b/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
--- a/benchmarks/results/accuracy/models/gemini-2.5-flash
+++ b/benchmarks/results/accuracy/models/gemini-2.5-flash
--- a/benchmarks/results/accuracy/models/gpt-5-nano
+++ b/benchmarks/results/accuracy/models/gpt-5-nano
--- a/benchmarks/results/accuracy/models/grok-4-fast-non-reasoning
+++ b/benchmarks/results/accuracy/models/grok-4-fast-non-reasoning
--- a/benchmarks/results/retrieval-accuracy.md
+++ b/benchmarks/results/retrieval-accuracy.md
@@ -1,4 +1,4 @@
-Benchmarks test LLM comprehension across different input formats using 204 data retrieval questions on 4 models.
+Benchmarks test LLM comprehension across different input formats using 209 data retrieval questions on 4 models.
 <details>
 <summary><strong>Show Dataset Catalog</strong></summary>
@@ -13,6 +13,11 @@ Benchmarks test LLM comprehension across different input formats using 204 data
 | Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
 | Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
 | Deeply nested configuration | 11 | deep | ✗ | 0% |
 | Valid complete dataset (control) | 20 | uniform | ✓ | 100% |
 | Array truncated: 3 rows removed from end | 17 | uniform | ✓ | 100% |
 | Extra rows added beyond declared length | 23 | uniform | ✓ | 100% |
 | Inconsistent field count (missing salary in row 10) | 20 | uniform | ✓ | 100% |
 | Missing required fields (no email in multiple rows) | 20 | uniform | ✓ | 100% |
 **Structure classes:**
 - **uniform**: All objects have identical fields with primitive values
@@ -31,67 +36,69 @@ Benchmarks test LLM comprehension across different input formats using 204 data
 Each format's overall performance, balancing accuracy against token cost:
 ```
-TOON           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓   17.2  │  75.5% acc  │  4,389 tokens
+TOON           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓   26.9  │  73.9% acc  │  2,744 tokens
-CSV            ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░   16.6  │  67.8% acc  │  4,080 tokens
+JSON compact   ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░   22.9  │  70.7% acc  │  3,081 tokens
-JSON compact   ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░   14.7  │  73.3% acc  │  4,982 tokens
+YAML           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░   18.6  │  69.0% acc  │  3,719 tokens
-YAML           ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░   12.1  │  72.4% acc  │  5,976 tokens
+JSON           ▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░   15.3  │  69.7% acc  │  4,545 tokens
-JSON           ▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░   10.0  │  72.4% acc  │  7,260 tokens
+XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░   13.0  │  67.1% acc  │  5,167 tokens
 XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░    8.4  │  69.0% acc  │  8,251 tokens
 ```
-TOON achieves **75.5%** accuracy (vs JSON's 72.4%) while using **39.5% fewer tokens**.
+TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
 **Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
 #### Per-Model Accuracy
-Accuracy across 4 LLMs on 204 data retrieval questions:
+Accuracy across 4 LLMs on 209 data retrieval questions:
 ```
 claude-haiku-4-5-20251001
-→ TOON           ████████████░░░░░░░░    62.3% (127/204)
+→ TOON           ████████████░░░░░░░░    59.8% (125/209)
-  JSON           ███████████░░░░░░░░░    56.9% (116/204)
+  JSON           ███████████░░░░░░░░░    57.4% (120/209)
-  YAML           ███████████░░░░░░░░░    55.9% (114/204)
+  YAML           ███████████░░░░░░░░░    56.0% (117/209)
-  JSON compact   ███████████░░░░░░░░░    54.9% (112/204)
+  XML            ███████████░░░░░░░░░    55.5% (116/209)
-  XML            ███████████░░░░░░░░░    54.9% (112/204)
+  JSON compact   ███████████░░░░░░░░░    55.0% (115/209)
-  CSV            █████████░░░░░░░░░░░    47.1% (49/104)
+  CSV            ██████████░░░░░░░░░░    50.5% (55/109)
 gemini-2.5-flash
-→ TOON           ██████████████████░░    91.2% (186/204)
+→ TOON           ██████████████████░░    87.6% (183/209)
-  YAML           ██████████████████░░    89.7% (183/204)
+  CSV            █████████████████░░░    86.2% (94/109)
-  JSON compact   ██████████████████░░    87.7% (179/204)
+  JSON compact   ████████████████░░░░    82.3% (172/209)
-  JSON           ██████████████████░░    87.7% (179/204)
+  YAML           ████████████████░░░░    79.4% (166/209)
-  XML            █████████████████░░░    87.3% (178/204)
+  XML            ████████████████░░░░    79.4% (166/209)
-  CSV            █████████████████░░░    85.6% (89/104)
+  JSON           ███████████████░░░░░    77.0% (161/209)
 gpt-5-nano
-  JSON compact   ███████████████████░    93.6% (191/204)
+→ TOON           ██████████████████░░    90.9% (190/209)
-  CSV            ██████████████████░░    90.4% (94/104)
+  JSON compact   ██████████████████░░    90.9% (190/209)
-  JSON           ██████████████████░░    89.7% (183/204)
+  JSON           ██████████████████░░    89.0% (186/209)
-→ TOON           ██████████████████░░    89.2% (182/204)
+  CSV            ██████████████████░░    89.0% (97/109)
-  YAML           ██████████████████░░    89.2% (182/204)
+  YAML           █████████████████░░░    87.1% (182/209)
-  XML            ████████████████░░░░    81.4% (166/204)
+  XML            ████████████████░░░░    80.9% (169/209)
 grok-4-fast-non-reasoning
-→ TOON           ████████████░░░░░░░░    59.3% (121/204)
+→ TOON           ███████████░░░░░░░░░    57.4% (120/209)
-  JSON compact   ███████████░░░░░░░░░    56.9% (116/204)
+  JSON           ███████████░░░░░░░░░    55.5% (116/209)
-  JSON           ███████████░░░░░░░░░    55.4% (113/204)
+  JSON compact   ███████████░░░░░░░░░    54.5% (114/209)
-  YAML           ███████████░░░░░░░░░    54.9% (112/204)
+  YAML           ███████████░░░░░░░░░    53.6% (112/209)
-  XML            ██████████░░░░░░░░░░    52.5% (107/204)
+  XML            ███████████░░░░░░░░░    52.6% (110/209)
-  CSV            ██████████░░░░░░░░░░    48.1% (50/104)
+  CSV            ██████████░░░░░░░░░░    52.3% (57/109)
 ```
-**Key tradeoff:** TOON achieves **75.5% accuracy** (vs JSON's 72.4%) while using **39.5% fewer tokens** on these datasets.
+**Key tradeoff:** TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
 <details>
 <summary><strong>Performance by dataset, model, and question type</strong></summary>
 #### Performance by Question Type
-| Question Type | TOON | JSON compact | JSON | YAML | XML | CSV |
+| Question Type | TOON | JSON compact | JSON | CSV | YAML | XML |
 | ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
-| Field Retrieval | 100.0% | 98.9% | 99.6% | 99.3% | 98.5% | 100.0% |
+| Field Retrieval | 99.6% | 99.3% | 99.3% | 100.0% | 98.2% | 98.9% |
-| Aggregation | 56.3% | 52.4% | 53.2% | 53.2% | 47.2% | 40.5% |
+| Aggregation | 54.4% | 47.2% | 48.8% | 44.0% | 47.6% | 41.3% |
-| Filtering | 58.9% | 58.3% | 54.2% | 53.1% | 50.5% | 49.1% |
+| Filtering | 56.3% | 57.3% | 50.5% | 49.1% | 51.0% | 47.9% |
-| Structure Awareness | 89.0% | 85.0% | 82.0% | 85.0% | 79.0% | 84.4% |
+| Structure Awareness | 88.0% | 83.0% | 83.0% | 85.9% | 80.0% | 80.0% |
 | Structural Validation | 70.0% | 45.0% | 50.0% | 80.0% | 60.0% | 80.0% |
 #### Performance by Dataset
@@ -99,64 +106,119 @@ grok-4-fast-non-reasoning
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `csv` | 70.7% | 2,337 | 116/164 |
+| `csv` | 72.0% | 2,352 | 118/164 |
-| `toon` | 72.0% | 2,483 | 118/164 |
+| `toon` | 73.8% | 2,518 | 121/164 |
-| `json-compact` | 71.3% | 3,943 | 117/164 |
+| `json-compact` | 69.5% | 3,953 | 114/164 |
-| `yaml` | 70.1% | 4,969 | 115/164 |
+| `yaml` | 68.3% | 4,982 | 112/164 |
-| `json-pretty` | 72.6% | 6,347 | 119/164 |
+| `json-pretty` | 68.3% | 6,360 | 112/164 |
-| `xml` | 70.7% | 7,314 | 116/164 |
+| `xml` | 69.5% | 7,324 | 114/164 |
 ##### E-commerce orders with nested structures
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 83.5% | 7,197 | 137/164 |
+| `toon` | 81.1% | 7,232 | 133/164 |
-| `json-compact` | 79.3% | 6,784 | 130/164 |
+| `json-compact` | 76.8% | 6,794 | 126/164 |
-| `yaml` | 78.7% | 8,334 | 129/164 |
+| `yaml` | 75.6% | 8,347 | 124/164 |
-| `json-pretty` | 78.7% | 10,700 | 129/164 |
+| `json-pretty` | 76.2% | 10,713 | 125/164 |
-| `xml` | 73.8% | 12,013 | 121/164 |
+| `xml` | 74.4% | 12,023 | 122/164 |
 ##### Time-series analytics data
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 75.8% | 1,513 | 91/120 |
+| `csv` | 73.3% | 1,406 | 88/120 |
-| `csv` | 72.5% | 1,391 | 87/120 |
+| `toon` | 72.5% | 1,548 | 87/120 |
-| `json-compact` | 70.0% | 2,339 | 84/120 |
+| `json-compact` | 71.7% | 2,349 | 86/120 |
-| `yaml` | 70.0% | 2,936 | 84/120 |
+| `yaml` | 71.7% | 2,949 | 86/120 |
-| `json-pretty` | 71.7% | 3,663 | 86/120 |
+| `json-pretty` | 68.3% | 3,676 | 82/120 |
-| `xml` | 71.7% | 4,374 | 86/120 |
+| `xml` | 68.3% | 4,384 | 82/120 |
 ##### Top 100 GitHub repositories
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 64.4% | 8,745 | 85/132 |
+| `toon` | 62.9% | 8,780 | 83/132 |
-| `csv` | 59.8% | 8,513 | 79/132 |
+| `csv` | 61.4% | 8,528 | 81/132 |
-| `json-compact` | 60.6% | 11,455 | 80/132 |
+| `yaml` | 59.8% | 13,142 | 79/132 |
-| `yaml` | 61.4% | 13,129 | 81/132 |
+| `json-compact` | 55.3% | 11,465 | 73/132 |
-| `json-pretty` | 59.1% | 15,145 | 78/132 |
+| `json-pretty` | 56.1% | 15,158 | 74/132 |
-| `xml` | 51.5% | 17,095 | 68/132 |
+| `xml` | 48.5% | 17,105 | 64/132 |
 ##### Semi-uniform event logs
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `json-compact` | 67.5% | 4,809 | 81/120 |
+| `json-compact` | 63.3% | 4,819 | 76/120 |
-| `yaml` | 63.3% | 5,814 | 76/120 |
+| `toon` | 57.5% | 5,799 | 69/120 |
-| `toon` | 62.5% | 5,764 | 75/120 |
+| `json-pretty` | 59.2% | 6,797 | 71/120 |
-| `json-pretty` | 59.2% | 6,784 | 71/120 |
+| `yaml` | 48.3% | 5,827 | 58/120 |
-| `xml` | 55.0% | 7,699 | 66/120 |
+| `xml` | 46.7% | 7,709 | 56/120 |
 ##### Deeply nested configuration
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `json-compact` | 91.4% | 564 | 106/116 |
+| `json-compact` | 92.2% | 574 | 107/116 |
-| `toon` | 94.8% | 631 | 110/116 |
+| `toon` | 95.7% | 666 | 111/116 |
-| `yaml` | 91.4% | 673 | 106/116 |
+| `yaml` | 91.4% | 686 | 106/116 |
-| `json-pretty` | 93.1% | 919 | 108/116 |
+| `json-pretty` | 94.0% | 932 | 109/116 |
-| `xml` | 91.4% | 1,008 | 106/116 |
+| `xml` | 92.2% | 1,018 | 107/116 |
 ##### Valid complete dataset (control)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `toon` | 100.0% | 544 | 4/4 |
 | `json-compact` | 100.0% | 795 | 4/4 |
 | `yaml` | 100.0% | 1,003 | 4/4 |
 | `json-pretty` | 100.0% | 1,282 | 4/4 |
 | `csv` | 25.0% | 492 | 1/4 |
 | `xml` | 0.0% | 1,467 | 0/4 |
 ##### Array truncated: 3 rows removed from end
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 425 | 4/4 |
 | `xml` | 100.0% | 1,251 | 4/4 |
 | `toon` | 0.0% | 474 | 0/4 |
 | `json-compact` | 0.0% | 681 | 0/4 |
 | `json-pretty` | 0.0% | 1,096 | 0/4 |
 | `yaml` | 0.0% | 859 | 0/4 |
 ##### Extra rows added beyond declared length
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 566 | 4/4 |
 | `toon` | 75.0% | 621 | 3/4 |
 | `xml` | 100.0% | 1,692 | 4/4 |
 | `yaml` | 75.0% | 1,157 | 3/4 |
 | `json-compact` | 50.0% | 917 | 2/4 |
 | `json-pretty` | 50.0% | 1,476 | 2/4 |
 ##### Inconsistent field count (missing salary in row 10)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 75.0% | 489 | 3/4 |
 | `yaml` | 100.0% | 996 | 4/4 |
 | `toon` | 100.0% | 1,019 | 4/4 |
 | `json-compact` | 75.0% | 790 | 3/4 |
 | `xml` | 100.0% | 1,458 | 4/4 |
 | `json-pretty` | 75.0% | 1,274 | 3/4 |
 ##### Missing required fields (no email in multiple rows)
 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
 | `csv` | 100.0% | 329 | 4/4 |
 | `xml` | 100.0% | 1,411 | 4/4 |
 | `toon` | 75.0% | 983 | 3/4 |
 | `yaml` | 25.0% | 960 | 1/4 |
 | `json-pretty` | 25.0% | 1,230 | 1/4 |
 | `json-compact` | 0.0% | 755 | 0/4 |
 #### Performance by Model
@@ -164,45 +226,45 @@ grok-4-fast-non-reasoning
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 62.3% | 127/204 |
+| `toon` | 59.8% | 125/209 |
-| `json-pretty` | 56.9% | 116/204 |
+| `json-pretty` | 57.4% | 120/209 |
-| `yaml` | 55.9% | 114/204 |
+| `yaml` | 56.0% | 117/209 |
-| `json-compact` | 54.9% | 112/204 |
+| `xml` | 55.5% | 116/209 |
-| `xml` | 54.9% | 112/204 |
+| `json-compact` | 55.0% | 115/209 |
-| `csv` | 47.1% | 49/104 |
+| `csv` | 50.5% | 55/109 |
 ##### gemini-2.5-flash
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 91.2% | 186/204 |
+| `toon` | 87.6% | 183/209 |
-| `yaml` | 89.7% | 183/204 |
+| `csv` | 86.2% | 94/109 |
-| `json-compact` | 87.7% | 179/204 |
+| `json-compact` | 82.3% | 172/209 |
-| `json-pretty` | 87.7% | 179/204 |
+| `yaml` | 79.4% | 166/209 |
-| `xml` | 87.3% | 178/204 |
+| `xml` | 79.4% | 166/209 |
-| `csv` | 85.6% | 89/104 |
+| `json-pretty` | 77.0% | 161/209 |
 ##### gpt-5-nano
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `json-compact` | 93.6% | 191/204 |
+| `toon` | 90.9% | 190/209 |
-| `csv` | 90.4% | 94/104 |
+| `json-compact` | 90.9% | 190/209 |
-| `json-pretty` | 89.7% | 183/204 |
+| `json-pretty` | 89.0% | 186/209 |
-| `toon` | 89.2% | 182/204 |
+| `csv` | 89.0% | 97/109 |
-| `yaml` | 89.2% | 182/204 |
+| `yaml` | 87.1% | 182/209 |
-| `xml` | 81.4% | 166/204 |
+| `xml` | 80.9% | 169/209 |
 ##### grok-4-fast-non-reasoning
 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 59.3% | 121/204 |
+| `toon` | 57.4% | 120/209 |
-| `json-compact` | 56.9% | 116/204 |
+| `json-pretty` | 55.5% | 116/209 |
-| `json-pretty` | 55.4% | 113/204 |
+| `json-compact` | 54.5% | 114/209 |
-| `yaml` | 54.9% | 112/204 |
+| `yaml` | 53.6% | 112/209 |
-| `xml` | 52.5% | 107/204 |
+| `xml` | 52.6% | 110/209 |
-| `csv` | 48.1% | 50/104 |
+| `csv` | 52.3% | 57/109 |
 </details>
@@ -215,8 +277,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
 #### Datasets Tested
-Six datasets designed to test different structural patterns:
+Eleven datasets designed to test different structural patterns and validation capabilities:
 **Primary datasets:**
 1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
 2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
 3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
@@ -224,21 +287,28 @@ Six datasets designed to test different structural patterns:
 5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
 6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
 **Structural validation datasets:**
 7. **Control**: Valid complete dataset (baseline for validation)
 8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
 9. **Extra rows**: Array with 3 additional rows beyond declared length
 10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
 11. **Missing fields**: Systematic field omissions (no email in multiple rows)
 #### Question Types
-204 questions are generated dynamically across four categories:
+209 questions are generated dynamically across five categories:
 - **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → `75000`
  - Example: "How many items are in order ORD-0042?" → `3`
  - Example: "What is the customer name for order ORD-0042?" → `John Doe`
- **Aggregation (31%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
+- **Aggregation (30%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
  - Example: "How many employees work in Engineering?" → `17`
  - Example: "What is the total revenue across all orders?" → `45123.50`
  - Example: "How many employees have salary > 80000?" → `23`
- **Filtering (24%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
+- **Filtering (23%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
  - Example: "How many employees in Sales have salary > 80000?" → `5`
  - Example: "How many active employees have more than 10 years of experience?" → `8`
@@ -247,17 +317,22 @@ Six datasets designed to test different structural patterns:
  - Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
  - Example: "What is the department of the last employee?" → `Sales`
 - **Structural validation (2%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
  - Example: "Is this data complete and valid?" → `YES` (control dataset) or `NO` (corrupted datasets)
  - Tests TOON's [N] length validation and {fields} consistency checking
  - Demonstrates CSV's lack of structural validation capabilities
 #### Evaluation Process
-1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, YAML, XML, CSV).
+1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, CSV, YAML, XML).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
+3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`) without requiring an LLM judge.
 #### Models & Configuration
 - **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
 - **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
 - **Temperature**: Not set (models use their defaults)
- **Total evaluations**: 204 questions × 6 formats × 4 models = 4,896 LLM calls
+- **Total evaluations**: 209 questions × 6 formats × 4 models = 5,016 LLM calls
 </details>
--- a/benchmarks/results/token-efficiency.md
+++ b/benchmarks/results/token-efficiency.md
@@ -5,19 +5,19 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
 ```
 🛒 E-commerce orders with nested structures  ┊  Tabular: 33%
   │
-   TOON                █████████████░░░░░░░    72,743 tokens
+   TOON                █████████████░░░░░░░    72,771 tokens
-   ├─ vs JSON          (−33.1%)               108,731 tokens
+   ├─ vs JSON          (−33.1%)               108,806 tokens
-   ├─ vs JSON compact  (+5.5%)                 68,936 tokens
+   ├─ vs JSON compact  (+5.5%)                 68,975 tokens
-   ├─ vs YAML          (−14.1%)                84,724 tokens
+   ├─ vs YAML          (−14.2%)                84,780 tokens
-   └─ vs XML           (−40.5%)               122,313 tokens
+   └─ vs XML           (−40.5%)               122,406 tokens
 🧾 Semi-uniform event logs  ┊  Tabular: 50%
   │
-   TOON                █████████████████░░░   153,223 tokens
+   TOON                █████████████████░░░   153,211 tokens
-   ├─ vs JSON          (−15.0%)               180,196 tokens
+   ├─ vs JSON          (−15.0%)               180,176 tokens
-   ├─ vs JSON compact  (+19.9%)               127,740 tokens
+   ├─ vs JSON compact  (+19.9%)               127,731 tokens
-   ├─ vs YAML          (−0.8%)                154,514 tokens
+   ├─ vs YAML          (−0.8%)                154,505 tokens
-   └─ vs XML           (−25.2%)               204,800 tokens
+   └─ vs XML           (−25.2%)               204,777 tokens
 🧩 Deeply nested configuration  ┊  Tabular: 0%
   │
@@ -28,11 +28,11 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
   └─ vs XML           (−37.4%)                 1,008 tokens
 ──────────────────────────────────── Total ────────────────────────────────────
-   TOON                ████████████████░░░░   226,597 tokens
+   TOON                ████████████████░░░░   226,613 tokens
-   ├─ vs JSON          (−21.8%)               289,846 tokens
+   ├─ vs JSON          (−21.8%)               289,901 tokens
-   ├─ vs JSON compact  (+14.9%)               197,240 tokens
+   ├─ vs JSON compact  (+14.9%)               197,270 tokens
-   ├─ vs YAML          (−5.5%)                239,911 tokens
+   ├─ vs YAML          (−5.6%)                239,958 tokens
-   └─ vs XML           (−30.9%)               328,121 tokens
+   └─ vs XML           (−31.0%)               328,191 tokens
 ```
 #### Flat-Only Track
@@ -42,21 +42,21 @@ Datasets with flat tabular structures where CSV is applicable.
 ```
 👥 Uniform employee records  ┊  Tabular: 100%
   │
-   CSV                 ███████████████████░    46,956 tokens
+   CSV                 ███████████████████░    46,954 tokens
-   TOON                ████████████████████    49,827 tokens   (+6.1% vs CSV)
+   TOON                ████████████████████    49,831 tokens   (+6.1% vs CSV)
-   ├─ vs JSON          (−60.7%)               126,854 tokens
+   ├─ vs JSON          (−60.7%)               126,860 tokens
-   ├─ vs JSON compact  (−36.8%)                78,850 tokens
+   ├─ vs JSON compact  (−36.8%)                78,856 tokens
-   ├─ vs YAML          (−50.0%)                99,701 tokens
+   ├─ vs YAML          (−50.0%)                99,706 tokens
-   └─ vs XML           (−66.0%)               146,440 tokens
+   └─ vs XML           (−66.0%)               146,444 tokens
 📈 Time-series analytics data  ┊  Tabular: 100%
   │
-   CSV                 ██████████████████░░     8,396 tokens
+   CSV                 ██████████████████░░     8,388 tokens
-   TOON                ████████████████████     9,128 tokens   (+8.7% vs CSV)
+   TOON                ████████████████████     9,120 tokens   (+8.7% vs CSV)
-   ├─ vs JSON          (−59.0%)                22,258 tokens
+   ├─ vs JSON          (−59.0%)                22,250 tokens
-   ├─ vs JSON compact  (−35.8%)                14,224 tokens
+   ├─ vs JSON compact  (−35.8%)                14,216 tokens
-   ├─ vs YAML          (−48.9%)                17,871 tokens
+   ├─ vs YAML          (−48.9%)                17,863 tokens
-   └─ vs XML           (−65.7%)                26,629 tokens
+   └─ vs XML           (−65.7%)                26,621 tokens
 ⭐ Top 100 GitHub repositories  ┊  Tabular: 100%
   │
@@ -68,12 +68,12 @@ Datasets with flat tabular structures where CSV is applicable.
   └─ vs XML           (−48.8%)                17,095 tokens
 ──────────────────────────────────── Total ────────────────────────────────────
-   CSV                 ███████████████████░    63,865 tokens
+   CSV                 ███████████████████░    63,855 tokens
-   TOON                ████████████████████    67,700 tokens   (+6.0% vs CSV)
+   TOON                ████████████████████    67,696 tokens   (+6.0% vs CSV)
-   ├─ vs JSON          (−58.8%)               164,257 tokens
+   ├─ vs JSON          (−58.8%)               164,255 tokens
-   ├─ vs JSON compact  (−35.2%)               104,529 tokens
+   ├─ vs JSON compact  (−35.2%)               104,527 tokens
-   ├─ vs YAML          (−48.2%)               130,701 tokens
+   ├─ vs YAML          (−48.2%)               130,698 tokens
-   └─ vs XML           (−64.4%)               190,164 tokens
+   └─ vs XML           (−64.4%)               190,160 tokens
 ```
 <details>
@@ -83,64 +83,64 @@ Datasets with flat tabular structures where CSV is applicable.
 **Savings:** 13,130 tokens (59.0% reduction vs JSON)
-**JSON** (22,258 tokens):
+**JSON** (22,250 tokens):
 ```json
 {
  "metrics": [
    {
      "date": "2025-01-01",
-      "views": 7708,
+      "views": 5715,
-      "clicks": 595,
+      "clicks": 211,
-      "conversions": 69,
+      "conversions": 28,
-      "revenue": 15369.93,
+      "revenue": 7976.46,
-      "bounceRate": 0.35
+      "bounceRate": 0.47
    },
    {
      "date": "2025-01-02",
-      "views": 5894,
+      "views": 7103,
-      "clicks": 381,
+      "clicks": 393,
-      "conversions": 21,
+      "conversions": 28,
-      "revenue": 2112.12,
+      "revenue": 8360.53,
-      "bounceRate": 0.3
+      "bounceRate": 0.32
    },
    {
      "date": "2025-01-03",
-      "views": 6835,
+      "views": 7248,
-      "clicks": 422,
+      "clicks": 378,
-      "conversions": 35,
+      "conversions": 24,
-      "revenue": 4525.73,
+      "revenue": 3212.57,
      "bounceRate": 0.5
    },
    {
      "date": "2025-01-04",
-      "views": 5325,
+      "views": 2927,
-      "clicks": 305,
+      "clicks": 77,
-      "conversions": 22,
+      "conversions": 11,
-      "revenue": 2445.3,
+      "revenue": 1211.69,
-      "bounceRate": 0.44
+      "bounceRate": 0.62
    },
    {
      "date": "2025-01-05",
-      "views": 2974,
+      "views": 3530,
-      "clicks": 61,
+      "clicks": 82,
-      "conversions": 6,
+      "conversions": 8,
-      "revenue": 956.57,
+      "revenue": 462.77,
-      "bounceRate": 0.47
+      "bounceRate": 0.56
    }
  ]
 }
 ```
-**TOON** (9,128 tokens):
+**TOON** (9,120 tokens):
 ```
 metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
-  2025-01-01,7708,595,69,15369.93,0.35
+  2025-01-01,5715,211,28,7976.46,0.47
-  2025-01-02,5894,381,21,2112.12,0.3
+  2025-01-02,7103,393,28,8360.53,0.32
-  2025-01-03,6835,422,35,4525.73,0.5
+  2025-01-03,7248,378,24,3212.57,0.5
-  2025-01-04,5325,305,22,2445.3,0.44
+  2025-01-04,2927,77,11,1211.69,0.62
-  2025-01-05,2974,61,6,956.57,0.47
+  2025-01-05,3530,82,8,462.77,0.56
 ```
 ---
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -56,9 +56,11 @@ export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
 */
 export const QUESTION_TYPES = [
  'field-retrieval',
  'retrieval',
  'aggregation',
  'filtering',
  'structure-awareness',
  'structural-validation',
 ] as const
 /**
@@ -66,9 +68,11 @@ export const QUESTION_TYPES = [
 */
 export const QUESTION_TYPE_LABELS = {
  'field-retrieval': 'Field Retrieval',
  'retrieval': 'Retrieval',
  'aggregation': 'Aggregation',
  'filtering': 'Filtering',
  'structure-awareness': 'Structure Awareness',
  'structural-validation': 'Structural Validation',
 } as const
 /**
@@ -81,6 +85,12 @@ export const DATASET_NAMES = [
  'github',
  'event-logs',
  'nested-config',
  'large-uniform',
  'structural-validation-control',
  'structural-validation-truncated',
  'structural-validation-extra-rows',
  'structural-validation-width-mismatch',
  'structural-validation-missing-fields',
 ] as const
 /**
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -144,6 +144,30 @@ export interface NestedConfig {
  }
 }
 /**
 * Product structure for large uniform arrays
 */
 export interface Product {
  sku: string
  name: string
  category: string
  price: number
  qty: number
  lastUpdated: string
 }
 /**
 * Internal types for structural validation pattern generation
 */
 type StructuralValidationType = 'truncated' | 'extra-rows' | 'width-mismatch' | 'missing-fields'
 interface StructuralValidationFixture {
  type: StructuralValidationType
  description: string
  data: Record<string, unknown>
  isValid: boolean
 }
 /**
 * Generate analytics time-series data
 */
@@ -505,6 +529,100 @@ export function generateNestedConfig(): NestedConfig {
  }
 }
 /**
 * Generate large uniform product array (5000+ rows)
 *
 * @remarks
 * Tests TOON's token efficiency and structural reliability at scale.
 */
 export function generateProducts(count: number): { products: Product[] } {
  const categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys'] as const
  return {
    products: Array.from({ length: count }, (_, i): Product => ({
      sku: `SKU-${String(i + 1).padStart(6, '0')}`,
      name: faker.commerce.productName(),
      category: categories[i % categories.length]!,
      price: Number(faker.commerce.price({ min: 5, max: 500 })),
      qty: faker.number.int({ min: 0, max: 1000 }),
      lastUpdated: faker.date.recent({ days: 30 }).toISOString().split('T')[0]!,
    })),
  }
 }
 /**
 * Generate structural validation fixtures from employee data
 *
 * @remarks
 * Creates deliberately corrupted datasets to test TOON's structural validation
 * capabilities via [N] length declarations and {fields} headers.
 * Internal function used to generate structural validation datasets.
 */
 function generateStructuralValidationFixtures(): StructuralValidationFixture[] {
  const baseData = generateEmployees(20)
  return [
    // Valid baseline
    {
      type: 'truncated' as const,
      description: 'Valid complete dataset (control)',
      data: { employees: baseData.employees },
      isValid: true,
    },
    // Truncated array (missing last 3 rows)
    {
      type: 'truncated' as const,
      description: 'Array truncated: 3 rows removed from end',
      data: { employees: baseData.employees.slice(0, -3) },
      isValid: false, // [N] won't match actual row count in TOON
    },
    // Extra rows (3 more than original)
    {
      type: 'extra-rows' as const,
      description: 'Extra rows added beyond declared length',
      data: {
        employees: [
          ...baseData.employees,
          ...generateEmployees(3).employees,
        ],
      },
      isValid: false, // [N] won't match actual row count in TOON
    },
    // Width mismatch (inconsistent field count)
    {
      type: 'width-mismatch' as const,
      description: 'Inconsistent field count (missing salary in row 10)',
      data: {
        employees: baseData.employees.map((emp, i) => {
          if (i === 9) {
            // Row 10, missing salary field
            const { salary, ...rest } = emp
            return rest
          }
          return emp
        }),
      },
      isValid: false, // Not all objects have same fields (tabular requirement)
    },
    // Missing required fields
    {
      type: 'missing-fields' as const,
      description: 'Missing required fields (no email in multiple rows)',
      data: {
        employees: baseData.employees.map((emp, i) => {
          if (i % 5 === 0) {
            // Every 5th row, missing email
            const { email, ...rest } = emp
            return rest
          }
          return emp
        }),
      },
      isValid: false, // Not all objects have same fields (tabular requirement)
    },
  ]
 }
 /**
 * Event logs dataset: Semi-uniform structure
 *
@@ -539,6 +657,34 @@ const nestedConfigDataset: Dataset = {
  },
 }
 /**
 * Structural validation datasets: Tests ability to detect incomplete, truncated, or corrupted data
 *
 * @remarks
 * These datasets test TOON's structural validation advantages via [N] length declarations
 * and {fields} headers. CSV is included to demonstrate its lack of structural metadata.
 */
 const structuralValidationDatasets: Dataset[] = generateStructuralValidationFixtures().map((fixture, index) => {
  const datasetNames = [
    'structural-validation-control',
    'structural-validation-truncated',
    'structural-validation-extra-rows',
    'structural-validation-width-mismatch',
    'structural-validation-missing-fields',
  ] as const
  return {
    name: datasetNames[index]!,
    description: fixture.description,
    data: fixture.data,
    metadata: {
      supportsCSV: true, // Include CSV to show it can't validate structure
      structureClass: 'uniform',
      tabularEligibility: 100,
    },
  }
 })
 /**
 * Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
 */
@@ -549,6 +695,7 @@ export const ACCURACY_DATASETS: Dataset[] = [
  githubDataset, // 100 repos
  eventLogsDataset, // 75 logs
  nestedConfigDataset, // 1 config
  ...structuralValidationDatasets, // 5 validation fixtures
 ]
 /**
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -5,6 +5,7 @@ import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
 import { xai } from '@ai-sdk/xai'
 import { generateText } from 'ai'
 import { compareAnswers } from './normalize'
 /**
 * Models used for evaluation
@@ -74,7 +75,13 @@ ${formattedData}
 Question: ${question.prompt}
-Provide only the direct answer, without any additional explanation or formatting.
+Answer format requirements:
 - Provide only the value itself, no explanation
 - For numbers: output digits only (no commas, currency symbols, or units)
 - For dates/field names: use the exact string from the data
 - For lists: output comma-separated values with no spaces
 Answer:
 `.trim()
  const startTime = performance.now()
@@ -83,11 +90,13 @@ Provide only the direct answer, without any additional explanation or formatting
  const actual = text.trim()
  const latencyMs = performance.now() - startTime
-  const isCorrect = await validateAnswer({
+  const comparisonResult = compareAnswers(
    actual,
-    expected: question.groundTruth,
+    question.groundTruth,
-    question: question.prompt,
+    question.answerType ?? 'string',
-  })
+    question.normalizationOptions,
  )
  const isCorrect = comparisonResult.match
  return {
    questionId: question.id,
@@ -101,42 +110,3 @@ Provide only the direct answer, without any additional explanation or formatting
    latencyMs,
  }
 }
 /**
 * Validate an answer using LLM-as-judge approach
 */
 async function validateAnswer(
  {
    actual,
    expected,
    question,
  }:
  {
    actual: string
    expected: string
    question: string
  },
 ): Promise<boolean> {
  const prompt = `
 You are validating answers to questions about structured data.
 Question: ${question}
 Expected answer: ${expected}
 Actual answer: ${actual}
 Is the actual answer correct? Consider:
 - Exact matches are correct
 - Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
 - Minor formatting differences are acceptable
 - Case-insensitive comparison for text
 Respond with only "YES" or "NO".
 `.trim()
  const { text } = await generateText({
    model: models.find(m => m.modelId === 'gpt-5-nano')!,
    prompt,
  })
  return text.trim().toUpperCase() === 'YES'
 }
--- a/benchmarks/src/normalize.ts
+++ b/benchmarks/src/normalize.ts
@@ -0,0 +1,386 @@
 /**
 * Type of expected answer for deterministic comparison
 */
 export type AnswerType
  = | 'integer'
    | 'number'
    | 'boolean'
    | 'date'
    | 'string'
    | 'csv-list-ordered'
    | 'csv-list-unordered'
 /**
 * Options for answer normalization and comparison
 */
 export interface NormalizationOptions {
  /**
   * Tolerance for floating-point number comparison (e.g., 1e-6).
   * @default 1e-6
   */
  tolerance?: number
  /**
   * Whether string comparison should be case-sensitive.
   * @default false
   */
  caseSensitive?: boolean
  /**
   * Allow currency symbols ($, €, etc.) in number extraction.
   * @default true
   */
  allowCurrency?: boolean
  /**
   * Allow percent signs (%) in number extraction (will divide by 100).
   * @default true
   */
  allowPercent?: boolean
  /**
   * Number of decimal places to round to for number comparison.
   * If specified, overrides tolerance-based comparison.
   */
  decimalPlaces?: number
 }
 interface NormalizedResult {
  success: boolean
  value?: unknown
  error?: string
 }
 /**
 * Default normalization options
 */
 const DEFAULT_OPTIONS: Required<NormalizationOptions> = {
  tolerance: 1e-6,
  caseSensitive: false,
  allowCurrency: true,
  allowPercent: true,
  decimalPlaces: undefined!,
 }
 // Regex pattern constants
 const INTEGER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*/
 const INTEGER_PATTERN = /-?\d[\d,]*/
 const NUMBER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
 const NUMBER_PATTERN = /-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
 const WRAPPING_QUOTES_PATTERN = /^["']|["']$/g
 const CODE_FENCE_PATTERN = /^```[\s\S]*?```$/g
 const LANGUAGE_IDENTIFIER_PATTERN = /^\w+\n/
 const CURRENCY_AND_FORMATTING_CHARS = /[$€£¥,\s]/g
 const NUMBER_CLEANUP_CHARS = /[$€£¥,%\s]/g
 // Boolean value constants
 const TRUE_VALUES = new Set(['true', 'yes', 'y', '1'])
 const FALSE_VALUES = new Set(['false', 'no', 'n', '0'])
 // Numeric constants
 const PERCENTAGE_DIVISOR = 100
 const DECIMAL_BASE = 10
 const MONTH_OFFSET = 1 // JavaScript months are 0-indexed
 const DATE_COMPONENT_WIDTH = 2
 const DATE_PAD_CHAR = '0'
 // String constants
 const CSV_DELIMITER = ','
 /**
 * Strip wrapping quotes from a string
 */
 function stripWrappingQuotes(text: string): string {
  return text.trim().replace(WRAPPING_QUOTES_PATTERN, '')
 }
 /**
 * Extract and normalize an integer from a string
 *
 * @remarks
 * Handles: "42", "1,234", "$5,678", "  -99  ", "The answer is 42."
 */
 function normalizeInteger(text: string, options: Required<NormalizationOptions>): NormalizedResult {
  // Strip common formatting, extract first integer-like token
  const pattern = options.allowCurrency
    ? INTEGER_PATTERN_WITH_CURRENCY
    : INTEGER_PATTERN
  const match = text.match(pattern)
  if (!match)
    return { success: false, error: `No integer found in: "${text}"` }
  // Remove currency symbols, spaces, and thousand separators
  const normalizedValue = match[0].replace(CURRENCY_AND_FORMATTING_CHARS, '')
  const parsedNumber = Number.parseInt(normalizedValue, DECIMAL_BASE)
  if (Number.isNaN(parsedNumber))
    return { success: false, error: `Failed to parse integer: "${match[0]}"` }
  return { success: true, value: parsedNumber }
 }
 /**
 * Extract and normalize a floating-point number from a string
 *
 * @remarks
 * Handles: "3.14", "1,234.56", "$5,678.90", "42%", "1.5e-3", "Price: $99.99"
 */
 function normalizeNumber(text: string, options: Required<NormalizationOptions>): NormalizedResult {
  // Extract first number-like token (supports scientific notation)
  const pattern = options.allowCurrency
    ? NUMBER_PATTERN_WITH_CURRENCY
    : NUMBER_PATTERN
  const match = text.match(pattern)
  if (!match)
    return { success: false, error: `No number found in: "${text}"` }
  const token = match[0]
  const hasPercentSign = options.allowPercent && token.endsWith('%')
  // Remove currency, commas, spaces, and percent sign
  const normalizedToken = token.replace(NUMBER_CLEANUP_CHARS, '')
  let parsedNumber = Number.parseFloat(normalizedToken)
  if (Number.isNaN(parsedNumber))
    return { success: false, error: `Failed to parse number: "${token}"` }
  // Convert percentage to decimal if present
  if (hasPercentSign)
    parsedNumber = parsedNumber / PERCENTAGE_DIVISOR
  // Round to specified decimal places if requested
  if (options.decimalPlaces !== undefined) {
    const factor = DECIMAL_BASE ** options.decimalPlaces
    parsedNumber = Math.round(parsedNumber * factor) / factor
  }
  return { success: true, value: parsedNumber }
 }
 /**
 * Normalize a boolean/yes-no answer
 *
 * @remarks
 * Handles: "true", "false", "yes", "no", "y", "n", "1", "0" (case-insensitive)
 */
 function normalizeBoolean(text: string): NormalizedResult {
  const normalizedValue = text.trim().toLowerCase()
  if (TRUE_VALUES.has(normalizedValue))
    return { success: true, value: true }
  if (FALSE_VALUES.has(normalizedValue))
    return { success: true, value: false }
  return { success: false, error: `Not a boolean: "${text}"` }
 }
 /**
 * Normalize a date string to YYYY-MM-DD format
 *
 * @remarks
 * Handles: ISO dates, "Nov 1, 2025", "2025-11-01", RFC 2822, etc.
 */
 function normalizeDate(text: string): NormalizedResult {
  const cleaned = stripWrappingQuotes(text)
  // Try parsing as date
  const parsedDate = new Date(cleaned)
  if (Number.isNaN(parsedDate.getTime()))
    return { success: false, error: `Invalid date: "${text}"` }
  // Normalize to YYYY-MM-DD (UTC)
  const year = parsedDate.getUTCFullYear()
  const monthPadded = String(parsedDate.getUTCMonth() + MONTH_OFFSET).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
  const dayPadded = String(parsedDate.getUTCDate()).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
  const normalized = `${year}-${monthPadded}-${dayPadded}`
  return { success: true, value: normalized }
 }
 /**
 * Normalize a string (trim, optionally case-insensitive)
 *
 * @remarks
 * Handles wrapping quotes and code fences.
 */
 function normalizeString(text: string, options: Required<NormalizationOptions>): NormalizedResult {
  let trimmedText = text.trim()
  // Strip wrapping quotes
  trimmedText = trimmedText.replace(WRAPPING_QUOTES_PATTERN, '')
  // Strip code fences (```...```)
  trimmedText = trimmedText.replace(CODE_FENCE_PATTERN, (match) => {
    const inner = match.slice(3, -3).trim()
    // Remove language identifier if present (e.g., ```json)
    return inner.replace(LANGUAGE_IDENTIFIER_PATTERN, '')
  })
  trimmedText = trimmedText.trim()
  const value = options.caseSensitive ? trimmedText : trimmedText.toLowerCase()
  return { success: true, value }
 }
 /**
 * Normalize a comma-separated list (ordered)
 *
 * @remarks
 * Handles: "a,b,c", "a, b, c", " a , b , c "
 */
 function normalizeCsvListOrdered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
  const strippedText = stripWrappingQuotes(text)
  const items = strippedText
    .split(CSV_DELIMITER)
    .map(item => item.trim())
    .filter(item => item.length > 0)
  const normalizedItems = items.map(item =>
    options.caseSensitive ? item : item.toLowerCase(),
  )
  return { success: true, value: normalizedItems }
 }
 /**
 * Normalize a comma-separated list (unordered, compare as sets)
 *
 * @remarks
 * Handles: "c,a,b" equals "a,b,c"
 */
 function normalizeCsvListUnordered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
  const result = normalizeCsvListOrdered(text, options)
  if (!result.success)
    return result
  // Type guard: ensure result.value is an array
  if (!Array.isArray(result.value))
    return { success: false, error: 'Expected array result from normalizeCsvListOrdered' }
  // Sort for deterministic comparison
  const sorted = [...result.value].sort()
  return { success: true, value: sorted }
 }
 /**
 * Normalize a value based on its expected kind
 */
 export function normalizeAnswer(
  text: string,
  kind: AnswerType,
  options: Partial<NormalizationOptions> = {},
 ): NormalizedResult {
  const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
  switch (kind) {
    case 'integer':
      return normalizeInteger(text, resolvedOptions)
    case 'number':
      return normalizeNumber(text, resolvedOptions)
    case 'boolean':
      return normalizeBoolean(text)
    case 'date':
      return normalizeDate(text)
    case 'string':
      return normalizeString(text, resolvedOptions)
    case 'csv-list-ordered':
      return normalizeCsvListOrdered(text, resolvedOptions)
    case 'csv-list-unordered':
      return normalizeCsvListUnordered(text, resolvedOptions)
    default:
      return { success: false, error: `Unknown answer kind: ${kind}` }
  }
 }
 /**
 * Compare two normalized values based on answer kind
 */
 function compareValues(
  actual: unknown,
  expected: unknown,
  kind: AnswerType,
  options: Required<NormalizationOptions>,
 ): boolean {
  switch (kind) {
    case 'integer':
    case 'boolean':
    case 'date':
    case 'string':
      return actual === expected
    case 'number':
      if (typeof actual !== 'number' || typeof expected !== 'number')
        return false
      if (options.decimalPlaces !== undefined) {
        // Already rounded during normalization
        return actual === expected
      }
      return Math.abs(actual - expected) <= options.tolerance
    case 'csv-list-ordered':
      if (!Array.isArray(actual) || !Array.isArray(expected))
        return false
      if (actual.length !== expected.length)
        return false
      return actual.every((item, i) => item === expected[i])
    case 'csv-list-unordered':
      if (!Array.isArray(actual) || !Array.isArray(expected))
        return false
      if (actual.length !== expected.length)
        return false
      // Already sorted during normalization
      return actual.every((item, i) => item === expected[i])
    default:
      return false
  }
 }
 /**
 * Compare actual and expected answers with deterministic, type-aware normalization
 *
 * @remarks
 * Returns true if answers match within the specified tolerance/rules.
 */
 export function compareAnswers(
  actual: string,
  expected: string,
  kind: AnswerType,
  options: Partial<NormalizationOptions> = {},
 ): { match: boolean, details?: string } {
  const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
  // Normalize both answers
  const actualResult = normalizeAnswer(actual, kind, resolvedOptions)
  const expectedResult = normalizeAnswer(expected, kind, resolvedOptions)
  // If either normalization failed, return false with details
  if (!actualResult.success) {
    return {
      match: false,
      details: `Failed to normalize actual answer: ${actualResult.error}`,
    }
  }
  if (!expectedResult.success) {
    return {
      match: false,
      details: `Failed to normalize expected answer: ${expectedResult.error}`,
    }
  }
  // Compare normalized values
  const match = compareValues(actualResult.value, expectedResult.value, kind, resolvedOptions)
  return {
    match,
    details: match
      ? undefined
      : `Mismatch: actual="${actualResult.value}" vs expected="${expectedResult.value}"`,
  }
 }
--- a/benchmarks/src/questions/analytics.ts
+++ b/benchmarks/src/questions/analytics.ts
@@ -17,6 +17,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(metric.views))
      .type('field-retrieval')
      .dataset('analytics')
      .answerType('integer')
      .build(),
    (metric, getId) => new QuestionBuilder()
      .id(getId())
@@ -24,6 +25,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(metric.revenue))
      .type('field-retrieval')
      .dataset('analytics')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    (metric, getId) => new QuestionBuilder()
      .id(getId())
@@ -31,6 +34,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(metric.bounceRate))
      .type('field-retrieval')
      .dataset('analytics')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    (metric, getId) => new QuestionBuilder()
      .id(getId())
@@ -38,6 +43,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(metric.conversions))
      .type('field-retrieval')
      .dataset('analytics')
      .answerType('integer')
      .build(),
  ]
@@ -63,6 +69,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(totalDays))
      .type('aggregation')
      .dataset('analytics')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -70,6 +77,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(totalViews))
      .type('aggregation')
      .dataset('analytics')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -77,6 +85,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(totalConversions))
      .type('aggregation')
      .dataset('analytics')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -84,6 +93,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(totalRevenue.toFixed(2)))
      .type('aggregation')
      .dataset('analytics')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -91,6 +102,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
      .groundTruth(String(avgBounceRate.toFixed(2)))
      .type('aggregation')
      .dataset('analytics')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
  )
@@ -104,6 +117,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
@@ -117,6 +131,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
@@ -133,6 +148,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('filtering')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
@@ -149,6 +165,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('filtering')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
@@ -165,6 +182,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('filtering')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
@@ -181,6 +199,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
        .groundTruth(String(count))
        .type('filtering')
        .dataset('analytics')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/event-logs.ts
+++ b/benchmarks/src/questions/event-logs.ts
@@ -17,6 +17,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(log.level)
      .type('field-retrieval')
      .dataset('event-logs')
      .answerType('string')
      .build(),
    (log, getId) => new QuestionBuilder()
      .id(getId())
@@ -24,6 +25,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(log.endpoint)
      .type('field-retrieval')
      .dataset('event-logs')
      .answerType('string')
      .build(),
    (log, getId) => new QuestionBuilder()
      .id(getId())
@@ -31,6 +33,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(log.statusCode))
      .type('field-retrieval')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
    (log, getId) => new QuestionBuilder()
      .id(getId())
@@ -38,6 +41,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(log.responseTime))
      .type('field-retrieval')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
  ]
@@ -60,6 +64,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(totalLogs))
      .type('aggregation')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -67,6 +72,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(avgResponseTime.toFixed(2)))
      .type('aggregation')
      .dataset('event-logs')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
  )
@@ -81,6 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('event-logs')
        .answerType('integer')
        .build(),
    )
  }
@@ -96,6 +104,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('event-logs')
        .answerType('integer')
        .build(),
    )
  }
@@ -111,6 +120,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(errorCount))
      .type('aggregation')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -118,6 +128,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(successCount))
      .type('aggregation')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
  )
@@ -130,6 +141,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .groundTruth(String(retryableErrorCount))
      .type('aggregation')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
  )
@@ -147,6 +159,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
        .groundTruth(String(count))
        .type('filtering')
        .dataset('event-logs')
        .answerType('integer')
        .build(),
    )
  }
@@ -161,6 +174,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
        .groundTruth(String(count))
        .type('filtering')
        .dataset('event-logs')
        .answerType('integer')
        .build(),
    )
  }
@@ -175,6 +189,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
        .groundTruth(String(count))
        .type('filtering')
        .dataset('event-logs')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/github.ts
+++ b/benchmarks/src/questions/github.ts
@@ -17,6 +17,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(repo.stars))
      .type('field-retrieval')
      .dataset('github')
      .answerType('integer')
      .build(),
    (repo, getId) => new QuestionBuilder()
      .id(getId())
@@ -24,6 +25,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(repo.forks))
      .type('field-retrieval')
      .dataset('github')
      .answerType('integer')
      .build(),
    (repo, getId) => new QuestionBuilder()
      .id(getId())
@@ -31,6 +33,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(repo.watchers))
      .type('field-retrieval')
      .dataset('github')
      .answerType('integer')
      .build(),
    (repo, getId) => new QuestionBuilder()
      .id(getId())
@@ -38,6 +41,8 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(repo.defaultBranch)
      .type('field-retrieval')
      .dataset('github')
      .answerType('string')
      .normalize({ caseSensitive: true })
      .build(),
  ]
@@ -62,6 +67,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(totalRepos))
      .type('aggregation')
      .dataset('github')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -69,6 +75,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(totalStars))
      .type('aggregation')
      .dataset('github')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -76,6 +83,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(totalForks))
      .type('aggregation')
      .dataset('github')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -83,6 +91,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
      .groundTruth(String(Math.round(avgStars)))
      .type('aggregation')
      .dataset('github')
      .answerType('integer')
      .build(),
  )
@@ -97,6 +106,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
@@ -111,6 +121,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
@@ -125,6 +136,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
@@ -139,6 +151,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
@@ -155,6 +168,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('filtering')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
@@ -171,6 +185,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
        .groundTruth(String(count))
        .type('filtering')
        .dataset('github')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/index.ts
+++ b/benchmarks/src/questions/index.ts
@@ -6,6 +6,7 @@ import { generateEventLogsQuestions } from './event-logs'
 import { generateGithubQuestions } from './github'
 import { generateNestedQuestions } from './nested'
 import { generateNestedConfigQuestions } from './nested-config'
 import { generateStructuralValidationQuestions } from './structural-validation'
 import { generateStructureQuestions } from './structure'
 import { generateTabularQuestions } from './tabular'
 import { createIdGenerator } from './utils'
@@ -47,5 +48,8 @@ export function generateQuestions(): Question[] {
  // Generate structure-awareness questions (tests format-native affordances)
  questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
  // Generate structural-validation questions (tests ability to detect corrupted data)
  questions.push(...generateStructuralValidationQuestions(getId))
  return questions
 }
--- a/benchmarks/src/questions/nested-config.ts
+++ b/benchmarks/src/questions/nested-config.ts
@@ -17,42 +17,52 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
    {
      prompt: 'What is the environment in the configuration?',
      groundTruth: config.environment,
      answerType: 'string' as const,
    },
    {
      prompt: 'What is the database host?',
      groundTruth: config.database.host,
      answerType: 'string' as const,
    },
    {
      prompt: 'What is the database port?',
      groundTruth: String(config.database.port),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the maximum connection pool size?',
      groundTruth: String(config.database.pool.max),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the session duration?',
      groundTruth: String(config.authentication.session.duration),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the minimum connection pool size?',
      groundTruth: String(config.database.pool.min),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the connection pool idle timeout?',
      groundTruth: String(config.database.pool.idleTimeout),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the database name?',
      groundTruth: config.database.name,
      answerType: 'string' as const,
    },
    {
      prompt: 'What is the session refresh threshold?',
      groundTruth: String(config.authentication.session.refreshThreshold),
      answerType: 'integer' as const,
    },
    {
      prompt: 'What is the version in the configuration?',
      groundTruth: config.version,
      answerType: 'string' as const,
    },
  ]
@@ -64,6 +74,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
        .groundTruth(q.groundTruth)
        .type('field-retrieval')
        .dataset('nested-config')
        .answerType(q.answerType)
        .build(),
    )
  }
@@ -82,6 +93,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(roleCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -89,6 +101,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(groupCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -96,6 +109,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(providerCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -103,6 +117,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(featureCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -110,6 +125,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(replicaCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
  )
@@ -122,6 +138,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(adminScopeProviderCount))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
  )
@@ -134,6 +151,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(enabledFeatures))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
  )
@@ -146,6 +164,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(adminPermissions))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
  )
@@ -164,6 +183,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(totalPermissions))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -171,6 +191,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(distinctPermissions))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -178,6 +199,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(totalVariants))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -185,6 +207,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(highPriorityReplicas))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -192,6 +215,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(featuresWithHighRollout))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -199,6 +223,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .groundTruth(String(groupsWithMultipleRoles))
      .type('aggregation')
      .dataset('nested-config')
      .answerType('integer')
      .build(),
  )
@@ -249,6 +274,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
        .groundTruth(q.groundTruth)
        .type('filtering')
        .dataset('nested-config')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/nested.ts
+++ b/benchmarks/src/questions/nested.ts
@@ -17,6 +17,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(order.total))
      .type('field-retrieval')
      .dataset('nested')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    (order, getId) => new QuestionBuilder()
      .id(getId())
@@ -24,6 +26,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(order.status)
      .type('field-retrieval')
      .dataset('nested')
      .answerType('string')
      .build(),
  ]
@@ -43,6 +46,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(order.customer.name)
      .type('field-retrieval')
      .dataset('nested')
      .answerType('string')
      .build(),
    (order, getId) => new QuestionBuilder()
      .id(getId())
@@ -50,6 +54,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(order.customer.email)
      .type('field-retrieval')
      .dataset('nested')
      .answerType('string')
      .build(),
    (order, getId) => new QuestionBuilder()
      .id(getId())
@@ -57,6 +62,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(order.orderDate || '')
      .type('field-retrieval')
      .dataset('nested')
      .answerType('string')
      .build(),
    (order, getId) => new QuestionBuilder()
      .id(getId())
@@ -64,6 +70,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(order.items.length))
      .type('field-retrieval')
      .dataset('nested')
      .answerType('integer')
      .build(),
  ]
@@ -94,6 +101,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('nested')
        .answerType('integer')
        .build(),
    )
  }
@@ -105,6 +113,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(totalRevenue.toFixed(2)))
      .type('aggregation')
      .dataset('nested')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -112,6 +122,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(avgOrderValue.toFixed(2)))
      .type('aggregation')
      .dataset('nested')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -119,6 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(totalOrders))
      .type('aggregation')
      .dataset('nested')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -126,6 +139,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
      .groundTruth(String(maxOrderValue.toFixed(2)))
      .type('aggregation')
      .dataset('nested')
      .answerType('number')
      .normalize({ decimalPlaces: 2 })
      .build(),
  )
@@ -139,6 +154,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('nested')
        .answerType('integer')
        .build(),
    )
  }
@@ -156,6 +172,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
        .groundTruth(String(count))
        .type('filtering')
        .dataset('nested')
        .answerType('integer')
        .build(),
    )
  }
@@ -172,6 +189,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
        .groundTruth(String(count))
        .type('filtering')
        .dataset('nested')
        .answerType('integer')
        .build(),
    )
  }
@@ -188,6 +206,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
        .groundTruth(String(count))
        .type('filtering')
        .dataset('nested')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/structural-validation.ts
+++ b/benchmarks/src/questions/structural-validation.ts
@@ -0,0 +1,44 @@
 import type { Question } from '../types'
 import { QuestionBuilder } from './utils'
 /**
 * Generate structural validation questions for all incompleteness fixtures
 *
 * These questions test the ability to detect incomplete, truncated, or corrupted data
 * by validating structural metadata (TOON's [N] length declarations and {fields} headers).
 *
 * @remarks
 * - TOON's advantage: Explicit [N] and {fields} enable validation
 * - CSV disadvantage: No structural metadata to validate against
 * - JSON/YAML disadvantage: Require manual counting and schema inference
 */
 export function generateStructuralValidationQuestions(
  getId: () => string,
 ): Question[] {
  const questions: Question[] = []
  // Dataset names and their expected validity
  const validationFixtures = [
    { dataset: 'structural-validation-control', isValid: true, description: 'Valid complete dataset (control)' },
    { dataset: 'structural-validation-truncated', isValid: false, description: 'Array truncated: 3 rows removed from end' },
    { dataset: 'structural-validation-extra-rows', isValid: false, description: 'Extra rows added beyond declared length' },
    { dataset: 'structural-validation-width-mismatch', isValid: false, description: 'Inconsistent field count (missing salary in row 10)' },
    { dataset: 'structural-validation-missing-fields', isValid: false, description: 'Missing required fields (no email in multiple rows)' },
  ] as const
  // Generate one validation question per fixture
  for (const fixture of validationFixtures) {
    questions.push(
      new QuestionBuilder()
        .id(getId())
        .prompt('Is this data complete and valid? Answer only YES or NO.')
        .groundTruth(fixture.isValid ? 'YES' : 'NO')
        .type('structural-validation')
        .dataset(fixture.dataset)
        .answerType('boolean')
        .build(),
    )
  }
  return questions
 }
--- a/benchmarks/src/questions/structure.ts
+++ b/benchmarks/src/questions/structure.ts
@@ -30,6 +30,7 @@ export function generateStructureQuestions(
      .groundTruth(String(employees.length))
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('integer')
      .build(),
  )
@@ -42,6 +43,7 @@ export function generateStructureQuestions(
      .groundTruth(employeeFields)
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -53,6 +55,7 @@ export function generateStructureQuestions(
      .groundTruth('email')
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('string')
      .build(),
  )
@@ -65,6 +68,7 @@ export function generateStructureQuestions(
      .groundTruth(lastEmployee.department)
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('string')
      .build(),
  )
@@ -76,6 +80,7 @@ export function generateStructureQuestions(
      .groundTruth(lastEmployee.name)
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('string')
      .build(),
  )
@@ -87,6 +92,7 @@ export function generateStructureQuestions(
      .groundTruth('7')
      .type('structure-awareness')
      .dataset('tabular')
      .answerType('integer')
      .build(),
  )
@@ -100,6 +106,7 @@ export function generateStructureQuestions(
      .groundTruth(String(orders.length))
      .type('structure-awareness')
      .dataset('nested')
      .answerType('integer')
      .build(),
  )
@@ -112,6 +119,7 @@ export function generateStructureQuestions(
      .groundTruth(orderFields)
      .type('structure-awareness')
      .dataset('nested')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -126,6 +134,7 @@ export function generateStructureQuestions(
      .groundTruth(String(orderWithManyItems.items.length))
      .type('structure-awareness')
      .dataset('nested')
      .answerType('integer')
      .build(),
  )
@@ -138,6 +147,7 @@ export function generateStructureQuestions(
      .groundTruth(itemFields)
      .type('structure-awareness')
      .dataset('nested')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -150,6 +160,7 @@ export function generateStructureQuestions(
      .groundTruth(lastOrder.status)
      .type('structure-awareness')
      .dataset('nested')
      .answerType('string')
      .build(),
  )
@@ -162,6 +173,7 @@ export function generateStructureQuestions(
      .groundTruth(customerFields)
      .type('structure-awareness')
      .dataset('nested')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -175,6 +187,7 @@ export function generateStructureQuestions(
      .groundTruth(String(metrics.length))
      .type('structure-awareness')
      .dataset('analytics')
      .answerType('integer')
      .build(),
  )
@@ -187,6 +200,7 @@ export function generateStructureQuestions(
      .groundTruth(metricFields)
      .type('structure-awareness')
      .dataset('analytics')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -198,6 +212,7 @@ export function generateStructureQuestions(
      .groundTruth('revenue')
      .type('structure-awareness')
      .dataset('analytics')
      .answerType('string')
      .build(),
  )
@@ -210,6 +225,7 @@ export function generateStructureQuestions(
      .groundTruth(lastMetric.date)
      .type('structure-awareness')
      .dataset('analytics')
      .answerType('string')
      .build(),
  )
@@ -221,6 +237,7 @@ export function generateStructureQuestions(
      .groundTruth('6')
      .type('structure-awareness')
      .dataset('analytics')
      .answerType('integer')
      .build(),
  )
@@ -234,6 +251,7 @@ export function generateStructureQuestions(
      .groundTruth(String(repos.length))
      .type('structure-awareness')
      .dataset('github')
      .answerType('integer')
      .build(),
  )
@@ -246,6 +264,7 @@ export function generateStructureQuestions(
      .groundTruth(repoFields)
      .type('structure-awareness')
      .dataset('github')
      .answerType('csv-list-ordered')
      .build(),
  )
@@ -257,6 +276,7 @@ export function generateStructureQuestions(
      .groundTruth('forks')
      .type('structure-awareness')
      .dataset('github')
      .answerType('string')
      .build(),
  )
@@ -269,6 +289,7 @@ export function generateStructureQuestions(
      .groundTruth(lastRepo.name)
      .type('structure-awareness')
      .dataset('github')
      .answerType('string')
      .build(),
  )
@@ -280,6 +301,7 @@ export function generateStructureQuestions(
      .groundTruth('11')
      .type('structure-awareness')
      .dataset('github')
      .answerType('integer')
      .build(),
  )
@@ -293,6 +315,7 @@ export function generateStructureQuestions(
      .groundTruth(String(logs.length))
      .type('structure-awareness')
      .dataset('event-logs')
      .answerType('integer')
      .build(),
  )
@@ -305,6 +328,7 @@ export function generateStructureQuestions(
      .groundTruth(logFields)
      .type('structure-awareness')
      .dataset('event-logs')
      .answerType('csv-list-unordered')
      .build(),
  )
@@ -317,6 +341,7 @@ export function generateStructureQuestions(
      .groundTruth(lastLog.level)
      .type('structure-awareness')
      .dataset('event-logs')
      .answerType('string')
      .build(),
  )
--- a/benchmarks/src/questions/tabular.ts
+++ b/benchmarks/src/questions/tabular.ts
@@ -17,6 +17,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(emp.salary))
      .type('field-retrieval')
      .dataset('tabular')
      .answerType('integer')
      .build(),
    (emp, getId) => new QuestionBuilder()
      .id(getId())
@@ -24,6 +25,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(emp.department)
      .type('field-retrieval')
      .dataset('tabular')
      .answerType('string')
      .build(),
    (emp, getId) => new QuestionBuilder()
      .id(getId())
@@ -31,6 +33,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(emp.email)
      .type('field-retrieval')
      .dataset('tabular')
      .answerType('string')
      .build(),
    (emp, getId) => new QuestionBuilder()
      .id(getId())
@@ -38,6 +41,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(emp.yearsExperience))
      .type('field-retrieval')
      .dataset('tabular')
      .answerType('integer')
      .build(),
    (emp, getId) => new QuestionBuilder()
      .id(getId())
@@ -45,6 +49,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(emp.active ? 'yes' : 'no')
      .type('field-retrieval')
      .dataset('tabular')
      .answerType('boolean')
      .build(),
  ]
@@ -67,6 +72,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
@@ -81,6 +87,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('aggregation')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
@@ -98,6 +105,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(totalEmployees))
      .type('aggregation')
      .dataset('tabular')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -105,6 +113,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(avgSalary))
      .type('aggregation')
      .dataset('tabular')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -112,6 +121,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(activeCount))
      .type('aggregation')
      .dataset('tabular')
      .answerType('integer')
      .build(),
    new QuestionBuilder()
      .id(getId())
@@ -119,6 +129,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
      .groundTruth(String(inactiveCount))
      .type('aggregation')
      .dataset('tabular')
      .answerType('integer')
      .build(),
  )
@@ -134,6 +145,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('filtering')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
@@ -148,6 +160,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('filtering')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
@@ -164,6 +177,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('filtering')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
@@ -178,6 +192,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
        .groundTruth(String(count))
        .type('filtering')
        .dataset('tabular')
        .answerType('integer')
        .build(),
    )
  }
--- a/benchmarks/src/questions/utils.ts
+++ b/benchmarks/src/questions/utils.ts
@@ -1,3 +1,4 @@
 import type { AnswerType, NormalizationOptions } from '../normalize'
 import type { Question } from '../types'
 // Constants for sampling strides
@@ -52,10 +53,21 @@ export class QuestionBuilder {
    return this
  }
  answerType(kind: AnswerType): this {
    this.question.answerType = kind
    return this
  }
  normalize(options: Partial<NormalizationOptions>): this {
    this.question.normalizationOptions = options
    return this
  }
  build(): Question {
    if (!this.question.id || !this.question.prompt || !this.question.groundTruth || !this.question.type || !this.question.dataset) {
      throw new Error('Incomplete question')
    }
    return this.question as Question
  }
 }
@@ -65,7 +77,7 @@ export class QuestionBuilder {
 */
 export function rotateQuestions<T>(
  items: T[],
-  generators: Array<(item: T, getId: () => string) => Question>,
+  generators: ((item: T, getId: () => string) => Question)[],
  limit: number,
  stride: number,
  getId: () => string,
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,7 +1,7 @@
 import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
 import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
 import { ACCURACY_DATASETS } from './datasets'
-import { models } from './evaluate'
+import { models, PRIMERS } from './evaluate'
 import { supportsCSV } from './formatters'
 import { generateQuestions } from './questions'
 import { createProgressBar, tokenize } from './utils'
@@ -10,6 +10,9 @@ const EFFICIENCY_CHART_STYLE: 'vertical' | 'horizontal' = 'horizontal'
 /**
 * Calculate token counts for all format+dataset combinations
 *
 * @remarks
 * Includes primer tokens for fairer comparison across formats
 */
 export function calculateTokenCounts(
  formatters: Record<string, (data: unknown) => string>,
@@ -23,8 +26,11 @@ export function calculateTokenCounts(
        continue
      const formattedData = formatter(dataset.data)
      const primer = PRIMERS[formatName] ?? ''
      // Include primer in token count for fair comparison
      const fullPrompt = primer ? `${primer}\n\n${formattedData}` : formattedData
      const key = `${formatName}-${dataset.name}`
-      tokenCounts[key] = tokenize(formattedData)
+      tokenCounts[key] = tokenize(fullPrompt)
    }
  }
@@ -137,9 +143,12 @@ function generateEfficiencyRankingReport(
 ): string {
  const toon = formatResults.find(r => r.format === 'toon')
  const json = formatResults.find(r => r.format === 'json-pretty')
  const csv = formatResults.find(r => r.format === 'csv')
  // Build efficiency ranking (accuracy per 1k tokens)
  const efficiencyRanking = formatResults
    // Exclude CSV since it only supports a subset of datasets (~half the questions)
    .filter(fr => fr.format !== 'csv')
    .map((fr) => {
      const efficiency = (fr.accuracy * 100) / (fr.totalTokens / 1000)
      return {
@@ -163,6 +172,12 @@ function generateEfficiencyRankingReport(
    summary = `TOON achieves ${toonVsJson} while using ${tokenSavings}.`
  }
  // Add CSV note if available
  let csvNote = ''
  if (csv) {
    csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
  }
  return `
 Each format's overall performance, balancing accuracy against token cost:
@@ -170,7 +185,7 @@ Each format's overall performance, balancing accuracy against token cost:
 ${efficiencyChart}
 \`\`\`
-${summary}
+${summary}${csvNote}
 `.trim()
 }
@@ -210,11 +225,13 @@ function generateDetailedAccuracyReport(
  const aggregationCount = questions.filter(q => q.type === 'aggregation').length
  const filteringCount = questions.filter(q => q.type === 'filtering').length
  const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
  const structuralValidationCount = questions.filter(q => q.type === 'structural-validation').length
  const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
  const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
  const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
  const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
  const structuralValidationPercent = ((structuralValidationCount / totalQuestions) * 100).toFixed(0)
  // Calculate dataset sizes
  const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
@@ -263,8 +280,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
 #### Datasets Tested
-Six datasets designed to test different structural patterns:
+Eleven datasets designed to test different structural patterns and validation capabilities:
 **Primary datasets:**
 1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
 2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
 3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
@@ -272,9 +290,16 @@ Six datasets designed to test different structural patterns:
 5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
 6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.
 **Structural validation datasets:**
 7. **Control**: Valid complete dataset (baseline for validation)
 8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
 9. **Extra rows**: Array with 3 additional rows beyond declared length
 10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
 11. **Missing fields**: Systematic field omissions (no email in multiple rows)
 #### Question Types
-${totalQuestions} questions are generated dynamically across four categories:
+${totalQuestions} questions are generated dynamically across five categories:
 - **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → \`75000\`
@@ -295,11 +320,16 @@ ${totalQuestions} questions are generated dynamically across four categories:
  - Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
  - Example: "What is the department of the last employee?" → \`Sales\`
 - **Structural validation (${structuralValidationPercent}%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
  - Example: "Is this data complete and valid?" → \`YES\` (control dataset) or \`NO\` (corrupted datasets)
  - Tests TOON's [N] length validation and {fields} consistency checking
  - Demonstrates CSV's lack of structural validation capabilities
 #### Evaluation Process
 1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
+3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`) without requiring an LLM judge.
 #### Models & Configuration
@@ -376,9 +406,12 @@ function generateDatasetBreakdown(
  questions: Question[],
  tokenCounts: Record<string, number>,
 ): string {
  // Build question ID to dataset mapping for O(1) lookups
  const questionDatasetMap = new Map(questions.map(q => [q.id, q.dataset]))
  return ACCURACY_DATASETS.map((dataset) => {
    const datasetResults = formatResults.map((fr) => {
-      const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
+      const datasetFormatResults = results.filter(r => questionDatasetMap.get(r.questionId) === dataset.name)
      if (datasetFormatResults.length === 0)
        return undefined
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -1,4 +1,5 @@
 import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
 import type { AnswerType, NormalizationOptions } from './normalize'
 export type QuestionType = typeof QUESTION_TYPES[number]
 export type DatasetName = typeof DATASET_NAMES[number]
@@ -23,6 +24,15 @@ export interface Question {
  groundTruth: string
  type: QuestionType
  dataset: DatasetName
  /**
   * Expected answer kind for deterministic comparison.
   * @default 'string'
   */
  answerType?: AnswerType
  /**
   * Options for answer normalization and comparison.
   */
  normalizationOptions?: Partial<NormalizationOptions>
 }
 export interface EvaluationResult {
--- a/packages/toon/package.json
+++ b/packages/toon/package.json
@@ -3,7 +3,7 @@
  "type": "module",
  "version": "0.8.0",
  "packageManager": "pnpm@10.20.0",
-  "description": "Token-Oriented Object Notation (TOON) – a token-efficient JSON alternative for LLM prompts",
+  "description": "Token-Oriented Object Notation (TOON) – A compact, deterministic JSON format for LLM prompts",
  "author": "Johann Schopplich <hello@johannschopplich.com>",
  "license": "MIT",
  "homepage": "https://toonformat.dev",