mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
chore(benchmarks): replace LLM-as-judge, new structural validation
This commit is contained in:
414
README.md
414
README.md
@@ -75,7 +75,7 @@ See [benchmarks](#benchmarks) for concrete comparisons across different data str
|
|||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
- 💸 **Token-efficient:** typically 30–60% fewer tokens than JSON[^1]
|
- 💸 **Token-efficient:** typically 30-60% fewer tokens on large uniform arrays vs formatted JSON[^1]
|
||||||
- 🤿 **LLM-friendly guardrails:** explicit lengths and fields enable validation
|
- 🤿 **LLM-friendly guardrails:** explicit lengths and fields enable validation
|
||||||
- 🍱 **Minimal syntax:** removes redundant punctuation (braces, brackets, most quotes)
|
- 🍱 **Minimal syntax:** removes redundant punctuation (braces, brackets, most quotes)
|
||||||
- 📐 **Indentation-based structure:** like YAML, uses whitespace instead of braces
|
- 📐 **Indentation-based structure:** like YAML, uses whitespace instead of braces
|
||||||
@@ -108,19 +108,19 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
|
|||||||
```
|
```
|
||||||
🛒 E-commerce orders with nested structures ┊ Tabular: 33%
|
🛒 E-commerce orders with nested structures ┊ Tabular: 33%
|
||||||
│
|
│
|
||||||
TOON █████████████░░░░░░░ 72,743 tokens
|
TOON █████████████░░░░░░░ 72,771 tokens
|
||||||
├─ vs JSON (−33.1%) 108,731 tokens
|
├─ vs JSON (−33.1%) 108,806 tokens
|
||||||
├─ vs JSON compact (+5.5%) 68,936 tokens
|
├─ vs JSON compact (+5.5%) 68,975 tokens
|
||||||
├─ vs YAML (−14.1%) 84,724 tokens
|
├─ vs YAML (−14.2%) 84,780 tokens
|
||||||
└─ vs XML (−40.5%) 122,313 tokens
|
└─ vs XML (−40.5%) 122,406 tokens
|
||||||
|
|
||||||
🧾 Semi-uniform event logs ┊ Tabular: 50%
|
🧾 Semi-uniform event logs ┊ Tabular: 50%
|
||||||
│
|
│
|
||||||
TOON █████████████████░░░ 153,223 tokens
|
TOON █████████████████░░░ 153,211 tokens
|
||||||
├─ vs JSON (−15.0%) 180,196 tokens
|
├─ vs JSON (−15.0%) 180,176 tokens
|
||||||
├─ vs JSON compact (+19.9%) 127,740 tokens
|
├─ vs JSON compact (+19.9%) 127,731 tokens
|
||||||
├─ vs YAML (−0.8%) 154,514 tokens
|
├─ vs YAML (−0.8%) 154,505 tokens
|
||||||
└─ vs XML (−25.2%) 204,800 tokens
|
└─ vs XML (−25.2%) 204,777 tokens
|
||||||
|
|
||||||
🧩 Deeply nested configuration ┊ Tabular: 0%
|
🧩 Deeply nested configuration ┊ Tabular: 0%
|
||||||
│
|
│
|
||||||
@@ -131,11 +131,11 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
|
|||||||
└─ vs XML (−37.4%) 1,008 tokens
|
└─ vs XML (−37.4%) 1,008 tokens
|
||||||
|
|
||||||
──────────────────────────────────── Total ────────────────────────────────────
|
──────────────────────────────────── Total ────────────────────────────────────
|
||||||
TOON ████████████████░░░░ 226,597 tokens
|
TOON ████████████████░░░░ 226,613 tokens
|
||||||
├─ vs JSON (−21.8%) 289,846 tokens
|
├─ vs JSON (−21.8%) 289,901 tokens
|
||||||
├─ vs JSON compact (+14.9%) 197,240 tokens
|
├─ vs JSON compact (+14.9%) 197,270 tokens
|
||||||
├─ vs YAML (−5.5%) 239,911 tokens
|
├─ vs YAML (−5.6%) 239,958 tokens
|
||||||
└─ vs XML (−30.9%) 328,121 tokens
|
└─ vs XML (−31.0%) 328,191 tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Flat-Only Track
|
#### Flat-Only Track
|
||||||
@@ -145,21 +145,21 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
```
|
```
|
||||||
👥 Uniform employee records ┊ Tabular: 100%
|
👥 Uniform employee records ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
CSV ███████████████████░ 46,956 tokens
|
CSV ███████████████████░ 46,954 tokens
|
||||||
TOON ████████████████████ 49,827 tokens (+6.1% vs CSV)
|
TOON ████████████████████ 49,831 tokens (+6.1% vs CSV)
|
||||||
├─ vs JSON (−60.7%) 126,854 tokens
|
├─ vs JSON (−60.7%) 126,860 tokens
|
||||||
├─ vs JSON compact (−36.8%) 78,850 tokens
|
├─ vs JSON compact (−36.8%) 78,856 tokens
|
||||||
├─ vs YAML (−50.0%) 99,701 tokens
|
├─ vs YAML (−50.0%) 99,706 tokens
|
||||||
└─ vs XML (−66.0%) 146,440 tokens
|
└─ vs XML (−66.0%) 146,444 tokens
|
||||||
|
|
||||||
📈 Time-series analytics data ┊ Tabular: 100%
|
📈 Time-series analytics data ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
CSV ██████████████████░░ 8,396 tokens
|
CSV ██████████████████░░ 8,388 tokens
|
||||||
TOON ████████████████████ 9,128 tokens (+8.7% vs CSV)
|
TOON ████████████████████ 9,120 tokens (+8.7% vs CSV)
|
||||||
├─ vs JSON (−59.0%) 22,258 tokens
|
├─ vs JSON (−59.0%) 22,250 tokens
|
||||||
├─ vs JSON compact (−35.8%) 14,224 tokens
|
├─ vs JSON compact (−35.8%) 14,216 tokens
|
||||||
├─ vs YAML (−48.9%) 17,871 tokens
|
├─ vs YAML (−48.9%) 17,863 tokens
|
||||||
└─ vs XML (−65.7%) 26,629 tokens
|
└─ vs XML (−65.7%) 26,621 tokens
|
||||||
|
|
||||||
⭐ Top 100 GitHub repositories ┊ Tabular: 100%
|
⭐ Top 100 GitHub repositories ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
@@ -171,12 +171,12 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
└─ vs XML (−48.8%) 17,095 tokens
|
└─ vs XML (−48.8%) 17,095 tokens
|
||||||
|
|
||||||
──────────────────────────────────── Total ────────────────────────────────────
|
──────────────────────────────────── Total ────────────────────────────────────
|
||||||
CSV ███████████████████░ 63,865 tokens
|
CSV ███████████████████░ 63,855 tokens
|
||||||
TOON ████████████████████ 67,700 tokens (+6.0% vs CSV)
|
TOON ████████████████████ 67,696 tokens (+6.0% vs CSV)
|
||||||
├─ vs JSON (−58.8%) 164,257 tokens
|
├─ vs JSON (−58.8%) 164,255 tokens
|
||||||
├─ vs JSON compact (−35.2%) 104,529 tokens
|
├─ vs JSON compact (−35.2%) 104,527 tokens
|
||||||
├─ vs YAML (−48.2%) 130,701 tokens
|
├─ vs YAML (−48.2%) 130,698 tokens
|
||||||
└─ vs XML (−64.4%) 190,164 tokens
|
└─ vs XML (−64.4%) 190,160 tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
@@ -186,64 +186,64 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
|
|
||||||
**Savings:** 13,130 tokens (59.0% reduction vs JSON)
|
**Savings:** 13,130 tokens (59.0% reduction vs JSON)
|
||||||
|
|
||||||
**JSON** (22,258 tokens):
|
**JSON** (22,250 tokens):
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"metrics": [
|
"metrics": [
|
||||||
{
|
{
|
||||||
"date": "2025-01-01",
|
"date": "2025-01-01",
|
||||||
"views": 7708,
|
"views": 5715,
|
||||||
"clicks": 595,
|
"clicks": 211,
|
||||||
"conversions": 69,
|
"conversions": 28,
|
||||||
"revenue": 15369.93,
|
"revenue": 7976.46,
|
||||||
"bounceRate": 0.35
|
"bounceRate": 0.47
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-02",
|
"date": "2025-01-02",
|
||||||
"views": 5894,
|
"views": 7103,
|
||||||
"clicks": 381,
|
"clicks": 393,
|
||||||
"conversions": 21,
|
"conversions": 28,
|
||||||
"revenue": 2112.12,
|
"revenue": 8360.53,
|
||||||
"bounceRate": 0.3
|
"bounceRate": 0.32
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-03",
|
"date": "2025-01-03",
|
||||||
"views": 6835,
|
"views": 7248,
|
||||||
"clicks": 422,
|
"clicks": 378,
|
||||||
"conversions": 35,
|
"conversions": 24,
|
||||||
"revenue": 4525.73,
|
"revenue": 3212.57,
|
||||||
"bounceRate": 0.5
|
"bounceRate": 0.5
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-04",
|
"date": "2025-01-04",
|
||||||
"views": 5325,
|
"views": 2927,
|
||||||
"clicks": 305,
|
"clicks": 77,
|
||||||
"conversions": 22,
|
"conversions": 11,
|
||||||
"revenue": 2445.3,
|
"revenue": 1211.69,
|
||||||
"bounceRate": 0.44
|
"bounceRate": 0.62
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-05",
|
"date": "2025-01-05",
|
||||||
"views": 2974,
|
"views": 3530,
|
||||||
"clicks": 61,
|
"clicks": 82,
|
||||||
"conversions": 6,
|
"conversions": 8,
|
||||||
"revenue": 956.57,
|
"revenue": 462.77,
|
||||||
"bounceRate": 0.47
|
"bounceRate": 0.56
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**TOON** (9,128 tokens):
|
**TOON** (9,120 tokens):
|
||||||
|
|
||||||
```
|
```
|
||||||
metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
||||||
2025-01-01,7708,595,69,15369.93,0.35
|
2025-01-01,5715,211,28,7976.46,0.47
|
||||||
2025-01-02,5894,381,21,2112.12,0.3
|
2025-01-02,7103,393,28,8360.53,0.32
|
||||||
2025-01-03,6835,422,35,4525.73,0.5
|
2025-01-03,7248,378,24,3212.57,0.5
|
||||||
2025-01-04,5325,305,22,2445.3,0.44
|
2025-01-04,2927,77,11,1211.69,0.62
|
||||||
2025-01-05,2974,61,6,956.57,0.47
|
2025-01-05,3530,82,8,462.77,0.56
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -317,7 +317,7 @@ repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watc
|
|||||||
|
|
||||||
<!-- automd:file src="./benchmarks/results/retrieval-accuracy.md" -->
|
<!-- automd:file src="./benchmarks/results/retrieval-accuracy.md" -->
|
||||||
|
|
||||||
Benchmarks test LLM comprehension across different input formats using 204 data retrieval questions on 4 models.
|
Benchmarks test LLM comprehension across different input formats using 209 data retrieval questions on 4 models.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Show Dataset Catalog</strong></summary>
|
<summary><strong>Show Dataset Catalog</strong></summary>
|
||||||
@@ -332,6 +332,11 @@ Benchmarks test LLM comprehension across different input formats using 204 data
|
|||||||
| Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
|
| Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
|
||||||
| Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
|
| Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
|
||||||
| Deeply nested configuration | 11 | deep | ✗ | 0% |
|
| Deeply nested configuration | 11 | deep | ✗ | 0% |
|
||||||
|
| Valid complete dataset (control) | 20 | uniform | ✓ | 100% |
|
||||||
|
| Array truncated: 3 rows removed from end | 17 | uniform | ✓ | 100% |
|
||||||
|
| Extra rows added beyond declared length | 23 | uniform | ✓ | 100% |
|
||||||
|
| Inconsistent field count (missing salary in row 10) | 20 | uniform | ✓ | 100% |
|
||||||
|
| Missing required fields (no email in multiple rows) | 20 | uniform | ✓ | 100% |
|
||||||
|
|
||||||
**Structure classes:**
|
**Structure classes:**
|
||||||
- **uniform**: All objects have identical fields with primitive values
|
- **uniform**: All objects have identical fields with primitive values
|
||||||
@@ -350,67 +355,69 @@ Benchmarks test LLM comprehension across different input formats using 204 data
|
|||||||
Each format's overall performance, balancing accuracy against token cost:
|
Each format's overall performance, balancing accuracy against token cost:
|
||||||
|
|
||||||
```
|
```
|
||||||
TOON ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 17.2 │ 75.5% acc │ 4,389 tokens
|
TOON ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 26.9 │ 73.9% acc │ 2,744 tokens
|
||||||
CSV ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░ 16.6 │ 67.8% acc │ 4,080 tokens
|
JSON compact ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░ 22.9 │ 70.7% acc │ 3,081 tokens
|
||||||
JSON compact ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░ 14.7 │ 73.3% acc │ 4,982 tokens
|
YAML ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░ 18.6 │ 69.0% acc │ 3,719 tokens
|
||||||
YAML ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░ 12.1 │ 72.4% acc │ 5,976 tokens
|
JSON ▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░ 15.3 │ 69.7% acc │ 4,545 tokens
|
||||||
JSON ▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░ 10.0 │ 72.4% acc │ 7,260 tokens
|
XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13.0 │ 67.1% acc │ 5,167 tokens
|
||||||
XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 8.4 │ 69.0% acc │ 8,251 tokens
|
|
||||||
```
|
```
|
||||||
|
|
||||||
TOON achieves **75.5%** accuracy (vs JSON's 72.4%) while using **39.5% fewer tokens**.
|
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
||||||
|
|
||||||
|
**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
||||||
|
|
||||||
#### Per-Model Accuracy
|
#### Per-Model Accuracy
|
||||||
|
|
||||||
Accuracy across 4 LLMs on 204 data retrieval questions:
|
Accuracy across 4 LLMs on 209 data retrieval questions:
|
||||||
|
|
||||||
```
|
```
|
||||||
claude-haiku-4-5-20251001
|
claude-haiku-4-5-20251001
|
||||||
→ TOON ████████████░░░░░░░░ 62.3% (127/204)
|
→ TOON ████████████░░░░░░░░ 59.8% (125/209)
|
||||||
JSON ███████████░░░░░░░░░ 56.9% (116/204)
|
JSON ███████████░░░░░░░░░ 57.4% (120/209)
|
||||||
YAML ███████████░░░░░░░░░ 55.9% (114/204)
|
YAML ███████████░░░░░░░░░ 56.0% (117/209)
|
||||||
JSON compact ███████████░░░░░░░░░ 54.9% (112/204)
|
XML ███████████░░░░░░░░░ 55.5% (116/209)
|
||||||
XML ███████████░░░░░░░░░ 54.9% (112/204)
|
JSON compact ███████████░░░░░░░░░ 55.0% (115/209)
|
||||||
CSV █████████░░░░░░░░░░░ 47.1% (49/104)
|
CSV ██████████░░░░░░░░░░ 50.5% (55/109)
|
||||||
|
|
||||||
gemini-2.5-flash
|
gemini-2.5-flash
|
||||||
→ TOON ██████████████████░░ 91.2% (186/204)
|
→ TOON ██████████████████░░ 87.6% (183/209)
|
||||||
YAML ██████████████████░░ 89.7% (183/204)
|
CSV █████████████████░░░ 86.2% (94/109)
|
||||||
JSON compact ██████████████████░░ 87.7% (179/204)
|
JSON compact ████████████████░░░░ 82.3% (172/209)
|
||||||
JSON ██████████████████░░ 87.7% (179/204)
|
YAML ████████████████░░░░ 79.4% (166/209)
|
||||||
XML █████████████████░░░ 87.3% (178/204)
|
XML ████████████████░░░░ 79.4% (166/209)
|
||||||
CSV █████████████████░░░ 85.6% (89/104)
|
JSON ███████████████░░░░░ 77.0% (161/209)
|
||||||
|
|
||||||
gpt-5-nano
|
gpt-5-nano
|
||||||
JSON compact ███████████████████░ 93.6% (191/204)
|
→ TOON ██████████████████░░ 90.9% (190/209)
|
||||||
CSV ██████████████████░░ 90.4% (94/104)
|
JSON compact ██████████████████░░ 90.9% (190/209)
|
||||||
JSON ██████████████████░░ 89.7% (183/204)
|
JSON ██████████████████░░ 89.0% (186/209)
|
||||||
→ TOON ██████████████████░░ 89.2% (182/204)
|
CSV ██████████████████░░ 89.0% (97/109)
|
||||||
YAML ██████████████████░░ 89.2% (182/204)
|
YAML █████████████████░░░ 87.1% (182/209)
|
||||||
XML ████████████████░░░░ 81.4% (166/204)
|
XML ████████████████░░░░ 80.9% (169/209)
|
||||||
|
|
||||||
grok-4-fast-non-reasoning
|
grok-4-fast-non-reasoning
|
||||||
→ TOON ████████████░░░░░░░░ 59.3% (121/204)
|
→ TOON ███████████░░░░░░░░░ 57.4% (120/209)
|
||||||
JSON compact ███████████░░░░░░░░░ 56.9% (116/204)
|
JSON ███████████░░░░░░░░░ 55.5% (116/209)
|
||||||
JSON ███████████░░░░░░░░░ 55.4% (113/204)
|
JSON compact ███████████░░░░░░░░░ 54.5% (114/209)
|
||||||
YAML ███████████░░░░░░░░░ 54.9% (112/204)
|
YAML ███████████░░░░░░░░░ 53.6% (112/209)
|
||||||
XML ██████████░░░░░░░░░░ 52.5% (107/204)
|
XML ███████████░░░░░░░░░ 52.6% (110/209)
|
||||||
CSV ██████████░░░░░░░░░░ 48.1% (50/104)
|
CSV ██████████░░░░░░░░░░ 52.3% (57/109)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key tradeoff:** TOON achieves **75.5% accuracy** (vs JSON's 72.4%) while using **39.5% fewer tokens** on these datasets.
|
**Key tradeoff:** TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
||||||
|
|
||||||
#### Performance by Question Type
|
#### Performance by Question Type
|
||||||
|
|
||||||
| Question Type | TOON | JSON compact | JSON | YAML | XML | CSV |
|
| Question Type | TOON | JSON compact | JSON | CSV | YAML | XML |
|
||||||
| ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
|
| ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
|
||||||
| Field Retrieval | 100.0% | 98.9% | 99.6% | 99.3% | 98.5% | 100.0% |
|
| Field Retrieval | 99.6% | 99.3% | 99.3% | 100.0% | 98.2% | 98.9% |
|
||||||
| Aggregation | 56.3% | 52.4% | 53.2% | 53.2% | 47.2% | 40.5% |
|
| Aggregation | 54.4% | 47.2% | 48.8% | 44.0% | 47.6% | 41.3% |
|
||||||
| Filtering | 58.9% | 58.3% | 54.2% | 53.1% | 50.5% | 49.1% |
|
| Filtering | 56.3% | 57.3% | 50.5% | 49.1% | 51.0% | 47.9% |
|
||||||
| Structure Awareness | 89.0% | 85.0% | 82.0% | 85.0% | 79.0% | 84.4% |
|
| Structure Awareness | 88.0% | 83.0% | 83.0% | 85.9% | 80.0% | 80.0% |
|
||||||
|
| Structural Validation | 70.0% | 45.0% | 50.0% | 80.0% | 60.0% | 80.0% |
|
||||||
|
|
||||||
#### Performance by Dataset
|
#### Performance by Dataset
|
||||||
|
|
||||||
@@ -418,64 +425,119 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 70.7% | 2,337 | 116/164 |
|
| `csv` | 72.0% | 2,352 | 118/164 |
|
||||||
| `toon` | 72.0% | 2,483 | 118/164 |
|
| `toon` | 73.8% | 2,518 | 121/164 |
|
||||||
| `json-compact` | 71.3% | 3,943 | 117/164 |
|
| `json-compact` | 69.5% | 3,953 | 114/164 |
|
||||||
| `yaml` | 70.1% | 4,969 | 115/164 |
|
| `yaml` | 68.3% | 4,982 | 112/164 |
|
||||||
| `json-pretty` | 72.6% | 6,347 | 119/164 |
|
| `json-pretty` | 68.3% | 6,360 | 112/164 |
|
||||||
| `xml` | 70.7% | 7,314 | 116/164 |
|
| `xml` | 69.5% | 7,324 | 114/164 |
|
||||||
|
|
||||||
##### E-commerce orders with nested structures
|
##### E-commerce orders with nested structures
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 83.5% | 7,197 | 137/164 |
|
| `toon` | 81.1% | 7,232 | 133/164 |
|
||||||
| `json-compact` | 79.3% | 6,784 | 130/164 |
|
| `json-compact` | 76.8% | 6,794 | 126/164 |
|
||||||
| `yaml` | 78.7% | 8,334 | 129/164 |
|
| `yaml` | 75.6% | 8,347 | 124/164 |
|
||||||
| `json-pretty` | 78.7% | 10,700 | 129/164 |
|
| `json-pretty` | 76.2% | 10,713 | 125/164 |
|
||||||
| `xml` | 73.8% | 12,013 | 121/164 |
|
| `xml` | 74.4% | 12,023 | 122/164 |
|
||||||
|
|
||||||
##### Time-series analytics data
|
##### Time-series analytics data
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 75.8% | 1,513 | 91/120 |
|
| `csv` | 73.3% | 1,406 | 88/120 |
|
||||||
| `csv` | 72.5% | 1,391 | 87/120 |
|
| `toon` | 72.5% | 1,548 | 87/120 |
|
||||||
| `json-compact` | 70.0% | 2,339 | 84/120 |
|
| `json-compact` | 71.7% | 2,349 | 86/120 |
|
||||||
| `yaml` | 70.0% | 2,936 | 84/120 |
|
| `yaml` | 71.7% | 2,949 | 86/120 |
|
||||||
| `json-pretty` | 71.7% | 3,663 | 86/120 |
|
| `json-pretty` | 68.3% | 3,676 | 82/120 |
|
||||||
| `xml` | 71.7% | 4,374 | 86/120 |
|
| `xml` | 68.3% | 4,384 | 82/120 |
|
||||||
|
|
||||||
##### Top 100 GitHub repositories
|
##### Top 100 GitHub repositories
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 64.4% | 8,745 | 85/132 |
|
| `toon` | 62.9% | 8,780 | 83/132 |
|
||||||
| `csv` | 59.8% | 8,513 | 79/132 |
|
| `csv` | 61.4% | 8,528 | 81/132 |
|
||||||
| `json-compact` | 60.6% | 11,455 | 80/132 |
|
| `yaml` | 59.8% | 13,142 | 79/132 |
|
||||||
| `yaml` | 61.4% | 13,129 | 81/132 |
|
| `json-compact` | 55.3% | 11,465 | 73/132 |
|
||||||
| `json-pretty` | 59.1% | 15,145 | 78/132 |
|
| `json-pretty` | 56.1% | 15,158 | 74/132 |
|
||||||
| `xml` | 51.5% | 17,095 | 68/132 |
|
| `xml` | 48.5% | 17,105 | 64/132 |
|
||||||
|
|
||||||
##### Semi-uniform event logs
|
##### Semi-uniform event logs
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `json-compact` | 67.5% | 4,809 | 81/120 |
|
| `json-compact` | 63.3% | 4,819 | 76/120 |
|
||||||
| `yaml` | 63.3% | 5,814 | 76/120 |
|
| `toon` | 57.5% | 5,799 | 69/120 |
|
||||||
| `toon` | 62.5% | 5,764 | 75/120 |
|
| `json-pretty` | 59.2% | 6,797 | 71/120 |
|
||||||
| `json-pretty` | 59.2% | 6,784 | 71/120 |
|
| `yaml` | 48.3% | 5,827 | 58/120 |
|
||||||
| `xml` | 55.0% | 7,699 | 66/120 |
|
| `xml` | 46.7% | 7,709 | 56/120 |
|
||||||
|
|
||||||
##### Deeply nested configuration
|
##### Deeply nested configuration
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `json-compact` | 91.4% | 564 | 106/116 |
|
| `json-compact` | 92.2% | 574 | 107/116 |
|
||||||
| `toon` | 94.8% | 631 | 110/116 |
|
| `toon` | 95.7% | 666 | 111/116 |
|
||||||
| `yaml` | 91.4% | 673 | 106/116 |
|
| `yaml` | 91.4% | 686 | 106/116 |
|
||||||
| `json-pretty` | 93.1% | 919 | 108/116 |
|
| `json-pretty` | 94.0% | 932 | 109/116 |
|
||||||
| `xml` | 91.4% | 1,008 | 106/116 |
|
| `xml` | 92.2% | 1,018 | 107/116 |
|
||||||
|
|
||||||
|
##### Valid complete dataset (control)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `toon` | 100.0% | 544 | 4/4 |
|
||||||
|
| `json-compact` | 100.0% | 795 | 4/4 |
|
||||||
|
| `yaml` | 100.0% | 1,003 | 4/4 |
|
||||||
|
| `json-pretty` | 100.0% | 1,282 | 4/4 |
|
||||||
|
| `csv` | 25.0% | 492 | 1/4 |
|
||||||
|
| `xml` | 0.0% | 1,467 | 0/4 |
|
||||||
|
|
||||||
|
##### Array truncated: 3 rows removed from end
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 425 | 4/4 |
|
||||||
|
| `xml` | 100.0% | 1,251 | 4/4 |
|
||||||
|
| `toon` | 0.0% | 474 | 0/4 |
|
||||||
|
| `json-compact` | 0.0% | 681 | 0/4 |
|
||||||
|
| `json-pretty` | 0.0% | 1,096 | 0/4 |
|
||||||
|
| `yaml` | 0.0% | 859 | 0/4 |
|
||||||
|
|
||||||
|
##### Extra rows added beyond declared length
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 566 | 4/4 |
|
||||||
|
| `toon` | 75.0% | 621 | 3/4 |
|
||||||
|
| `xml` | 100.0% | 1,692 | 4/4 |
|
||||||
|
| `yaml` | 75.0% | 1,157 | 3/4 |
|
||||||
|
| `json-compact` | 50.0% | 917 | 2/4 |
|
||||||
|
| `json-pretty` | 50.0% | 1,476 | 2/4 |
|
||||||
|
|
||||||
|
##### Inconsistent field count (missing salary in row 10)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 75.0% | 489 | 3/4 |
|
||||||
|
| `yaml` | 100.0% | 996 | 4/4 |
|
||||||
|
| `toon` | 100.0% | 1,019 | 4/4 |
|
||||||
|
| `json-compact` | 75.0% | 790 | 3/4 |
|
||||||
|
| `xml` | 100.0% | 1,458 | 4/4 |
|
||||||
|
| `json-pretty` | 75.0% | 1,274 | 3/4 |
|
||||||
|
|
||||||
|
##### Missing required fields (no email in multiple rows)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 329 | 4/4 |
|
||||||
|
| `xml` | 100.0% | 1,411 | 4/4 |
|
||||||
|
| `toon` | 75.0% | 983 | 3/4 |
|
||||||
|
| `yaml` | 25.0% | 960 | 1/4 |
|
||||||
|
| `json-pretty` | 25.0% | 1,230 | 1/4 |
|
||||||
|
| `json-compact` | 0.0% | 755 | 0/4 |
|
||||||
|
|
||||||
#### Performance by Model
|
#### Performance by Model
|
||||||
|
|
||||||
@@ -483,45 +545,45 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 62.3% | 127/204 |
|
| `toon` | 59.8% | 125/209 |
|
||||||
| `json-pretty` | 56.9% | 116/204 |
|
| `json-pretty` | 57.4% | 120/209 |
|
||||||
| `yaml` | 55.9% | 114/204 |
|
| `yaml` | 56.0% | 117/209 |
|
||||||
| `json-compact` | 54.9% | 112/204 |
|
| `xml` | 55.5% | 116/209 |
|
||||||
| `xml` | 54.9% | 112/204 |
|
| `json-compact` | 55.0% | 115/209 |
|
||||||
| `csv` | 47.1% | 49/104 |
|
| `csv` | 50.5% | 55/109 |
|
||||||
|
|
||||||
##### gemini-2.5-flash
|
##### gemini-2.5-flash
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 91.2% | 186/204 |
|
| `toon` | 87.6% | 183/209 |
|
||||||
| `yaml` | 89.7% | 183/204 |
|
| `csv` | 86.2% | 94/109 |
|
||||||
| `json-compact` | 87.7% | 179/204 |
|
| `json-compact` | 82.3% | 172/209 |
|
||||||
| `json-pretty` | 87.7% | 179/204 |
|
| `yaml` | 79.4% | 166/209 |
|
||||||
| `xml` | 87.3% | 178/204 |
|
| `xml` | 79.4% | 166/209 |
|
||||||
| `csv` | 85.6% | 89/104 |
|
| `json-pretty` | 77.0% | 161/209 |
|
||||||
|
|
||||||
##### gpt-5-nano
|
##### gpt-5-nano
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `json-compact` | 93.6% | 191/204 |
|
| `toon` | 90.9% | 190/209 |
|
||||||
| `csv` | 90.4% | 94/104 |
|
| `json-compact` | 90.9% | 190/209 |
|
||||||
| `json-pretty` | 89.7% | 183/204 |
|
| `json-pretty` | 89.0% | 186/209 |
|
||||||
| `toon` | 89.2% | 182/204 |
|
| `csv` | 89.0% | 97/109 |
|
||||||
| `yaml` | 89.2% | 182/204 |
|
| `yaml` | 87.1% | 182/209 |
|
||||||
| `xml` | 81.4% | 166/204 |
|
| `xml` | 80.9% | 169/209 |
|
||||||
|
|
||||||
##### grok-4-fast-non-reasoning
|
##### grok-4-fast-non-reasoning
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 59.3% | 121/204 |
|
| `toon` | 57.4% | 120/209 |
|
||||||
| `json-compact` | 56.9% | 116/204 |
|
| `json-pretty` | 55.5% | 116/209 |
|
||||||
| `json-pretty` | 55.4% | 113/204 |
|
| `json-compact` | 54.5% | 114/209 |
|
||||||
| `yaml` | 54.9% | 112/204 |
|
| `yaml` | 53.6% | 112/209 |
|
||||||
| `xml` | 52.5% | 107/204 |
|
| `xml` | 52.6% | 110/209 |
|
||||||
| `csv` | 48.1% | 50/104 |
|
| `csv` | 52.3% | 57/109 |
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -534,8 +596,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
|||||||
|
|
||||||
#### Datasets Tested
|
#### Datasets Tested
|
||||||
|
|
||||||
Six datasets designed to test different structural patterns:
|
Eleven datasets designed to test different structural patterns and validation capabilities:
|
||||||
|
|
||||||
|
**Primary datasets:**
|
||||||
1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||||
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
|
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||||
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
|
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
|
||||||
@@ -543,21 +606,28 @@ Six datasets designed to test different structural patterns:
|
|||||||
5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
||||||
6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
|
6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
|
||||||
|
|
||||||
|
**Structural validation datasets:**
|
||||||
|
7. **Control**: Valid complete dataset (baseline for validation)
|
||||||
|
8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
|
||||||
|
9. **Extra rows**: Array with 3 additional rows beyond declared length
|
||||||
|
10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
|
||||||
|
11. **Missing fields**: Systematic field omissions (no email in multiple rows)
|
||||||
|
|
||||||
#### Question Types
|
#### Question Types
|
||||||
|
|
||||||
204 questions are generated dynamically across four categories:
|
209 questions are generated dynamically across five categories:
|
||||||
|
|
||||||
- **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
- **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||||
- Example: "What is Alice's salary?" → `75000`
|
- Example: "What is Alice's salary?" → `75000`
|
||||||
- Example: "How many items are in order ORD-0042?" → `3`
|
- Example: "How many items are in order ORD-0042?" → `3`
|
||||||
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
|
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
|
||||||
|
|
||||||
- **Aggregation (31%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
|
- **Aggregation (30%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
|
||||||
- Example: "How many employees work in Engineering?" → `17`
|
- Example: "How many employees work in Engineering?" → `17`
|
||||||
- Example: "What is the total revenue across all orders?" → `45123.50`
|
- Example: "What is the total revenue across all orders?" → `45123.50`
|
||||||
- Example: "How many employees have salary > 80000?" → `23`
|
- Example: "How many employees have salary > 80000?" → `23`
|
||||||
|
|
||||||
- **Filtering (24%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
|
- **Filtering (23%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
|
||||||
- Example: "How many employees in Sales have salary > 80000?" → `5`
|
- Example: "How many employees in Sales have salary > 80000?" → `5`
|
||||||
- Example: "How many active employees have more than 10 years of experience?" → `8`
|
- Example: "How many active employees have more than 10 years of experience?" → `8`
|
||||||
|
|
||||||
@@ -566,18 +636,23 @@ Six datasets designed to test different structural patterns:
|
|||||||
- Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
|
- Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
|
||||||
- Example: "What is the department of the last employee?" → `Sales`
|
- Example: "What is the department of the last employee?" → `Sales`
|
||||||
|
|
||||||
|
- **Structural validation (2%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
|
||||||
|
- Example: "Is this data complete and valid?" → `YES` (control dataset) or `NO` (corrupted datasets)
|
||||||
|
- Tests TOON's [N] length validation and {fields} consistency checking
|
||||||
|
- Demonstrates CSV's lack of structural validation capabilities
|
||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, YAML, XML, CSV).
|
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, CSV, YAML, XML).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`) without requiring an LLM judge.
|
||||||
|
|
||||||
#### Models & Configuration
|
#### Models & Configuration
|
||||||
|
|
||||||
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
|
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
|
||||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||||
- **Temperature**: Not set (models use their defaults)
|
- **Temperature**: Not set (models use their defaults)
|
||||||
- **Total evaluations**: 204 questions × 6 formats × 4 models = 4,896 LLM calls
|
- **Total evaluations**: 209 questions × 6 formats × 4 models = 5,016 LLM calls
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -782,6 +857,9 @@ items[1]:
|
|||||||
status: active
|
status: active
|
||||||
```
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Tabular format requires identical field sets across all objects (same keys, order doesn't matter) and primitive values only (strings, numbers, booleans, null).
|
||||||
|
|
||||||
#### Mixed and Non-Uniform Arrays
|
#### Mixed and Non-Uniform Arrays
|
||||||
|
|
||||||
Arrays that don't meet the tabular requirements use list format:
|
Arrays that don't meet the tabular requirements use list format:
|
||||||
|
|||||||
@@ -34,10 +34,10 @@ Results are saved to `results/token-efficiency.md`.
|
|||||||
|
|
||||||
Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
|
Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
|
||||||
|
|
||||||
1. Generate ~200 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
|
1. Generate 209 questions across 11 datasets (6 primary + 5 structural validation; CSV only included for datasets with flat/tabular structure)
|
||||||
2. Convert each dataset to all supported formats
|
2. Convert each dataset to all supported formats
|
||||||
3. Query each LLM with formatted data + question
|
3. Query each LLM with formatted data + question
|
||||||
4. Validate answers using `gpt-5-nano` as judge
|
4. Validate answers deterministically using type-aware comparison (no LLM judge needed)
|
||||||
5. Aggregate metrics and generate report
|
5. Aggregate metrics and generate report
|
||||||
|
|
||||||
### Setup
|
### Setup
|
||||||
@@ -95,10 +95,22 @@ src/
|
|||||||
├── datasets.ts # Test data generators
|
├── datasets.ts # Test data generators
|
||||||
├── evaluate.ts # LLM evaluation
|
├── evaluate.ts # LLM evaluation
|
||||||
├── formatters.ts # Format converters
|
├── formatters.ts # Format converters
|
||||||
├── questions.ts # Question generation
|
├── normalize.ts # Answer normalization
|
||||||
├── report.ts # Markdown reports
|
├── report.ts # Markdown reports
|
||||||
├── storage.ts # Result caching
|
├── storage.ts # Result caching
|
||||||
└── utils.ts # Helpers
|
├── types.ts # Type definitions
|
||||||
|
├── utils.ts # Helpers
|
||||||
|
└── questions/ # Question generators
|
||||||
|
├── analytics.ts
|
||||||
|
├── event-logs.ts
|
||||||
|
├── github.ts
|
||||||
|
├── index.ts
|
||||||
|
├── nested-config.ts
|
||||||
|
├── nested.ts
|
||||||
|
├── structural-validation.ts
|
||||||
|
├── structure.ts
|
||||||
|
├── tabular.ts
|
||||||
|
└── utils.ts
|
||||||
data/
|
data/
|
||||||
└── github-repos.json # Top 100 GitHub repos
|
└── github-repos.json # Top 100 GitHub repos
|
||||||
results/
|
results/
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,4 +1,4 @@
|
|||||||
Benchmarks test LLM comprehension across different input formats using 204 data retrieval questions on 4 models.
|
Benchmarks test LLM comprehension across different input formats using 209 data retrieval questions on 4 models.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Show Dataset Catalog</strong></summary>
|
<summary><strong>Show Dataset Catalog</strong></summary>
|
||||||
@@ -13,6 +13,11 @@ Benchmarks test LLM comprehension across different input formats using 204 data
|
|||||||
| Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
|
| Top 100 GitHub repositories | 100 | uniform | ✓ | 100% |
|
||||||
| Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
|
| Semi-uniform event logs | 75 | semi-uniform | ✗ | 50% |
|
||||||
| Deeply nested configuration | 11 | deep | ✗ | 0% |
|
| Deeply nested configuration | 11 | deep | ✗ | 0% |
|
||||||
|
| Valid complete dataset (control) | 20 | uniform | ✓ | 100% |
|
||||||
|
| Array truncated: 3 rows removed from end | 17 | uniform | ✓ | 100% |
|
||||||
|
| Extra rows added beyond declared length | 23 | uniform | ✓ | 100% |
|
||||||
|
| Inconsistent field count (missing salary in row 10) | 20 | uniform | ✓ | 100% |
|
||||||
|
| Missing required fields (no email in multiple rows) | 20 | uniform | ✓ | 100% |
|
||||||
|
|
||||||
**Structure classes:**
|
**Structure classes:**
|
||||||
- **uniform**: All objects have identical fields with primitive values
|
- **uniform**: All objects have identical fields with primitive values
|
||||||
@@ -31,67 +36,69 @@ Benchmarks test LLM comprehension across different input formats using 204 data
|
|||||||
Each format's overall performance, balancing accuracy against token cost:
|
Each format's overall performance, balancing accuracy against token cost:
|
||||||
|
|
||||||
```
|
```
|
||||||
TOON ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 17.2 │ 75.5% acc │ 4,389 tokens
|
TOON ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 26.9 │ 73.9% acc │ 2,744 tokens
|
||||||
CSV ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░ 16.6 │ 67.8% acc │ 4,080 tokens
|
JSON compact ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░ 22.9 │ 70.7% acc │ 3,081 tokens
|
||||||
JSON compact ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░ 14.7 │ 73.3% acc │ 4,982 tokens
|
YAML ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░ 18.6 │ 69.0% acc │ 3,719 tokens
|
||||||
YAML ▓▓▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░ 12.1 │ 72.4% acc │ 5,976 tokens
|
JSON ▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░░ 15.3 │ 69.7% acc │ 4,545 tokens
|
||||||
JSON ▓▓▓▓▓▓▓▓▓▓▓▓░░░░░░░░ 10.0 │ 72.4% acc │ 7,260 tokens
|
XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13.0 │ 67.1% acc │ 5,167 tokens
|
||||||
XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 8.4 │ 69.0% acc │ 8,251 tokens
|
|
||||||
```
|
```
|
||||||
|
|
||||||
TOON achieves **75.5%** accuracy (vs JSON's 72.4%) while using **39.5% fewer tokens**.
|
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
||||||
|
|
||||||
|
**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
||||||
|
|
||||||
#### Per-Model Accuracy
|
#### Per-Model Accuracy
|
||||||
|
|
||||||
Accuracy across 4 LLMs on 204 data retrieval questions:
|
Accuracy across 4 LLMs on 209 data retrieval questions:
|
||||||
|
|
||||||
```
|
```
|
||||||
claude-haiku-4-5-20251001
|
claude-haiku-4-5-20251001
|
||||||
→ TOON ████████████░░░░░░░░ 62.3% (127/204)
|
→ TOON ████████████░░░░░░░░ 59.8% (125/209)
|
||||||
JSON ███████████░░░░░░░░░ 56.9% (116/204)
|
JSON ███████████░░░░░░░░░ 57.4% (120/209)
|
||||||
YAML ███████████░░░░░░░░░ 55.9% (114/204)
|
YAML ███████████░░░░░░░░░ 56.0% (117/209)
|
||||||
JSON compact ███████████░░░░░░░░░ 54.9% (112/204)
|
XML ███████████░░░░░░░░░ 55.5% (116/209)
|
||||||
XML ███████████░░░░░░░░░ 54.9% (112/204)
|
JSON compact ███████████░░░░░░░░░ 55.0% (115/209)
|
||||||
CSV █████████░░░░░░░░░░░ 47.1% (49/104)
|
CSV ██████████░░░░░░░░░░ 50.5% (55/109)
|
||||||
|
|
||||||
gemini-2.5-flash
|
gemini-2.5-flash
|
||||||
→ TOON ██████████████████░░ 91.2% (186/204)
|
→ TOON ██████████████████░░ 87.6% (183/209)
|
||||||
YAML ██████████████████░░ 89.7% (183/204)
|
CSV █████████████████░░░ 86.2% (94/109)
|
||||||
JSON compact ██████████████████░░ 87.7% (179/204)
|
JSON compact ████████████████░░░░ 82.3% (172/209)
|
||||||
JSON ██████████████████░░ 87.7% (179/204)
|
YAML ████████████████░░░░ 79.4% (166/209)
|
||||||
XML █████████████████░░░ 87.3% (178/204)
|
XML ████████████████░░░░ 79.4% (166/209)
|
||||||
CSV █████████████████░░░ 85.6% (89/104)
|
JSON ███████████████░░░░░ 77.0% (161/209)
|
||||||
|
|
||||||
gpt-5-nano
|
gpt-5-nano
|
||||||
JSON compact ███████████████████░ 93.6% (191/204)
|
→ TOON ██████████████████░░ 90.9% (190/209)
|
||||||
CSV ██████████████████░░ 90.4% (94/104)
|
JSON compact ██████████████████░░ 90.9% (190/209)
|
||||||
JSON ██████████████████░░ 89.7% (183/204)
|
JSON ██████████████████░░ 89.0% (186/209)
|
||||||
→ TOON ██████████████████░░ 89.2% (182/204)
|
CSV ██████████████████░░ 89.0% (97/109)
|
||||||
YAML ██████████████████░░ 89.2% (182/204)
|
YAML █████████████████░░░ 87.1% (182/209)
|
||||||
XML ████████████████░░░░ 81.4% (166/204)
|
XML ████████████████░░░░ 80.9% (169/209)
|
||||||
|
|
||||||
grok-4-fast-non-reasoning
|
grok-4-fast-non-reasoning
|
||||||
→ TOON ████████████░░░░░░░░ 59.3% (121/204)
|
→ TOON ███████████░░░░░░░░░ 57.4% (120/209)
|
||||||
JSON compact ███████████░░░░░░░░░ 56.9% (116/204)
|
JSON ███████████░░░░░░░░░ 55.5% (116/209)
|
||||||
JSON ███████████░░░░░░░░░ 55.4% (113/204)
|
JSON compact ███████████░░░░░░░░░ 54.5% (114/209)
|
||||||
YAML ███████████░░░░░░░░░ 54.9% (112/204)
|
YAML ███████████░░░░░░░░░ 53.6% (112/209)
|
||||||
XML ██████████░░░░░░░░░░ 52.5% (107/204)
|
XML ███████████░░░░░░░░░ 52.6% (110/209)
|
||||||
CSV ██████████░░░░░░░░░░ 48.1% (50/104)
|
CSV ██████████░░░░░░░░░░ 52.3% (57/109)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key tradeoff:** TOON achieves **75.5% accuracy** (vs JSON's 72.4%) while using **39.5% fewer tokens** on these datasets.
|
**Key tradeoff:** TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
||||||
|
|
||||||
#### Performance by Question Type
|
#### Performance by Question Type
|
||||||
|
|
||||||
| Question Type | TOON | JSON compact | JSON | YAML | XML | CSV |
|
| Question Type | TOON | JSON compact | JSON | CSV | YAML | XML |
|
||||||
| ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
|
| ------------- | ---- | ---- | ---- | ---- | ---- | ---- |
|
||||||
| Field Retrieval | 100.0% | 98.9% | 99.6% | 99.3% | 98.5% | 100.0% |
|
| Field Retrieval | 99.6% | 99.3% | 99.3% | 100.0% | 98.2% | 98.9% |
|
||||||
| Aggregation | 56.3% | 52.4% | 53.2% | 53.2% | 47.2% | 40.5% |
|
| Aggregation | 54.4% | 47.2% | 48.8% | 44.0% | 47.6% | 41.3% |
|
||||||
| Filtering | 58.9% | 58.3% | 54.2% | 53.1% | 50.5% | 49.1% |
|
| Filtering | 56.3% | 57.3% | 50.5% | 49.1% | 51.0% | 47.9% |
|
||||||
| Structure Awareness | 89.0% | 85.0% | 82.0% | 85.0% | 79.0% | 84.4% |
|
| Structure Awareness | 88.0% | 83.0% | 83.0% | 85.9% | 80.0% | 80.0% |
|
||||||
|
| Structural Validation | 70.0% | 45.0% | 50.0% | 80.0% | 60.0% | 80.0% |
|
||||||
|
|
||||||
#### Performance by Dataset
|
#### Performance by Dataset
|
||||||
|
|
||||||
@@ -99,64 +106,119 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 70.7% | 2,337 | 116/164 |
|
| `csv` | 72.0% | 2,352 | 118/164 |
|
||||||
| `toon` | 72.0% | 2,483 | 118/164 |
|
| `toon` | 73.8% | 2,518 | 121/164 |
|
||||||
| `json-compact` | 71.3% | 3,943 | 117/164 |
|
| `json-compact` | 69.5% | 3,953 | 114/164 |
|
||||||
| `yaml` | 70.1% | 4,969 | 115/164 |
|
| `yaml` | 68.3% | 4,982 | 112/164 |
|
||||||
| `json-pretty` | 72.6% | 6,347 | 119/164 |
|
| `json-pretty` | 68.3% | 6,360 | 112/164 |
|
||||||
| `xml` | 70.7% | 7,314 | 116/164 |
|
| `xml` | 69.5% | 7,324 | 114/164 |
|
||||||
|
|
||||||
##### E-commerce orders with nested structures
|
##### E-commerce orders with nested structures
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 83.5% | 7,197 | 137/164 |
|
| `toon` | 81.1% | 7,232 | 133/164 |
|
||||||
| `json-compact` | 79.3% | 6,784 | 130/164 |
|
| `json-compact` | 76.8% | 6,794 | 126/164 |
|
||||||
| `yaml` | 78.7% | 8,334 | 129/164 |
|
| `yaml` | 75.6% | 8,347 | 124/164 |
|
||||||
| `json-pretty` | 78.7% | 10,700 | 129/164 |
|
| `json-pretty` | 76.2% | 10,713 | 125/164 |
|
||||||
| `xml` | 73.8% | 12,013 | 121/164 |
|
| `xml` | 74.4% | 12,023 | 122/164 |
|
||||||
|
|
||||||
##### Time-series analytics data
|
##### Time-series analytics data
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 75.8% | 1,513 | 91/120 |
|
| `csv` | 73.3% | 1,406 | 88/120 |
|
||||||
| `csv` | 72.5% | 1,391 | 87/120 |
|
| `toon` | 72.5% | 1,548 | 87/120 |
|
||||||
| `json-compact` | 70.0% | 2,339 | 84/120 |
|
| `json-compact` | 71.7% | 2,349 | 86/120 |
|
||||||
| `yaml` | 70.0% | 2,936 | 84/120 |
|
| `yaml` | 71.7% | 2,949 | 86/120 |
|
||||||
| `json-pretty` | 71.7% | 3,663 | 86/120 |
|
| `json-pretty` | 68.3% | 3,676 | 82/120 |
|
||||||
| `xml` | 71.7% | 4,374 | 86/120 |
|
| `xml` | 68.3% | 4,384 | 82/120 |
|
||||||
|
|
||||||
##### Top 100 GitHub repositories
|
##### Top 100 GitHub repositories
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 64.4% | 8,745 | 85/132 |
|
| `toon` | 62.9% | 8,780 | 83/132 |
|
||||||
| `csv` | 59.8% | 8,513 | 79/132 |
|
| `csv` | 61.4% | 8,528 | 81/132 |
|
||||||
| `json-compact` | 60.6% | 11,455 | 80/132 |
|
| `yaml` | 59.8% | 13,142 | 79/132 |
|
||||||
| `yaml` | 61.4% | 13,129 | 81/132 |
|
| `json-compact` | 55.3% | 11,465 | 73/132 |
|
||||||
| `json-pretty` | 59.1% | 15,145 | 78/132 |
|
| `json-pretty` | 56.1% | 15,158 | 74/132 |
|
||||||
| `xml` | 51.5% | 17,095 | 68/132 |
|
| `xml` | 48.5% | 17,105 | 64/132 |
|
||||||
|
|
||||||
##### Semi-uniform event logs
|
##### Semi-uniform event logs
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `json-compact` | 67.5% | 4,809 | 81/120 |
|
| `json-compact` | 63.3% | 4,819 | 76/120 |
|
||||||
| `yaml` | 63.3% | 5,814 | 76/120 |
|
| `toon` | 57.5% | 5,799 | 69/120 |
|
||||||
| `toon` | 62.5% | 5,764 | 75/120 |
|
| `json-pretty` | 59.2% | 6,797 | 71/120 |
|
||||||
| `json-pretty` | 59.2% | 6,784 | 71/120 |
|
| `yaml` | 48.3% | 5,827 | 58/120 |
|
||||||
| `xml` | 55.0% | 7,699 | 66/120 |
|
| `xml` | 46.7% | 7,709 | 56/120 |
|
||||||
|
|
||||||
##### Deeply nested configuration
|
##### Deeply nested configuration
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `json-compact` | 91.4% | 564 | 106/116 |
|
| `json-compact` | 92.2% | 574 | 107/116 |
|
||||||
| `toon` | 94.8% | 631 | 110/116 |
|
| `toon` | 95.7% | 666 | 111/116 |
|
||||||
| `yaml` | 91.4% | 673 | 106/116 |
|
| `yaml` | 91.4% | 686 | 106/116 |
|
||||||
| `json-pretty` | 93.1% | 919 | 108/116 |
|
| `json-pretty` | 94.0% | 932 | 109/116 |
|
||||||
| `xml` | 91.4% | 1,008 | 106/116 |
|
| `xml` | 92.2% | 1,018 | 107/116 |
|
||||||
|
|
||||||
|
##### Valid complete dataset (control)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `toon` | 100.0% | 544 | 4/4 |
|
||||||
|
| `json-compact` | 100.0% | 795 | 4/4 |
|
||||||
|
| `yaml` | 100.0% | 1,003 | 4/4 |
|
||||||
|
| `json-pretty` | 100.0% | 1,282 | 4/4 |
|
||||||
|
| `csv` | 25.0% | 492 | 1/4 |
|
||||||
|
| `xml` | 0.0% | 1,467 | 0/4 |
|
||||||
|
|
||||||
|
##### Array truncated: 3 rows removed from end
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 425 | 4/4 |
|
||||||
|
| `xml` | 100.0% | 1,251 | 4/4 |
|
||||||
|
| `toon` | 0.0% | 474 | 0/4 |
|
||||||
|
| `json-compact` | 0.0% | 681 | 0/4 |
|
||||||
|
| `json-pretty` | 0.0% | 1,096 | 0/4 |
|
||||||
|
| `yaml` | 0.0% | 859 | 0/4 |
|
||||||
|
|
||||||
|
##### Extra rows added beyond declared length
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 566 | 4/4 |
|
||||||
|
| `toon` | 75.0% | 621 | 3/4 |
|
||||||
|
| `xml` | 100.0% | 1,692 | 4/4 |
|
||||||
|
| `yaml` | 75.0% | 1,157 | 3/4 |
|
||||||
|
| `json-compact` | 50.0% | 917 | 2/4 |
|
||||||
|
| `json-pretty` | 50.0% | 1,476 | 2/4 |
|
||||||
|
|
||||||
|
##### Inconsistent field count (missing salary in row 10)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 75.0% | 489 | 3/4 |
|
||||||
|
| `yaml` | 100.0% | 996 | 4/4 |
|
||||||
|
| `toon` | 100.0% | 1,019 | 4/4 |
|
||||||
|
| `json-compact` | 75.0% | 790 | 3/4 |
|
||||||
|
| `xml` | 100.0% | 1,458 | 4/4 |
|
||||||
|
| `json-pretty` | 75.0% | 1,274 | 3/4 |
|
||||||
|
|
||||||
|
##### Missing required fields (no email in multiple rows)
|
||||||
|
|
||||||
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
|
| ------ | -------- | ------ | ------------- |
|
||||||
|
| `csv` | 100.0% | 329 | 4/4 |
|
||||||
|
| `xml` | 100.0% | 1,411 | 4/4 |
|
||||||
|
| `toon` | 75.0% | 983 | 3/4 |
|
||||||
|
| `yaml` | 25.0% | 960 | 1/4 |
|
||||||
|
| `json-pretty` | 25.0% | 1,230 | 1/4 |
|
||||||
|
| `json-compact` | 0.0% | 755 | 0/4 |
|
||||||
|
|
||||||
#### Performance by Model
|
#### Performance by Model
|
||||||
|
|
||||||
@@ -164,45 +226,45 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 62.3% | 127/204 |
|
| `toon` | 59.8% | 125/209 |
|
||||||
| `json-pretty` | 56.9% | 116/204 |
|
| `json-pretty` | 57.4% | 120/209 |
|
||||||
| `yaml` | 55.9% | 114/204 |
|
| `yaml` | 56.0% | 117/209 |
|
||||||
| `json-compact` | 54.9% | 112/204 |
|
| `xml` | 55.5% | 116/209 |
|
||||||
| `xml` | 54.9% | 112/204 |
|
| `json-compact` | 55.0% | 115/209 |
|
||||||
| `csv` | 47.1% | 49/104 |
|
| `csv` | 50.5% | 55/109 |
|
||||||
|
|
||||||
##### gemini-2.5-flash
|
##### gemini-2.5-flash
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 91.2% | 186/204 |
|
| `toon` | 87.6% | 183/209 |
|
||||||
| `yaml` | 89.7% | 183/204 |
|
| `csv` | 86.2% | 94/109 |
|
||||||
| `json-compact` | 87.7% | 179/204 |
|
| `json-compact` | 82.3% | 172/209 |
|
||||||
| `json-pretty` | 87.7% | 179/204 |
|
| `yaml` | 79.4% | 166/209 |
|
||||||
| `xml` | 87.3% | 178/204 |
|
| `xml` | 79.4% | 166/209 |
|
||||||
| `csv` | 85.6% | 89/104 |
|
| `json-pretty` | 77.0% | 161/209 |
|
||||||
|
|
||||||
##### gpt-5-nano
|
##### gpt-5-nano
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `json-compact` | 93.6% | 191/204 |
|
| `toon` | 90.9% | 190/209 |
|
||||||
| `csv` | 90.4% | 94/104 |
|
| `json-compact` | 90.9% | 190/209 |
|
||||||
| `json-pretty` | 89.7% | 183/204 |
|
| `json-pretty` | 89.0% | 186/209 |
|
||||||
| `toon` | 89.2% | 182/204 |
|
| `csv` | 89.0% | 97/109 |
|
||||||
| `yaml` | 89.2% | 182/204 |
|
| `yaml` | 87.1% | 182/209 |
|
||||||
| `xml` | 81.4% | 166/204 |
|
| `xml` | 80.9% | 169/209 |
|
||||||
|
|
||||||
##### grok-4-fast-non-reasoning
|
##### grok-4-fast-non-reasoning
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 59.3% | 121/204 |
|
| `toon` | 57.4% | 120/209 |
|
||||||
| `json-compact` | 56.9% | 116/204 |
|
| `json-pretty` | 55.5% | 116/209 |
|
||||||
| `json-pretty` | 55.4% | 113/204 |
|
| `json-compact` | 54.5% | 114/209 |
|
||||||
| `yaml` | 54.9% | 112/204 |
|
| `yaml` | 53.6% | 112/209 |
|
||||||
| `xml` | 52.5% | 107/204 |
|
| `xml` | 52.6% | 110/209 |
|
||||||
| `csv` | 48.1% | 50/104 |
|
| `csv` | 52.3% | 57/109 |
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -215,8 +277,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
|||||||
|
|
||||||
#### Datasets Tested
|
#### Datasets Tested
|
||||||
|
|
||||||
Six datasets designed to test different structural patterns:
|
Eleven datasets designed to test different structural patterns and validation capabilities:
|
||||||
|
|
||||||
|
**Primary datasets:**
|
||||||
1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||||
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
|
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||||
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
|
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
|
||||||
@@ -224,21 +287,28 @@ Six datasets designed to test different structural patterns:
|
|||||||
5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
5. **Event Logs** (75 logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
||||||
6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
|
6. **Nested Config** (1 configuration): Deeply nested configuration with minimal tabular eligibility.
|
||||||
|
|
||||||
|
**Structural validation datasets:**
|
||||||
|
7. **Control**: Valid complete dataset (baseline for validation)
|
||||||
|
8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
|
||||||
|
9. **Extra rows**: Array with 3 additional rows beyond declared length
|
||||||
|
10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
|
||||||
|
11. **Missing fields**: Systematic field omissions (no email in multiple rows)
|
||||||
|
|
||||||
#### Question Types
|
#### Question Types
|
||||||
|
|
||||||
204 questions are generated dynamically across four categories:
|
209 questions are generated dynamically across five categories:
|
||||||
|
|
||||||
- **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
- **Field retrieval (33%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||||
- Example: "What is Alice's salary?" → `75000`
|
- Example: "What is Alice's salary?" → `75000`
|
||||||
- Example: "How many items are in order ORD-0042?" → `3`
|
- Example: "How many items are in order ORD-0042?" → `3`
|
||||||
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
|
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
|
||||||
|
|
||||||
- **Aggregation (31%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
|
- **Aggregation (30%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
|
||||||
- Example: "How many employees work in Engineering?" → `17`
|
- Example: "How many employees work in Engineering?" → `17`
|
||||||
- Example: "What is the total revenue across all orders?" → `45123.50`
|
- Example: "What is the total revenue across all orders?" → `45123.50`
|
||||||
- Example: "How many employees have salary > 80000?" → `23`
|
- Example: "How many employees have salary > 80000?" → `23`
|
||||||
|
|
||||||
- **Filtering (24%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
|
- **Filtering (23%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
|
||||||
- Example: "How many employees in Sales have salary > 80000?" → `5`
|
- Example: "How many employees in Sales have salary > 80000?" → `5`
|
||||||
- Example: "How many active employees have more than 10 years of experience?" → `8`
|
- Example: "How many active employees have more than 10 years of experience?" → `8`
|
||||||
|
|
||||||
@@ -247,17 +317,22 @@ Six datasets designed to test different structural patterns:
|
|||||||
- Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
|
- Example: "List the field names for employees" → `id, name, email, department, salary, yearsExperience, active`
|
||||||
- Example: "What is the department of the last employee?" → `Sales`
|
- Example: "What is the department of the last employee?" → `Sales`
|
||||||
|
|
||||||
|
- **Structural validation (2%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
|
||||||
|
- Example: "Is this data complete and valid?" → `YES` (control dataset) or `NO` (corrupted datasets)
|
||||||
|
- Tests TOON's [N] length validation and {fields} consistency checking
|
||||||
|
- Demonstrates CSV's lack of structural validation capabilities
|
||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, YAML, XML, CSV).
|
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, JSON compact, JSON, CSV, YAML, XML).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`) without requiring an LLM judge.
|
||||||
|
|
||||||
#### Models & Configuration
|
#### Models & Configuration
|
||||||
|
|
||||||
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
|
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`, `grok-4-fast-non-reasoning`
|
||||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||||
- **Temperature**: Not set (models use their defaults)
|
- **Temperature**: Not set (models use their defaults)
|
||||||
- **Total evaluations**: 204 questions × 6 formats × 4 models = 4,896 LLM calls
|
- **Total evaluations**: 209 questions × 6 formats × 4 models = 5,016 LLM calls
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|||||||
@@ -5,19 +5,19 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
|
|||||||
```
|
```
|
||||||
🛒 E-commerce orders with nested structures ┊ Tabular: 33%
|
🛒 E-commerce orders with nested structures ┊ Tabular: 33%
|
||||||
│
|
│
|
||||||
TOON █████████████░░░░░░░ 72,743 tokens
|
TOON █████████████░░░░░░░ 72,771 tokens
|
||||||
├─ vs JSON (−33.1%) 108,731 tokens
|
├─ vs JSON (−33.1%) 108,806 tokens
|
||||||
├─ vs JSON compact (+5.5%) 68,936 tokens
|
├─ vs JSON compact (+5.5%) 68,975 tokens
|
||||||
├─ vs YAML (−14.1%) 84,724 tokens
|
├─ vs YAML (−14.2%) 84,780 tokens
|
||||||
└─ vs XML (−40.5%) 122,313 tokens
|
└─ vs XML (−40.5%) 122,406 tokens
|
||||||
|
|
||||||
🧾 Semi-uniform event logs ┊ Tabular: 50%
|
🧾 Semi-uniform event logs ┊ Tabular: 50%
|
||||||
│
|
│
|
||||||
TOON █████████████████░░░ 153,223 tokens
|
TOON █████████████████░░░ 153,211 tokens
|
||||||
├─ vs JSON (−15.0%) 180,196 tokens
|
├─ vs JSON (−15.0%) 180,176 tokens
|
||||||
├─ vs JSON compact (+19.9%) 127,740 tokens
|
├─ vs JSON compact (+19.9%) 127,731 tokens
|
||||||
├─ vs YAML (−0.8%) 154,514 tokens
|
├─ vs YAML (−0.8%) 154,505 tokens
|
||||||
└─ vs XML (−25.2%) 204,800 tokens
|
└─ vs XML (−25.2%) 204,777 tokens
|
||||||
|
|
||||||
🧩 Deeply nested configuration ┊ Tabular: 0%
|
🧩 Deeply nested configuration ┊ Tabular: 0%
|
||||||
│
|
│
|
||||||
@@ -28,11 +28,11 @@ Datasets with nested or semi-uniform structures. CSV excluded as it cannot prope
|
|||||||
└─ vs XML (−37.4%) 1,008 tokens
|
└─ vs XML (−37.4%) 1,008 tokens
|
||||||
|
|
||||||
──────────────────────────────────── Total ────────────────────────────────────
|
──────────────────────────────────── Total ────────────────────────────────────
|
||||||
TOON ████████████████░░░░ 226,597 tokens
|
TOON ████████████████░░░░ 226,613 tokens
|
||||||
├─ vs JSON (−21.8%) 289,846 tokens
|
├─ vs JSON (−21.8%) 289,901 tokens
|
||||||
├─ vs JSON compact (+14.9%) 197,240 tokens
|
├─ vs JSON compact (+14.9%) 197,270 tokens
|
||||||
├─ vs YAML (−5.5%) 239,911 tokens
|
├─ vs YAML (−5.6%) 239,958 tokens
|
||||||
└─ vs XML (−30.9%) 328,121 tokens
|
└─ vs XML (−31.0%) 328,191 tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Flat-Only Track
|
#### Flat-Only Track
|
||||||
@@ -42,21 +42,21 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
```
|
```
|
||||||
👥 Uniform employee records ┊ Tabular: 100%
|
👥 Uniform employee records ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
CSV ███████████████████░ 46,956 tokens
|
CSV ███████████████████░ 46,954 tokens
|
||||||
TOON ████████████████████ 49,827 tokens (+6.1% vs CSV)
|
TOON ████████████████████ 49,831 tokens (+6.1% vs CSV)
|
||||||
├─ vs JSON (−60.7%) 126,854 tokens
|
├─ vs JSON (−60.7%) 126,860 tokens
|
||||||
├─ vs JSON compact (−36.8%) 78,850 tokens
|
├─ vs JSON compact (−36.8%) 78,856 tokens
|
||||||
├─ vs YAML (−50.0%) 99,701 tokens
|
├─ vs YAML (−50.0%) 99,706 tokens
|
||||||
└─ vs XML (−66.0%) 146,440 tokens
|
└─ vs XML (−66.0%) 146,444 tokens
|
||||||
|
|
||||||
📈 Time-series analytics data ┊ Tabular: 100%
|
📈 Time-series analytics data ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
CSV ██████████████████░░ 8,396 tokens
|
CSV ██████████████████░░ 8,388 tokens
|
||||||
TOON ████████████████████ 9,128 tokens (+8.7% vs CSV)
|
TOON ████████████████████ 9,120 tokens (+8.7% vs CSV)
|
||||||
├─ vs JSON (−59.0%) 22,258 tokens
|
├─ vs JSON (−59.0%) 22,250 tokens
|
||||||
├─ vs JSON compact (−35.8%) 14,224 tokens
|
├─ vs JSON compact (−35.8%) 14,216 tokens
|
||||||
├─ vs YAML (−48.9%) 17,871 tokens
|
├─ vs YAML (−48.9%) 17,863 tokens
|
||||||
└─ vs XML (−65.7%) 26,629 tokens
|
└─ vs XML (−65.7%) 26,621 tokens
|
||||||
|
|
||||||
⭐ Top 100 GitHub repositories ┊ Tabular: 100%
|
⭐ Top 100 GitHub repositories ┊ Tabular: 100%
|
||||||
│
|
│
|
||||||
@@ -68,12 +68,12 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
└─ vs XML (−48.8%) 17,095 tokens
|
└─ vs XML (−48.8%) 17,095 tokens
|
||||||
|
|
||||||
──────────────────────────────────── Total ────────────────────────────────────
|
──────────────────────────────────── Total ────────────────────────────────────
|
||||||
CSV ███████████████████░ 63,865 tokens
|
CSV ███████████████████░ 63,855 tokens
|
||||||
TOON ████████████████████ 67,700 tokens (+6.0% vs CSV)
|
TOON ████████████████████ 67,696 tokens (+6.0% vs CSV)
|
||||||
├─ vs JSON (−58.8%) 164,257 tokens
|
├─ vs JSON (−58.8%) 164,255 tokens
|
||||||
├─ vs JSON compact (−35.2%) 104,529 tokens
|
├─ vs JSON compact (−35.2%) 104,527 tokens
|
||||||
├─ vs YAML (−48.2%) 130,701 tokens
|
├─ vs YAML (−48.2%) 130,698 tokens
|
||||||
└─ vs XML (−64.4%) 190,164 tokens
|
└─ vs XML (−64.4%) 190,160 tokens
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
@@ -83,64 +83,64 @@ Datasets with flat tabular structures where CSV is applicable.
|
|||||||
|
|
||||||
**Savings:** 13,130 tokens (59.0% reduction vs JSON)
|
**Savings:** 13,130 tokens (59.0% reduction vs JSON)
|
||||||
|
|
||||||
**JSON** (22,258 tokens):
|
**JSON** (22,250 tokens):
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"metrics": [
|
"metrics": [
|
||||||
{
|
{
|
||||||
"date": "2025-01-01",
|
"date": "2025-01-01",
|
||||||
"views": 7708,
|
"views": 5715,
|
||||||
"clicks": 595,
|
"clicks": 211,
|
||||||
"conversions": 69,
|
"conversions": 28,
|
||||||
"revenue": 15369.93,
|
"revenue": 7976.46,
|
||||||
"bounceRate": 0.35
|
"bounceRate": 0.47
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-02",
|
"date": "2025-01-02",
|
||||||
"views": 5894,
|
"views": 7103,
|
||||||
"clicks": 381,
|
"clicks": 393,
|
||||||
"conversions": 21,
|
"conversions": 28,
|
||||||
"revenue": 2112.12,
|
"revenue": 8360.53,
|
||||||
"bounceRate": 0.3
|
"bounceRate": 0.32
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-03",
|
"date": "2025-01-03",
|
||||||
"views": 6835,
|
"views": 7248,
|
||||||
"clicks": 422,
|
"clicks": 378,
|
||||||
"conversions": 35,
|
"conversions": 24,
|
||||||
"revenue": 4525.73,
|
"revenue": 3212.57,
|
||||||
"bounceRate": 0.5
|
"bounceRate": 0.5
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-04",
|
"date": "2025-01-04",
|
||||||
"views": 5325,
|
"views": 2927,
|
||||||
"clicks": 305,
|
"clicks": 77,
|
||||||
"conversions": 22,
|
"conversions": 11,
|
||||||
"revenue": 2445.3,
|
"revenue": 1211.69,
|
||||||
"bounceRate": 0.44
|
"bounceRate": 0.62
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"date": "2025-01-05",
|
"date": "2025-01-05",
|
||||||
"views": 2974,
|
"views": 3530,
|
||||||
"clicks": 61,
|
"clicks": 82,
|
||||||
"conversions": 6,
|
"conversions": 8,
|
||||||
"revenue": 956.57,
|
"revenue": 462.77,
|
||||||
"bounceRate": 0.47
|
"bounceRate": 0.56
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**TOON** (9,128 tokens):
|
**TOON** (9,120 tokens):
|
||||||
|
|
||||||
```
|
```
|
||||||
metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
||||||
2025-01-01,7708,595,69,15369.93,0.35
|
2025-01-01,5715,211,28,7976.46,0.47
|
||||||
2025-01-02,5894,381,21,2112.12,0.3
|
2025-01-02,7103,393,28,8360.53,0.32
|
||||||
2025-01-03,6835,422,35,4525.73,0.5
|
2025-01-03,7248,378,24,3212.57,0.5
|
||||||
2025-01-04,5325,305,22,2445.3,0.44
|
2025-01-04,2927,77,11,1211.69,0.62
|
||||||
2025-01-05,2974,61,6,956.57,0.47
|
2025-01-05,3530,82,8,462.77,0.56
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -56,9 +56,11 @@ export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
|||||||
*/
|
*/
|
||||||
export const QUESTION_TYPES = [
|
export const QUESTION_TYPES = [
|
||||||
'field-retrieval',
|
'field-retrieval',
|
||||||
|
'retrieval',
|
||||||
'aggregation',
|
'aggregation',
|
||||||
'filtering',
|
'filtering',
|
||||||
'structure-awareness',
|
'structure-awareness',
|
||||||
|
'structural-validation',
|
||||||
] as const
|
] as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -66,9 +68,11 @@ export const QUESTION_TYPES = [
|
|||||||
*/
|
*/
|
||||||
export const QUESTION_TYPE_LABELS = {
|
export const QUESTION_TYPE_LABELS = {
|
||||||
'field-retrieval': 'Field Retrieval',
|
'field-retrieval': 'Field Retrieval',
|
||||||
|
'retrieval': 'Retrieval',
|
||||||
'aggregation': 'Aggregation',
|
'aggregation': 'Aggregation',
|
||||||
'filtering': 'Filtering',
|
'filtering': 'Filtering',
|
||||||
'structure-awareness': 'Structure Awareness',
|
'structure-awareness': 'Structure Awareness',
|
||||||
|
'structural-validation': 'Structural Validation',
|
||||||
} as const
|
} as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -81,6 +85,12 @@ export const DATASET_NAMES = [
|
|||||||
'github',
|
'github',
|
||||||
'event-logs',
|
'event-logs',
|
||||||
'nested-config',
|
'nested-config',
|
||||||
|
'large-uniform',
|
||||||
|
'structural-validation-control',
|
||||||
|
'structural-validation-truncated',
|
||||||
|
'structural-validation-extra-rows',
|
||||||
|
'structural-validation-width-mismatch',
|
||||||
|
'structural-validation-missing-fields',
|
||||||
] as const
|
] as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -144,6 +144,30 @@ export interface NestedConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Product structure for large uniform arrays
|
||||||
|
*/
|
||||||
|
export interface Product {
|
||||||
|
sku: string
|
||||||
|
name: string
|
||||||
|
category: string
|
||||||
|
price: number
|
||||||
|
qty: number
|
||||||
|
lastUpdated: string
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Internal types for structural validation pattern generation
|
||||||
|
*/
|
||||||
|
type StructuralValidationType = 'truncated' | 'extra-rows' | 'width-mismatch' | 'missing-fields'
|
||||||
|
|
||||||
|
interface StructuralValidationFixture {
|
||||||
|
type: StructuralValidationType
|
||||||
|
description: string
|
||||||
|
data: Record<string, unknown>
|
||||||
|
isValid: boolean
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate analytics time-series data
|
* Generate analytics time-series data
|
||||||
*/
|
*/
|
||||||
@@ -505,6 +529,100 @@ export function generateNestedConfig(): NestedConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate large uniform product array (5000+ rows)
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Tests TOON's token efficiency and structural reliability at scale.
|
||||||
|
*/
|
||||||
|
export function generateProducts(count: number): { products: Product[] } {
|
||||||
|
const categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys'] as const
|
||||||
|
|
||||||
|
return {
|
||||||
|
products: Array.from({ length: count }, (_, i): Product => ({
|
||||||
|
sku: `SKU-${String(i + 1).padStart(6, '0')}`,
|
||||||
|
name: faker.commerce.productName(),
|
||||||
|
category: categories[i % categories.length]!,
|
||||||
|
price: Number(faker.commerce.price({ min: 5, max: 500 })),
|
||||||
|
qty: faker.number.int({ min: 0, max: 1000 }),
|
||||||
|
lastUpdated: faker.date.recent({ days: 30 }).toISOString().split('T')[0]!,
|
||||||
|
})),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate structural validation fixtures from employee data
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Creates deliberately corrupted datasets to test TOON's structural validation
|
||||||
|
* capabilities via [N] length declarations and {fields} headers.
|
||||||
|
* Internal function used to generate structural validation datasets.
|
||||||
|
*/
|
||||||
|
function generateStructuralValidationFixtures(): StructuralValidationFixture[] {
|
||||||
|
const baseData = generateEmployees(20)
|
||||||
|
|
||||||
|
return [
|
||||||
|
// Valid baseline
|
||||||
|
{
|
||||||
|
type: 'truncated' as const,
|
||||||
|
description: 'Valid complete dataset (control)',
|
||||||
|
data: { employees: baseData.employees },
|
||||||
|
isValid: true,
|
||||||
|
},
|
||||||
|
// Truncated array (missing last 3 rows)
|
||||||
|
{
|
||||||
|
type: 'truncated' as const,
|
||||||
|
description: 'Array truncated: 3 rows removed from end',
|
||||||
|
data: { employees: baseData.employees.slice(0, -3) },
|
||||||
|
isValid: false, // [N] won't match actual row count in TOON
|
||||||
|
},
|
||||||
|
// Extra rows (3 more than original)
|
||||||
|
{
|
||||||
|
type: 'extra-rows' as const,
|
||||||
|
description: 'Extra rows added beyond declared length',
|
||||||
|
data: {
|
||||||
|
employees: [
|
||||||
|
...baseData.employees,
|
||||||
|
...generateEmployees(3).employees,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
isValid: false, // [N] won't match actual row count in TOON
|
||||||
|
},
|
||||||
|
// Width mismatch (inconsistent field count)
|
||||||
|
{
|
||||||
|
type: 'width-mismatch' as const,
|
||||||
|
description: 'Inconsistent field count (missing salary in row 10)',
|
||||||
|
data: {
|
||||||
|
employees: baseData.employees.map((emp, i) => {
|
||||||
|
if (i === 9) {
|
||||||
|
// Row 10, missing salary field
|
||||||
|
const { salary, ...rest } = emp
|
||||||
|
return rest
|
||||||
|
}
|
||||||
|
return emp
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
isValid: false, // Not all objects have same fields (tabular requirement)
|
||||||
|
},
|
||||||
|
// Missing required fields
|
||||||
|
{
|
||||||
|
type: 'missing-fields' as const,
|
||||||
|
description: 'Missing required fields (no email in multiple rows)',
|
||||||
|
data: {
|
||||||
|
employees: baseData.employees.map((emp, i) => {
|
||||||
|
if (i % 5 === 0) {
|
||||||
|
// Every 5th row, missing email
|
||||||
|
const { email, ...rest } = emp
|
||||||
|
return rest
|
||||||
|
}
|
||||||
|
return emp
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
isValid: false, // Not all objects have same fields (tabular requirement)
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Event logs dataset: Semi-uniform structure
|
* Event logs dataset: Semi-uniform structure
|
||||||
*
|
*
|
||||||
@@ -539,6 +657,34 @@ const nestedConfigDataset: Dataset = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structural validation datasets: Tests ability to detect incomplete, truncated, or corrupted data
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* These datasets test TOON's structural validation advantages via [N] length declarations
|
||||||
|
* and {fields} headers. CSV is included to demonstrate its lack of structural metadata.
|
||||||
|
*/
|
||||||
|
const structuralValidationDatasets: Dataset[] = generateStructuralValidationFixtures().map((fixture, index) => {
|
||||||
|
const datasetNames = [
|
||||||
|
'structural-validation-control',
|
||||||
|
'structural-validation-truncated',
|
||||||
|
'structural-validation-extra-rows',
|
||||||
|
'structural-validation-width-mismatch',
|
||||||
|
'structural-validation-missing-fields',
|
||||||
|
] as const
|
||||||
|
|
||||||
|
return {
|
||||||
|
name: datasetNames[index]!,
|
||||||
|
description: fixture.description,
|
||||||
|
data: fixture.data,
|
||||||
|
metadata: {
|
||||||
|
supportsCSV: true, // Include CSV to show it can't validate structure
|
||||||
|
structureClass: 'uniform',
|
||||||
|
tabularEligibility: 100,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
|
* Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
|
||||||
*/
|
*/
|
||||||
@@ -549,6 +695,7 @@ export const ACCURACY_DATASETS: Dataset[] = [
|
|||||||
githubDataset, // 100 repos
|
githubDataset, // 100 repos
|
||||||
eventLogsDataset, // 75 logs
|
eventLogsDataset, // 75 logs
|
||||||
nestedConfigDataset, // 1 config
|
nestedConfigDataset, // 1 config
|
||||||
|
...structuralValidationDatasets, // 5 validation fixtures
|
||||||
]
|
]
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import { google } from '@ai-sdk/google'
|
|||||||
import { openai } from '@ai-sdk/openai'
|
import { openai } from '@ai-sdk/openai'
|
||||||
import { xai } from '@ai-sdk/xai'
|
import { xai } from '@ai-sdk/xai'
|
||||||
import { generateText } from 'ai'
|
import { generateText } from 'ai'
|
||||||
|
import { compareAnswers } from './normalize'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Models used for evaluation
|
* Models used for evaluation
|
||||||
@@ -74,7 +75,13 @@ ${formattedData}
|
|||||||
|
|
||||||
Question: ${question.prompt}
|
Question: ${question.prompt}
|
||||||
|
|
||||||
Provide only the direct answer, without any additional explanation or formatting.
|
Answer format requirements:
|
||||||
|
- Provide only the value itself, no explanation
|
||||||
|
- For numbers: output digits only (no commas, currency symbols, or units)
|
||||||
|
- For dates/field names: use the exact string from the data
|
||||||
|
- For lists: output comma-separated values with no spaces
|
||||||
|
|
||||||
|
Answer:
|
||||||
`.trim()
|
`.trim()
|
||||||
|
|
||||||
const startTime = performance.now()
|
const startTime = performance.now()
|
||||||
@@ -83,11 +90,13 @@ Provide only the direct answer, without any additional explanation or formatting
|
|||||||
const actual = text.trim()
|
const actual = text.trim()
|
||||||
const latencyMs = performance.now() - startTime
|
const latencyMs = performance.now() - startTime
|
||||||
|
|
||||||
const isCorrect = await validateAnswer({
|
const comparisonResult = compareAnswers(
|
||||||
actual,
|
actual,
|
||||||
expected: question.groundTruth,
|
question.groundTruth,
|
||||||
question: question.prompt,
|
question.answerType ?? 'string',
|
||||||
})
|
question.normalizationOptions,
|
||||||
|
)
|
||||||
|
const isCorrect = comparisonResult.match
|
||||||
|
|
||||||
return {
|
return {
|
||||||
questionId: question.id,
|
questionId: question.id,
|
||||||
@@ -101,42 +110,3 @@ Provide only the direct answer, without any additional explanation or formatting
|
|||||||
latencyMs,
|
latencyMs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate an answer using LLM-as-judge approach
|
|
||||||
*/
|
|
||||||
async function validateAnswer(
|
|
||||||
{
|
|
||||||
actual,
|
|
||||||
expected,
|
|
||||||
question,
|
|
||||||
}:
|
|
||||||
{
|
|
||||||
actual: string
|
|
||||||
expected: string
|
|
||||||
question: string
|
|
||||||
},
|
|
||||||
): Promise<boolean> {
|
|
||||||
const prompt = `
|
|
||||||
You are validating answers to questions about structured data.
|
|
||||||
|
|
||||||
Question: ${question}
|
|
||||||
Expected answer: ${expected}
|
|
||||||
Actual answer: ${actual}
|
|
||||||
|
|
||||||
Is the actual answer correct? Consider:
|
|
||||||
- Exact matches are correct
|
|
||||||
- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
|
|
||||||
- Minor formatting differences are acceptable
|
|
||||||
- Case-insensitive comparison for text
|
|
||||||
|
|
||||||
Respond with only "YES" or "NO".
|
|
||||||
`.trim()
|
|
||||||
|
|
||||||
const { text } = await generateText({
|
|
||||||
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
|
||||||
prompt,
|
|
||||||
})
|
|
||||||
|
|
||||||
return text.trim().toUpperCase() === 'YES'
|
|
||||||
}
|
|
||||||
|
|||||||
386
benchmarks/src/normalize.ts
Normal file
386
benchmarks/src/normalize.ts
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
/**
|
||||||
|
* Type of expected answer for deterministic comparison
|
||||||
|
*/
|
||||||
|
export type AnswerType
|
||||||
|
= | 'integer'
|
||||||
|
| 'number'
|
||||||
|
| 'boolean'
|
||||||
|
| 'date'
|
||||||
|
| 'string'
|
||||||
|
| 'csv-list-ordered'
|
||||||
|
| 'csv-list-unordered'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options for answer normalization and comparison
|
||||||
|
*/
|
||||||
|
export interface NormalizationOptions {
|
||||||
|
/**
|
||||||
|
* Tolerance for floating-point number comparison (e.g., 1e-6).
|
||||||
|
* @default 1e-6
|
||||||
|
*/
|
||||||
|
tolerance?: number
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether string comparison should be case-sensitive.
|
||||||
|
* @default false
|
||||||
|
*/
|
||||||
|
caseSensitive?: boolean
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow currency symbols ($, €, etc.) in number extraction.
|
||||||
|
* @default true
|
||||||
|
*/
|
||||||
|
allowCurrency?: boolean
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allow percent signs (%) in number extraction (will divide by 100).
|
||||||
|
* @default true
|
||||||
|
*/
|
||||||
|
allowPercent?: boolean
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of decimal places to round to for number comparison.
|
||||||
|
* If specified, overrides tolerance-based comparison.
|
||||||
|
*/
|
||||||
|
decimalPlaces?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
interface NormalizedResult {
|
||||||
|
success: boolean
|
||||||
|
value?: unknown
|
||||||
|
error?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default normalization options
|
||||||
|
*/
|
||||||
|
const DEFAULT_OPTIONS: Required<NormalizationOptions> = {
|
||||||
|
tolerance: 1e-6,
|
||||||
|
caseSensitive: false,
|
||||||
|
allowCurrency: true,
|
||||||
|
allowPercent: true,
|
||||||
|
decimalPlaces: undefined!,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regex pattern constants
|
||||||
|
const INTEGER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*/
|
||||||
|
const INTEGER_PATTERN = /-?\d[\d,]*/
|
||||||
|
const NUMBER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
|
||||||
|
const NUMBER_PATTERN = /-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
|
||||||
|
const WRAPPING_QUOTES_PATTERN = /^["']|["']$/g
|
||||||
|
const CODE_FENCE_PATTERN = /^```[\s\S]*?```$/g
|
||||||
|
const LANGUAGE_IDENTIFIER_PATTERN = /^\w+\n/
|
||||||
|
const CURRENCY_AND_FORMATTING_CHARS = /[$€£¥,\s]/g
|
||||||
|
const NUMBER_CLEANUP_CHARS = /[$€£¥,%\s]/g
|
||||||
|
|
||||||
|
// Boolean value constants
|
||||||
|
const TRUE_VALUES = new Set(['true', 'yes', 'y', '1'])
|
||||||
|
const FALSE_VALUES = new Set(['false', 'no', 'n', '0'])
|
||||||
|
|
||||||
|
// Numeric constants
|
||||||
|
const PERCENTAGE_DIVISOR = 100
|
||||||
|
const DECIMAL_BASE = 10
|
||||||
|
const MONTH_OFFSET = 1 // JavaScript months are 0-indexed
|
||||||
|
const DATE_COMPONENT_WIDTH = 2
|
||||||
|
const DATE_PAD_CHAR = '0'
|
||||||
|
|
||||||
|
// String constants
|
||||||
|
const CSV_DELIMITER = ','
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Strip wrapping quotes from a string
|
||||||
|
*/
|
||||||
|
function stripWrappingQuotes(text: string): string {
|
||||||
|
return text.trim().replace(WRAPPING_QUOTES_PATTERN, '')
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract and normalize an integer from a string
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: "42", "1,234", "$5,678", " -99 ", "The answer is 42."
|
||||||
|
*/
|
||||||
|
function normalizeInteger(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||||
|
// Strip common formatting, extract first integer-like token
|
||||||
|
const pattern = options.allowCurrency
|
||||||
|
? INTEGER_PATTERN_WITH_CURRENCY
|
||||||
|
: INTEGER_PATTERN
|
||||||
|
|
||||||
|
const match = text.match(pattern)
|
||||||
|
if (!match)
|
||||||
|
return { success: false, error: `No integer found in: "${text}"` }
|
||||||
|
|
||||||
|
// Remove currency symbols, spaces, and thousand separators
|
||||||
|
const normalizedValue = match[0].replace(CURRENCY_AND_FORMATTING_CHARS, '')
|
||||||
|
const parsedNumber = Number.parseInt(normalizedValue, DECIMAL_BASE)
|
||||||
|
|
||||||
|
if (Number.isNaN(parsedNumber))
|
||||||
|
return { success: false, error: `Failed to parse integer: "${match[0]}"` }
|
||||||
|
|
||||||
|
return { success: true, value: parsedNumber }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract and normalize a floating-point number from a string
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: "3.14", "1,234.56", "$5,678.90", "42%", "1.5e-3", "Price: $99.99"
|
||||||
|
*/
|
||||||
|
function normalizeNumber(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||||
|
// Extract first number-like token (supports scientific notation)
|
||||||
|
const pattern = options.allowCurrency
|
||||||
|
? NUMBER_PATTERN_WITH_CURRENCY
|
||||||
|
: NUMBER_PATTERN
|
||||||
|
|
||||||
|
const match = text.match(pattern)
|
||||||
|
if (!match)
|
||||||
|
return { success: false, error: `No number found in: "${text}"` }
|
||||||
|
|
||||||
|
const token = match[0]
|
||||||
|
const hasPercentSign = options.allowPercent && token.endsWith('%')
|
||||||
|
|
||||||
|
// Remove currency, commas, spaces, and percent sign
|
||||||
|
const normalizedToken = token.replace(NUMBER_CLEANUP_CHARS, '')
|
||||||
|
let parsedNumber = Number.parseFloat(normalizedToken)
|
||||||
|
|
||||||
|
if (Number.isNaN(parsedNumber))
|
||||||
|
return { success: false, error: `Failed to parse number: "${token}"` }
|
||||||
|
|
||||||
|
// Convert percentage to decimal if present
|
||||||
|
if (hasPercentSign)
|
||||||
|
parsedNumber = parsedNumber / PERCENTAGE_DIVISOR
|
||||||
|
|
||||||
|
// Round to specified decimal places if requested
|
||||||
|
if (options.decimalPlaces !== undefined) {
|
||||||
|
const factor = DECIMAL_BASE ** options.decimalPlaces
|
||||||
|
parsedNumber = Math.round(parsedNumber * factor) / factor
|
||||||
|
}
|
||||||
|
|
||||||
|
return { success: true, value: parsedNumber }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a boolean/yes-no answer
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: "true", "false", "yes", "no", "y", "n", "1", "0" (case-insensitive)
|
||||||
|
*/
|
||||||
|
function normalizeBoolean(text: string): NormalizedResult {
|
||||||
|
const normalizedValue = text.trim().toLowerCase()
|
||||||
|
|
||||||
|
if (TRUE_VALUES.has(normalizedValue))
|
||||||
|
return { success: true, value: true }
|
||||||
|
|
||||||
|
if (FALSE_VALUES.has(normalizedValue))
|
||||||
|
return { success: true, value: false }
|
||||||
|
|
||||||
|
return { success: false, error: `Not a boolean: "${text}"` }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a date string to YYYY-MM-DD format
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: ISO dates, "Nov 1, 2025", "2025-11-01", RFC 2822, etc.
|
||||||
|
*/
|
||||||
|
function normalizeDate(text: string): NormalizedResult {
|
||||||
|
const cleaned = stripWrappingQuotes(text)
|
||||||
|
|
||||||
|
// Try parsing as date
|
||||||
|
const parsedDate = new Date(cleaned)
|
||||||
|
if (Number.isNaN(parsedDate.getTime()))
|
||||||
|
return { success: false, error: `Invalid date: "${text}"` }
|
||||||
|
|
||||||
|
// Normalize to YYYY-MM-DD (UTC)
|
||||||
|
const year = parsedDate.getUTCFullYear()
|
||||||
|
const monthPadded = String(parsedDate.getUTCMonth() + MONTH_OFFSET).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
|
||||||
|
const dayPadded = String(parsedDate.getUTCDate()).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
|
||||||
|
const normalized = `${year}-${monthPadded}-${dayPadded}`
|
||||||
|
|
||||||
|
return { success: true, value: normalized }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a string (trim, optionally case-insensitive)
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles wrapping quotes and code fences.
|
||||||
|
*/
|
||||||
|
function normalizeString(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||||
|
let trimmedText = text.trim()
|
||||||
|
|
||||||
|
// Strip wrapping quotes
|
||||||
|
trimmedText = trimmedText.replace(WRAPPING_QUOTES_PATTERN, '')
|
||||||
|
|
||||||
|
// Strip code fences (```...```)
|
||||||
|
trimmedText = trimmedText.replace(CODE_FENCE_PATTERN, (match) => {
|
||||||
|
const inner = match.slice(3, -3).trim()
|
||||||
|
// Remove language identifier if present (e.g., ```json)
|
||||||
|
return inner.replace(LANGUAGE_IDENTIFIER_PATTERN, '')
|
||||||
|
})
|
||||||
|
|
||||||
|
trimmedText = trimmedText.trim()
|
||||||
|
|
||||||
|
const value = options.caseSensitive ? trimmedText : trimmedText.toLowerCase()
|
||||||
|
return { success: true, value }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a comma-separated list (ordered)
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: "a,b,c", "a, b, c", " a , b , c "
|
||||||
|
*/
|
||||||
|
function normalizeCsvListOrdered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||||
|
const strippedText = stripWrappingQuotes(text)
|
||||||
|
const items = strippedText
|
||||||
|
.split(CSV_DELIMITER)
|
||||||
|
.map(item => item.trim())
|
||||||
|
.filter(item => item.length > 0)
|
||||||
|
|
||||||
|
const normalizedItems = items.map(item =>
|
||||||
|
options.caseSensitive ? item : item.toLowerCase(),
|
||||||
|
)
|
||||||
|
|
||||||
|
return { success: true, value: normalizedItems }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a comma-separated list (unordered, compare as sets)
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Handles: "c,a,b" equals "a,b,c"
|
||||||
|
*/
|
||||||
|
function normalizeCsvListUnordered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||||
|
const result = normalizeCsvListOrdered(text, options)
|
||||||
|
if (!result.success)
|
||||||
|
return result
|
||||||
|
|
||||||
|
// Type guard: ensure result.value is an array
|
||||||
|
if (!Array.isArray(result.value))
|
||||||
|
return { success: false, error: 'Expected array result from normalizeCsvListOrdered' }
|
||||||
|
|
||||||
|
// Sort for deterministic comparison
|
||||||
|
const sorted = [...result.value].sort()
|
||||||
|
return { success: true, value: sorted }
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a value based on its expected kind
|
||||||
|
*/
|
||||||
|
export function normalizeAnswer(
|
||||||
|
text: string,
|
||||||
|
kind: AnswerType,
|
||||||
|
options: Partial<NormalizationOptions> = {},
|
||||||
|
): NormalizedResult {
|
||||||
|
const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
|
||||||
|
|
||||||
|
switch (kind) {
|
||||||
|
case 'integer':
|
||||||
|
return normalizeInteger(text, resolvedOptions)
|
||||||
|
case 'number':
|
||||||
|
return normalizeNumber(text, resolvedOptions)
|
||||||
|
case 'boolean':
|
||||||
|
return normalizeBoolean(text)
|
||||||
|
case 'date':
|
||||||
|
return normalizeDate(text)
|
||||||
|
case 'string':
|
||||||
|
return normalizeString(text, resolvedOptions)
|
||||||
|
case 'csv-list-ordered':
|
||||||
|
return normalizeCsvListOrdered(text, resolvedOptions)
|
||||||
|
case 'csv-list-unordered':
|
||||||
|
return normalizeCsvListUnordered(text, resolvedOptions)
|
||||||
|
default:
|
||||||
|
return { success: false, error: `Unknown answer kind: ${kind}` }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare two normalized values based on answer kind
|
||||||
|
*/
|
||||||
|
function compareValues(
|
||||||
|
actual: unknown,
|
||||||
|
expected: unknown,
|
||||||
|
kind: AnswerType,
|
||||||
|
options: Required<NormalizationOptions>,
|
||||||
|
): boolean {
|
||||||
|
switch (kind) {
|
||||||
|
case 'integer':
|
||||||
|
case 'boolean':
|
||||||
|
case 'date':
|
||||||
|
case 'string':
|
||||||
|
return actual === expected
|
||||||
|
|
||||||
|
case 'number':
|
||||||
|
if (typeof actual !== 'number' || typeof expected !== 'number')
|
||||||
|
return false
|
||||||
|
|
||||||
|
if (options.decimalPlaces !== undefined) {
|
||||||
|
// Already rounded during normalization
|
||||||
|
return actual === expected
|
||||||
|
}
|
||||||
|
return Math.abs(actual - expected) <= options.tolerance
|
||||||
|
|
||||||
|
case 'csv-list-ordered':
|
||||||
|
if (!Array.isArray(actual) || !Array.isArray(expected))
|
||||||
|
return false
|
||||||
|
if (actual.length !== expected.length)
|
||||||
|
return false
|
||||||
|
return actual.every((item, i) => item === expected[i])
|
||||||
|
|
||||||
|
case 'csv-list-unordered':
|
||||||
|
if (!Array.isArray(actual) || !Array.isArray(expected))
|
||||||
|
return false
|
||||||
|
if (actual.length !== expected.length)
|
||||||
|
return false
|
||||||
|
// Already sorted during normalization
|
||||||
|
return actual.every((item, i) => item === expected[i])
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compare actual and expected answers with deterministic, type-aware normalization
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Returns true if answers match within the specified tolerance/rules.
|
||||||
|
*/
|
||||||
|
export function compareAnswers(
|
||||||
|
actual: string,
|
||||||
|
expected: string,
|
||||||
|
kind: AnswerType,
|
||||||
|
options: Partial<NormalizationOptions> = {},
|
||||||
|
): { match: boolean, details?: string } {
|
||||||
|
const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
|
||||||
|
|
||||||
|
// Normalize both answers
|
||||||
|
const actualResult = normalizeAnswer(actual, kind, resolvedOptions)
|
||||||
|
const expectedResult = normalizeAnswer(expected, kind, resolvedOptions)
|
||||||
|
|
||||||
|
// If either normalization failed, return false with details
|
||||||
|
if (!actualResult.success) {
|
||||||
|
return {
|
||||||
|
match: false,
|
||||||
|
details: `Failed to normalize actual answer: ${actualResult.error}`,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!expectedResult.success) {
|
||||||
|
return {
|
||||||
|
match: false,
|
||||||
|
details: `Failed to normalize expected answer: ${expectedResult.error}`,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare normalized values
|
||||||
|
const match = compareValues(actualResult.value, expectedResult.value, kind, resolvedOptions)
|
||||||
|
|
||||||
|
return {
|
||||||
|
match,
|
||||||
|
details: match
|
||||||
|
? undefined
|
||||||
|
: `Mismatch: actual="${actualResult.value}" vs expected="${expectedResult.value}"`,
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(metric.views))
|
.groundTruth(String(metric.views))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(metric, getId) => new QuestionBuilder()
|
(metric, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -24,6 +25,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(metric.revenue))
|
.groundTruth(String(metric.revenue))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
(metric, getId) => new QuestionBuilder()
|
(metric, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -31,6 +34,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(metric.bounceRate))
|
.groundTruth(String(metric.bounceRate))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
(metric, getId) => new QuestionBuilder()
|
(metric, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -38,6 +43,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(metric.conversions))
|
.groundTruth(String(metric.conversions))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -63,6 +69,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(totalDays))
|
.groundTruth(String(totalDays))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -70,6 +77,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(totalViews))
|
.groundTruth(String(totalViews))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -77,6 +85,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(totalConversions))
|
.groundTruth(String(totalConversions))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -84,6 +93,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(totalRevenue.toFixed(2)))
|
.groundTruth(String(totalRevenue.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -91,6 +102,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(avgBounceRate.toFixed(2)))
|
.groundTruth(String(avgBounceRate.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -104,6 +117,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -117,6 +131,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -133,6 +148,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -149,6 +165,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -165,6 +182,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -181,6 +199,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(log.level)
|
.groundTruth(log.level)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(log, getId) => new QuestionBuilder()
|
(log, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -24,6 +25,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(log.endpoint)
|
.groundTruth(log.endpoint)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(log, getId) => new QuestionBuilder()
|
(log, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -31,6 +33,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(log.statusCode))
|
.groundTruth(String(log.statusCode))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(log, getId) => new QuestionBuilder()
|
(log, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -38,6 +41,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(log.responseTime))
|
.groundTruth(String(log.responseTime))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -60,6 +64,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(totalLogs))
|
.groundTruth(String(totalLogs))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -67,6 +72,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(avgResponseTime.toFixed(2)))
|
.groundTruth(String(avgResponseTime.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -81,6 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -96,6 +104,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -111,6 +120,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(errorCount))
|
.groundTruth(String(errorCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -118,6 +128,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(successCount))
|
.groundTruth(String(successCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -130,6 +141,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(retryableErrorCount))
|
.groundTruth(String(retryableErrorCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -147,6 +159,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -161,6 +174,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -175,6 +189,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(repo.stars))
|
.groundTruth(String(repo.stars))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(repo, getId) => new QuestionBuilder()
|
(repo, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -24,6 +25,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(repo.forks))
|
.groundTruth(String(repo.forks))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(repo, getId) => new QuestionBuilder()
|
(repo, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -31,6 +33,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(repo.watchers))
|
.groundTruth(String(repo.watchers))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(repo, getId) => new QuestionBuilder()
|
(repo, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -38,6 +41,8 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(repo.defaultBranch)
|
.groundTruth(repo.defaultBranch)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('string')
|
||||||
|
.normalize({ caseSensitive: true })
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -62,6 +67,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(totalRepos))
|
.groundTruth(String(totalRepos))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -69,6 +75,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(totalStars))
|
.groundTruth(String(totalStars))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -76,6 +83,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(totalForks))
|
.groundTruth(String(totalForks))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -83,6 +91,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(Math.round(avgStars)))
|
.groundTruth(String(Math.round(avgStars)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -97,6 +106,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -111,6 +121,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -125,6 +136,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -139,6 +151,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -155,6 +168,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -171,6 +185,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import { generateEventLogsQuestions } from './event-logs'
|
|||||||
import { generateGithubQuestions } from './github'
|
import { generateGithubQuestions } from './github'
|
||||||
import { generateNestedQuestions } from './nested'
|
import { generateNestedQuestions } from './nested'
|
||||||
import { generateNestedConfigQuestions } from './nested-config'
|
import { generateNestedConfigQuestions } from './nested-config'
|
||||||
|
import { generateStructuralValidationQuestions } from './structural-validation'
|
||||||
import { generateStructureQuestions } from './structure'
|
import { generateStructureQuestions } from './structure'
|
||||||
import { generateTabularQuestions } from './tabular'
|
import { generateTabularQuestions } from './tabular'
|
||||||
import { createIdGenerator } from './utils'
|
import { createIdGenerator } from './utils'
|
||||||
@@ -47,5 +48,8 @@ export function generateQuestions(): Question[] {
|
|||||||
// Generate structure-awareness questions (tests format-native affordances)
|
// Generate structure-awareness questions (tests format-native affordances)
|
||||||
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
|
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
|
||||||
|
|
||||||
|
// Generate structural-validation questions (tests ability to detect corrupted data)
|
||||||
|
questions.push(...generateStructuralValidationQuestions(getId))
|
||||||
|
|
||||||
return questions
|
return questions
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,42 +17,52 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
{
|
{
|
||||||
prompt: 'What is the environment in the configuration?',
|
prompt: 'What is the environment in the configuration?',
|
||||||
groundTruth: config.environment,
|
groundTruth: config.environment,
|
||||||
|
answerType: 'string' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the database host?',
|
prompt: 'What is the database host?',
|
||||||
groundTruth: config.database.host,
|
groundTruth: config.database.host,
|
||||||
|
answerType: 'string' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the database port?',
|
prompt: 'What is the database port?',
|
||||||
groundTruth: String(config.database.port),
|
groundTruth: String(config.database.port),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the maximum connection pool size?',
|
prompt: 'What is the maximum connection pool size?',
|
||||||
groundTruth: String(config.database.pool.max),
|
groundTruth: String(config.database.pool.max),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the session duration?',
|
prompt: 'What is the session duration?',
|
||||||
groundTruth: String(config.authentication.session.duration),
|
groundTruth: String(config.authentication.session.duration),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the minimum connection pool size?',
|
prompt: 'What is the minimum connection pool size?',
|
||||||
groundTruth: String(config.database.pool.min),
|
groundTruth: String(config.database.pool.min),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the connection pool idle timeout?',
|
prompt: 'What is the connection pool idle timeout?',
|
||||||
groundTruth: String(config.database.pool.idleTimeout),
|
groundTruth: String(config.database.pool.idleTimeout),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the database name?',
|
prompt: 'What is the database name?',
|
||||||
groundTruth: config.database.name,
|
groundTruth: config.database.name,
|
||||||
|
answerType: 'string' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the session refresh threshold?',
|
prompt: 'What is the session refresh threshold?',
|
||||||
groundTruth: String(config.authentication.session.refreshThreshold),
|
groundTruth: String(config.authentication.session.refreshThreshold),
|
||||||
|
answerType: 'integer' as const,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
prompt: 'What is the version in the configuration?',
|
prompt: 'What is the version in the configuration?',
|
||||||
groundTruth: config.version,
|
groundTruth: config.version,
|
||||||
|
answerType: 'string' as const,
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -64,6 +74,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(q.groundTruth)
|
.groundTruth(q.groundTruth)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType(q.answerType)
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -82,6 +93,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(roleCount))
|
.groundTruth(String(roleCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -89,6 +101,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(groupCount))
|
.groundTruth(String(groupCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -96,6 +109,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(providerCount))
|
.groundTruth(String(providerCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -103,6 +117,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(featureCount))
|
.groundTruth(String(featureCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -110,6 +125,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(replicaCount))
|
.groundTruth(String(replicaCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -122,6 +138,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(adminScopeProviderCount))
|
.groundTruth(String(adminScopeProviderCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -134,6 +151,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(enabledFeatures))
|
.groundTruth(String(enabledFeatures))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -146,6 +164,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(adminPermissions))
|
.groundTruth(String(adminPermissions))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -164,6 +183,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(totalPermissions))
|
.groundTruth(String(totalPermissions))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -171,6 +191,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(distinctPermissions))
|
.groundTruth(String(distinctPermissions))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -178,6 +199,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(totalVariants))
|
.groundTruth(String(totalVariants))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -185,6 +207,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(highPriorityReplicas))
|
.groundTruth(String(highPriorityReplicas))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -192,6 +215,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(featuresWithHighRollout))
|
.groundTruth(String(featuresWithHighRollout))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -199,6 +223,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(String(groupsWithMultipleRoles))
|
.groundTruth(String(groupsWithMultipleRoles))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -249,6 +274,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.groundTruth(q.groundTruth)
|
.groundTruth(q.groundTruth)
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(order.total))
|
.groundTruth(String(order.total))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
(order, getId) => new QuestionBuilder()
|
(order, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -24,6 +26,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(order.status)
|
.groundTruth(order.status)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -43,6 +46,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(order.customer.name)
|
.groundTruth(order.customer.name)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(order, getId) => new QuestionBuilder()
|
(order, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -50,6 +54,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(order.customer.email)
|
.groundTruth(order.customer.email)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(order, getId) => new QuestionBuilder()
|
(order, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -57,6 +62,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(order.orderDate || '')
|
.groundTruth(order.orderDate || '')
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(order, getId) => new QuestionBuilder()
|
(order, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -64,6 +70,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(order.items.length))
|
.groundTruth(String(order.items.length))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -94,6 +101,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -105,6 +113,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(totalRevenue.toFixed(2)))
|
.groundTruth(String(totalRevenue.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -112,6 +122,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(avgOrderValue.toFixed(2)))
|
.groundTruth(String(avgOrderValue.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -119,6 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(totalOrders))
|
.groundTruth(String(totalOrders))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -126,6 +139,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(maxOrderValue.toFixed(2)))
|
.groundTruth(String(maxOrderValue.toFixed(2)))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('number')
|
||||||
|
.normalize({ decimalPlaces: 2 })
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -139,6 +154,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -156,6 +172,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -172,6 +189,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -188,6 +206,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
44
benchmarks/src/questions/structural-validation.ts
Normal file
44
benchmarks/src/questions/structural-validation.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import type { Question } from '../types'
|
||||||
|
import { QuestionBuilder } from './utils'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate structural validation questions for all incompleteness fixtures
|
||||||
|
*
|
||||||
|
* These questions test the ability to detect incomplete, truncated, or corrupted data
|
||||||
|
* by validating structural metadata (TOON's [N] length declarations and {fields} headers).
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* - TOON's advantage: Explicit [N] and {fields} enable validation
|
||||||
|
* - CSV disadvantage: No structural metadata to validate against
|
||||||
|
* - JSON/YAML disadvantage: Require manual counting and schema inference
|
||||||
|
*/
|
||||||
|
export function generateStructuralValidationQuestions(
|
||||||
|
getId: () => string,
|
||||||
|
): Question[] {
|
||||||
|
const questions: Question[] = []
|
||||||
|
|
||||||
|
// Dataset names and their expected validity
|
||||||
|
const validationFixtures = [
|
||||||
|
{ dataset: 'structural-validation-control', isValid: true, description: 'Valid complete dataset (control)' },
|
||||||
|
{ dataset: 'structural-validation-truncated', isValid: false, description: 'Array truncated: 3 rows removed from end' },
|
||||||
|
{ dataset: 'structural-validation-extra-rows', isValid: false, description: 'Extra rows added beyond declared length' },
|
||||||
|
{ dataset: 'structural-validation-width-mismatch', isValid: false, description: 'Inconsistent field count (missing salary in row 10)' },
|
||||||
|
{ dataset: 'structural-validation-missing-fields', isValid: false, description: 'Missing required fields (no email in multiple rows)' },
|
||||||
|
] as const
|
||||||
|
|
||||||
|
// Generate one validation question per fixture
|
||||||
|
for (const fixture of validationFixtures) {
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('Is this data complete and valid? Answer only YES or NO.')
|
||||||
|
.groundTruth(fixture.isValid ? 'YES' : 'NO')
|
||||||
|
.type('structural-validation')
|
||||||
|
.dataset(fixture.dataset)
|
||||||
|
.answerType('boolean')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return questions
|
||||||
|
}
|
||||||
@@ -30,6 +30,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(employees.length))
|
.groundTruth(String(employees.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -42,6 +43,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(employeeFields)
|
.groundTruth(employeeFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -53,6 +55,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('email')
|
.groundTruth('email')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -65,6 +68,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastEmployee.department)
|
.groundTruth(lastEmployee.department)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -76,6 +80,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastEmployee.name)
|
.groundTruth(lastEmployee.name)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -87,6 +92,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('7')
|
.groundTruth('7')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -100,6 +106,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(orders.length))
|
.groundTruth(String(orders.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -112,6 +119,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(orderFields)
|
.groundTruth(orderFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -126,6 +134,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(orderWithManyItems.items.length))
|
.groundTruth(String(orderWithManyItems.items.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -138,6 +147,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(itemFields)
|
.groundTruth(itemFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -150,6 +160,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastOrder.status)
|
.groundTruth(lastOrder.status)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -162,6 +173,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(customerFields)
|
.groundTruth(customerFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('nested')
|
.dataset('nested')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -175,6 +187,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(metrics.length))
|
.groundTruth(String(metrics.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -187,6 +200,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(metricFields)
|
.groundTruth(metricFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -198,6 +212,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('revenue')
|
.groundTruth('revenue')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -210,6 +225,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastMetric.date)
|
.groundTruth(lastMetric.date)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -221,6 +237,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('6')
|
.groundTruth('6')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('analytics')
|
.dataset('analytics')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -234,6 +251,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(repos.length))
|
.groundTruth(String(repos.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -246,6 +264,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(repoFields)
|
.groundTruth(repoFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('csv-list-ordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -257,6 +276,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('forks')
|
.groundTruth('forks')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -269,6 +289,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastRepo.name)
|
.groundTruth(lastRepo.name)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -280,6 +301,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth('11')
|
.groundTruth('11')
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('github')
|
.dataset('github')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -293,6 +315,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(String(logs.length))
|
.groundTruth(String(logs.length))
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -305,6 +328,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(logFields)
|
.groundTruth(logFields)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('csv-list-unordered')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -317,6 +341,7 @@ export function generateStructureQuestions(
|
|||||||
.groundTruth(lastLog.level)
|
.groundTruth(lastLog.level)
|
||||||
.type('structure-awareness')
|
.type('structure-awareness')
|
||||||
.dataset('event-logs')
|
.dataset('event-logs')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(emp.salary))
|
.groundTruth(String(emp.salary))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(emp, getId) => new QuestionBuilder()
|
(emp, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -24,6 +25,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(emp.department)
|
.groundTruth(emp.department)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(emp, getId) => new QuestionBuilder()
|
(emp, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -31,6 +33,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(emp.email)
|
.groundTruth(emp.email)
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('string')
|
||||||
.build(),
|
.build(),
|
||||||
(emp, getId) => new QuestionBuilder()
|
(emp, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -38,6 +41,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(emp.yearsExperience))
|
.groundTruth(String(emp.yearsExperience))
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
(emp, getId) => new QuestionBuilder()
|
(emp, getId) => new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -45,6 +49,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(emp.active ? 'yes' : 'no')
|
.groundTruth(emp.active ? 'yes' : 'no')
|
||||||
.type('field-retrieval')
|
.type('field-retrieval')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('boolean')
|
||||||
.build(),
|
.build(),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -67,6 +72,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -81,6 +87,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -98,6 +105,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(totalEmployees))
|
.groundTruth(String(totalEmployees))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -105,6 +113,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(avgSalary))
|
.groundTruth(String(avgSalary))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -112,6 +121,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(activeCount))
|
.groundTruth(String(activeCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -119,6 +129,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(inactiveCount))
|
.groundTruth(String(inactiveCount))
|
||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -134,6 +145,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -148,6 +160,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -164,6 +177,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -178,6 +192,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
.groundTruth(String(count))
|
.groundTruth(String(count))
|
||||||
.type('filtering')
|
.type('filtering')
|
||||||
.dataset('tabular')
|
.dataset('tabular')
|
||||||
|
.answerType('integer')
|
||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import type { AnswerType, NormalizationOptions } from '../normalize'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
|
|
||||||
// Constants for sampling strides
|
// Constants for sampling strides
|
||||||
@@ -52,10 +53,21 @@ export class QuestionBuilder {
|
|||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
|
answerType(kind: AnswerType): this {
|
||||||
|
this.question.answerType = kind
|
||||||
|
return this
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize(options: Partial<NormalizationOptions>): this {
|
||||||
|
this.question.normalizationOptions = options
|
||||||
|
return this
|
||||||
|
}
|
||||||
|
|
||||||
build(): Question {
|
build(): Question {
|
||||||
if (!this.question.id || !this.question.prompt || !this.question.groundTruth || !this.question.type || !this.question.dataset) {
|
if (!this.question.id || !this.question.prompt || !this.question.groundTruth || !this.question.type || !this.question.dataset) {
|
||||||
throw new Error('Incomplete question')
|
throw new Error('Incomplete question')
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.question as Question
|
return this.question as Question
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -65,7 +77,7 @@ export class QuestionBuilder {
|
|||||||
*/
|
*/
|
||||||
export function rotateQuestions<T>(
|
export function rotateQuestions<T>(
|
||||||
items: T[],
|
items: T[],
|
||||||
generators: Array<(item: T, getId: () => string) => Question>,
|
generators: ((item: T, getId: () => string) => Question)[],
|
||||||
limit: number,
|
limit: number,
|
||||||
stride: number,
|
stride: number,
|
||||||
getId: () => string,
|
getId: () => string,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||||
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
||||||
import { ACCURACY_DATASETS } from './datasets'
|
import { ACCURACY_DATASETS } from './datasets'
|
||||||
import { models } from './evaluate'
|
import { models, PRIMERS } from './evaluate'
|
||||||
import { supportsCSV } from './formatters'
|
import { supportsCSV } from './formatters'
|
||||||
import { generateQuestions } from './questions'
|
import { generateQuestions } from './questions'
|
||||||
import { createProgressBar, tokenize } from './utils'
|
import { createProgressBar, tokenize } from './utils'
|
||||||
@@ -10,6 +10,9 @@ const EFFICIENCY_CHART_STYLE: 'vertical' | 'horizontal' = 'horizontal'
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate token counts for all format+dataset combinations
|
* Calculate token counts for all format+dataset combinations
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Includes primer tokens for fairer comparison across formats
|
||||||
*/
|
*/
|
||||||
export function calculateTokenCounts(
|
export function calculateTokenCounts(
|
||||||
formatters: Record<string, (data: unknown) => string>,
|
formatters: Record<string, (data: unknown) => string>,
|
||||||
@@ -23,8 +26,11 @@ export function calculateTokenCounts(
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
const formattedData = formatter(dataset.data)
|
const formattedData = formatter(dataset.data)
|
||||||
|
const primer = PRIMERS[formatName] ?? ''
|
||||||
|
// Include primer in token count for fair comparison
|
||||||
|
const fullPrompt = primer ? `${primer}\n\n${formattedData}` : formattedData
|
||||||
const key = `${formatName}-${dataset.name}`
|
const key = `${formatName}-${dataset.name}`
|
||||||
tokenCounts[key] = tokenize(formattedData)
|
tokenCounts[key] = tokenize(fullPrompt)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,9 +143,12 @@ function generateEfficiencyRankingReport(
|
|||||||
): string {
|
): string {
|
||||||
const toon = formatResults.find(r => r.format === 'toon')
|
const toon = formatResults.find(r => r.format === 'toon')
|
||||||
const json = formatResults.find(r => r.format === 'json-pretty')
|
const json = formatResults.find(r => r.format === 'json-pretty')
|
||||||
|
const csv = formatResults.find(r => r.format === 'csv')
|
||||||
|
|
||||||
// Build efficiency ranking (accuracy per 1k tokens)
|
// Build efficiency ranking (accuracy per 1k tokens)
|
||||||
const efficiencyRanking = formatResults
|
const efficiencyRanking = formatResults
|
||||||
|
// Exclude CSV since it only supports a subset of datasets (~half the questions)
|
||||||
|
.filter(fr => fr.format !== 'csv')
|
||||||
.map((fr) => {
|
.map((fr) => {
|
||||||
const efficiency = (fr.accuracy * 100) / (fr.totalTokens / 1000)
|
const efficiency = (fr.accuracy * 100) / (fr.totalTokens / 1000)
|
||||||
return {
|
return {
|
||||||
@@ -163,6 +172,12 @@ function generateEfficiencyRankingReport(
|
|||||||
summary = `TOON achieves ${toonVsJson} while using ${tokenSavings}.`
|
summary = `TOON achieves ${toonVsJson} while using ${tokenSavings}.`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add CSV note if available
|
||||||
|
let csvNote = ''
|
||||||
|
if (csv) {
|
||||||
|
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
|
||||||
|
}
|
||||||
|
|
||||||
return `
|
return `
|
||||||
Each format's overall performance, balancing accuracy against token cost:
|
Each format's overall performance, balancing accuracy against token cost:
|
||||||
|
|
||||||
@@ -170,7 +185,7 @@ Each format's overall performance, balancing accuracy against token cost:
|
|||||||
${efficiencyChart}
|
${efficiencyChart}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
${summary}
|
${summary}${csvNote}
|
||||||
`.trim()
|
`.trim()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -210,11 +225,13 @@ function generateDetailedAccuracyReport(
|
|||||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||||
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
||||||
|
const structuralValidationCount = questions.filter(q => q.type === 'structural-validation').length
|
||||||
|
|
||||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||||
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
||||||
|
const structuralValidationPercent = ((structuralValidationCount / totalQuestions) * 100).toFixed(0)
|
||||||
|
|
||||||
// Calculate dataset sizes
|
// Calculate dataset sizes
|
||||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||||
@@ -263,8 +280,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
|||||||
|
|
||||||
#### Datasets Tested
|
#### Datasets Tested
|
||||||
|
|
||||||
Six datasets designed to test different structural patterns:
|
Eleven datasets designed to test different structural patterns and validation capabilities:
|
||||||
|
|
||||||
|
**Primary datasets:**
|
||||||
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||||
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
|
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||||
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
|
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
|
||||||
@@ -272,9 +290,16 @@ Six datasets designed to test different structural patterns:
|
|||||||
5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
||||||
6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.
|
6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.
|
||||||
|
|
||||||
|
**Structural validation datasets:**
|
||||||
|
7. **Control**: Valid complete dataset (baseline for validation)
|
||||||
|
8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
|
||||||
|
9. **Extra rows**: Array with 3 additional rows beyond declared length
|
||||||
|
10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
|
||||||
|
11. **Missing fields**: Systematic field omissions (no email in multiple rows)
|
||||||
|
|
||||||
#### Question Types
|
#### Question Types
|
||||||
|
|
||||||
${totalQuestions} questions are generated dynamically across four categories:
|
${totalQuestions} questions are generated dynamically across five categories:
|
||||||
|
|
||||||
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||||
- Example: "What is Alice's salary?" → \`75000\`
|
- Example: "What is Alice's salary?" → \`75000\`
|
||||||
@@ -295,11 +320,16 @@ ${totalQuestions} questions are generated dynamically across four categories:
|
|||||||
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
||||||
- Example: "What is the department of the last employee?" → \`Sales\`
|
- Example: "What is the department of the last employee?" → \`Sales\`
|
||||||
|
|
||||||
|
- **Structural validation (${structuralValidationPercent}%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
|
||||||
|
- Example: "Is this data complete and valid?" → \`YES\` (control dataset) or \`NO\` (corrupted datasets)
|
||||||
|
- Tests TOON's [N] length validation and {fields} consistency checking
|
||||||
|
- Demonstrates CSV's lack of structural validation capabilities
|
||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`) without requiring an LLM judge.
|
||||||
|
|
||||||
#### Models & Configuration
|
#### Models & Configuration
|
||||||
|
|
||||||
@@ -376,9 +406,12 @@ function generateDatasetBreakdown(
|
|||||||
questions: Question[],
|
questions: Question[],
|
||||||
tokenCounts: Record<string, number>,
|
tokenCounts: Record<string, number>,
|
||||||
): string {
|
): string {
|
||||||
|
// Build question ID to dataset mapping for O(1) lookups
|
||||||
|
const questionDatasetMap = new Map(questions.map(q => [q.id, q.dataset]))
|
||||||
|
|
||||||
return ACCURACY_DATASETS.map((dataset) => {
|
return ACCURACY_DATASETS.map((dataset) => {
|
||||||
const datasetResults = formatResults.map((fr) => {
|
const datasetResults = formatResults.map((fr) => {
|
||||||
const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
|
const datasetFormatResults = results.filter(r => questionDatasetMap.get(r.questionId) === dataset.name)
|
||||||
if (datasetFormatResults.length === 0)
|
if (datasetFormatResults.length === 0)
|
||||||
return undefined
|
return undefined
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
|
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
|
||||||
|
import type { AnswerType, NormalizationOptions } from './normalize'
|
||||||
|
|
||||||
export type QuestionType = typeof QUESTION_TYPES[number]
|
export type QuestionType = typeof QUESTION_TYPES[number]
|
||||||
export type DatasetName = typeof DATASET_NAMES[number]
|
export type DatasetName = typeof DATASET_NAMES[number]
|
||||||
@@ -23,6 +24,15 @@ export interface Question {
|
|||||||
groundTruth: string
|
groundTruth: string
|
||||||
type: QuestionType
|
type: QuestionType
|
||||||
dataset: DatasetName
|
dataset: DatasetName
|
||||||
|
/**
|
||||||
|
* Expected answer kind for deterministic comparison.
|
||||||
|
* @default 'string'
|
||||||
|
*/
|
||||||
|
answerType?: AnswerType
|
||||||
|
/**
|
||||||
|
* Options for answer normalization and comparison.
|
||||||
|
*/
|
||||||
|
normalizationOptions?: Partial<NormalizationOptions>
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface EvaluationResult {
|
export interface EvaluationResult {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
"type": "module",
|
"type": "module",
|
||||||
"version": "0.8.0",
|
"version": "0.8.0",
|
||||||
"packageManager": "pnpm@10.20.0",
|
"packageManager": "pnpm@10.20.0",
|
||||||
"description": "Token-Oriented Object Notation (TOON) – a token-efficient JSON alternative for LLM prompts",
|
"description": "Token-Oriented Object Notation (TOON) – A compact, deterministic JSON format for LLM prompts",
|
||||||
"author": "Johann Schopplich <hello@johannschopplich.com>",
|
"author": "Johann Schopplich <hello@johannschopplich.com>",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"homepage": "https://toonformat.dev",
|
"homepage": "https://toonformat.dev",
|
||||||
|
|||||||
Reference in New Issue
Block a user