mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
test: add benchmarks for compact vs. pretty JSON
This commit is contained in:
168
README.md
168
README.md
@@ -57,26 +57,30 @@ The benchmarks test datasets that favor TOON's strengths (uniform tabular data).
|
|||||||
### Token Efficiency
|
### Token Efficiency
|
||||||
|
|
||||||
```
|
```
|
||||||
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
||||||
vs JSON: 15,145 (-42.3%)
|
vs JSON (-42.3%) 15,145
|
||||||
vs YAML: 13,129 (-33.4%)
|
vs JSON compact (-23.7%) 11,455
|
||||||
vs XML: 17,095 (-48.8%)
|
vs YAML (-33.4%) 13,129
|
||||||
|
vs XML (-48.8%) 17,095
|
||||||
|
|
||||||
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
||||||
vs JSON: 10,977 (-58.9%)
|
vs JSON (-58.9%) 10,977
|
||||||
vs YAML: 8,810 (-48.8%)
|
vs JSON compact (-35.7%) 7,013
|
||||||
vs XML: 13,128 (-65.7%)
|
vs YAML (-48.8%) 8,810
|
||||||
|
vs XML (-65.7%) 13,128
|
||||||
|
|
||||||
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
||||||
vs JSON: 257 (-35.4%)
|
vs JSON (-35.4%) 257
|
||||||
vs YAML: 197 (-15.7%)
|
vs JSON compact (-2.9%) 171
|
||||||
vs XML: 271 (-38.7%)
|
vs YAML (-15.7%) 197
|
||||||
|
vs XML (-38.7%) 271
|
||||||
|
|
||||||
─────────────────────────────────────────────────────────────────────
|
─────────────────────────────────────────────────────────────────────
|
||||||
Total █████████████░░░░░░░░░░░░ 13,418 tokens
|
Total ██████████████░░░░░░░░░░░ 13,418 tokens
|
||||||
vs JSON: 26,379 (-49.1%)
|
vs JSON (-49.1%) 26,379
|
||||||
vs YAML: 22,136 (-39.4%)
|
vs JSON compact (-28.0%) 18,639
|
||||||
vs XML: 30,494 (-56.0%)
|
vs YAML (-39.4%) 22,136
|
||||||
|
vs XML (-56.0%) 30,494
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
@@ -218,7 +222,7 @@ metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
|||||||
<!-- /automd -->
|
<!-- /automd -->
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (used by GPT-5 and other modern models). Savings will vary across models and tokenizers.
|
> Token savings are measured against formatted JSON (2-space indentation) as the primary baseline. Additional comparisons include compact JSON (minified), YAML, and XML to provide a comprehensive view across common data formats. Measured with [`gpt-tokenizer`](https://github.com/niieani/gpt-tokenizer) using `o200k_base` encoding (GPT-5 tokenizer). Actual savings vary by model and tokenizer.
|
||||||
|
|
||||||
<!-- automd:file src="./benchmarks/results/retrieval-accuracy.md" -->
|
<!-- automd:file src="./benchmarks/results/retrieval-accuracy.md" -->
|
||||||
|
|
||||||
@@ -228,35 +232,39 @@ Accuracy across **4 LLMs** on 154 data retrieval questions:
|
|||||||
|
|
||||||
```
|
```
|
||||||
gpt-5-nano
|
gpt-5-nano
|
||||||
→ toon ███████████████████░ 96.1% (148/154)
|
→ TOON ███████████████████░ 96.1% (148/154)
|
||||||
csv ██████████████████░░ 90.3% (139/154)
|
CSV ██████████████████░░ 91.6% (141/154)
|
||||||
yaml ██████████████████░░ 89.0% (137/154)
|
YAML ██████████████████░░ 91.6% (141/154)
|
||||||
json ██████████████████░░ 87.7% (135/154)
|
JSON compact ██████████████████░░ 91.6% (141/154)
|
||||||
xml █████████████████░░░ 83.8% (129/154)
|
XML █████████████████░░░ 87.0% (134/154)
|
||||||
|
JSON █████████████████░░░ 86.4% (133/154)
|
||||||
|
|
||||||
claude-haiku-4-5-20251001
|
claude-haiku-4-5-20251001
|
||||||
yaml ██████████░░░░░░░░░░ 49.4% (76/154)
|
JSON ██████████░░░░░░░░░░ 50.0% (77/154)
|
||||||
→ toon ██████████░░░░░░░░░░ 48.1% (74/154)
|
YAML ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||||
csv ██████████░░░░░░░░░░ 48.1% (74/154)
|
→ TOON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||||
json █████████░░░░░░░░░░░ 47.4% (73/154)
|
XML ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||||
xml █████████░░░░░░░░░░░ 46.8% (72/154)
|
CSV █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||||
|
JSON compact █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||||
|
|
||||||
gemini-2.5-flash
|
gemini-2.5-flash
|
||||||
csv ██████████████████░░ 87.7% (135/154)
|
CSV ██████████████████░░ 87.7% (135/154)
|
||||||
xml █████████████████░░░ 85.1% (131/154)
|
XML ██████████████████░░ 87.7% (135/154)
|
||||||
→ toon █████████████████░░░ 83.8% (129/154)
|
→ TOON █████████████████░░░ 86.4% (133/154)
|
||||||
json ████████████████░░░░ 78.6% (121/154)
|
YAML ████████████████░░░░ 79.9% (123/154)
|
||||||
yaml ███████████████░░░░░ 76.6% (118/154)
|
JSON compact ████████████████░░░░ 79.9% (123/154)
|
||||||
|
JSON ███████████████░░░░░ 76.6% (118/154)
|
||||||
|
|
||||||
grok-4-fast-non-reasoning
|
grok-4-fast-non-reasoning
|
||||||
→ toon ██████████░░░░░░░░░░ 48.7% (75/154)
|
→ TOON ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||||
json ██████████░░░░░░░░░░ 48.1% (74/154)
|
JSON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||||
xml █████████░░░░░░░░░░░ 47.4% (73/154)
|
XML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||||
yaml █████████░░░░░░░░░░░ 46.8% (72/154)
|
YAML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||||
csv █████████░░░░░░░░░░░ 45.5% (70/154)
|
JSON compact █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||||
|
CSV █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key tradeoff:** TOON achieves **69.2% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
**Key tradeoff:** TOON achieves **70.1% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Performance by dataset and model</strong></summary>
|
<summary><strong>Performance by dataset and model</strong></summary>
|
||||||
@@ -267,41 +275,45 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 67.0% | 2,337 | 134/200 |
|
| `csv` | 65.5% | 2,337 | 131/200 |
|
||||||
| `toon` | 66.5% | 2,483 | 133/200 |
|
| `toon` | 67.5% | 2,483 | 135/200 |
|
||||||
| `yaml` | 65.5% | 4,969 | 131/200 |
|
| `json-compact` | 65.5% | 3,943 | 131/200 |
|
||||||
| `json` | 63.5% | 6,347 | 127/200 |
|
| `yaml` | 68.5% | 4,969 | 137/200 |
|
||||||
| `xml` | 66.5% | 7,314 | 133/200 |
|
| `xml` | 69.5% | 7,314 | 139/200 |
|
||||||
|
| `json-pretty` | 64.5% | 6,347 | 129/200 |
|
||||||
|
|
||||||
##### E-commerce orders with nested structures
|
##### E-commerce orders with nested structures
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 78.8% | 5,967 | 126/160 |
|
| `toon` | 78.8% | 5,967 | 126/160 |
|
||||||
| `csv` | 71.9% | 6,735 | 115/160 |
|
| `csv` | 76.3% | 6,735 | 122/160 |
|
||||||
| `yaml` | 71.9% | 7,328 | 115/160 |
|
| `json-compact` | 70.6% | 5,962 | 113/160 |
|
||||||
| `json` | 73.1% | 9,694 | 117/160 |
|
| `yaml` | 72.5% | 7,328 | 116/160 |
|
||||||
| `xml` | 73.8% | 10,992 | 118/160 |
|
| `json-pretty` | 76.9% | 9,694 | 123/160 |
|
||||||
|
| `xml` | 73.1% | 10,992 | 117/160 |
|
||||||
|
|
||||||
##### Time-series analytics data
|
##### Time-series analytics data
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 67.6% | 1,393 | 92/136 |
|
| `toon` | 68.4% | 1,515 | 93/136 |
|
||||||
| `toon` | 67.6% | 1,515 | 92/136 |
|
| `csv` | 65.4% | 1,393 | 89/136 |
|
||||||
| `yaml` | 64.7% | 2,938 | 88/136 |
|
| `json-compact` | 64.7% | 2,341 | 88/136 |
|
||||||
| `json` | 68.4% | 3,665 | 93/136 |
|
| `yaml` | 66.2% | 2,938 | 90/136 |
|
||||||
| `xml` | 66.2% | 4,376 | 90/136 |
|
| `json-pretty` | 64.7% | 3,665 | 88/136 |
|
||||||
|
| `xml` | 66.9% | 4,376 | 91/136 |
|
||||||
|
|
||||||
##### Top 100 GitHub repositories
|
##### Top 100 GitHub repositories
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 64.2% | 8,513 | 77/120 |
|
| `toon` | 65.0% | 8,745 | 78/120 |
|
||||||
| `toon` | 62.5% | 8,745 | 75/120 |
|
| `csv` | 62.5% | 8,513 | 75/120 |
|
||||||
| `yaml` | 57.5% | 13,129 | 69/120 |
|
| `json-compact` | 58.3% | 11,455 | 70/120 |
|
||||||
| `json` | 55.0% | 15,145 | 66/120 |
|
| `yaml` | 56.7% | 13,129 | 68/120 |
|
||||||
| `xml` | 53.3% | 17,095 | 64/120 |
|
| `xml` | 55.8% | 17,095 | 67/120 |
|
||||||
|
| `json-pretty` | 52.5% | 15,145 | 63/120 |
|
||||||
|
|
||||||
#### Performance by Model
|
#### Performance by Model
|
||||||
|
|
||||||
@@ -310,40 +322,44 @@ grok-4-fast-non-reasoning
|
|||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 96.1% | 148/154 |
|
| `toon` | 96.1% | 148/154 |
|
||||||
| `csv` | 90.3% | 139/154 |
|
| `csv` | 91.6% | 141/154 |
|
||||||
| `yaml` | 89.0% | 137/154 |
|
| `yaml` | 91.6% | 141/154 |
|
||||||
| `json` | 87.7% | 135/154 |
|
| `json-compact` | 91.6% | 141/154 |
|
||||||
| `xml` | 83.8% | 129/154 |
|
| `xml` | 87.0% | 134/154 |
|
||||||
|
| `json-pretty` | 86.4% | 133/154 |
|
||||||
|
|
||||||
##### claude-haiku-4-5-20251001
|
##### claude-haiku-4-5-20251001
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
|
| `json-pretty` | 50.0% | 77/154 |
|
||||||
| `yaml` | 49.4% | 76/154 |
|
| `yaml` | 49.4% | 76/154 |
|
||||||
| `toon` | 48.1% | 74/154 |
|
| `toon` | 48.7% | 75/154 |
|
||||||
| `csv` | 48.1% | 74/154 |
|
| `xml` | 48.1% | 74/154 |
|
||||||
| `json` | 47.4% | 73/154 |
|
| `csv` | 47.4% | 73/154 |
|
||||||
| `xml` | 46.8% | 72/154 |
|
| `json-compact` | 44.2% | 68/154 |
|
||||||
|
|
||||||
##### gemini-2.5-flash
|
##### gemini-2.5-flash
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `csv` | 87.7% | 135/154 |
|
| `csv` | 87.7% | 135/154 |
|
||||||
| `xml` | 85.1% | 131/154 |
|
| `xml` | 87.7% | 135/154 |
|
||||||
| `toon` | 83.8% | 129/154 |
|
| `toon` | 86.4% | 133/154 |
|
||||||
| `json` | 78.6% | 121/154 |
|
| `yaml` | 79.9% | 123/154 |
|
||||||
| `yaml` | 76.6% | 118/154 |
|
| `json-compact` | 79.9% | 123/154 |
|
||||||
|
| `json-pretty` | 76.6% | 118/154 |
|
||||||
|
|
||||||
##### grok-4-fast-non-reasoning
|
##### grok-4-fast-non-reasoning
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 48.7% | 75/154 |
|
| `toon` | 49.4% | 76/154 |
|
||||||
| `json` | 48.1% | 74/154 |
|
| `json-pretty` | 48.7% | 75/154 |
|
||||||
| `xml` | 47.4% | 73/154 |
|
| `xml` | 46.1% | 71/154 |
|
||||||
| `yaml` | 46.8% | 72/154 |
|
| `yaml` | 46.1% | 71/154 |
|
||||||
| `csv` | 45.5% | 70/154 |
|
| `json-compact` | 45.5% | 70/154 |
|
||||||
|
| `csv` | 44.2% | 68/154 |
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -383,7 +399,7 @@ Four datasets designed to test different structural patterns (all contain arrays
|
|||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
|
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, CSV, XML, YAML, JSON, JSON compact).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
||||||
|
|
||||||
@@ -392,7 +408,7 @@ Four datasets designed to test different structural patterns (all contain arrays
|
|||||||
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
||||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||||
- **Temperature**: Not set (models use their defaults)
|
- **Temperature**: Not set (models use their defaults)
|
||||||
- **Total evaluations**: 154 questions × 5 formats × 4 models = 3,080 LLM calls
|
- **Total evaluations**: 154 questions × 6 formats × 4 models = 3,696 LLM calls
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -4,35 +4,39 @@ Accuracy across **4 LLMs** on 154 data retrieval questions:
|
|||||||
|
|
||||||
```
|
```
|
||||||
gpt-5-nano
|
gpt-5-nano
|
||||||
→ toon ███████████████████░ 96.1% (148/154)
|
→ TOON ███████████████████░ 96.1% (148/154)
|
||||||
csv ██████████████████░░ 90.3% (139/154)
|
CSV ██████████████████░░ 91.6% (141/154)
|
||||||
yaml ██████████████████░░ 89.0% (137/154)
|
YAML ██████████████████░░ 91.6% (141/154)
|
||||||
json ██████████████████░░ 87.7% (135/154)
|
JSON compact ██████████████████░░ 91.6% (141/154)
|
||||||
xml █████████████████░░░ 83.8% (129/154)
|
XML █████████████████░░░ 87.0% (134/154)
|
||||||
|
JSON █████████████████░░░ 86.4% (133/154)
|
||||||
|
|
||||||
claude-haiku-4-5-20251001
|
claude-haiku-4-5-20251001
|
||||||
yaml ██████████░░░░░░░░░░ 49.4% (76/154)
|
JSON ██████████░░░░░░░░░░ 50.0% (77/154)
|
||||||
→ toon ██████████░░░░░░░░░░ 48.1% (74/154)
|
YAML ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||||
csv ██████████░░░░░░░░░░ 48.1% (74/154)
|
→ TOON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||||
json █████████░░░░░░░░░░░ 47.4% (73/154)
|
XML ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||||
xml █████████░░░░░░░░░░░ 46.8% (72/154)
|
CSV █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||||
|
JSON compact █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||||
|
|
||||||
gemini-2.5-flash
|
gemini-2.5-flash
|
||||||
csv ██████████████████░░ 87.7% (135/154)
|
CSV ██████████████████░░ 87.7% (135/154)
|
||||||
xml █████████████████░░░ 85.1% (131/154)
|
XML ██████████████████░░ 87.7% (135/154)
|
||||||
→ toon █████████████████░░░ 83.8% (129/154)
|
→ TOON █████████████████░░░ 86.4% (133/154)
|
||||||
json ████████████████░░░░ 78.6% (121/154)
|
YAML ████████████████░░░░ 79.9% (123/154)
|
||||||
yaml ███████████████░░░░░ 76.6% (118/154)
|
JSON compact ████████████████░░░░ 79.9% (123/154)
|
||||||
|
JSON ███████████████░░░░░ 76.6% (118/154)
|
||||||
|
|
||||||
grok-4-fast-non-reasoning
|
grok-4-fast-non-reasoning
|
||||||
→ toon ██████████░░░░░░░░░░ 48.7% (75/154)
|
→ TOON ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||||
json ██████████░░░░░░░░░░ 48.1% (74/154)
|
JSON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||||
xml █████████░░░░░░░░░░░ 47.4% (73/154)
|
XML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||||
yaml █████████░░░░░░░░░░░ 46.8% (72/154)
|
YAML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||||
csv █████████░░░░░░░░░░░ 45.5% (70/154)
|
JSON compact █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||||
|
CSV █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key tradeoff:** TOON achieves **69.2% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
**Key tradeoff:** TOON achieves **70.1% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Performance by dataset and model</strong></summary>
|
<summary><strong>Performance by dataset and model</strong></summary>
|
||||||
@@ -43,41 +47,45 @@ grok-4-fast-non-reasoning
|
|||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 67.0% | 2,337 | 134/200 |
|
| `csv` | 65.5% | 2,337 | 131/200 |
|
||||||
| `toon` | 66.5% | 2,483 | 133/200 |
|
| `toon` | 67.5% | 2,483 | 135/200 |
|
||||||
| `yaml` | 65.5% | 4,969 | 131/200 |
|
| `json-compact` | 65.5% | 3,943 | 131/200 |
|
||||||
| `json` | 63.5% | 6,347 | 127/200 |
|
| `yaml` | 68.5% | 4,969 | 137/200 |
|
||||||
| `xml` | 66.5% | 7,314 | 133/200 |
|
| `xml` | 69.5% | 7,314 | 139/200 |
|
||||||
|
| `json-pretty` | 64.5% | 6,347 | 129/200 |
|
||||||
|
|
||||||
##### E-commerce orders with nested structures
|
##### E-commerce orders with nested structures
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `toon` | 78.8% | 5,967 | 126/160 |
|
| `toon` | 78.8% | 5,967 | 126/160 |
|
||||||
| `csv` | 71.9% | 6,735 | 115/160 |
|
| `csv` | 76.3% | 6,735 | 122/160 |
|
||||||
| `yaml` | 71.9% | 7,328 | 115/160 |
|
| `json-compact` | 70.6% | 5,962 | 113/160 |
|
||||||
| `json` | 73.1% | 9,694 | 117/160 |
|
| `yaml` | 72.5% | 7,328 | 116/160 |
|
||||||
| `xml` | 73.8% | 10,992 | 118/160 |
|
| `json-pretty` | 76.9% | 9,694 | 123/160 |
|
||||||
|
| `xml` | 73.1% | 10,992 | 117/160 |
|
||||||
|
|
||||||
##### Time-series analytics data
|
##### Time-series analytics data
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 67.6% | 1,393 | 92/136 |
|
| `toon` | 68.4% | 1,515 | 93/136 |
|
||||||
| `toon` | 67.6% | 1,515 | 92/136 |
|
| `csv` | 65.4% | 1,393 | 89/136 |
|
||||||
| `yaml` | 64.7% | 2,938 | 88/136 |
|
| `json-compact` | 64.7% | 2,341 | 88/136 |
|
||||||
| `json` | 68.4% | 3,665 | 93/136 |
|
| `yaml` | 66.2% | 2,938 | 90/136 |
|
||||||
| `xml` | 66.2% | 4,376 | 90/136 |
|
| `json-pretty` | 64.7% | 3,665 | 88/136 |
|
||||||
|
| `xml` | 66.9% | 4,376 | 91/136 |
|
||||||
|
|
||||||
##### Top 100 GitHub repositories
|
##### Top 100 GitHub repositories
|
||||||
|
|
||||||
| Format | Accuracy | Tokens | Correct/Total |
|
| Format | Accuracy | Tokens | Correct/Total |
|
||||||
| ------ | -------- | ------ | ------------- |
|
| ------ | -------- | ------ | ------------- |
|
||||||
| `csv` | 64.2% | 8,513 | 77/120 |
|
| `toon` | 65.0% | 8,745 | 78/120 |
|
||||||
| `toon` | 62.5% | 8,745 | 75/120 |
|
| `csv` | 62.5% | 8,513 | 75/120 |
|
||||||
| `yaml` | 57.5% | 13,129 | 69/120 |
|
| `json-compact` | 58.3% | 11,455 | 70/120 |
|
||||||
| `json` | 55.0% | 15,145 | 66/120 |
|
| `yaml` | 56.7% | 13,129 | 68/120 |
|
||||||
| `xml` | 53.3% | 17,095 | 64/120 |
|
| `xml` | 55.8% | 17,095 | 67/120 |
|
||||||
|
| `json-pretty` | 52.5% | 15,145 | 63/120 |
|
||||||
|
|
||||||
#### Performance by Model
|
#### Performance by Model
|
||||||
|
|
||||||
@@ -86,40 +94,44 @@ grok-4-fast-non-reasoning
|
|||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 96.1% | 148/154 |
|
| `toon` | 96.1% | 148/154 |
|
||||||
| `csv` | 90.3% | 139/154 |
|
| `csv` | 91.6% | 141/154 |
|
||||||
| `yaml` | 89.0% | 137/154 |
|
| `yaml` | 91.6% | 141/154 |
|
||||||
| `json` | 87.7% | 135/154 |
|
| `json-compact` | 91.6% | 141/154 |
|
||||||
| `xml` | 83.8% | 129/154 |
|
| `xml` | 87.0% | 134/154 |
|
||||||
|
| `json-pretty` | 86.4% | 133/154 |
|
||||||
|
|
||||||
##### claude-haiku-4-5-20251001
|
##### claude-haiku-4-5-20251001
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
|
| `json-pretty` | 50.0% | 77/154 |
|
||||||
| `yaml` | 49.4% | 76/154 |
|
| `yaml` | 49.4% | 76/154 |
|
||||||
| `toon` | 48.1% | 74/154 |
|
| `toon` | 48.7% | 75/154 |
|
||||||
| `csv` | 48.1% | 74/154 |
|
| `xml` | 48.1% | 74/154 |
|
||||||
| `json` | 47.4% | 73/154 |
|
| `csv` | 47.4% | 73/154 |
|
||||||
| `xml` | 46.8% | 72/154 |
|
| `json-compact` | 44.2% | 68/154 |
|
||||||
|
|
||||||
##### gemini-2.5-flash
|
##### gemini-2.5-flash
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `csv` | 87.7% | 135/154 |
|
| `csv` | 87.7% | 135/154 |
|
||||||
| `xml` | 85.1% | 131/154 |
|
| `xml` | 87.7% | 135/154 |
|
||||||
| `toon` | 83.8% | 129/154 |
|
| `toon` | 86.4% | 133/154 |
|
||||||
| `json` | 78.6% | 121/154 |
|
| `yaml` | 79.9% | 123/154 |
|
||||||
| `yaml` | 76.6% | 118/154 |
|
| `json-compact` | 79.9% | 123/154 |
|
||||||
|
| `json-pretty` | 76.6% | 118/154 |
|
||||||
|
|
||||||
##### grok-4-fast-non-reasoning
|
##### grok-4-fast-non-reasoning
|
||||||
|
|
||||||
| Format | Accuracy | Correct/Total |
|
| Format | Accuracy | Correct/Total |
|
||||||
| ------ | -------- | ------------- |
|
| ------ | -------- | ------------- |
|
||||||
| `toon` | 48.7% | 75/154 |
|
| `toon` | 49.4% | 76/154 |
|
||||||
| `json` | 48.1% | 74/154 |
|
| `json-pretty` | 48.7% | 75/154 |
|
||||||
| `xml` | 47.4% | 73/154 |
|
| `xml` | 46.1% | 71/154 |
|
||||||
| `yaml` | 46.8% | 72/154 |
|
| `yaml` | 46.1% | 71/154 |
|
||||||
| `csv` | 45.5% | 70/154 |
|
| `json-compact` | 45.5% | 70/154 |
|
||||||
|
| `csv` | 44.2% | 68/154 |
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
@@ -159,7 +171,7 @@ Four datasets designed to test different structural patterns (all contain arrays
|
|||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
|
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, CSV, XML, YAML, JSON, JSON compact).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
||||||
|
|
||||||
@@ -168,6 +180,6 @@ Four datasets designed to test different structural patterns (all contain arrays
|
|||||||
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
||||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||||
- **Temperature**: Not set (models use their defaults)
|
- **Temperature**: Not set (models use their defaults)
|
||||||
- **Total evaluations**: 154 questions × 5 formats × 4 models = 3,080 LLM calls
|
- **Total evaluations**: 154 questions × 6 formats × 4 models = 3,696 LLM calls
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|||||||
@@ -1,26 +1,30 @@
|
|||||||
### Token Efficiency
|
### Token Efficiency
|
||||||
|
|
||||||
```
|
```
|
||||||
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
||||||
vs JSON: 15,145 (-42.3%)
|
vs JSON (-42.3%) 15,145
|
||||||
vs YAML: 13,129 (-33.4%)
|
vs JSON compact (-23.7%) 11,455
|
||||||
vs XML: 17,095 (-48.8%)
|
vs YAML (-33.4%) 13,129
|
||||||
|
vs XML (-48.8%) 17,095
|
||||||
|
|
||||||
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
||||||
vs JSON: 10,977 (-58.9%)
|
vs JSON (-58.9%) 10,977
|
||||||
vs YAML: 8,810 (-48.8%)
|
vs JSON compact (-35.7%) 7,013
|
||||||
vs XML: 13,128 (-65.7%)
|
vs YAML (-48.8%) 8,810
|
||||||
|
vs XML (-65.7%) 13,128
|
||||||
|
|
||||||
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
||||||
vs JSON: 257 (-35.4%)
|
vs JSON (-35.4%) 257
|
||||||
vs YAML: 197 (-15.7%)
|
vs JSON compact (-2.9%) 171
|
||||||
vs XML: 271 (-38.7%)
|
vs YAML (-15.7%) 197
|
||||||
|
vs XML (-38.7%) 271
|
||||||
|
|
||||||
─────────────────────────────────────────────────────────────────────
|
─────────────────────────────────────────────────────────────────────
|
||||||
Total █████████████░░░░░░░░░░░░ 13,418 tokens
|
Total ██████████████░░░░░░░░░░░ 13,418 tokens
|
||||||
vs JSON: 26,379 (-49.1%)
|
vs JSON (-49.1%) 26,379
|
||||||
vs YAML: 22,136 (-39.4%)
|
vs JSON compact (-28.0%) 18,639
|
||||||
vs XML: 30,494 (-56.0%)
|
vs YAML (-39.4%) 22,136
|
||||||
|
vs XML (-56.0%) 30,494
|
||||||
```
|
```
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
|
import * as fsp from 'node:fs/promises'
|
||||||
import * as path from 'node:path'
|
import * as path from 'node:path'
|
||||||
import process from 'node:process'
|
import process from 'node:process'
|
||||||
import * as prompts from '@clack/prompts'
|
import * as prompts from '@clack/prompts'
|
||||||
import { ofetch } from 'ofetch'
|
import { ofetch } from 'ofetch'
|
||||||
import pMap from 'p-map'
|
import pMap from 'p-map'
|
||||||
import { BENCHMARKS_DIR } from '../src/constants'
|
import { BENCHMARKS_DIR } from '../src/constants'
|
||||||
import { ensureDir, saveJsonFile } from '../src/utils'
|
import { ensureDir } from '../src/utils'
|
||||||
|
|
||||||
prompts.intro('GitHub Repositories Fetcher')
|
prompts.intro('GitHub Repositories Fetcher')
|
||||||
|
|
||||||
@@ -79,7 +80,8 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
|
|||||||
const outputFile = path.join(outputDir, 'github-repos.json')
|
const outputFile = path.join(outputDir, 'github-repos.json')
|
||||||
|
|
||||||
await ensureDir(outputDir)
|
await ensureDir(outputDir)
|
||||||
await saveJsonFile(outputFile, repos)
|
const jsonOutput = JSON.stringify(repos, undefined, 2)
|
||||||
|
await fsp.writeFile(outputFile, `${jsonOutput}\n`, 'utf-8')
|
||||||
|
|
||||||
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
|
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
|
||||||
prompts.log.info(`Result saved to \`${relativePath}\``)
|
prompts.log.info(`Result saved to \`${relativePath}\``)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import * as path from 'node:path'
|
|||||||
import * as prompts from '@clack/prompts'
|
import * as prompts from '@clack/prompts'
|
||||||
import { encode } from '../../src/index'
|
import { encode } from '../../src/index'
|
||||||
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
||||||
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
import { BENCHMARKS_DIR, FORMATTER_DISPLAY_NAMES, ROOT_DIR } from '../src/constants'
|
||||||
import { generateAnalyticsData, generateOrderData } from '../src/datasets'
|
import { generateAnalyticsData, generateOrderData } from '../src/datasets'
|
||||||
import { formatters } from '../src/formatters'
|
import { formatters } from '../src/formatters'
|
||||||
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
||||||
@@ -50,118 +50,102 @@ const BENCHMARK_EXAMPLES = [
|
|||||||
|
|
||||||
prompts.intro('Token Efficiency Benchmark')
|
prompts.intro('Token Efficiency Benchmark')
|
||||||
|
|
||||||
// Calculate total savings
|
|
||||||
let totalJsonTokens = 0
|
|
||||||
let totalToonTokens = 0
|
|
||||||
let totalXmlTokens = 0
|
|
||||||
let totalYamlTokens = 0
|
|
||||||
|
|
||||||
const results: BenchmarkResult[] = []
|
const results: BenchmarkResult[] = []
|
||||||
|
const totalTokensByFormat: Record<string, number> = {}
|
||||||
|
|
||||||
for (const example of BENCHMARK_EXAMPLES) {
|
for (const example of BENCHMARK_EXAMPLES) {
|
||||||
const data = example.getData()
|
const data = example.getData()
|
||||||
|
|
||||||
const jsonString = JSON.stringify(data, undefined, 2)
|
// Calculate tokens for each format
|
||||||
const toonString = encode(data)
|
const formatMetrics: FormatMetrics[] = []
|
||||||
const xmlString = formatters.xml!(data)
|
const tokensByFormat: Record<string, number> = {}
|
||||||
const yamlString = formatters.yaml!(data)
|
|
||||||
|
|
||||||
const jsonTokens = tokenize(jsonString)
|
for (const [formatName, formatter] of Object.entries(formatters)) {
|
||||||
const toonTokens = tokenize(toonString)
|
const formattedString = formatter(data)
|
||||||
const xmlTokens = tokenize(xmlString)
|
const tokens = tokenize(formattedString)
|
||||||
const yamlTokens = tokenize(yamlString)
|
tokensByFormat[formatName] = tokens
|
||||||
|
totalTokensByFormat[formatName] = (totalTokensByFormat[formatName] || 0) + tokens
|
||||||
|
}
|
||||||
|
|
||||||
const jsonSavings = jsonTokens - toonTokens
|
// Calculate savings vs TOON
|
||||||
const xmlSavings = xmlTokens - toonTokens
|
const toonTokens = tokensByFormat.toon!
|
||||||
const yamlSavings = yamlTokens - toonTokens
|
for (const [formatName, tokens] of Object.entries(tokensByFormat)) {
|
||||||
|
const savings = tokens - toonTokens
|
||||||
totalJsonTokens += jsonTokens
|
formatMetrics.push({
|
||||||
totalToonTokens += toonTokens
|
name: formatName,
|
||||||
totalXmlTokens += xmlTokens
|
tokens,
|
||||||
totalYamlTokens += yamlTokens
|
savings,
|
||||||
|
savingsPercent: formatName === 'toon' ? '0.0' : ((savings / tokens) * 100).toFixed(1),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
results.push({
|
results.push({
|
||||||
name: example.name,
|
name: example.name,
|
||||||
emoji: example.emoji,
|
emoji: example.emoji,
|
||||||
description: example.description,
|
description: example.description,
|
||||||
data,
|
data,
|
||||||
formats: [
|
formats: formatMetrics,
|
||||||
{
|
|
||||||
name: 'toon',
|
|
||||||
tokens: toonTokens,
|
|
||||||
savings: 0,
|
|
||||||
savingsPercent: '0.0',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'json',
|
|
||||||
tokens: jsonTokens,
|
|
||||||
savings: jsonSavings,
|
|
||||||
savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'xml',
|
|
||||||
tokens: xmlTokens,
|
|
||||||
savings: xmlSavings,
|
|
||||||
savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'yaml',
|
|
||||||
tokens: yamlTokens,
|
|
||||||
savings: yamlSavings,
|
|
||||||
savingsPercent: ((yamlSavings / yamlTokens) * 100).toFixed(1),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
showDetailed: example.showDetailed,
|
showDetailed: example.showDetailed,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalJsonSavings = totalJsonTokens - totalToonTokens
|
// Calculate total savings percentages
|
||||||
const totalJsonSavingsPercent = ((totalJsonSavings / totalJsonTokens) * 100).toFixed(1)
|
const totalToonTokens = totalTokensByFormat.toon!
|
||||||
|
const totalSavingsPercent: Record<string, string> = {}
|
||||||
const totalXmlSavings = totalXmlTokens - totalToonTokens
|
for (const [formatName, totalTokens] of Object.entries(totalTokensByFormat)) {
|
||||||
const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixed(1)
|
if (formatName === 'toon') {
|
||||||
|
totalSavingsPercent[formatName] = '0.0'
|
||||||
const totalYamlSavings = totalYamlTokens - totalToonTokens
|
}
|
||||||
const totalYamlSavingsPercent = ((totalYamlSavings / totalYamlTokens) * 100).toFixed(1)
|
else {
|
||||||
|
const savings = totalTokens - totalToonTokens
|
||||||
|
totalSavingsPercent[formatName] = ((savings / totalTokens) * 100).toFixed(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Generate ASCII bar chart visualization (stacked compact format)
|
// Generate ASCII bar chart visualization (stacked compact format)
|
||||||
|
const formatOrder = ['json-pretty', 'json-compact', 'yaml', 'xml']
|
||||||
const datasetRows = results
|
const datasetRows = results
|
||||||
.map((result) => {
|
.map((result) => {
|
||||||
const toon = result.formats.find(f => f.name === 'toon')!
|
const toon = result.formats.find(f => f.name === 'toon')!
|
||||||
const json = result.formats.find(f => f.name === 'json')!
|
const percentage = Number.parseFloat(result.formats.find(f => f.name === 'json-pretty')!.savingsPercent)
|
||||||
const xml = result.formats.find(f => f.name === 'xml')!
|
|
||||||
const yaml = result.formats.find(f => f.name === 'yaml')!
|
|
||||||
|
|
||||||
const percentage = Number.parseFloat(json.savingsPercent)
|
|
||||||
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
||||||
const toonStr = toon.tokens.toLocaleString('en-US')
|
const toonStr = toon.tokens.toLocaleString('en-US')
|
||||||
const jsonStr = json.tokens.toLocaleString('en-US')
|
|
||||||
const xmlStr = xml.tokens.toLocaleString('en-US')
|
|
||||||
const yamlStr = yaml.tokens.toLocaleString('en-US')
|
|
||||||
|
|
||||||
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
||||||
const line2 = ` vs JSON: ${jsonStr.padStart(6)} (-${json.savingsPercent}%)`
|
|
||||||
const line3 = ` vs YAML: ${yamlStr.padStart(6)} (-${yaml.savingsPercent}%)`
|
|
||||||
const line4 = ` vs XML: ${xmlStr.padStart(6)} (-${xml.savingsPercent}%)`
|
|
||||||
|
|
||||||
return `${line1}\n${line2}\n${line3}\n${line4}`
|
const comparisonLines = formatOrder.map((formatName) => {
|
||||||
|
const format = result.formats.find(f => f.name === formatName)!
|
||||||
|
const label = FORMATTER_DISPLAY_NAMES[formatName] || formatName.toUpperCase()
|
||||||
|
const labelWithSavings = `vs ${label} (-${format.savingsPercent}%)`.padEnd(28)
|
||||||
|
const tokenStr = format.tokens.toLocaleString('en-US').padStart(6)
|
||||||
|
return ` ${labelWithSavings}${tokenStr}`
|
||||||
|
})
|
||||||
|
|
||||||
|
return [line1, ...comparisonLines].join('\n')
|
||||||
})
|
})
|
||||||
.join('\n\n')
|
.join('\n\n')
|
||||||
|
|
||||||
// Add separator and totals row
|
// Add separator and totals row
|
||||||
const separator = '─────────────────────────────────────────────────────────────────────'
|
const separator = '─────────────────────────────────────────────────────────────────────'
|
||||||
|
|
||||||
// Calculate bar for totals (TOON vs average of JSON+YAML+XML)
|
// Calculate bar for totals (TOON vs average of comparison formats)
|
||||||
const averageComparisonTokens = (totalJsonTokens + totalYamlTokens + totalXmlTokens) / 3
|
const comparisonTokens = formatOrder.map(name => totalTokensByFormat[name]!)
|
||||||
|
const averageComparisonTokens = comparisonTokens.reduce((a, b) => a + b, 0) / comparisonTokens.length
|
||||||
const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
|
const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
|
||||||
const totalBar = createProgressBar(totalPercentage, 100)
|
const totalBar = createProgressBar(totalPercentage, 100)
|
||||||
|
|
||||||
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
||||||
const totalLine2 = ` vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)} (-${totalJsonSavingsPercent}%)`
|
|
||||||
const totalLine3 = ` vs YAML: ${totalYamlTokens.toLocaleString('en-US').padStart(6)} (-${totalYamlSavingsPercent}%)`
|
|
||||||
const totalLine4 = ` vs XML: ${totalXmlTokens.toLocaleString('en-US').padStart(6)} (-${totalXmlSavingsPercent}%)`
|
|
||||||
|
|
||||||
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}\n${totalLine4}`
|
const totalComparisonLines = formatOrder.map((formatName) => {
|
||||||
|
const label = FORMATTER_DISPLAY_NAMES[formatName] || formatName.toUpperCase()
|
||||||
|
const tokens = totalTokensByFormat[formatName]!
|
||||||
|
const percent = totalSavingsPercent[formatName]!
|
||||||
|
const labelWithSavings = `vs ${label} (-${percent}%)`.padEnd(28)
|
||||||
|
const tokenStr = tokens.toLocaleString('en-US').padStart(6)
|
||||||
|
return ` ${labelWithSavings}${tokenStr}`
|
||||||
|
})
|
||||||
|
|
||||||
|
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalComparisonLines.join('\n')}`
|
||||||
|
|
||||||
// Generate detailed examples (only for selected examples)
|
// Generate detailed examples (only for selected examples)
|
||||||
// Note: Large datasets are truncated for display readability in the report.
|
// Note: Large datasets are truncated for display readability in the report.
|
||||||
@@ -185,7 +169,7 @@ const detailedExamples = results
|
|||||||
|
|
||||||
const separator = i < filtered.length - 1 ? '\n\n---' : ''
|
const separator = i < filtered.length - 1 ? '\n\n---' : ''
|
||||||
|
|
||||||
const json = result.formats.find(f => f.name === 'json')!
|
const json = result.formats.find(f => f.name === 'json-pretty')!
|
||||||
const toon = result.formats.find(f => f.name === 'toon')!
|
const toon = result.formats.find(f => f.name === 'toon')!
|
||||||
|
|
||||||
return `#### ${result.emoji} ${result.name}
|
return `#### ${result.emoji} ${result.name}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
|
|||||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||||
'claude-haiku-4-5-20251001': 50,
|
'claude-haiku-4-5-20251001': 50,
|
||||||
'gemini-2.5-flash': 25,
|
'gemini-2.5-flash': 25,
|
||||||
'gpt-5-nano': undefined,
|
'gpt-5-nano': 50,
|
||||||
'grok-4-fast-non-reasoning': 50,
|
'grok-4-fast-non-reasoning': 50,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -23,6 +23,18 @@ export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
|||||||
*/
|
*/
|
||||||
export const DEFAULT_CONCURRENCY = 10
|
export const DEFAULT_CONCURRENCY = 10
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Display names for data format types
|
||||||
|
*/
|
||||||
|
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||||
|
'json-pretty': 'JSON',
|
||||||
|
'json-compact': 'JSON compact',
|
||||||
|
'toon': 'TOON',
|
||||||
|
'csv': 'CSV',
|
||||||
|
'xml': 'XML',
|
||||||
|
'yaml': 'YAML',
|
||||||
|
} as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Progress bar configuration
|
* Progress bar configuration
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -12,11 +12,12 @@ import { encode as encodeToon } from '../../src/index'
|
|||||||
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
||||||
*/
|
*/
|
||||||
export const formatters: Record<string, (data: unknown) => string> = {
|
export const formatters: Record<string, (data: unknown) => string> = {
|
||||||
json: data => JSON.stringify(data, undefined, 2),
|
'json-pretty': data => JSON.stringify(data, undefined, 2),
|
||||||
toon: data => encodeToon(data),
|
'json-compact': data => JSON.stringify(data),
|
||||||
csv: data => toCSV(data),
|
'toon': data => encodeToon(data),
|
||||||
xml: data => toXML(data),
|
'csv': data => toCSV(data),
|
||||||
yaml: data => stringifyYAML(data),
|
'xml': data => toXML(data),
|
||||||
|
'yaml': data => stringifyYAML(data),
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { EvaluationResult, FormatResult, Question } from './types'
|
import type { EvaluationResult, FormatResult, Question } from './types'
|
||||||
import * as fsp from 'node:fs/promises'
|
import * as fsp from 'node:fs/promises'
|
||||||
import * as path from 'node:path'
|
import * as path from 'node:path'
|
||||||
import { BENCHMARKS_DIR } from './constants'
|
import { BENCHMARKS_DIR, FORMATTER_DISPLAY_NAMES } from './constants'
|
||||||
import { datasets } from './datasets'
|
import { datasets } from './datasets'
|
||||||
import { models } from './evaluate'
|
import { models } from './evaluate'
|
||||||
import { createProgressBar, ensureDir, tokenize } from './utils'
|
import { createProgressBar, ensureDir, tokenize } from './utils'
|
||||||
@@ -49,7 +49,7 @@ export function generateMarkdownReport(
|
|||||||
tokenCounts: Record<string, number>,
|
tokenCounts: Record<string, number>,
|
||||||
): string {
|
): string {
|
||||||
const toon = formatResults.find(r => r.format === 'toon')
|
const toon = formatResults.find(r => r.format === 'toon')
|
||||||
const json = formatResults.find(r => r.format === 'json')
|
const json = formatResults.find(r => r.format === 'json-pretty')
|
||||||
|
|
||||||
const modelIds = models.map(m => m.modelId)
|
const modelIds = models.map(m => m.modelId)
|
||||||
const modelNames = modelIds.filter(id => results.some(r => r.model === id))
|
const modelNames = modelIds.filter(id => results.some(r => r.model === id))
|
||||||
@@ -71,10 +71,11 @@ export function generateMarkdownReport(
|
|||||||
|
|
||||||
const formatLines = modelResults.map((result) => {
|
const formatLines = modelResults.map((result) => {
|
||||||
const bar = createProgressBar(result.accuracy, 1, 20)
|
const bar = createProgressBar(result.accuracy, 1, 20)
|
||||||
const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
|
const accuracyString = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
|
||||||
const countStr = `(${result.correctCount}/${result.totalCount})`
|
const countString = `(${result.correctCount}/${result.totalCount})`
|
||||||
const prefix = result.format === 'toon' ? '→ ' : ' '
|
const prefix = result.format === 'toon' ? '→ ' : ' '
|
||||||
return `${prefix}${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`
|
const displayName = FORMATTER_DISPLAY_NAMES[result.format] || result.format
|
||||||
|
return `${prefix}${displayName.padEnd(12)} ${bar} ${accuracyString} ${countString}`
|
||||||
}).join('\n')
|
}).join('\n')
|
||||||
|
|
||||||
// Add blank line before model name, except for first model
|
// Add blank line before model name, except for first model
|
||||||
@@ -248,7 +249,7 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
|||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
|
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||||
|
|
||||||
|
|||||||
@@ -40,19 +40,3 @@ export function tokenize(text: string): number {
|
|||||||
export async function ensureDir(dirPath: string): Promise<void> {
|
export async function ensureDir(dirPath: string): Promise<void> {
|
||||||
await fsp.mkdir(dirPath, { recursive: true })
|
await fsp.mkdir(dirPath, { recursive: true })
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Save data as formatted JSON file
|
|
||||||
*
|
|
||||||
* @param filePath - Path to save the file
|
|
||||||
* @param data - Data to serialize as JSON
|
|
||||||
* @param indent - Indentation spaces (default: 2)
|
|
||||||
*/
|
|
||||||
export async function saveJsonFile(
|
|
||||||
filePath: string,
|
|
||||||
data: unknown,
|
|
||||||
indent = 2,
|
|
||||||
): Promise<void> {
|
|
||||||
const json = JSON.stringify(data, undefined, indent)
|
|
||||||
await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -150,8 +150,8 @@ export function encodeArrayOfObjectsAsTabular(
|
|||||||
depth: Depth,
|
depth: Depth,
|
||||||
options: ResolvedEncodeOptions,
|
options: ResolvedEncodeOptions,
|
||||||
): void {
|
): void {
|
||||||
const headerStr = formatHeader(rows.length, { key: prefix, fields: header, delimiter: options.delimiter, lengthMarker: options.lengthMarker })
|
const formattedHeader = formatHeader(rows.length, { key: prefix, fields: header, delimiter: options.delimiter, lengthMarker: options.lengthMarker })
|
||||||
writer.push(depth, `${headerStr}`)
|
writer.push(depth, `${formattedHeader}`)
|
||||||
|
|
||||||
writeTabularRows(rows, header, writer, depth + 1, options)
|
writeTabularRows(rows, header, writer, depth + 1, options)
|
||||||
}
|
}
|
||||||
@@ -255,8 +255,8 @@ export function encodeObjectAsListItem(obj: JsonObject, writer: LineWriter, dept
|
|||||||
const header = extractTabularHeader(firstValue)
|
const header = extractTabularHeader(firstValue)
|
||||||
if (header) {
|
if (header) {
|
||||||
// Tabular format for uniform arrays of objects
|
// Tabular format for uniform arrays of objects
|
||||||
const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header, delimiter: options.delimiter, lengthMarker: options.lengthMarker })
|
const formattedHeader = formatHeader(firstValue.length, { key: firstKey, fields: header, delimiter: options.delimiter, lengthMarker: options.lengthMarker })
|
||||||
writer.pushListItem(depth, headerStr)
|
writer.pushListItem(depth, formattedHeader)
|
||||||
writeTabularRows(firstValue, header, writer, depth + 1, options)
|
writeTabularRows(firstValue, header, writer, depth + 1, options)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|||||||
Reference in New Issue
Block a user