mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
test: add benchmarks for compact vs. pretty JSON
This commit is contained in:
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -4,35 +4,39 @@ Accuracy across **4 LLMs** on 154 data retrieval questions:
|
||||
|
||||
```
|
||||
gpt-5-nano
|
||||
→ toon ███████████████████░ 96.1% (148/154)
|
||||
csv ██████████████████░░ 90.3% (139/154)
|
||||
yaml ██████████████████░░ 89.0% (137/154)
|
||||
json ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 83.8% (129/154)
|
||||
→ TOON ███████████████████░ 96.1% (148/154)
|
||||
CSV ██████████████████░░ 91.6% (141/154)
|
||||
YAML ██████████████████░░ 91.6% (141/154)
|
||||
JSON compact ██████████████████░░ 91.6% (141/154)
|
||||
XML █████████████████░░░ 87.0% (134/154)
|
||||
JSON █████████████████░░░ 86.4% (133/154)
|
||||
|
||||
claude-haiku-4-5-20251001
|
||||
yaml ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||
→ toon ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
csv ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
json █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
xml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
JSON ██████████░░░░░░░░░░ 50.0% (77/154)
|
||||
YAML ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||
→ TOON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
XML ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
CSV █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
JSON compact █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||
|
||||
gemini-2.5-flash
|
||||
csv ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 85.1% (131/154)
|
||||
→ toon █████████████████░░░ 83.8% (129/154)
|
||||
json ████████████████░░░░ 78.6% (121/154)
|
||||
yaml ███████████████░░░░░ 76.6% (118/154)
|
||||
CSV ██████████████████░░ 87.7% (135/154)
|
||||
XML ██████████████████░░ 87.7% (135/154)
|
||||
→ TOON █████████████████░░░ 86.4% (133/154)
|
||||
YAML ████████████████░░░░ 79.9% (123/154)
|
||||
JSON compact ████████████████░░░░ 79.9% (123/154)
|
||||
JSON ███████████████░░░░░ 76.6% (118/154)
|
||||
|
||||
grok-4-fast-non-reasoning
|
||||
→ toon ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
json ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
xml █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
yaml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
csv █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||
→ TOON ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||
JSON ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
XML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||
YAML █████████░░░░░░░░░░░ 46.1% (71/154)
|
||||
JSON compact █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||
CSV █████████░░░░░░░░░░░ 44.2% (68/154)
|
||||
```
|
||||
|
||||
**Key tradeoff:** TOON achieves **69.2% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
||||
**Key tradeoff:** TOON achieves **70.1% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens** on these datasets.
|
||||
|
||||
<details>
|
||||
<summary><strong>Performance by dataset and model</strong></summary>
|
||||
@@ -43,41 +47,45 @@ grok-4-fast-non-reasoning
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 67.0% | 2,337 | 134/200 |
|
||||
| `toon` | 66.5% | 2,483 | 133/200 |
|
||||
| `yaml` | 65.5% | 4,969 | 131/200 |
|
||||
| `json` | 63.5% | 6,347 | 127/200 |
|
||||
| `xml` | 66.5% | 7,314 | 133/200 |
|
||||
| `csv` | 65.5% | 2,337 | 131/200 |
|
||||
| `toon` | 67.5% | 2,483 | 135/200 |
|
||||
| `json-compact` | 65.5% | 3,943 | 131/200 |
|
||||
| `yaml` | 68.5% | 4,969 | 137/200 |
|
||||
| `xml` | 69.5% | 7,314 | 139/200 |
|
||||
| `json-pretty` | 64.5% | 6,347 | 129/200 |
|
||||
|
||||
##### E-commerce orders with nested structures
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `toon` | 78.8% | 5,967 | 126/160 |
|
||||
| `csv` | 71.9% | 6,735 | 115/160 |
|
||||
| `yaml` | 71.9% | 7,328 | 115/160 |
|
||||
| `json` | 73.1% | 9,694 | 117/160 |
|
||||
| `xml` | 73.8% | 10,992 | 118/160 |
|
||||
| `csv` | 76.3% | 6,735 | 122/160 |
|
||||
| `json-compact` | 70.6% | 5,962 | 113/160 |
|
||||
| `yaml` | 72.5% | 7,328 | 116/160 |
|
||||
| `json-pretty` | 76.9% | 9,694 | 123/160 |
|
||||
| `xml` | 73.1% | 10,992 | 117/160 |
|
||||
|
||||
##### Time-series analytics data
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 67.6% | 1,393 | 92/136 |
|
||||
| `toon` | 67.6% | 1,515 | 92/136 |
|
||||
| `yaml` | 64.7% | 2,938 | 88/136 |
|
||||
| `json` | 68.4% | 3,665 | 93/136 |
|
||||
| `xml` | 66.2% | 4,376 | 90/136 |
|
||||
| `toon` | 68.4% | 1,515 | 93/136 |
|
||||
| `csv` | 65.4% | 1,393 | 89/136 |
|
||||
| `json-compact` | 64.7% | 2,341 | 88/136 |
|
||||
| `yaml` | 66.2% | 2,938 | 90/136 |
|
||||
| `json-pretty` | 64.7% | 3,665 | 88/136 |
|
||||
| `xml` | 66.9% | 4,376 | 91/136 |
|
||||
|
||||
##### Top 100 GitHub repositories
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 64.2% | 8,513 | 77/120 |
|
||||
| `toon` | 62.5% | 8,745 | 75/120 |
|
||||
| `yaml` | 57.5% | 13,129 | 69/120 |
|
||||
| `json` | 55.0% | 15,145 | 66/120 |
|
||||
| `xml` | 53.3% | 17,095 | 64/120 |
|
||||
| `toon` | 65.0% | 8,745 | 78/120 |
|
||||
| `csv` | 62.5% | 8,513 | 75/120 |
|
||||
| `json-compact` | 58.3% | 11,455 | 70/120 |
|
||||
| `yaml` | 56.7% | 13,129 | 68/120 |
|
||||
| `xml` | 55.8% | 17,095 | 67/120 |
|
||||
| `json-pretty` | 52.5% | 15,145 | 63/120 |
|
||||
|
||||
#### Performance by Model
|
||||
|
||||
@@ -86,40 +94,44 @@ grok-4-fast-non-reasoning
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `toon` | 96.1% | 148/154 |
|
||||
| `csv` | 90.3% | 139/154 |
|
||||
| `yaml` | 89.0% | 137/154 |
|
||||
| `json` | 87.7% | 135/154 |
|
||||
| `xml` | 83.8% | 129/154 |
|
||||
| `csv` | 91.6% | 141/154 |
|
||||
| `yaml` | 91.6% | 141/154 |
|
||||
| `json-compact` | 91.6% | 141/154 |
|
||||
| `xml` | 87.0% | 134/154 |
|
||||
| `json-pretty` | 86.4% | 133/154 |
|
||||
|
||||
##### claude-haiku-4-5-20251001
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `json-pretty` | 50.0% | 77/154 |
|
||||
| `yaml` | 49.4% | 76/154 |
|
||||
| `toon` | 48.1% | 74/154 |
|
||||
| `csv` | 48.1% | 74/154 |
|
||||
| `json` | 47.4% | 73/154 |
|
||||
| `xml` | 46.8% | 72/154 |
|
||||
| `toon` | 48.7% | 75/154 |
|
||||
| `xml` | 48.1% | 74/154 |
|
||||
| `csv` | 47.4% | 73/154 |
|
||||
| `json-compact` | 44.2% | 68/154 |
|
||||
|
||||
##### gemini-2.5-flash
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `csv` | 87.7% | 135/154 |
|
||||
| `xml` | 85.1% | 131/154 |
|
||||
| `toon` | 83.8% | 129/154 |
|
||||
| `json` | 78.6% | 121/154 |
|
||||
| `yaml` | 76.6% | 118/154 |
|
||||
| `xml` | 87.7% | 135/154 |
|
||||
| `toon` | 86.4% | 133/154 |
|
||||
| `yaml` | 79.9% | 123/154 |
|
||||
| `json-compact` | 79.9% | 123/154 |
|
||||
| `json-pretty` | 76.6% | 118/154 |
|
||||
|
||||
##### grok-4-fast-non-reasoning
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `toon` | 48.7% | 75/154 |
|
||||
| `json` | 48.1% | 74/154 |
|
||||
| `xml` | 47.4% | 73/154 |
|
||||
| `yaml` | 46.8% | 72/154 |
|
||||
| `csv` | 45.5% | 70/154 |
|
||||
| `toon` | 49.4% | 76/154 |
|
||||
| `json-pretty` | 48.7% | 75/154 |
|
||||
| `xml` | 46.1% | 71/154 |
|
||||
| `yaml` | 46.1% | 71/154 |
|
||||
| `json-compact` | 45.5% | 70/154 |
|
||||
| `csv` | 44.2% | 68/154 |
|
||||
|
||||
</details>
|
||||
|
||||
@@ -159,7 +171,7 @@ Four datasets designed to test different structural patterns (all contain arrays
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
|
||||
1. **Format conversion**: Each dataset is converted to all 6 formats (TOON, CSV, XML, YAML, JSON, JSON compact).
|
||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
|
||||
|
||||
@@ -168,6 +180,6 @@ Four datasets designed to test different structural patterns (all contain arrays
|
||||
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||
- **Temperature**: Not set (models use their defaults)
|
||||
- **Total evaluations**: 154 questions × 5 formats × 4 models = 3,080 LLM calls
|
||||
- **Total evaluations**: 154 questions × 6 formats × 4 models = 3,696 LLM calls
|
||||
|
||||
</details>
|
||||
|
||||
@@ -1,26 +1,30 @@
|
||||
### Token Efficiency
|
||||
|
||||
```
|
||||
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
||||
vs JSON: 15,145 (-42.3%)
|
||||
vs YAML: 13,129 (-33.4%)
|
||||
vs XML: 17,095 (-48.8%)
|
||||
⭐ GitHub Repositories ██████████████░░░░░░░░░░░ 8,745 tokens
|
||||
vs JSON (-42.3%) 15,145
|
||||
vs JSON compact (-23.7%) 11,455
|
||||
vs YAML (-33.4%) 13,129
|
||||
vs XML (-48.8%) 17,095
|
||||
|
||||
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
||||
vs JSON: 10,977 (-58.9%)
|
||||
vs YAML: 8,810 (-48.8%)
|
||||
vs XML: 13,128 (-65.7%)
|
||||
📈 Daily Analytics ██████████░░░░░░░░░░░░░░░ 4,507 tokens
|
||||
vs JSON (-58.9%) 10,977
|
||||
vs JSON compact (-35.7%) 7,013
|
||||
vs YAML (-48.8%) 8,810
|
||||
vs XML (-65.7%) 13,128
|
||||
|
||||
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
||||
vs JSON: 257 (-35.4%)
|
||||
vs YAML: 197 (-15.7%)
|
||||
vs XML: 271 (-38.7%)
|
||||
🛒 E-Commerce Order ████████████████░░░░░░░░░ 166 tokens
|
||||
vs JSON (-35.4%) 257
|
||||
vs JSON compact (-2.9%) 171
|
||||
vs YAML (-15.7%) 197
|
||||
vs XML (-38.7%) 271
|
||||
|
||||
─────────────────────────────────────────────────────────────────────
|
||||
Total █████████████░░░░░░░░░░░░ 13,418 tokens
|
||||
vs JSON: 26,379 (-49.1%)
|
||||
vs YAML: 22,136 (-39.4%)
|
||||
vs XML: 30,494 (-56.0%)
|
||||
Total ██████████████░░░░░░░░░░░ 13,418 tokens
|
||||
vs JSON (-49.1%) 26,379
|
||||
vs JSON compact (-28.0%) 18,639
|
||||
vs YAML (-39.4%) 22,136
|
||||
vs XML (-56.0%) 30,494
|
||||
```
|
||||
|
||||
<details>
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import * as fsp from 'node:fs/promises'
|
||||
import * as path from 'node:path'
|
||||
import process from 'node:process'
|
||||
import * as prompts from '@clack/prompts'
|
||||
import { ofetch } from 'ofetch'
|
||||
import pMap from 'p-map'
|
||||
import { BENCHMARKS_DIR } from '../src/constants'
|
||||
import { ensureDir, saveJsonFile } from '../src/utils'
|
||||
import { ensureDir } from '../src/utils'
|
||||
|
||||
prompts.intro('GitHub Repositories Fetcher')
|
||||
|
||||
@@ -79,7 +80,8 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
|
||||
const outputFile = path.join(outputDir, 'github-repos.json')
|
||||
|
||||
await ensureDir(outputDir)
|
||||
await saveJsonFile(outputFile, repos)
|
||||
const jsonOutput = JSON.stringify(repos, undefined, 2)
|
||||
await fsp.writeFile(outputFile, `${jsonOutput}\n`, 'utf-8')
|
||||
|
||||
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
|
||||
prompts.log.info(`Result saved to \`${relativePath}\``)
|
||||
|
||||
@@ -3,7 +3,7 @@ import * as path from 'node:path'
|
||||
import * as prompts from '@clack/prompts'
|
||||
import { encode } from '../../src/index'
|
||||
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
||||
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
||||
import { BENCHMARKS_DIR, FORMATTER_DISPLAY_NAMES, ROOT_DIR } from '../src/constants'
|
||||
import { generateAnalyticsData, generateOrderData } from '../src/datasets'
|
||||
import { formatters } from '../src/formatters'
|
||||
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
||||
@@ -50,118 +50,102 @@ const BENCHMARK_EXAMPLES = [
|
||||
|
||||
prompts.intro('Token Efficiency Benchmark')
|
||||
|
||||
// Calculate total savings
|
||||
let totalJsonTokens = 0
|
||||
let totalToonTokens = 0
|
||||
let totalXmlTokens = 0
|
||||
let totalYamlTokens = 0
|
||||
|
||||
const results: BenchmarkResult[] = []
|
||||
const totalTokensByFormat: Record<string, number> = {}
|
||||
|
||||
for (const example of BENCHMARK_EXAMPLES) {
|
||||
const data = example.getData()
|
||||
|
||||
const jsonString = JSON.stringify(data, undefined, 2)
|
||||
const toonString = encode(data)
|
||||
const xmlString = formatters.xml!(data)
|
||||
const yamlString = formatters.yaml!(data)
|
||||
// Calculate tokens for each format
|
||||
const formatMetrics: FormatMetrics[] = []
|
||||
const tokensByFormat: Record<string, number> = {}
|
||||
|
||||
const jsonTokens = tokenize(jsonString)
|
||||
const toonTokens = tokenize(toonString)
|
||||
const xmlTokens = tokenize(xmlString)
|
||||
const yamlTokens = tokenize(yamlString)
|
||||
for (const [formatName, formatter] of Object.entries(formatters)) {
|
||||
const formattedString = formatter(data)
|
||||
const tokens = tokenize(formattedString)
|
||||
tokensByFormat[formatName] = tokens
|
||||
totalTokensByFormat[formatName] = (totalTokensByFormat[formatName] || 0) + tokens
|
||||
}
|
||||
|
||||
const jsonSavings = jsonTokens - toonTokens
|
||||
const xmlSavings = xmlTokens - toonTokens
|
||||
const yamlSavings = yamlTokens - toonTokens
|
||||
|
||||
totalJsonTokens += jsonTokens
|
||||
totalToonTokens += toonTokens
|
||||
totalXmlTokens += xmlTokens
|
||||
totalYamlTokens += yamlTokens
|
||||
// Calculate savings vs TOON
|
||||
const toonTokens = tokensByFormat.toon!
|
||||
for (const [formatName, tokens] of Object.entries(tokensByFormat)) {
|
||||
const savings = tokens - toonTokens
|
||||
formatMetrics.push({
|
||||
name: formatName,
|
||||
tokens,
|
||||
savings,
|
||||
savingsPercent: formatName === 'toon' ? '0.0' : ((savings / tokens) * 100).toFixed(1),
|
||||
})
|
||||
}
|
||||
|
||||
results.push({
|
||||
name: example.name,
|
||||
emoji: example.emoji,
|
||||
description: example.description,
|
||||
data,
|
||||
formats: [
|
||||
{
|
||||
name: 'toon',
|
||||
tokens: toonTokens,
|
||||
savings: 0,
|
||||
savingsPercent: '0.0',
|
||||
},
|
||||
{
|
||||
name: 'json',
|
||||
tokens: jsonTokens,
|
||||
savings: jsonSavings,
|
||||
savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
|
||||
},
|
||||
{
|
||||
name: 'xml',
|
||||
tokens: xmlTokens,
|
||||
savings: xmlSavings,
|
||||
savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
|
||||
},
|
||||
{
|
||||
name: 'yaml',
|
||||
tokens: yamlTokens,
|
||||
savings: yamlSavings,
|
||||
savingsPercent: ((yamlSavings / yamlTokens) * 100).toFixed(1),
|
||||
},
|
||||
],
|
||||
formats: formatMetrics,
|
||||
showDetailed: example.showDetailed,
|
||||
})
|
||||
}
|
||||
|
||||
const totalJsonSavings = totalJsonTokens - totalToonTokens
|
||||
const totalJsonSavingsPercent = ((totalJsonSavings / totalJsonTokens) * 100).toFixed(1)
|
||||
|
||||
const totalXmlSavings = totalXmlTokens - totalToonTokens
|
||||
const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixed(1)
|
||||
|
||||
const totalYamlSavings = totalYamlTokens - totalToonTokens
|
||||
const totalYamlSavingsPercent = ((totalYamlSavings / totalYamlTokens) * 100).toFixed(1)
|
||||
// Calculate total savings percentages
|
||||
const totalToonTokens = totalTokensByFormat.toon!
|
||||
const totalSavingsPercent: Record<string, string> = {}
|
||||
for (const [formatName, totalTokens] of Object.entries(totalTokensByFormat)) {
|
||||
if (formatName === 'toon') {
|
||||
totalSavingsPercent[formatName] = '0.0'
|
||||
}
|
||||
else {
|
||||
const savings = totalTokens - totalToonTokens
|
||||
totalSavingsPercent[formatName] = ((savings / totalTokens) * 100).toFixed(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Generate ASCII bar chart visualization (stacked compact format)
|
||||
const formatOrder = ['json-pretty', 'json-compact', 'yaml', 'xml']
|
||||
const datasetRows = results
|
||||
.map((result) => {
|
||||
const toon = result.formats.find(f => f.name === 'toon')!
|
||||
const json = result.formats.find(f => f.name === 'json')!
|
||||
const xml = result.formats.find(f => f.name === 'xml')!
|
||||
const yaml = result.formats.find(f => f.name === 'yaml')!
|
||||
|
||||
const percentage = Number.parseFloat(json.savingsPercent)
|
||||
const percentage = Number.parseFloat(result.formats.find(f => f.name === 'json-pretty')!.savingsPercent)
|
||||
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
||||
const toonStr = toon.tokens.toLocaleString('en-US')
|
||||
const jsonStr = json.tokens.toLocaleString('en-US')
|
||||
const xmlStr = xml.tokens.toLocaleString('en-US')
|
||||
const yamlStr = yaml.tokens.toLocaleString('en-US')
|
||||
|
||||
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
||||
const line2 = ` vs JSON: ${jsonStr.padStart(6)} (-${json.savingsPercent}%)`
|
||||
const line3 = ` vs YAML: ${yamlStr.padStart(6)} (-${yaml.savingsPercent}%)`
|
||||
const line4 = ` vs XML: ${xmlStr.padStart(6)} (-${xml.savingsPercent}%)`
|
||||
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
||||
|
||||
return `${line1}\n${line2}\n${line3}\n${line4}`
|
||||
const comparisonLines = formatOrder.map((formatName) => {
|
||||
const format = result.formats.find(f => f.name === formatName)!
|
||||
const label = FORMATTER_DISPLAY_NAMES[formatName] || formatName.toUpperCase()
|
||||
const labelWithSavings = `vs ${label} (-${format.savingsPercent}%)`.padEnd(28)
|
||||
const tokenStr = format.tokens.toLocaleString('en-US').padStart(6)
|
||||
return ` ${labelWithSavings}${tokenStr}`
|
||||
})
|
||||
|
||||
return [line1, ...comparisonLines].join('\n')
|
||||
})
|
||||
.join('\n\n')
|
||||
|
||||
// Add separator and totals row
|
||||
const separator = '─────────────────────────────────────────────────────────────────────'
|
||||
|
||||
// Calculate bar for totals (TOON vs average of JSON+YAML+XML)
|
||||
const averageComparisonTokens = (totalJsonTokens + totalYamlTokens + totalXmlTokens) / 3
|
||||
// Calculate bar for totals (TOON vs average of comparison formats)
|
||||
const comparisonTokens = formatOrder.map(name => totalTokensByFormat[name]!)
|
||||
const averageComparisonTokens = comparisonTokens.reduce((a, b) => a + b, 0) / comparisonTokens.length
|
||||
const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
|
||||
const totalBar = createProgressBar(totalPercentage, 100)
|
||||
|
||||
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
||||
const totalLine2 = ` vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)} (-${totalJsonSavingsPercent}%)`
|
||||
const totalLine3 = ` vs YAML: ${totalYamlTokens.toLocaleString('en-US').padStart(6)} (-${totalYamlSavingsPercent}%)`
|
||||
const totalLine4 = ` vs XML: ${totalXmlTokens.toLocaleString('en-US').padStart(6)} (-${totalXmlSavingsPercent}%)`
|
||||
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
||||
|
||||
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}\n${totalLine4}`
|
||||
const totalComparisonLines = formatOrder.map((formatName) => {
|
||||
const label = FORMATTER_DISPLAY_NAMES[formatName] || formatName.toUpperCase()
|
||||
const tokens = totalTokensByFormat[formatName]!
|
||||
const percent = totalSavingsPercent[formatName]!
|
||||
const labelWithSavings = `vs ${label} (-${percent}%)`.padEnd(28)
|
||||
const tokenStr = tokens.toLocaleString('en-US').padStart(6)
|
||||
return ` ${labelWithSavings}${tokenStr}`
|
||||
})
|
||||
|
||||
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalComparisonLines.join('\n')}`
|
||||
|
||||
// Generate detailed examples (only for selected examples)
|
||||
// Note: Large datasets are truncated for display readability in the report.
|
||||
@@ -185,7 +169,7 @@ const detailedExamples = results
|
||||
|
||||
const separator = i < filtered.length - 1 ? '\n\n---' : ''
|
||||
|
||||
const json = result.formats.find(f => f.name === 'json')!
|
||||
const json = result.formats.find(f => f.name === 'json-pretty')!
|
||||
const toon = result.formats.find(f => f.name === 'toon')!
|
||||
|
||||
return `#### ${result.emoji} ${result.name}
|
||||
|
||||
@@ -14,7 +14,7 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
|
||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
'claude-haiku-4-5-20251001': 50,
|
||||
'gemini-2.5-flash': 25,
|
||||
'gpt-5-nano': undefined,
|
||||
'gpt-5-nano': 50,
|
||||
'grok-4-fast-non-reasoning': 50,
|
||||
}
|
||||
|
||||
@@ -23,6 +23,18 @@ export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
*/
|
||||
export const DEFAULT_CONCURRENCY = 10
|
||||
|
||||
/**
|
||||
* Display names for data format types
|
||||
*/
|
||||
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||
'json-pretty': 'JSON',
|
||||
'json-compact': 'JSON compact',
|
||||
'toon': 'TOON',
|
||||
'csv': 'CSV',
|
||||
'xml': 'XML',
|
||||
'yaml': 'YAML',
|
||||
} as const
|
||||
|
||||
/**
|
||||
* Progress bar configuration
|
||||
*/
|
||||
|
||||
@@ -12,11 +12,12 @@ import { encode as encodeToon } from '../../src/index'
|
||||
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
||||
*/
|
||||
export const formatters: Record<string, (data: unknown) => string> = {
|
||||
json: data => JSON.stringify(data, undefined, 2),
|
||||
toon: data => encodeToon(data),
|
||||
csv: data => toCSV(data),
|
||||
xml: data => toXML(data),
|
||||
yaml: data => stringifyYAML(data),
|
||||
'json-pretty': data => JSON.stringify(data, undefined, 2),
|
||||
'json-compact': data => JSON.stringify(data),
|
||||
'toon': data => encodeToon(data),
|
||||
'csv': data => toCSV(data),
|
||||
'xml': data => toXML(data),
|
||||
'yaml': data => stringifyYAML(data),
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { EvaluationResult, FormatResult, Question } from './types'
|
||||
import * as fsp from 'node:fs/promises'
|
||||
import * as path from 'node:path'
|
||||
import { BENCHMARKS_DIR } from './constants'
|
||||
import { BENCHMARKS_DIR, FORMATTER_DISPLAY_NAMES } from './constants'
|
||||
import { datasets } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { createProgressBar, ensureDir, tokenize } from './utils'
|
||||
@@ -49,7 +49,7 @@ export function generateMarkdownReport(
|
||||
tokenCounts: Record<string, number>,
|
||||
): string {
|
||||
const toon = formatResults.find(r => r.format === 'toon')
|
||||
const json = formatResults.find(r => r.format === 'json')
|
||||
const json = formatResults.find(r => r.format === 'json-pretty')
|
||||
|
||||
const modelIds = models.map(m => m.modelId)
|
||||
const modelNames = modelIds.filter(id => results.some(r => r.model === id))
|
||||
@@ -71,10 +71,11 @@ export function generateMarkdownReport(
|
||||
|
||||
const formatLines = modelResults.map((result) => {
|
||||
const bar = createProgressBar(result.accuracy, 1, 20)
|
||||
const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
|
||||
const countStr = `(${result.correctCount}/${result.totalCount})`
|
||||
const accuracyString = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
|
||||
const countString = `(${result.correctCount}/${result.totalCount})`
|
||||
const prefix = result.format === 'toon' ? '→ ' : ' '
|
||||
return `${prefix}${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`
|
||||
const displayName = FORMATTER_DISPLAY_NAMES[result.format] || result.format
|
||||
return `${prefix}${displayName.padEnd(12)} ${bar} ${accuracyString} ${countString}`
|
||||
}).join('\n')
|
||||
|
||||
// Add blank line before model name, except for first model
|
||||
@@ -248,7 +249,7 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
|
||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||
|
||||
|
||||
@@ -40,19 +40,3 @@ export function tokenize(text: string): number {
|
||||
export async function ensureDir(dirPath: string): Promise<void> {
|
||||
await fsp.mkdir(dirPath, { recursive: true })
|
||||
}
|
||||
|
||||
/**
|
||||
* Save data as formatted JSON file
|
||||
*
|
||||
* @param filePath - Path to save the file
|
||||
* @param data - Data to serialize as JSON
|
||||
* @param indent - Indentation spaces (default: 2)
|
||||
*/
|
||||
export async function saveJsonFile(
|
||||
filePath: string,
|
||||
data: unknown,
|
||||
indent = 2,
|
||||
): Promise<void> {
|
||||
const json = JSON.stringify(data, undefined, indent)
|
||||
await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user