diff --git a/README.md b/README.md index 07d4cf7..2dda5c4 100644 --- a/README.md +++ b/README.md @@ -195,17 +195,20 @@ Benchmarks test LLM comprehension across different input formats using 209 data #### Efficiency Ranking (Accuracy per 1K Tokens) -Each format's overall performance, balancing accuracy against token cost: +Each format ranked by efficiency (accuracy percentage per 1,000 tokens): ``` -TOON ████████████████████ 26.9 │ 73.9% acc │ 2,744 tokens -JSON compact █████████████████░░░ 22.9 │ 70.7% acc │ 3,081 tokens -YAML ██████████████░░░░░░ 18.6 │ 69.0% acc │ 3,719 tokens -JSON ███████████░░░░░░░░░ 15.3 │ 69.7% acc │ 4,545 tokens -XML ██████████░░░░░░░░░░ 13.0 │ 67.1% acc │ 5,167 tokens +TOON ████████████████████ 26.9 acc%/1K tok │ 73.9% acc │ 2,744 tokens +JSON compact █████████████████░░░ 22.9 acc%/1K tok │ 70.7% acc │ 3,081 tokens +YAML ██████████████░░░░░░ 18.6 acc%/1K tok │ 69.0% acc │ 3,719 tokens +JSON ███████████░░░░░░░░░ 15.3 acc%/1K tok │ 69.7% acc │ 4,545 tokens +XML ██████████░░░░░░░░░░ 13.0 acc%/1K tok │ 67.1% acc │ 5,167 tokens ``` -TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. +*Efficiency score = (Accuracy % ÷ Tokens) × 1,000. Higher is better.* + +> [!TIP] +> TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. **Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. @@ -247,7 +250,7 @@ grok-4-fast-non-reasoning CSV ██████████░░░░░░░░░░ 52.3% (57/109) ``` -> [!TIP] Results Summary +> [!TIP] > TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
diff --git a/benchmarks/results/retrieval-accuracy.md b/benchmarks/results/retrieval-accuracy.md index 868103b..7657299 100644 --- a/benchmarks/results/retrieval-accuracy.md +++ b/benchmarks/results/retrieval-accuracy.md @@ -33,17 +33,20 @@ Benchmarks test LLM comprehension across different input formats using 209 data #### Efficiency Ranking (Accuracy per 1K Tokens) -Each format's overall performance, balancing accuracy against token cost: +Each format ranked by efficiency (accuracy percentage per 1,000 tokens): ``` -TOON ████████████████████ 26.9 │ 73.9% acc │ 2,744 tokens -JSON compact █████████████████░░░ 22.9 │ 70.7% acc │ 3,081 tokens -YAML ██████████████░░░░░░ 18.6 │ 69.0% acc │ 3,719 tokens -JSON ███████████░░░░░░░░░ 15.3 │ 69.7% acc │ 4,545 tokens -XML ██████████░░░░░░░░░░ 13.0 │ 67.1% acc │ 5,167 tokens +TOON ████████████████████ 26.9 acc%/1K tok │ 73.9% acc │ 2,744 tokens +JSON compact █████████████████░░░ 22.9 acc%/1K tok │ 70.7% acc │ 3,081 tokens +YAML ██████████████░░░░░░ 18.6 acc%/1K tok │ 69.0% acc │ 3,719 tokens +JSON ███████████░░░░░░░░░ 15.3 acc%/1K tok │ 69.7% acc │ 4,545 tokens +XML ██████████░░░░░░░░░░ 13.0 acc%/1K tok │ 67.1% acc │ 5,167 tokens ``` -TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. +*Efficiency score = (Accuracy % ÷ Tokens) × 1,000. Higher is better.* + +> [!TIP] +> TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. **Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. @@ -85,7 +88,7 @@ grok-4-fast-non-reasoning CSV ██████████░░░░░░░░░░ 52.3% (57/109) ``` -> [!TIP] Results Summary +> [!TIP] > TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index 4803115..42112e8 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -179,17 +179,22 @@ function generateEfficiencyRankingReport( if (csv) { // CSV totalCount is evaluations (questions × models), so divide by number of models to get question count const csvQuestionCount = csv.totalCount / modelCount - csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.` + csvNote = `**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.` } return ` -Each format's overall performance, balancing accuracy against token cost: +Each format ranked by efficiency (accuracy percentage per 1,000 tokens): \`\`\` ${efficiencyChart} \`\`\` -${summary}${csvNote} +*Efficiency score = (Accuracy % ÷ Tokens) × 1,000. Higher is better.* + +> [!TIP] +> ${summary} + +${csvNote} `.trim() } @@ -396,7 +401,7 @@ function generateSummaryComparison( return '' return ` -> [!TIP] Results Summary +> [!TIP] > TOON achieves **${(toon.accuracy * 100).toFixed(1)}% accuracy** (vs JSON's ${(json.accuracy * 100).toFixed(1)}%) while using **${((1 - toon.totalTokens / json.totalTokens) * 100).toFixed(1)}% fewer tokens** on these datasets. `.trim() } @@ -566,7 +571,7 @@ function generateHorizontalEfficiencyChart( const accuracy = `${(r.accuracy * 100).toFixed(1)}%`.padStart(5) const tokens = r.tokens.toLocaleString('en-US').padStart(5) - return `${formatName} ${bar} ${efficiency} │ ${accuracy} acc │ ${tokens} tokens` + return `${formatName} ${bar} ${efficiency} acc%/1K tok │ ${accuracy} acc │ ${tokens} tokens` }) .join('\n') } diff --git a/docs/guide/benchmarks.md b/docs/guide/benchmarks.md index 24a8e16..5042f4d 100644 --- a/docs/guide/benchmarks.md +++ b/docs/guide/benchmarks.md @@ -49,17 +49,20 @@ Benchmarks test LLM comprehension across different input formats using 209 data #### Efficiency Ranking (Accuracy per 1K Tokens) -Each format's overall performance, balancing accuracy against token cost: +Each format ranked by efficiency (accuracy percentage per 1,000 tokens): ``` -TOON ████████████████████ 26.9 │ 73.9% acc │ 2,744 tokens -JSON compact █████████████████░░░ 22.9 │ 70.7% acc │ 3,081 tokens -YAML ██████████████░░░░░░ 18.6 │ 69.0% acc │ 3,719 tokens -JSON ███████████░░░░░░░░░ 15.3 │ 69.7% acc │ 4,545 tokens -XML ██████████░░░░░░░░░░ 13.0 │ 67.1% acc │ 5,167 tokens +TOON ████████████████████ 26.9 acc%/1K tok │ 73.9% acc │ 2,744 tokens +JSON compact █████████████████░░░ 22.9 acc%/1K tok │ 70.7% acc │ 3,081 tokens +YAML ██████████████░░░░░░ 18.6 acc%/1K tok │ 69.0% acc │ 3,719 tokens +JSON ███████████░░░░░░░░░ 15.3 acc%/1K tok │ 69.7% acc │ 4,545 tokens +XML ██████████░░░░░░░░░░ 13.0 acc%/1K tok │ 67.1% acc │ 5,167 tokens ``` -TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. +*Efficiency score = (Accuracy % ÷ Tokens) × 1,000. Higher is better.* + +> [!TIP] +> TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. **Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. @@ -101,7 +104,7 @@ grok-4-fast-non-reasoning CSV ██████████░░░░░░░░░░ 52.3% (57/109) ``` -> [!TIP] Results Summary +> [!TIP] > TOON achieves **73.9% accuracy** (vs JSON's 69.7%) while using **39.6% fewer tokens** on these datasets.