From b4655b01af03b4ac983e978bf824249c2ba6226a Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Fri, 7 Nov 2025 21:31:15 +0100 Subject: [PATCH] chore(benchmarks): fix CSV question count in accuracy reports --- README.md | 2 +- benchmarks/results/retrieval-accuracy.md | 2 +- benchmarks/src/report.ts | 8 ++++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3c0ce90..2e8b4c8 100644 --- a/README.md +++ b/README.md @@ -364,7 +364,7 @@ XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13 TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. -**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. +**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. #### Per-Model Accuracy diff --git a/benchmarks/results/retrieval-accuracy.md b/benchmarks/results/retrieval-accuracy.md index b5dee75..7b9b287 100644 --- a/benchmarks/results/retrieval-accuracy.md +++ b/benchmarks/results/retrieval-accuracy.md @@ -45,7 +45,7 @@ XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13 TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**. -**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. +**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle. #### Per-Model Accuracy diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index cb8f0e1..94b53e9 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -95,7 +95,7 @@ ${generateDatasetCatalog(ACCURACY_DATASETS)} #### Efficiency Ranking (Accuracy per 1K Tokens) -${generateEfficiencyRankingReport(formatResults)} +${generateEfficiencyRankingReport(formatResults, totalQuestions, modelNames.length)} #### Per-Model Accuracy @@ -140,6 +140,8 @@ ${rows} */ function generateEfficiencyRankingReport( formatResults: FormatResult[], + totalQuestions: number, + modelCount: number, ): string { const toon = formatResults.find(r => r.format === 'toon') const json = formatResults.find(r => r.format === 'json-pretty') @@ -175,7 +177,9 @@ function generateEfficiencyRankingReport( // Add CSV note if available let csvNote = '' if (csv) { - csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.` + // CSV totalCount is evaluations (questions × models), so divide by number of models to get question count + const csvQuestionCount = csv.totalCount / modelCount + csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.` } return `