From b4655b01af03b4ac983e978bf824249c2ba6226a Mon Sep 17 00:00:00 2001
From: Johann Schopplich <mail@johannschopplich.com>
Date: Fri, 7 Nov 2025 21:31:15 +0100
Subject: [PATCH] chore(benchmarks): fix CSV question count in accuracy reports

---
 README.md                                | 2 +-
 benchmarks/results/retrieval-accuracy.md | 2 +-
 benchmarks/src/report.ts                 | 8 ++++++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 3c0ce90..2e8b4c8 100644
--- a/README.md
+++ b/README.md
@@ -364,7 +364,7 @@ XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░   13
 
 TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
 
-**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
+**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
 
 #### Per-Model Accuracy
 
diff --git a/benchmarks/results/retrieval-accuracy.md b/benchmarks/results/retrieval-accuracy.md
index b5dee75..7b9b287 100644
--- a/benchmarks/results/retrieval-accuracy.md
+++ b/benchmarks/results/retrieval-accuracy.md
@@ -45,7 +45,7 @@ XML            ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░   13
 
 TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
 
-**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
+**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
 
 #### Per-Model Accuracy
 
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
index cb8f0e1..94b53e9 100644
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -95,7 +95,7 @@ ${generateDatasetCatalog(ACCURACY_DATASETS)}
 
 #### Efficiency Ranking (Accuracy per 1K Tokens)
 
-${generateEfficiencyRankingReport(formatResults)}
+${generateEfficiencyRankingReport(formatResults, totalQuestions, modelNames.length)}
 
 #### Per-Model Accuracy
 
@@ -140,6 +140,8 @@ ${rows}
  */
 function generateEfficiencyRankingReport(
   formatResults: FormatResult[],
+  totalQuestions: number,
+  modelCount: number,
 ): string {
   const toon = formatResults.find(r => r.format === 'toon')
   const json = formatResults.find(r => r.format === 'json-pretty')
@@ -175,7 +177,9 @@ function generateEfficiencyRankingReport(
   // Add CSV note if available
   let csvNote = ''
   if (csv) {
-    csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
+    // CSV totalCount is evaluations (questions × models), so divide by number of models to get question count
+    const csvQuestionCount = csv.totalCount / modelCount
+    csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
   }
 
   return `