chore(benchmarks): fix CSV question count in accuracy reports

This commit is contained in:
Johann Schopplich
2025-11-07 21:31:15 +01:00
parent acca69c64a
commit b4655b01af
3 changed files with 8 additions and 4 deletions

View File

@@ -45,7 +45,7 @@ XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
#### Per-Model Accuracy

View File

@@ -95,7 +95,7 @@ ${generateDatasetCatalog(ACCURACY_DATASETS)}
#### Efficiency Ranking (Accuracy per 1K Tokens)
${generateEfficiencyRankingReport(formatResults)}
${generateEfficiencyRankingReport(formatResults, totalQuestions, modelNames.length)}
#### Per-Model Accuracy
@@ -140,6 +140,8 @@ ${rows}
*/
function generateEfficiencyRankingReport(
formatResults: FormatResult[],
totalQuestions: number,
modelCount: number,
): string {
const toon = formatResults.find(r => r.format === 'toon')
const json = formatResults.find(r => r.format === 'json-pretty')
@@ -175,7 +177,9 @@ function generateEfficiencyRankingReport(
// Add CSV note if available
let csvNote = ''
if (csv) {
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
// CSV totalCount is evaluations (questions × models), so divide by number of models to get question count
const csvQuestionCount = csv.totalCount / modelCount
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
}
return `