mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
chore(benchmarks): fix CSV question count in accuracy reports
This commit is contained in:
@@ -364,7 +364,7 @@ XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13
|
|||||||
|
|
||||||
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
||||||
|
|
||||||
**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
||||||
|
|
||||||
#### Per-Model Accuracy
|
#### Per-Model Accuracy
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ XML ▓▓▓▓▓▓▓▓▓▓░░░░░░░░░░ 13
|
|||||||
|
|
||||||
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
TOON achieves **73.9%** accuracy (vs JSON's 69.7%) while using **39.6% fewer tokens**.
|
||||||
|
|
||||||
**Note on CSV:** Excluded from ranking as it only supports 436/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
**Note on CSV:** Excluded from ranking as it only supports 109 of 209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.
|
||||||
|
|
||||||
#### Per-Model Accuracy
|
#### Per-Model Accuracy
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ ${generateDatasetCatalog(ACCURACY_DATASETS)}
|
|||||||
|
|
||||||
#### Efficiency Ranking (Accuracy per 1K Tokens)
|
#### Efficiency Ranking (Accuracy per 1K Tokens)
|
||||||
|
|
||||||
${generateEfficiencyRankingReport(formatResults)}
|
${generateEfficiencyRankingReport(formatResults, totalQuestions, modelNames.length)}
|
||||||
|
|
||||||
#### Per-Model Accuracy
|
#### Per-Model Accuracy
|
||||||
|
|
||||||
@@ -140,6 +140,8 @@ ${rows}
|
|||||||
*/
|
*/
|
||||||
function generateEfficiencyRankingReport(
|
function generateEfficiencyRankingReport(
|
||||||
formatResults: FormatResult[],
|
formatResults: FormatResult[],
|
||||||
|
totalQuestions: number,
|
||||||
|
modelCount: number,
|
||||||
): string {
|
): string {
|
||||||
const toon = formatResults.find(r => r.format === 'toon')
|
const toon = formatResults.find(r => r.format === 'toon')
|
||||||
const json = formatResults.find(r => r.format === 'json-pretty')
|
const json = formatResults.find(r => r.format === 'json-pretty')
|
||||||
@@ -175,7 +177,9 @@ function generateEfficiencyRankingReport(
|
|||||||
// Add CSV note if available
|
// Add CSV note if available
|
||||||
let csvNote = ''
|
let csvNote = ''
|
||||||
if (csv) {
|
if (csv) {
|
||||||
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
|
// CSV totalCount is evaluations (questions × models), so divide by number of models to get question count
|
||||||
|
const csvQuestionCount = csv.totalCount / modelCount
|
||||||
|
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csvQuestionCount} of ${totalQuestions} questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
|
||||||
}
|
}
|
||||||
|
|
||||||
return `
|
return `
|
||||||
|
|||||||
Reference in New Issue
Block a user