mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
chore(benchmarks): add structure-awareness questions
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||
import { FORMATTER_DISPLAY_NAMES } from './constants'
|
||||
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
||||
import { ACCURACY_DATASETS } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { supportsCSV } from './formatters'
|
||||
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
|
||||
if (formatName === 'csv' && !supportsCSV(dataset))
|
||||
continue
|
||||
|
||||
const formatted = formatter(dataset.data)
|
||||
const formattedData = formatter(dataset.data)
|
||||
const key = `${formatName}-${dataset.name}`
|
||||
tokenCounts[key] = tokenize(formatted)
|
||||
tokenCounts[key] = tokenize(formattedData)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
|
||||
|
||||
// Generate performance by model
|
||||
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
|
||||
|
||||
// Generate question type breakdown
|
||||
const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
|
||||
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
|
||||
|
||||
// Calculate question type distribution
|
||||
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
|
||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
||||
|
||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
||||
|
||||
// Calculate dataset sizes
|
||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
@@ -233,7 +238,11 @@ ${modelBreakdown}
|
||||
${summaryComparison}
|
||||
|
||||
<details>
|
||||
<summary><strong>Performance by dataset and model</strong></summary>
|
||||
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
||||
|
||||
#### Performance by Question Type
|
||||
|
||||
${questionTypeBreakdown}
|
||||
|
||||
#### Performance by Dataset
|
||||
|
||||
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
|
||||
|
||||
#### Question Types
|
||||
|
||||
${totalQuestions} questions are generated dynamically across three categories:
|
||||
${totalQuestions} questions are generated dynamically across four categories:
|
||||
|
||||
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- Example: "What is Alice's salary?" → \`75000\`
|
||||
- Example: "How many items are in order ORD-0042?" → \`3\`
|
||||
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
|
||||
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
||||
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
|
||||
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
|
||||
|
||||
- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
|
||||
- Example: "How many employees are in the dataset?" → \`100\`
|
||||
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
||||
- Example: "What is the department of the last employee?" → \`Sales\`
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||
@@ -413,6 +427,48 @@ ${tableRows}
|
||||
}).filter(Boolean).join('\n').trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate question type breakdown table
|
||||
*/
|
||||
function generateQuestionTypeBreakdown(
|
||||
formatResults: FormatResult[],
|
||||
results: EvaluationResult[],
|
||||
questions: Question[],
|
||||
): string {
|
||||
// Build header
|
||||
const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
|
||||
const header = `| Question Type | ${formatNames.join(' | ')} |`
|
||||
const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
|
||||
|
||||
// Build rows
|
||||
const rows = QUESTION_TYPES.map((type) => {
|
||||
const questionIds = questions.filter(q => q.type === type).map(q => q.id)
|
||||
const typeResults = results.filter(r => questionIds.includes(r.questionId))
|
||||
|
||||
if (typeResults.length === 0)
|
||||
return undefined
|
||||
|
||||
const accuracies = formatResults.map((fr) => {
|
||||
const formatTypeResults = typeResults.filter(r => r.format === fr.format)
|
||||
if (formatTypeResults.length === 0)
|
||||
return 'N/A'
|
||||
|
||||
const correctCount = formatTypeResults.filter(r => r.isCorrect).length
|
||||
const totalCount = formatTypeResults.length
|
||||
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
|
||||
return `${(accuracy * 100).toFixed(1)}%`
|
||||
})
|
||||
|
||||
return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
|
||||
}).filter(Boolean)
|
||||
|
||||
return `
|
||||
${header}
|
||||
${separator}
|
||||
${rows.join('\n')}
|
||||
`.trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate per-model performance comparison tables
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user