mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
test(benchmark): overhaul generation
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
import type { EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||
import { FORMATTER_DISPLAY_NAMES } from './constants'
|
||||
import { datasets } from './datasets'
|
||||
import { ACCURACY_DATASETS } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { supportsCSV } from './formatters'
|
||||
import { generateQuestions } from './questions'
|
||||
import { createProgressBar, tokenize } from './utils'
|
||||
|
||||
@@ -16,7 +17,11 @@ export function calculateTokenCounts(
|
||||
const tokenCounts: Record<string, number> = {}
|
||||
|
||||
for (const [formatName, formatter] of Object.entries(formatters)) {
|
||||
for (const dataset of datasets) {
|
||||
for (const dataset of ACCURACY_DATASETS) {
|
||||
// Skip CSV for datasets that don't support it
|
||||
if (formatName === 'csv' && !supportsCSV(dataset))
|
||||
continue
|
||||
|
||||
const formatted = formatter(dataset.data)
|
||||
const key = `${formatName}-${dataset.name}`
|
||||
tokenCounts[key] = tokenize(formatted)
|
||||
@@ -42,9 +47,9 @@ export function calculateFormatResults(
|
||||
const accuracy = correctCount / totalCount
|
||||
|
||||
// Calculate average tokens across all datasets for this format
|
||||
const avgTokens = Object.entries(tokenCounts)
|
||||
const formatTokenEntries = Object.entries(tokenCounts)
|
||||
.filter(([key]) => key.startsWith(`${formatName}-`))
|
||||
.reduce((sum, [, tokens]) => sum + tokens, 0) / datasets.length
|
||||
const avgTokens = formatTokenEntries.reduce((sum, [, tokens]) => sum + tokens, 0) / formatTokenEntries.length
|
||||
|
||||
const averageLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount
|
||||
|
||||
@@ -75,6 +80,8 @@ export function generateAccuracyReport(
|
||||
return `
|
||||
Benchmarks test LLM comprehension across different input formats using ${totalQuestions} data retrieval questions on ${modelNames.length} ${modelNames.length === 1 ? 'model' : 'models'}.
|
||||
|
||||
${generateDatasetCatalog(ACCURACY_DATASETS)}
|
||||
|
||||
#### Efficiency Ranking (Accuracy per 1K Tokens)
|
||||
|
||||
${generateEfficiencyRankingReport(formatResults)}
|
||||
@@ -85,6 +92,38 @@ ${generateDetailedAccuracyReport(formatResults, results, questions, tokenCounts)
|
||||
`.trimStart()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate dataset catalog section
|
||||
*/
|
||||
function generateDatasetCatalog(datasets: Dataset[]): string {
|
||||
const rows = datasets.map((dataset) => {
|
||||
const csvSupport = supportsCSV(dataset) ? '✓' : '✗'
|
||||
const rowCount = Object.values(dataset.data)[0]?.length ?? 1
|
||||
const structure = dataset.metadata.structureClass
|
||||
const eligibility = `${dataset.metadata.tabularEligibility}%`
|
||||
|
||||
return `| ${dataset.description} | ${rowCount} | ${structure} | ${csvSupport} | ${eligibility} |`
|
||||
}).join('\n')
|
||||
|
||||
return `
|
||||
#### Dataset Catalog
|
||||
|
||||
| Dataset | Rows | Structure | CSV Support | Eligibility |
|
||||
| ------- | ---- | --------- | ----------- | ----------- |
|
||||
${rows}
|
||||
|
||||
**Structure classes:**
|
||||
- **uniform**: All objects have identical fields with primitive values
|
||||
- **semi-uniform**: Mix of uniform and non-uniform structures
|
||||
- **nested**: Objects with nested structures (nested objects or arrays)
|
||||
- **deep**: Highly nested with minimal tabular eligibility
|
||||
|
||||
**CSV Support:** ✓ (supported), ✗ (not supported - would require lossy flattening)
|
||||
|
||||
**Eligibility:** Percentage of arrays that qualify for TOON's tabular format (uniform objects with primitive values)
|
||||
`.trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate efficiency ranking report
|
||||
*/
|
||||
@@ -168,10 +207,12 @@ function generateDetailedAccuracyReport(
|
||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||
|
||||
// Calculate dataset sizes
|
||||
const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
|
||||
const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
|
||||
const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
|
||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
const nestedSize = ACCURACY_DATASETS.find(d => d.name === 'nested')?.data.orders?.length || 0
|
||||
const analyticsSize = ACCURACY_DATASETS.find(d => d.name === 'analytics')?.data.metrics?.length || 0
|
||||
const githubSize = ACCURACY_DATASETS.find(d => d.name === 'github')?.data.repositories?.length || 0
|
||||
const eventLogsSize = ACCURACY_DATASETS.find(d => d.name === 'event-logs')?.data.logs?.length || 0
|
||||
const nestedConfigSize = 1 // Single config object
|
||||
|
||||
// Calculate number of formats and evaluations
|
||||
const formatCount = formatResults.length
|
||||
@@ -208,12 +249,14 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
||||
|
||||
#### Datasets Tested
|
||||
|
||||
Four datasets designed to test different structural patterns (all contain arrays of uniform objects, TOON's optimal format):
|
||||
Six datasets designed to test different structural patterns:
|
||||
|
||||
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
|
||||
4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.
|
||||
5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
||||
6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.
|
||||
|
||||
#### Question Types
|
||||
|
||||
@@ -314,7 +357,7 @@ function generateDatasetBreakdown(
|
||||
questions: Question[],
|
||||
tokenCounts: Record<string, number>,
|
||||
): string {
|
||||
return datasets.map((dataset) => {
|
||||
return ACCURACY_DATASETS.map((dataset) => {
|
||||
const datasetResults = formatResults.map((fr) => {
|
||||
const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
|
||||
if (datasetFormatResults.length === 0)
|
||||
|
||||
Reference in New Issue
Block a user