test(benchmark): overhaul generation

2026-01-29 23:34:10 +08:00 · 2025-11-06 14:45:44 +01:00
parent 9863875706
commit bc711ccecf
19 changed files with 2254 additions and 997 deletions
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,7 +1,8 @@
-import type { EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
+import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
 import { FORMATTER_DISPLAY_NAMES } from './constants'
-import { datasets } from './datasets'
+import { ACCURACY_DATASETS } from './datasets'
 import { models } from './evaluate'
+import { supportsCSV } from './formatters'
 import { generateQuestions } from './questions'
 import { createProgressBar, tokenize } from './utils'

@@ -16,7 +17,11 @@ export function calculateTokenCounts(
  const tokenCounts: Record<string, number> = {}

  for (const [formatName, formatter] of Object.entries(formatters)) {
-    for (const dataset of datasets) {
+    for (const dataset of ACCURACY_DATASETS) {
+      // Skip CSV for datasets that don't support it
+      if (formatName === 'csv' && !supportsCSV(dataset))
+        continue
+
      const formatted = formatter(dataset.data)
      const key = `${formatName}-${dataset.name}`
      tokenCounts[key] = tokenize(formatted)
@@ -42,9 +47,9 @@ export function calculateFormatResults(
    const accuracy = correctCount / totalCount

    // Calculate average tokens across all datasets for this format
-    const avgTokens = Object.entries(tokenCounts)
+    const formatTokenEntries = Object.entries(tokenCounts)
      .filter(([key]) => key.startsWith(`${formatName}-`))
-      .reduce((sum, [, tokens]) => sum + tokens, 0) / datasets.length
+    const avgTokens = formatTokenEntries.reduce((sum, [, tokens]) => sum + tokens, 0) / formatTokenEntries.length

    const averageLatency = formatResults.reduce((sum, r) => sum + r.latencyMs, 0) / totalCount

@@ -75,6 +80,8 @@ export function generateAccuracyReport(
  return `
 Benchmarks test LLM comprehension across different input formats using ${totalQuestions} data retrieval questions on ${modelNames.length} ${modelNames.length === 1 ? 'model' : 'models'}.

+${generateDatasetCatalog(ACCURACY_DATASETS)}
+
 #### Efficiency Ranking (Accuracy per 1K Tokens)

 ${generateEfficiencyRankingReport(formatResults)}
@@ -85,6 +92,38 @@ ${generateDetailedAccuracyReport(formatResults, results, questions, tokenCounts)
 `.trimStart()
 }

+/**
+ * Generate dataset catalog section
+ */
+function generateDatasetCatalog(datasets: Dataset[]): string {
+  const rows = datasets.map((dataset) => {
+    const csvSupport = supportsCSV(dataset) ? '✓' : '✗'
+    const rowCount = Object.values(dataset.data)[0]?.length ?? 1
+    const structure = dataset.metadata.structureClass
+    const eligibility = `${dataset.metadata.tabularEligibility}%`
+
+    return `| ${dataset.description} | ${rowCount} | ${structure} | ${csvSupport} | ${eligibility} |`
+  }).join('\n')
+
+  return `
+#### Dataset Catalog
+
+| Dataset | Rows | Structure | CSV Support | Eligibility |
+| ------- | ---- | --------- | ----------- | ----------- |
+${rows}
+
+**Structure classes:**
+- **uniform**: All objects have identical fields with primitive values
+- **semi-uniform**: Mix of uniform and non-uniform structures
+- **nested**: Objects with nested structures (nested objects or arrays)
+- **deep**: Highly nested with minimal tabular eligibility
+
+**CSV Support:** ✓ (supported), ✗ (not supported - would require lossy flattening)
+
+**Eligibility:** Percentage of arrays that qualify for TOON's tabular format (uniform objects with primitive values)
+`.trim()
+}
+
 /**
 * Generate efficiency ranking report
 */
@@ -168,10 +207,12 @@ function generateDetailedAccuracyReport(
  const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)

  // Calculate dataset sizes
-  const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
-  const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
-  const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
-  const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
+  const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
+  const nestedSize = ACCURACY_DATASETS.find(d => d.name === 'nested')?.data.orders?.length || 0
+  const analyticsSize = ACCURACY_DATASETS.find(d => d.name === 'analytics')?.data.metrics?.length || 0
+  const githubSize = ACCURACY_DATASETS.find(d => d.name === 'github')?.data.repositories?.length || 0
+  const eventLogsSize = ACCURACY_DATASETS.find(d => d.name === 'event-logs')?.data.logs?.length || 0
+  const nestedConfigSize = 1 // Single config object

  // Calculate number of formats and evaluations
  const formatCount = formatResults.length
@@ -208,12 +249,14 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di

 #### Datasets Tested

-Four datasets designed to test different structural patterns (all contain arrays of uniform objects, TOON's optimal format):
+Six datasets designed to test different structural patterns:

 1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
 2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
 3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
 4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.
+5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
+6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.

 #### Question Types

@@ -314,7 +357,7 @@ function generateDatasetBreakdown(
  questions: Question[],
  tokenCounts: Record<string, number>,
 ): string {
-  return datasets.map((dataset) => {
+  return ACCURACY_DATASETS.map((dataset) => {
    const datasetResults = formatResults.map((fr) => {
      const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
      if (datasetFormatResults.length === 0)