docs: overhaul retrieval accuracy benchmark

This commit is contained in:
Johann Schopplich
2025-10-28 20:22:43 +01:00
parent efbe4ded88
commit 67c0df8cb0
22 changed files with 1553 additions and 27288 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@
"scripts": {
"benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts",
"benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts",
"fetch-github-data": "tsx scripts/fetch-github-data.ts",
"fetch:github-repos": "tsx scripts/fetch-github-repos.ts",
"test": "vitest"
},
"devDependencies": {
@@ -14,14 +14,16 @@
"@ai-sdk/openai": "^2.0.53",
"@ai-sdk/provider": "^2.0.0",
"@antfu/eslint-config": "^6.1.0",
"@clack/prompts": "^0.11.0",
"@faker-js/faker": "^10.1.0",
"ai": "^5.0.80",
"consola": "^3.4.2",
"csv-stringify": "^6.6.0",
"fast-xml-parser": "^5.3.0",
"gpt-tokenizer": "^3.2.0",
"ofetch": "^1.4.1",
"p-map": "^7.0.3",
"p-queue": "^9.0.0",
"unstorage": "^1.17.1",
"yaml": "^2.8.1"
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -1,91 +0,0 @@
{
"formatResults": [
{
"format": "toon",
"accuracy": 0.8658280922431866,
"totalTokens": 4678,
"averageLatency": 5321,
"correctCount": 413,
"totalCount": 477
},
{
"format": "xml",
"accuracy": 0.8616352201257862,
"totalTokens": 9944,
"averageLatency": 6035,
"correctCount": 411,
"totalCount": 477
},
{
"format": "csv",
"accuracy": 0.8469601677148847,
"totalTokens": 4745,
"averageLatency": 6551,
"correctCount": 404,
"totalCount": 477
},
{
"format": "json",
"accuracy": 0.8322851153039832,
"totalTokens": 8713,
"averageLatency": 7981,
"correctCount": 397,
"totalCount": 477
},
{
"format": "yaml",
"accuracy": 0.8259958071278826,
"totalTokens": 7091,
"averageLatency": 5561,
"correctCount": 394,
"totalCount": 477
}
],
"questions": 159,
"models": [
"gpt-5-nano",
"claude-haiku-4-5",
"gemini-2.5-flash"
],
"datasets": [
{
"name": "tabular",
"description": "Uniform employee records (TOON optimal format)"
},
{
"name": "nested",
"description": "E-commerce orders with nested structures"
},
{
"name": "analytics",
"description": "Time-series analytics data"
},
{
"name": "github",
"description": "Top 100 GitHub repositories"
}
],
"tokenCounts": {
"json-tabular": 6347,
"json-nested": 9694,
"json-analytics": 3665,
"json-github": 15145,
"toon-tabular": 2483,
"toon-nested": 5967,
"toon-analytics": 1515,
"toon-github": 8745,
"csv-tabular": 2337,
"csv-nested": 6735,
"csv-analytics": 1393,
"csv-github": 8513,
"xml-tabular": 7314,
"xml-nested": 10992,
"xml-analytics": 4376,
"xml-github": 17095,
"yaml-tabular": 4969,
"yaml-nested": 7328,
"yaml-analytics": 2938,
"yaml-github": 13129
},
"timestamp": "2025-10-28T07:39:09.360Z"
}

View File

@@ -1,31 +1,31 @@
### Retrieval Accuracy
Accuracy across **3 LLMs** on **159 data retrieval questions**:
Accuracy across **3 LLMs** on **154 data retrieval questions**:
```
gpt-5-nano
toon ███████████████████ 99.4% (158/159)
yaml ██████████████████░ 95.0% (151/159)
csv ██████████████████░░ 92.5% (147/159)
json ██████████████████░░ 92.5% (147/159)
xml █████████████████░░ 91.2% (145/159)
claude-haiku-4-5
toon ███████████████░░░░░ 75.5% (120/159)
xml ███████████████░░░░░ 75.5% (120/159)
csv ███████████████░░░░░ 75.5% (120/159)
json ███████████████░░░░░ 75.5% (120/159)
yaml ███████████████░░░░░ 74.2% (118/159)
toon ███████████████████ 96.1% (148/154)
csv ██████████████████░ 90.3% (139/154)
yaml ██████████████████░░ 89.0% (137/154)
json ██████████████████░░ 87.7% (135/154)
xml █████████████████░░ 83.8% (129/154)
gemini-2.5-flash
xml ██████████████████░░ 91.8% (146/159)
csv █████████████████░░ 86.2% (137/159)
toon █████████████████░░░ 84.9% (135/159)
json ████████████████░░░░ 81.8% (130/159)
yaml ███████████████░░░░ 78.6% (125/159)
xml ██████████████████░░ 90.3% (139/154)
csv █████████████████░░ 89.0% (137/154)
toon █████████████████░░░ 87.0% (134/154)
json ████████████████░░░░ 79.2% (122/154)
yaml ███████████████░░░░ 76.0% (117/154)
claude-haiku-4-5-20251001
json ██████████░░░░░░░░░░ 48.7% (75/154)
toon ██████████░░░░░░░░░░ 48.1% (74/154)
xml █████████░░░░░░░░░░░ 47.4% (73/154)
yaml █████████░░░░░░░░░░░ 47.4% (73/154)
csv █████████░░░░░░░░░░░ 45.5% (70/154)
```
**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**.
**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.
<details>
<summary><strong>Performance by dataset and model</strong></summary>
@@ -36,41 +36,41 @@ gemini-2.5-flash
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
| `toon` | 87.4% | 2.483 | 152/174 |
| `csv` | 82.8% | 2.337 | 144/174 |
| `yaml` | 83.9% | 4.969 | 146/174 |
| `json` | 83.9% | 6.347 | 146/174 |
| `xml` | 88.5% | 7.314 | 154/174 |
| `csv` | 74.7% | 2,337 | 112/150 |
| `toon` | 76.7% | 2,483 | 115/150 |
| `yaml` | 70.7% | 4,969 | 106/150 |
| `xml` | 77.3% | 7,314 | 116/150 |
| `json` | 69.3% | 6,347 | 104/150 |
##### E-commerce orders with nested structures
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
| `toon` | 90.9% | 5.967 | 120/132 |
| `csv` | 93.9% | 6.735 | 124/132 |
| `yaml` | 87.1% | 7.328 | 115/132 |
| `json` | 87.9% | 9.694 | 116/132 |
| `xml` | 93.2% | 10.992 | 123/132 |
| `toon` | 80.0% | 5,967 | 96/120 |
| `csv` | 75.8% | 6,735 | 91/120 |
| `yaml` | 74.2% | 7,328 | 89/120 |
| `json` | 79.2% | 9,694 | 95/120 |
| `xml` | 78.3% | 10,992 | 94/120 |
##### Time-series analytics data
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
| `csv` | 89.7% | 1.393 | 78/87 |
| `toon` | 88.5% | 1.515 | 77/87 |
| `yaml` | 83.9% | 2.938 | 73/87 |
| `json` | 88.5% | 3.665 | 77/87 |
| `xml` | 85.1% | 4.376 | 74/87 |
| `csv` | 75.5% | 1,393 | 77/102 |
| `toon` | 76.5% | 1,515 | 78/102 |
| `yaml` | 74.5% | 2,938 | 76/102 |
| `json` | 76.5% | 3,665 | 78/102 |
| `xml` | 74.5% | 4,376 | 76/102 |
##### Top 100 GitHub repositories
| Format | Accuracy | Tokens | Correct/Total |
| ------ | -------- | ------ | ------------- |
| `toon` | 76.2% | 8.745 | 64/84 |
| `csv` | 69.0% | 8.513 | 58/84 |
| `yaml` | 71.4% | 13.129 | 60/84 |
| `json` | 69.0% | 15.145 | 58/84 |
| `xml` | 71.4% | 17.095 | 60/84 |
| `toon` | 74.4% | 8,745 | 67/90 |
| `csv` | 73.3% | 8,513 | 66/90 |
| `yaml` | 62.2% | 13,129 | 56/90 |
| `json` | 61.1% | 15,145 | 55/90 |
| `xml` | 61.1% | 17,095 | 55/90 |
#### Performance by Model
@@ -78,31 +78,31 @@ gemini-2.5-flash
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
| `toon` | 99.4% | 158/159 |
| `yaml` | 95.0% | 151/159 |
| `csv` | 92.5% | 147/159 |
| `json` | 92.5% | 147/159 |
| `xml` | 91.2% | 145/159 |
##### claude-haiku-4-5
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
| `toon` | 75.5% | 120/159 |
| `xml` | 75.5% | 120/159 |
| `csv` | 75.5% | 120/159 |
| `json` | 75.5% | 120/159 |
| `yaml` | 74.2% | 118/159 |
| `toon` | 96.1% | 148/154 |
| `csv` | 90.3% | 139/154 |
| `yaml` | 89.0% | 137/154 |
| `json` | 87.7% | 135/154 |
| `xml` | 83.8% | 129/154 |
##### gemini-2.5-flash
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
| `xml` | 91.8% | 146/159 |
| `csv` | 86.2% | 137/159 |
| `toon` | 84.9% | 135/159 |
| `json` | 81.8% | 130/159 |
| `yaml` | 78.6% | 125/159 |
| `xml` | 90.3% | 139/154 |
| `csv` | 89.0% | 137/154 |
| `toon` | 87.0% | 134/154 |
| `json` | 79.2% | 122/154 |
| `yaml` | 76.0% | 117/154 |
##### claude-haiku-4-5-20251001
| Format | Accuracy | Correct/Total |
| ------ | -------- | ------------- |
| `json` | 48.7% | 75/154 |
| `toon` | 48.1% | 74/154 |
| `xml` | 47.4% | 73/154 |
| `yaml` | 47.4% | 73/154 |
| `csv` | 45.5% | 70/154 |
</details>
@@ -124,31 +124,33 @@ Four datasets designed to test different structural patterns:
#### Question Types
159 questions are generated dynamically across three categories:
154 questions are generated dynamically across three categories:
- **Field retrieval (50%)**: Direct value lookups
- **Field retrieval (40%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → `75000`
- Example: "How many items are in order ORD-0042?" → `3`
- Example: "What is the customer name for order ORD-0042?" → `John Doe`
- **Aggregation (25%)**: Counting and summation tasks
- **Aggregation (32%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
- Example: "How many employees work in Engineering?" → `17`
- Example: "What is the total revenue across all orders?" → `45123.50`
- Example: "How many employees have salary > 80000?" → `23`
- **Filtering (25%)**: Conditional queries
- **Filtering (28%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
- Example: "How many employees in Sales have salary > 80000?" → `5`
- Example: "How many orders have total > 400?" → `12`
- Example: "How many active employees have more than 10 years of experience?" → `8`
#### Evaluation Process
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
4. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
#### Models & Configuration
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash`
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls
</details>

View File

@@ -39,11 +39,11 @@ Total ████████████░░░░░
"repo": "freeCodeCamp/freeCodeCamp",
"description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…",
"createdAt": "2014-12-24T17:49:19Z",
"updatedAt": "2025-10-27T07:40:58Z",
"pushedAt": "2025-10-26T11:31:08Z",
"stars": 430828,
"watchers": 8582,
"forks": 42136,
"updatedAt": "2025-10-28T11:58:08Z",
"pushedAt": "2025-10-28T10:17:16Z",
"stars": 430886,
"watchers": 8583,
"forks": 42146,
"defaultBranch": "main"
},
{
@@ -52,11 +52,11 @@ Total ████████████░░░░░
"repo": "codecrafters-io/build-your-own-x",
"description": "Master programming by recreating your favorite technologies from scratch.",
"createdAt": "2018-05-09T12:03:18Z",
"updatedAt": "2025-10-27T07:43:25Z",
"updatedAt": "2025-10-28T12:37:11Z",
"pushedAt": "2025-10-10T18:45:01Z",
"stars": 430102,
"watchers": 6322,
"forks": 40388,
"stars": 430877,
"watchers": 6332,
"forks": 40453,
"defaultBranch": "master"
},
{
@@ -65,11 +65,11 @@ Total ████████████░░░░░
"repo": "sindresorhus/awesome",
"description": "😎 Awesome lists about all kinds of interesting topics",
"createdAt": "2014-07-11T13:42:37Z",
"updatedAt": "2025-10-27T07:44:27Z",
"pushedAt": "2025-10-23T17:26:53Z",
"stars": 409760,
"watchers": 8016,
"forks": 32015,
"updatedAt": "2025-10-28T12:40:21Z",
"pushedAt": "2025-10-27T17:57:31Z",
"stars": 410052,
"watchers": 8017,
"forks": 32029,
"defaultBranch": "main"
}
]
@@ -80,9 +80,9 @@ Total ████████████░░░░░
```
repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-28T11:58:08Z","2025-10-28T10:17:16Z",430886,8583,42146,main
132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-28T12:37:11Z","2025-10-10T18:45:01Z",430877,6332,40453,master
21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-28T12:40:21Z","2025-10-27T17:57:31Z",410052,8017,32029,main
```
---

View File

@@ -1,51 +1,53 @@
/**
* LLM Retrieval Accuracy Benchmark
*
* Main entry point that orchestrates the full benchmark:
* 1. Generate questions from datasets
* 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
* 3. Evaluate each question with each format using LLMs
* 4. Generate reports
*/
import type { EvaluationResult, Question } from '../src/types'
import * as fsp from 'node:fs/promises'
import type { Question } from '../src/types'
import * as path from 'node:path'
import { consola } from 'consola'
import pMap from 'p-map'
import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
import process from 'node:process'
import * as prompts from '@clack/prompts'
import PQueue from 'p-queue'
import { DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, MODEL_RPM_LIMITS, ROOT_DIR } from '../src/constants'
import { datasets } from '../src/datasets'
import { evaluateQuestion, models } from '../src/evaluate'
import { formatters } from '../src/formatters'
import { generateQuestions } from '../src/questions'
import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
import { getAllModelResults, hasModelResults, saveModelResults } from '../src/storage'
consola.start('Retrieval Accuracy Benchmark for TOON')
prompts.intro('Retrieval Accuracy Benchmark')
// Check if results already exist
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
const rawResultsPath = path.join(resultsDir, 'raw-results.json')
const summaryPath = path.join(resultsDir, 'summary.json')
// Prompt user to select which models to benchmark
const modelChoices = models.map(({ modelId }) => ({
value: modelId,
label: modelId,
}))
let existingResults: EvaluationResult[] | undefined
let existingTokenCounts: Record<string, number> | undefined
const selectedModels = await prompts.multiselect({
message: 'Select models to benchmark (Space to select, Enter to confirm)',
options: modelChoices,
required: true,
})
try {
const [rawData, summaryData] = await Promise.all([
fsp.readFile(rawResultsPath, 'utf-8'),
fsp.readFile(summaryPath, 'utf-8'),
])
existingResults = JSON.parse(rawData)
const summary = JSON.parse(summaryData)
existingTokenCounts = summary.tokenCounts
consola.info('Found existing results regenerating report only')
if (prompts.isCancel(selectedModels)) {
prompts.cancel('Benchmark cancelled')
process.exit(0)
}
catch {
// Results don't exist, will run full evaluation
const activeModels = models.filter(m => selectedModels.includes(m.modelId))
prompts.log.info(`Selected ${activeModels.length} model(s): ${activeModels.map(m => m.modelId).join(', ')}`)
// Check which models already have results
const existingModelResults: Record<string, boolean> = {}
for (const model of activeModels) {
const existingResult = await hasModelResults(model.modelId)
if (existingResult)
existingModelResults[model.modelId] = existingResult
}
if (Object.keys(existingModelResults).length > 0) {
prompts.log.info(`Found existing results for ${Object.values(existingModelResults).length} model(s)`)
}
if (DRY_RUN) {
consola.info('Limiting questions and models for dry run')
prompts.log.info('Limiting questions and models for dry run')
}
let questions = generateQuestions()
@@ -55,79 +57,98 @@ if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
}
// Filter models for dry run
const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
? Object.fromEntries(
Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
)
: models
prompts.log.info(`Evaluating ${questions.length} questions`)
prompts.log.info(`Testing ${Object.keys(formatters).length} formats`)
let results: EvaluationResult[]
let tokenCounts: Record<string, number>
// Evaluate each model separately and save results incrementally
for (const model of activeModels) {
const modelId = model.modelId
if (existingResults && existingTokenCounts) {
// Reuse existing results
results = existingResults
tokenCounts = existingTokenCounts
}
else {
// Run full evaluation
consola.info(`Evaluating ${questions.length} questions`)
consola.info(`Testing ${Object.keys(formatters).length} formats`)
consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
// Skip if results already exist
if (existingModelResults[modelId]) {
prompts.log.info(`Skipping ${modelId} (results already exist)`)
continue
}
// Calculate token counts for all format+dataset combinations
tokenCounts = calculateTokenCounts(formatters)
// Generate evaluation tasks
const tasks: { question: Question, formatName: string, modelName: string }[] = []
prompts.log.step(`Running benchmark for ${modelId}`)
// Generate evaluation tasks for this model
const tasks: { question: Question, formatName: string }[] = []
for (const question of questions) {
for (const [formatName] of Object.entries(formatters)) {
for (const [modelName] of Object.entries(activeModels)) {
tasks.push({ question, formatName, modelName })
}
tasks.push({ question, formatName })
}
}
const total = tasks.length
consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
const rpmLimit = MODEL_RPM_LIMITS[modelId]
const queue = new PQueue({
concurrency: DEFAULT_CONCURRENCY,
intervalCap: rpmLimit,
interval: rpmLimit ? 60_000 : undefined,
})
results = await pMap(
tasks,
async (task, index) => {
const evalSpinner = prompts.spinner()
evalSpinner.start(`Running ${total} evaluations (concurrency: ${DEFAULT_CONCURRENCY}, RPM limit: ${rpmLimit ?? 'unlimited'})`)
let completed = 0
// Queue all tasks
const modelResultPromises = tasks.map(task =>
queue.add(async () => {
// Format data on-demand
const dataset = datasets.find(d => d.name === task.question.dataset)!
const formatter = formatters[task.formatName]!
const formattedData = formatter(dataset.data)
const model = activeModels[task.modelName as keyof typeof activeModels]!
const result = await evaluateQuestion({
question: task.question,
formatName: task.formatName,
formattedData,
model,
modelName: task.modelName,
})
// Progress update after task completes
if ((index + 1) % 10 === 0 || (index + 1) === total) {
const percent = (((index + 1) / total) * 100).toFixed(1)
consola.start(`Progress: ${index + 1}/${total} (${percent}%)`)
completed++
if (completed % 10 === 0 || completed === total) {
const percent = ((completed / total) * 100).toFixed(1)
evalSpinner.message(`Progress: ${completed}/${total} (${percent}%)`)
}
return result
},
{ concurrency: DEFAULT_CONCURRENCY },
}),
)
consola.success('Evaluation complete!')
// Wait for all tasks to complete
const modelResults = await Promise.all(modelResultPromises)
evalSpinner.stop(`Evaluation complete for ${modelId}`)
// Save results immediately for this model
await saveModelResults(modelId, modelResults)
prompts.log.success(`Saved results for ${modelId}`)
}
// Generate/regenerate markdown report
consola.start('Generating report and saving results…')
const formatResults = calculateFormatResults(results, tokenCounts)
await saveResults(results, formatResults, questions, tokenCounts)
// Generate/regenerate markdown report from all available model results
const reportSpinner = prompts.spinner()
reportSpinner.start('Generating report from all model results')
consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
// Load all available model results (including any that were skipped)
const allModelResults = await getAllModelResults()
const allResults = Object.values(allModelResults).flat()
if (allResults.length === 0) {
prompts.log.warn('No results available to generate report')
process.exit(0)
}
// Calculate token counts freshly (deterministic, no need to persist)
const tokenCounts = calculateTokenCounts(formatters)
// Calculate format statistics and save report
const formatResults = calculateFormatResults(allResults, tokenCounts)
const resultsDir = await saveResults(allResults, formatResults, questions, tokenCounts)
const reportPath = path.join(resultsDir, 'retrieval-accuracy.md')
prompts.log.info(`Report saved to: \`${path.relative(ROOT_DIR, reportPath)}\``)
reportSpinner.stop('Report generation complete!')

View File

@@ -1,18 +1,20 @@
import * as path from 'node:path'
import process from 'node:process'
import { consola } from 'consola'
import * as prompts from '@clack/prompts'
import { ofetch } from 'ofetch'
import pMap from 'p-map'
import { BENCHMARKS_DIR } from '../src/constants'
import { ensureDir, saveJsonFile } from '../src/utils'
prompts.intro('GitHub Repositories Fetcher')
try {
// Fetch top 100 repos from GitHub
const repoList = await searchTop100Repos()
const repos = await fetchRepoDetails(repoList)
if (repos.length === 0) {
consola.error('No repositories fetched. Exiting.')
prompts.log.error('No repositories fetched. Exiting.')
process.exit(1)
}
@@ -21,15 +23,16 @@ try {
await saveRepos(repos)
consola.success('Done!')
prompts.log.success('Done!')
}
catch (error) {
consola.error(error)
prompts.log.error(String(error))
process.exit(1)
}
async function searchTop100Repos(): Promise<string[]> {
consola.start('Fetching top 100 starred repositories from GitHub API…')
const s = prompts.spinner()
s.start('Fetching top 100 starred repositories')
const response = await ofetch<{ items: { full_name: string }[] }>(
'https://api.github.com/search/repositories',
@@ -47,23 +50,26 @@ async function searchTop100Repos(): Promise<string[]> {
},
)
s.stop('Fetched top 100 repositories')
return response.items.map(item => item.full_name)
}
async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
consola.start(`Fetching ${repoList.length} GitHub repositories…`)
const s = prompts.spinner()
s.start(`Fetching ${repoList.length} GitHub repositories`)
const repos = await pMap(
repoList,
async (repoPath, index) => {
consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}`)
s.message(`[${index + 1}/${repoList.length}] Fetching ${repoPath}`)
const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`)
return repo
},
{ concurrency: 5 },
)
consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
s.stop(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
return repos
}
@@ -76,5 +82,5 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
await saveJsonFile(outputFile, repos)
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
consola.info(`Saved to \`${relativePath}\``)
prompts.log.info(`Result saved to \`${relativePath}\``)
}

View File

@@ -1,6 +1,6 @@
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
import { consola } from 'consola'
import * as prompts from '@clack/prompts'
import { encode } from '../../src/index'
import githubRepos from '../data/github-repos.json' with { type: 'json' }
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
@@ -24,8 +24,6 @@ interface BenchmarkResult {
showDetailed: boolean
}
const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md')
const BENCHMARK_EXAMPLES = [
{
name: 'GitHub Repositories',
@@ -50,6 +48,8 @@ const BENCHMARK_EXAMPLES = [
},
] as const
prompts.intro('Token Efficiency Benchmark')
// Calculate total savings
let totalJsonTokens = 0
let totalToonTokens = 0
@@ -204,9 +204,12 @@ ${detailedExamples}
</details>
`.trimStart()
console.log(`${barChartSection}\n`)
prompts.log.message(`${barChartSection}\n`)
await ensureDir(path.join(BENCHMARKS_DIR, 'results'))
const resultsDir = path.join(BENCHMARKS_DIR, 'results')
await ensureDir(resultsDir)
const outputFilePath = path.join(resultsDir, 'token-efficiency.md')
await fsp.writeFile(outputFilePath, markdown, 'utf-8')
consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
prompts.log.success(`Result saved to \`${path.relative(ROOT_DIR, outputFilePath)}\``)

View File

@@ -5,9 +5,22 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
/**
* Default concurrency for parallel evaluations
* Model-specific RPM (requests per minute) limits to handle API quotas
*
* @remarks
* Set `undefined` for models without specific limits
*/
export const DEFAULT_CONCURRENCY = 20
/// keep-sorted
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
'claude-haiku-4-5-20251001': 50,
'gemini-2.5-flash': 25,
'gpt-5-nano': undefined,
}
/**
* Default concurrency for parallel evaluations to prevent bursting
*/
export const DEFAULT_CONCURRENCY = 10
/**
* Progress bar configuration
@@ -28,13 +41,83 @@ export const PROGRESS_BAR = {
export const DRY_RUN: boolean = process.env.DRY_RUN === 'true'
/**
* Limits applied when DRY_RUN is enabled
* Limits applied during dry run mode
*/
export const DRY_RUN_LIMITS = {
/** Maximum number of questions to evaluate */
maxQuestions: 10,
/** Maximum number of formats to test */
maxFormats: undefined as number | undefined,
/** Models to use in dry run */
allowedModels: [] as string[],
}
/**
* Threshold values for filtering and aggregation questions
*/
export const QUESTION_THRESHOLDS = {
tabular: {
salaryRanges: [60000, 80000, 100000, 120000],
experienceYears: [5, 10, 15, 20],
departmentSalaryThreshold: 80000,
departmentExperienceThreshold: 10,
},
nested: {
highValueOrders: [200, 400, 600],
statusValueThreshold: 300,
itemCountThreshold: 3,
totalThresholdsForItems: [300, 500],
},
analytics: {
views: [5000, 7000],
conversions: [10, 30],
viewsForFiltering: [6000, 7000],
conversionsForFiltering: 15,
revenueThresholds: [500, 1000, 1500, 2000, 2500],
viewsThresholdForRevenue: 6000,
clicksForFiltering: [250, 400],
conversionsForClickFiltering: 15,
revenueForBounceRate: [1000, 1500],
bounceRateThreshold: 0.5,
},
github: {
stars: [100000, 150000, 200000],
forks: [20000, 35000, 50000],
watchers: [5000, 8000],
starForkCombinations: [
{ stars: 75000, forks: 15000 },
{ stars: 100000, forks: 20000 },
{ stars: 150000, forks: 30000 },
{ stars: 200000, forks: 45000 },
],
starWatcherCombinations: [
{ stars: 100000, watchers: 7000 },
{ stars: 150000, watchers: 9000 },
],
},
} as const
/**
* Question generation configuration
*/
export const QUESTION_LIMITS = {
tabular: {
fieldRetrieval: 20,
aggregationDepartments: 6,
filteringMultiConditionDepartments: 6,
filteringExperience: 4,
filteringDepartmentExp: 3,
filteringDepartmentActive: 3,
},
nested: {
fieldRetrievalOrders: 8,
fieldRetrievalCustomers: 10,
aggregationStatuses: 5,
filteringStatusAndValue: 5,
filteringStatusAndItems: 3,
},
analytics: {
fieldRetrievalDates: 13,
},
github: {
fieldRetrievalRepos: 11,
aggregationBranches: 2,
filteringStarsAndForks: 8,
},
} as const

View File

@@ -1,12 +1,3 @@
/**
* Datasets for TOON benchmarks
*
* These datasets are designed to test TOON's strengths and weaknesses:
* - Tabular: Uniform records (TOON optimal)
* - Nested: Complex structures with nested objects
* - Analytics: Time-series data
*/
import type { Dataset } from './types'
import { faker } from '@faker-js/faker'
import githubRepos from '../data/github-repos.json' with { type: 'json' }
@@ -128,7 +119,7 @@ const tabularDataset: Dataset = {
description: 'Uniform employee records (TOON optimal format)',
data: {
employees: Array.from({ length: 100 }, (_, i): Employee => {
const yearsExp = faker.number.int({ min: 1, max: 20 })
const yearsExp = faker.number.int({ min: 1, max: 25 })
return {
id: i + 1,
name: faker.person.fullName(),

View File

@@ -1,28 +1,19 @@
/**
* LLM evaluation logic for TOON benchmarks
*
* Handles:
* - Model configuration
* - Question evaluation with LLMs
* - Answer validation using LLM-as-judge
*/
import type { LanguageModelV2 } from '@ai-sdk/provider'
import type { EvaluationResult, Question } from './types'
import { anthropic } from '@ai-sdk/anthropic'
import { google } from '@ai-sdk/google'
import { openai } from '@ai-sdk/openai'
import * as prompts from '@clack/prompts'
import { generateText } from 'ai'
import { consola } from 'consola'
/**
* Models used for evaluation
*/
export const models: Record<string, LanguageModelV2> = {
'gpt-5-nano': openai('gpt-5-nano'),
'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
'gemini-2.5-flash': google('gemini-2.5-flash'),
}
export const models: LanguageModelV2[] = [
openai('gpt-5-nano'),
google('gemini-2.5-flash'),
anthropic('claude-haiku-4-5-20251001'),
]
/**
* Evaluate a single question with a specific format and model
@@ -33,14 +24,12 @@ export async function evaluateQuestion(
formatName,
formattedData,
model,
modelName,
}:
{
question: Question
formatName: string
formattedData: string
model: LanguageModelV2
modelName: string
},
): Promise<EvaluationResult> {
const prompt = `
@@ -59,10 +48,11 @@ Provide only the direct answer, without any additional explanation or formatting
const { text, usage } = await generateText({
model,
prompt,
temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
})
const latencyMs = performance.now() - startTime
const isCorrect = await validateAnswer({
actual: text.trim(),
expected: question.groundTruth,
@@ -72,7 +62,7 @@ Provide only the direct answer, without any additional explanation or formatting
return {
questionId: question.id,
format: formatName,
model: modelName,
model: model.modelId,
expected: question.groundTruth,
actual: text.trim(),
isCorrect,
@@ -115,14 +105,14 @@ Respond with only "YES" or "NO".
try {
const { text } = await generateText({
model: models['gpt-5-nano']!,
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
consola.error('Validation error:', error)
prompts.log.error(`Validation error: ${error}`)
// Fallback to simple string comparison
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
}

View File

@@ -1,20 +1,3 @@
/**
* Format converters for TOON benchmarks
*
* Converts data to different formats for comparison:
* - JSON
* - TOON
* - CSV
* - XML
* - YAML
*
* ## Semantic Equivalence
*
* All formatters attempt to preserve semantic equivalence with the source data,
* meaning the converted data should represent the same information. However,
* CSV has inherent limitations with nested structures (see `toCSV` docs).
*/
import { stringify as stringifyCSV } from 'csv-stringify/sync'
import { XMLBuilder } from 'fast-xml-parser'
import { stringify as stringifyYAML } from 'yaml'
@@ -23,7 +6,10 @@ import { encode as encodeToon } from '../../src/index'
/**
* Format converters registry
*
* Each formatter takes unknown data and returns a string representation
* @remarks
* All formatters attempt to preserve semantic equivalence with the source data,
* meaning the converted data should represent the same information. However,
* CSV has inherent limitations with nested structures (see `toCSV` docs).
*/
export const formatters: Record<string, (data: unknown) => string> = {
json: data => JSON.stringify(data, undefined, 2),
@@ -37,11 +23,13 @@ export const formatters: Record<string, (data: unknown) => string> = {
* Convert data to CSV format
*
* @remarks
* **Limitations**: CSV is designed for flat tabular data only. This formatter:
* - Only handles top-level objects with arrays of flat objects
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
* - Loses nested structure information during conversion
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
* Limitations: CSV is designed for flat tabular data only.
*
* This formatter:
* - Only handles top-level objects with arrays of flat objects
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
* - Loses nested structure information during conversion
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
*
* For datasets with nested structures, CSV comparisons may not be fair or representative
* of how CSV would typically be used in practice.

View File

@@ -1,24 +1,18 @@
/**
* Question generation for TOON benchmarks
*
* Generates ~160 questions across different types:
* - Field retrieval (50%): "What is X's Y?"
* - Aggregation (25%): "How many X have Y?"
* - Filtering (25%): "List/count X where Y"
*
* Questions are generated dynamically based on actual data values
*
* TODO: Balance question distribution across datasets to ensure fair representation.
* Current distribution:
* - Tabular: 70 questions (43%)
* - Nested: 50 questions (31%)
* - Analytics: 40 questions (25%)
* - GitHub: 40 questions (25%)
* Generates ~150-160 questions across different question types and datasets:
* - Field Retrieval: Direct field access with no computation
* Examples: "What is X's salary?", "What is the status of order Y?"
* - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
* - Filtering: Multi-condition queries requiring complex logical operations
* Examples: "How many X WHERE condition1 AND condition2?"
*/
import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
import type { Question } from './types'
import { consola } from 'consola'
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from './constants'
import { datasets } from './datasets'
/**
@@ -34,19 +28,15 @@ export function generateQuestions(): Question[] {
const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) ?? []
const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) ?? []
// ========================================
// TABULAR DATASET QUESTIONS (70 questions)
// ========================================
if (tabular.length > 0) {
// Field retrieval: specific employees (40 questions)
for (let i = 0; i < Math.min(40, tabular.length); i++) {
// Field retrieval: specific employees
for (let i = 0; i < Math.min(QUESTION_LIMITS.tabular.fieldRetrieval, tabular.length); i++) {
const emp = tabular[i * 2] || tabular[i]
if (!emp)
continue
// Alternate between different field types
if (i % 3 === 0) {
// Rotate through all field types
if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the salary of ${emp.name}?`,
@@ -55,7 +45,7 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
else if (i % 3 === 1) {
else if (i % 5 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `What department does ${emp.name} work in?`,
@@ -64,7 +54,7 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
else {
else if (i % 5 === 2) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the email address of ${emp.name}?`,
@@ -73,11 +63,29 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
else if (i % 5 === 3) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many years of experience does ${emp.name} have?`,
groundTruth: String(emp.yearsExperience),
type: 'field-retrieval',
dataset: 'tabular',
})
}
else {
questions.push({
id: `q${idCounter++}`,
prompt: `Is ${emp.name} an active employee?`,
groundTruth: emp.active ? 'yes' : 'no',
type: 'field-retrieval',
dataset: 'tabular',
})
}
}
// Aggregation: count by department
const departments = [...new Set(tabular.map(e => e.department))]
for (const dept of departments.slice(0, 6)) {
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
const count = tabular.filter(e => e.department === dept).length
questions.push({
id: `q${idCounter++}`,
@@ -88,9 +96,8 @@ export function generateQuestions(): Question[] {
})
}
// Aggregation: salary ranges (4 questions)
const salaryThresholds = [60000, 80000, 100000, 120000]
for (const threshold of salaryThresholds) {
// Aggregation: salary ranges (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
const count = tabular.filter(e => e.salary > threshold).length
questions.push({
id: `q${idCounter++}`,
@@ -101,39 +108,57 @@ export function generateQuestions(): Question[] {
})
}
// Filtering: active status
// Aggregation: totals and averages
const totalEmployees = tabular.length
const avgSalary = Math.round(tabular.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
const activeCount = tabular.filter(e => e.active).length
const inactiveCount = tabular.filter(e => !e.active).length
questions.push(
{
id: `q${idCounter++}`,
prompt: 'How many employees are in the dataset?',
groundTruth: String(totalEmployees),
type: 'aggregation',
dataset: 'tabular',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average salary across all employees?',
groundTruth: String(avgSalary),
type: 'aggregation',
dataset: 'tabular',
},
{
id: `q${idCounter++}`,
prompt: 'How many employees are active?',
groundTruth: String(activeCount),
type: 'filtering',
type: 'aggregation',
dataset: 'tabular',
},
{
id: `q${idCounter++}`,
prompt: 'How many employees are inactive?',
groundTruth: String(inactiveCount),
type: 'filtering',
type: 'aggregation',
dataset: 'tabular',
},
)
// Complex filtering: multi-condition (8 questions)
for (const dept of departments.slice(0, 4)) {
const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
// Filtering: count by department with salary filter (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
const count = tabular.filter(e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many employees in ${dept} have a salary greater than 80000?`,
prompt: `How many employees in ${dept} have a salary greater than ${QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'tabular',
})
}
for (const exp of [5, 10]) {
// Filtering: active employees by experience (multi-condition)
for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
questions.push({
id: `q${idCounter++}`,
@@ -143,15 +168,35 @@ export function generateQuestions(): Question[] {
dataset: 'tabular',
})
}
// Filtering: department by experience (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
const count = tabular.filter(e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many employees in ${dept} have more than ${QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold} years of experience?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'tabular',
})
}
// Filtering: department by active status (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
const count = tabular.filter(e => e.department === dept && e.active).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many active employees work in ${dept}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'tabular',
})
}
}
// ========================================
// NESTED DATASET QUESTIONS (50 questions)
// ========================================
if (nested.length > 0) {
// Field retrieval: order totals (20 questions)
for (let i = 0; i < Math.min(20, nested.length); i++) {
// Field retrieval: order totals and statuses
for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalOrders, nested.length); i++) {
const order = nested[i * 2] || nested[i]
if (!order)
continue
@@ -159,7 +204,7 @@ export function generateQuestions(): Question[] {
if (i % 2 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the total amount for order ${order.orderId}?`,
prompt: `What is the total for order ${order.orderId}?`,
groundTruth: String(order.total),
type: 'field-retrieval',
dataset: 'nested',
@@ -176,51 +221,143 @@ export function generateQuestions(): Question[] {
}
}
// Field retrieval: customer info (15 questions)
for (let i = 0; i < Math.min(15, nested.length); i++) {
const order = nested[i * 3] || nested[i]
// Field retrieval: customer info and order dates (expanded)
for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalCustomers, nested.length); i++) {
const order = nested[i * 2 + 1] || nested[i]
if (!order)
continue
questions.push({
id: `q${idCounter++}`,
prompt: `What is the customer name for order ${order.orderId}?`,
groundTruth: order.customer.name,
type: 'field-retrieval',
dataset: 'nested',
})
if (i % 4 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the customer name for order ${order.orderId}?`,
groundTruth: order.customer.name,
type: 'field-retrieval',
dataset: 'nested',
})
}
else if (i % 4 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the customer email for order ${order.orderId}?`,
groundTruth: order.customer.email,
type: 'field-retrieval',
dataset: 'nested',
})
}
else if (i % 4 === 2) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the order date for order ${order.orderId}?`,
groundTruth: order.orderDate || '',
type: 'field-retrieval',
dataset: 'nested',
})
}
else {
questions.push({
id: `q${idCounter++}`,
prompt: `How many items are in order ${order.orderId}?`,
groundTruth: String(order.items.length),
type: 'field-retrieval',
dataset: 'nested',
})
}
}
// Aggregation: count by status
// Aggregation: totals and averages
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
const avgOrderValue = totalRevenue / nested.length
const totalOrders = nested.length
const maxOrderValue = Math.max(...nested.map(o => o.total))
// Count by status
const statuses = [...new Set(nested.map(o => o.status))]
for (const status of statuses) {
for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
const count = nested.filter(o => o.status === status).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have status "${status}"?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'nested',
})
}
questions.push(
{
id: `q${idCounter++}`,
prompt: 'What is the total revenue across all orders?',
groundTruth: String(totalRevenue.toFixed(2)),
type: 'aggregation',
dataset: 'nested',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average order value?',
groundTruth: String(avgOrderValue.toFixed(2)),
type: 'aggregation',
dataset: 'nested',
},
{
id: `q${idCounter++}`,
prompt: 'How many orders are in the dataset?',
groundTruth: String(totalOrders),
type: 'aggregation',
dataset: 'nested',
},
{
id: `q${idCounter++}`,
prompt: 'What is the highest order total?',
groundTruth: String(maxOrderValue.toFixed(2)),
type: 'aggregation',
dataset: 'nested',
},
)
// Aggregation: high-value orders (single-condition filter)
for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
const count = nested.filter(o => o.total > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have a total greater than ${threshold}?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'nested',
})
}
// Filtering: multi-condition queries (status AND value)
const orderStatuses = [...new Set(nested.map(o => o.status))]
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
const count = nested.filter(o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have status "${status}" and total greater than ${QUESTION_THRESHOLDS.nested.statusValueThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'nested',
})
}
// Aggregation: total revenue
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
questions.push({
id: `q${idCounter++}`,
prompt: 'What is the total revenue across all orders?',
groundTruth: String(totalRevenue.toFixed(2)),
type: 'aggregation',
dataset: 'nested',
})
// Filtering: high-value orders (3 questions)
const highValueThresholds = [200, 400, 600]
for (const threshold of highValueThresholds) {
const count = nested.filter(o => o.total > threshold).length
// Filtering: status AND items count (multi-condition)
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
const count = nested.filter(o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have a total greater than ${threshold}?`,
prompt: `How many orders have status "${status}" and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'nested',
})
}
// Filtering: total AND items count (multi-condition)
for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
const count = nested.filter(o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have a total greater than ${threshold} and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'nested',
@@ -228,18 +365,14 @@ export function generateQuestions(): Question[] {
}
}
// ========================================
// ANALYTICS DATASET QUESTIONS (40 questions)
// ========================================
if (analytics.length > 0) {
// Field retrieval: specific dates (20 questions)
for (let i = 0; i < Math.min(20, analytics.length); i++) {
// Field retrieval: specific dates (expanded with all metrics)
for (let i = 0; i < Math.min(QUESTION_LIMITS.analytics.fieldRetrievalDates, analytics.length); i++) {
const metric = analytics[i * 3] || analytics[i]
if (!metric)
continue
if (i % 2 === 0) {
if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many views were recorded on ${metric.date}?`,
@@ -248,7 +381,7 @@ export function generateQuestions(): Question[] {
dataset: 'analytics',
})
}
else {
else if (i % 5 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `What was the revenue on ${metric.date}?`,
@@ -257,12 +390,42 @@ export function generateQuestions(): Question[] {
dataset: 'analytics',
})
}
else if (i % 5 === 2) {
questions.push({
id: `q${idCounter++}`,
prompt: `What was the conversion count on ${metric.date}?`,
groundTruth: String(metric.conversions),
type: 'field-retrieval',
dataset: 'analytics',
})
}
else if (i % 5 === 3) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many clicks were recorded on ${metric.date}?`,
groundTruth: String(metric.clicks),
type: 'field-retrieval',
dataset: 'analytics',
})
}
else {
questions.push({
id: `q${idCounter++}`,
prompt: `What was the bounce rate on ${metric.date}?`,
groundTruth: String(metric.bounceRate),
type: 'field-retrieval',
dataset: 'analytics',
})
}
}
// Aggregation: totals (4 questions)
// Aggregation: totals and averages
const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
const avgViews = Math.round(totalViews / analytics.length)
const avgRevenue = totalRevenue / analytics.length
const avgConversions = Math.round(totalConversions / analytics.length)
questions.push(
{
@@ -286,27 +449,97 @@ export function generateQuestions(): Question[] {
type: 'aggregation',
dataset: 'analytics',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average number of views per day?',
groundTruth: String(avgViews),
type: 'aggregation',
dataset: 'analytics',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average revenue per day?',
groundTruth: String(avgRevenue.toFixed(2)),
type: 'aggregation',
dataset: 'analytics',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average number of conversions per day?',
groundTruth: String(avgConversions),
type: 'aggregation',
dataset: 'analytics',
},
{
id: `q${idCounter++}`,
prompt: 'How many days are included in the analytics data?',
groundTruth: String(analytics.length),
type: 'aggregation',
dataset: 'analytics',
},
{
id: `q${idCounter++}`,
prompt: 'What is the highest number of views recorded in a single day?',
groundTruth: String(Math.max(...analytics.map(m => m.views))),
type: 'aggregation',
dataset: 'analytics',
},
)
// Filtering: high-performing days (10 questions)
const viewThresholds = [5000, 6000, 7000]
for (const threshold of viewThresholds) {
// Aggregation: high-performing days (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
const count = analytics.filter(m => m.views > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${threshold} views?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'analytics',
})
}
// Filtering: multi-condition queries (views AND conversions)
for (const viewThreshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
const count = analytics.filter(m => m.views > viewThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${viewThreshold} views and more than ${QUESTION_THRESHOLDS.analytics.conversionsForFiltering} conversions?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
})
}
const conversionThresholds = [10, 20, 30]
for (const threshold of conversionThresholds) {
const count = analytics.filter(m => m.conversions > threshold).length
// Filtering: views AND revenue (expanded)
for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueThresholds.slice(0, 5)) {
const count = analytics.filter(m => m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue && m.revenue > revenueThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${threshold} conversions?`,
prompt: `How many days had more than ${QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue} views and revenue greater than ${revenueThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
})
}
// Filtering: clicks AND conversions (multi-condition)
for (const clickThreshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
const count = analytics.filter(m => m.clicks > clickThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${clickThreshold} clicks and more than ${QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering} conversions?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
})
}
// Filtering: revenue AND bounce rate (multi-condition)
for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
const count = analytics.filter(m => m.revenue > revenueThreshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had revenue greater than ${revenueThreshold} and bounce rate less than ${QUESTION_THRESHOLDS.analytics.bounceRateThreshold}?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'analytics',
@@ -314,79 +547,159 @@ export function generateQuestions(): Question[] {
}
}
// ========================================
// GITHUB DATASET QUESTIONS (40 questions)
// ========================================
if (github.length > 0) {
// Field retrieval: specific repos (20 questions)
for (let i = 0; i < Math.min(20, github.length); i++) {
const repo = github[i * 10] || github[i]
// Helper to extract owner from repo field
const getOwner = (repoFullName: string) => repoFullName.split('/')[0]!
// Field retrieval: specific repos (diverse fields)
for (let i = 0; i < Math.min(QUESTION_LIMITS.github.fieldRetrievalRepos, github.length); i++) {
const repo = github[i * 7]
if (!repo)
continue
if (i % 2 === 0) {
if (i % 5 === 0) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many stars does ${repo.owner}/${repo.name} have?`,
prompt: `How many stars does ${repo.repo} have?`,
groundTruth: String(repo.stars),
type: 'field-retrieval',
dataset: 'github',
})
}
else if (i % 5 === 1) {
questions.push({
id: `q${idCounter++}`,
prompt: `How many forks does ${repo.repo} have?`,
groundTruth: String(repo.forks),
type: 'field-retrieval',
dataset: 'github',
})
}
else if (i % 5 === 2) {
questions.push({
id: `q${idCounter++}`,
prompt: `Who is the owner of ${repo.repo}?`,
groundTruth: getOwner(repo.repo),
type: 'field-retrieval',
dataset: 'github',
})
}
else if (i % 5 === 3) {
questions.push({
id: `q${idCounter++}`,
prompt: `What is the default branch of ${repo.repo}?`,
groundTruth: repo.defaultBranch,
type: 'field-retrieval',
dataset: 'github',
})
}
else {
questions.push({
id: `q${idCounter++}`,
prompt: `How many forks does ${repo.owner}/${repo.name} have?`,
groundTruth: String(repo.forks),
prompt: `How many watchers does ${repo.repo} have?`,
groundTruth: String(repo.watchers),
type: 'field-retrieval',
dataset: 'github',
})
}
}
// Aggregation: count by owner (5 questions)
const owners = [...new Set(github.map(r => r.owner))]
for (const owner of owners.slice(0, 5)) {
const count = github.filter(r => r.owner === owner).length
// Aggregation: popular repositories
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
const totalRepos = github.length
const avgStars = Math.round(totalStars / totalRepos)
questions.push(
{
id: `q${idCounter++}`,
prompt: 'What is the total number of stars across all repositories?',
groundTruth: String(totalStars),
type: 'aggregation',
dataset: 'github',
},
{
id: `q${idCounter++}`,
prompt: 'How many repositories are in the dataset?',
groundTruth: String(totalRepos),
type: 'aggregation',
dataset: 'github',
},
{
id: `q${idCounter++}`,
prompt: 'What is the average number of stars per repository?',
groundTruth: String(avgStars),
type: 'aggregation',
dataset: 'github',
},
)
// Aggregation: star thresholds (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.github.stars) {
const count = github.filter(r => r.stars > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories does ${owner} have in the dataset?`,
prompt: `How many repositories have more than ${threshold} stars?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'github',
})
}
// Aggregation: total stars
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
questions.push({
id: `q${idCounter++}`,
prompt: 'What is the total number of stars across all repositories?',
groundTruth: String(totalStars),
type: 'aggregation',
dataset: 'github',
})
// Filtering: popular repos (8 questions)
const starThresholds = [10000, 50000, 100000]
for (const threshold of starThresholds) {
const count = github.filter(r => r.stars > threshold).length
// Aggregation: fork thresholds (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.github.forks) {
const count = github.filter(r => r.forks > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${threshold} stars?`,
prompt: `How many repositories have more than ${threshold} forks?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'github',
})
}
// Aggregation: watcher thresholds (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
const count = github.filter(r => r.watchers > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${threshold} watchers?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'github',
})
}
// Aggregation: default branch counts
const branches = [...new Set(github.map(r => r.defaultBranch))]
for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
const count = github.filter(r => r.defaultBranch === branch).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories use "${branch}" as their default branch?`,
groundTruth: String(count),
type: 'aggregation',
dataset: 'github',
})
}
// Filtering: multi-condition queries (stars AND forks)
for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
const count = github.filter(r => r.stars > combo.stars && r.forks > combo.forks).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.forks} forks?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'github',
})
}
const forkThresholds = [1000, 5000, 10000]
for (const threshold of forkThresholds) {
const count = github.filter(r => r.forks > threshold).length
// Filtering: stars AND watchers (multi-condition)
for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
const count = github.filter(r => r.stars > combo.stars && r.watchers > combo.watchers).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${threshold} forks?`,
prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.watchers} watchers?`,
groundTruth: String(count),
type: 'filtering',
dataset: 'github',
@@ -394,14 +707,5 @@ export function generateQuestions(): Question[] {
}
}
consola.info(`Question breakdown:`)
consola.box(`
Tabular: ${questions.filter(q => q.dataset === 'tabular').length}
Nested: ${questions.filter(q => q.dataset === 'nested').length}
Analytics: ${questions.filter(q => q.dataset === 'analytics').length}
GitHub: ${questions.filter(q => q.dataset === 'github').length}
Total: ${questions.length}
`.trim())
return questions
}

View File

@@ -1,21 +1,9 @@
/**
* Report generation for TOON benchmarks
*
* Handles:
* - Statistical analysis
* - Markdown report generation with visual elements
* - Per-dataset breakdowns
* - Cost analysis
* - Result file saving
*/
import type { EvaluationResult, FormatResult, Question } from './types'
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
import { BENCHMARKS_DIR } from './constants'
import { datasets } from './datasets'
import { models } from './evaluate'
import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
import { createProgressBar, ensureDir, tokenize } from './utils'
/**
* Calculate per-format statistics from evaluation results
@@ -63,8 +51,8 @@ export function generateMarkdownReport(
const json = formatResults.find(r => r.format === 'json')
// Build model-by-model breakdown with ASCII bars
const modelCount = Object.keys(models).length
const modelNames = Object.keys(models)
const modelNames = [...new Set(results.map(r => r.model))].reverse()
const modelCount = modelNames.length
const modelBreakdown = modelNames.map((modelName, i) => {
const modelResults = formatResults.map((fr) => {
@@ -136,7 +124,7 @@ export function generateMarkdownReport(
})
const tableRows = datasetResults.slice(0, 6).map(result =>
`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString('en-US')} | ${result.correctCount}/${result.totalCount} |`,
).join('\n')
return `
@@ -180,6 +168,27 @@ ${tableRows}
// Calculate total unique questions
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
// Calculate question type distribution
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
const filteringCount = questions.filter(q => q.type === 'filtering').length
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
// Calculate dataset sizes
const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
// Calculate number of formats and models
const formatCount = formatResults.length
const modelsUsed = [...new Set(results.map(r => r.model))]
const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
return `
### Retrieval Accuracy
@@ -213,39 +222,41 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
Four datasets designed to test different structural patterns:
1. **Tabular** (100 employee records): Uniform objects with identical fields optimal for TOON's tabular format.
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars.
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields optimal for TOON's tabular format.
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.
#### Question Types
${totalQuestions} questions are generated dynamically across three categories:
- **Field retrieval (50%)**: Direct value lookups
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → \`75000\`
- Example: "How many items are in order ORD-0042?" → \`3\`
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
- **Aggregation (25%)**: Counting and summation tasks
- **Aggregation (${aggregationPercent}%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
- Example: "How many employees work in Engineering?" → \`17\`
- Example: "What is the total revenue across all orders?" → \`45123.50\`
- Example: "How many employees have salary > 80000?" → \`23\`
- **Filtering (25%)**: Conditional queries
- **Filtering (${filteringPercent}%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
- Example: "How many orders have total > 400?" → \`12\`
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
#### Evaluation Process
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
1. **Format conversion:** Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
#### Models & Configuration
- **Models tested**: \`gpt-5-nano\`, \`claude-haiku-4-5\`, \`gemini-2.5-flash\`
- **Models tested**: ${modelsListStr}
- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
</details>
`.trimStart()
@@ -272,6 +283,10 @@ export function calculateTokenCounts(
/**
* Save results to disk
*
* @remarks
* Per-model results are managed separately via storage.ts
* This function only generates the aggregated markdown report
*/
export async function saveResults(
results: EvaluationResult[],
@@ -279,31 +294,12 @@ export async function saveResults(
questions: Question[],
tokenCounts: Record<string, number>,
): Promise<string> {
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
const resultsDir = path.join(BENCHMARKS_DIR, 'results')
await ensureDir(resultsDir)
// Save raw results
await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
// Save summary
await saveJsonFile(
path.join(resultsDir, 'summary.json'),
{
formatResults,
questions: questions.length,
models: Object.keys(models),
datasets: datasets.map(d => ({ name: d.name, description: d.description })),
tokenCounts,
timestamp: new Date().toISOString(),
},
)
// Generate markdown report
// Generate markdown report from all available model results
const report = generateMarkdownReport(formatResults, results, questions, tokenCounts)
await fsp.writeFile(
path.join(resultsDir, 'report.md'),
report,
)
await fsp.writeFile(path.join(resultsDir, 'retrieval-accuracy.md'), report)
return resultsDir
}

46
benchmarks/src/storage.ts Normal file
View File

@@ -0,0 +1,46 @@
import type { Storage, StorageValue } from 'unstorage'
import type { EvaluationResult } from './types'
import * as path from 'node:path'
import { createStorage } from 'unstorage'
import fsDriver from 'unstorage/drivers/fs'
import { BENCHMARKS_DIR } from './constants'
/**
* Storage instance for model results
*
* @remarks
* Stores results in: `benchmarks/results/accuracy/models/`
*/
export const resultsStorage: Storage<StorageValue> = createStorage({
driver: fsDriver({
base: path.join(BENCHMARKS_DIR, 'results', 'accuracy', 'models'),
}),
})
export async function loadModelResults(modelId: string): Promise<EvaluationResult[] | undefined> {
const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
return data ?? undefined
}
export async function saveModelResults(modelId: string, results: EvaluationResult[]): Promise<void> {
await resultsStorage.setItem(modelId, results)
}
export async function getAllModelResults(): Promise<Record<string, EvaluationResult[]>> {
const keys = await resultsStorage.getKeys()
const results: Record<string, EvaluationResult[]> = {}
await Promise.all(
keys.map(async (modelId) => {
const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
if (data)
results[modelId] = data
}),
)
return results
}
export async function hasModelResults(modelId: string): Promise<boolean> {
return await resultsStorage.hasItem(modelId)
}

View File

@@ -1,13 +1,3 @@
/**
* Shared utility functions for TOON benchmarks
*
* Provides common functionality used across multiple benchmark scripts:
* - Progress bar visualization
* - Token counting
* - File I/O operations
* - Retry logic for API calls
*/
import * as fsp from 'node:fs/promises'
import { encode } from 'gpt-tokenizer'