docs: overhaul retrieval accuracy benchmark

2026-01-29 15:24:10 +08:00 · 2025-10-28 20:22:43 +01:00
parent efbe4ded88
commit 67c0df8cb0
22 changed files with 1553 additions and 27288 deletions
--- a/benchmarks/data/github-repos.json
+++ b/benchmarks/data/github-repos.json
--- a/benchmarks/package.json
+++ b/benchmarks/package.json
@@ -5,7 +5,7 @@
  "scripts": {
    "benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts",
    "benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts",
-    "fetch-github-data": "tsx scripts/fetch-github-data.ts",
+    "fetch:github-repos": "tsx scripts/fetch-github-repos.ts",
    "test": "vitest"
  },
  "devDependencies": {
@@ -14,14 +14,16 @@
    "@ai-sdk/openai": "^2.0.53",
    "@ai-sdk/provider": "^2.0.0",
    "@antfu/eslint-config": "^6.1.0",
+    "@clack/prompts": "^0.11.0",
    "@faker-js/faker": "^10.1.0",
    "ai": "^5.0.80",
-    "consola": "^3.4.2",
    "csv-stringify": "^6.6.0",
    "fast-xml-parser": "^5.3.0",
    "gpt-tokenizer": "^3.2.0",
    "ofetch": "^1.4.1",
    "p-map": "^7.0.3",
+    "p-queue": "^9.0.0",
+    "unstorage": "^1.17.1",
    "yaml": "^2.8.1"
  }
 }
--- a/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
+++ b/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
--- a/benchmarks/results/accuracy/models/gemini-2.5-flash
+++ b/benchmarks/results/accuracy/models/gemini-2.5-flash
--- a/benchmarks/results/accuracy/models/gpt-5-nano
+++ b/benchmarks/results/accuracy/models/gpt-5-nano
--- a/benchmarks/results/accuracy/raw-results.json
+++ b/benchmarks/results/accuracy/raw-results.json
--- a/benchmarks/results/accuracy/summary.json
+++ b/benchmarks/results/accuracy/summary.json
@@ -1,91 +0,0 @@
-{
-  "formatResults": [
-    {
-      "format": "toon",
-      "accuracy": 0.8658280922431866,
-      "totalTokens": 4678,
-      "averageLatency": 5321,
-      "correctCount": 413,
-      "totalCount": 477
-    },
-    {
-      "format": "xml",
-      "accuracy": 0.8616352201257862,
-      "totalTokens": 9944,
-      "averageLatency": 6035,
-      "correctCount": 411,
-      "totalCount": 477
-    },
-    {
-      "format": "csv",
-      "accuracy": 0.8469601677148847,
-      "totalTokens": 4745,
-      "averageLatency": 6551,
-      "correctCount": 404,
-      "totalCount": 477
-    },
-    {
-      "format": "json",
-      "accuracy": 0.8322851153039832,
-      "totalTokens": 8713,
-      "averageLatency": 7981,
-      "correctCount": 397,
-      "totalCount": 477
-    },
-    {
-      "format": "yaml",
-      "accuracy": 0.8259958071278826,
-      "totalTokens": 7091,
-      "averageLatency": 5561,
-      "correctCount": 394,
-      "totalCount": 477
-    }
-  ],
-  "questions": 159,
-  "models": [
-    "gpt-5-nano",
-    "claude-haiku-4-5",
-    "gemini-2.5-flash"
-  ],
-  "datasets": [
-    {
-      "name": "tabular",
-      "description": "Uniform employee records (TOON optimal format)"
-    },
-    {
-      "name": "nested",
-      "description": "E-commerce orders with nested structures"
-    },
-    {
-      "name": "analytics",
-      "description": "Time-series analytics data"
-    },
-    {
-      "name": "github",
-      "description": "Top 100 GitHub repositories"
-    }
-  ],
-  "tokenCounts": {
-    "json-tabular": 6347,
-    "json-nested": 9694,
-    "json-analytics": 3665,
-    "json-github": 15145,
-    "toon-tabular": 2483,
-    "toon-nested": 5967,
-    "toon-analytics": 1515,
-    "toon-github": 8745,
-    "csv-tabular": 2337,
-    "csv-nested": 6735,
-    "csv-analytics": 1393,
-    "csv-github": 8513,
-    "xml-tabular": 7314,
-    "xml-nested": 10992,
-    "xml-analytics": 4376,
-    "xml-github": 17095,
-    "yaml-tabular": 4969,
-    "yaml-nested": 7328,
-    "yaml-analytics": 2938,
-    "yaml-github": 13129
-  },
-  "timestamp": "2025-10-28T07:39:09.360Z"
-}
--- a/benchmarks/results/retrieval-accuracy.md
+++ b/benchmarks/results/retrieval-accuracy.md
@@ -1,31 +1,31 @@
 ### Retrieval Accuracy

-Accuracy across **3 LLMs** on **159 data retrieval questions**:
+Accuracy across **3 LLMs** on **154 data retrieval questions**:

 ```
 gpt-5-nano
-  toon         ████████████████████  99.4% (158/159)
-  yaml         ███████████████████░  95.0% (151/159)
-  csv          ██████████████████░░  92.5% (147/159)
-  json         ██████████████████░░  92.5% (147/159)
-  xml          ██████████████████░░  91.2% (145/159)
-
-claude-haiku-4-5
-  toon         ███████████████░░░░░  75.5% (120/159)
-  xml          ███████████████░░░░░  75.5% (120/159)
-  csv          ███████████████░░░░░  75.5% (120/159)
-  json         ███████████████░░░░░  75.5% (120/159)
-  yaml         ███████████████░░░░░  74.2% (118/159)
+  toon         ███████████████████░  96.1% (148/154)
+  csv          ██████████████████░░  90.3% (139/154)
+  yaml         ██████████████████░░  89.0% (137/154)
+  json         ██████████████████░░  87.7% (135/154)
+  xml          █████████████████░░░  83.8% (129/154)

 gemini-2.5-flash
-  xml          ██████████████████░░  91.8% (146/159)
-  csv          █████████████████░░░  86.2% (137/159)
-  toon         █████████████████░░░  84.9% (135/159)
-  json         ████████████████░░░░  81.8% (130/159)
-  yaml         ████████████████░░░░  78.6% (125/159)
+  xml          ██████████████████░░  90.3% (139/154)
+  csv          ██████████████████░░  89.0% (137/154)
+  toon         █████████████████░░░  87.0% (134/154)
+  json         ████████████████░░░░  79.2% (122/154)
+  yaml         ███████████████░░░░░  76.0% (117/154)
+
+claude-haiku-4-5-20251001
+  json         ██████████░░░░░░░░░░  48.7% (75/154)
+  toon         ██████████░░░░░░░░░░  48.1% (74/154)
+  xml          █████████░░░░░░░░░░░  47.4% (73/154)
+  yaml         █████████░░░░░░░░░░░  47.4% (73/154)
+  csv          █████████░░░░░░░░░░░  45.5% (70/154)
 ```

-**Advantage:** TOON achieves **86.6% accuracy** (vs JSON's 83.2%) while using **46.3% fewer tokens**.
+**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.

 <details>
 <summary><strong>Performance by dataset and model</strong></summary>
@@ -36,41 +36,41 @@ gemini-2.5-flash

 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 87.4% | 2.483 | 152/174 |
-| `csv` | 82.8% | 2.337 | 144/174 |
-| `yaml` | 83.9% | 4.969 | 146/174 |
-| `json` | 83.9% | 6.347 | 146/174 |
-| `xml` | 88.5% | 7.314 | 154/174 |
+| `csv` | 74.7% | 2,337 | 112/150 |
+| `toon` | 76.7% | 2,483 | 115/150 |
+| `yaml` | 70.7% | 4,969 | 106/150 |
+| `xml` | 77.3% | 7,314 | 116/150 |
+| `json` | 69.3% | 6,347 | 104/150 |

 ##### E-commerce orders with nested structures

 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 90.9% | 5.967 | 120/132 |
-| `csv` | 93.9% | 6.735 | 124/132 |
-| `yaml` | 87.1% | 7.328 | 115/132 |
-| `json` | 87.9% | 9.694 | 116/132 |
-| `xml` | 93.2% | 10.992 | 123/132 |
+| `toon` | 80.0% | 5,967 | 96/120 |
+| `csv` | 75.8% | 6,735 | 91/120 |
+| `yaml` | 74.2% | 7,328 | 89/120 |
+| `json` | 79.2% | 9,694 | 95/120 |
+| `xml` | 78.3% | 10,992 | 94/120 |

 ##### Time-series analytics data

 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `csv` | 89.7% | 1.393 | 78/87 |
-| `toon` | 88.5% | 1.515 | 77/87 |
-| `yaml` | 83.9% | 2.938 | 73/87 |
-| `json` | 88.5% | 3.665 | 77/87 |
-| `xml` | 85.1% | 4.376 | 74/87 |
+| `csv` | 75.5% | 1,393 | 77/102 |
+| `toon` | 76.5% | 1,515 | 78/102 |
+| `yaml` | 74.5% | 2,938 | 76/102 |
+| `json` | 76.5% | 3,665 | 78/102 |
+| `xml` | 74.5% | 4,376 | 76/102 |

 ##### Top 100 GitHub repositories

 | Format | Accuracy | Tokens | Correct/Total |
 | ------ | -------- | ------ | ------------- |
-| `toon` | 76.2% | 8.745 | 64/84 |
-| `csv` | 69.0% | 8.513 | 58/84 |
-| `yaml` | 71.4% | 13.129 | 60/84 |
-| `json` | 69.0% | 15.145 | 58/84 |
-| `xml` | 71.4% | 17.095 | 60/84 |
+| `toon` | 74.4% | 8,745 | 67/90 |
+| `csv` | 73.3% | 8,513 | 66/90 |
+| `yaml` | 62.2% | 13,129 | 56/90 |
+| `json` | 61.1% | 15,145 | 55/90 |
+| `xml` | 61.1% | 17,095 | 55/90 |

 #### Performance by Model

@@ -78,31 +78,31 @@ gemini-2.5-flash

 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `toon` | 99.4% | 158/159 |
-| `yaml` | 95.0% | 151/159 |
-| `csv` | 92.5% | 147/159 |
-| `json` | 92.5% | 147/159 |
-| `xml` | 91.2% | 145/159 |
-
-##### claude-haiku-4-5
-
-| Format | Accuracy | Correct/Total |
-| ------ | -------- | ------------- |
-| `toon` | 75.5% | 120/159 |
-| `xml` | 75.5% | 120/159 |
-| `csv` | 75.5% | 120/159 |
-| `json` | 75.5% | 120/159 |
-| `yaml` | 74.2% | 118/159 |
+| `toon` | 96.1% | 148/154 |
+| `csv` | 90.3% | 139/154 |
+| `yaml` | 89.0% | 137/154 |
+| `json` | 87.7% | 135/154 |
+| `xml` | 83.8% | 129/154 |

 ##### gemini-2.5-flash

 | Format | Accuracy | Correct/Total |
 | ------ | -------- | ------------- |
-| `xml` | 91.8% | 146/159 |
-| `csv` | 86.2% | 137/159 |
-| `toon` | 84.9% | 135/159 |
-| `json` | 81.8% | 130/159 |
-| `yaml` | 78.6% | 125/159 |
+| `xml` | 90.3% | 139/154 |
+| `csv` | 89.0% | 137/154 |
+| `toon` | 87.0% | 134/154 |
+| `json` | 79.2% | 122/154 |
+| `yaml` | 76.0% | 117/154 |
+
+##### claude-haiku-4-5-20251001
+
+| Format | Accuracy | Correct/Total |
+| ------ | -------- | ------------- |
+| `json` | 48.7% | 75/154 |
+| `toon` | 48.1% | 74/154 |
+| `xml` | 47.4% | 73/154 |
+| `yaml` | 47.4% | 73/154 |
+| `csv` | 45.5% | 70/154 |

 </details>

@@ -124,31 +124,33 @@ Four datasets designed to test different structural patterns:

 #### Question Types

-159 questions are generated dynamically across three categories:
+154 questions are generated dynamically across three categories:

- **Field retrieval (50%)**: Direct value lookups
+- **Field retrieval (40%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → `75000`
+  - Example: "How many items are in order ORD-0042?" → `3`
  - Example: "What is the customer name for order ORD-0042?" → `John Doe`

- **Aggregation (25%)**: Counting and summation tasks
+- **Aggregation (32%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
  - Example: "How many employees work in Engineering?" → `17`
  - Example: "What is the total revenue across all orders?" → `45123.50`
+  - Example: "How many employees have salary > 80000?" → `23`

- **Filtering (25%)**: Conditional queries
+- **Filtering (28%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
  - Example: "How many employees in Sales have salary > 80000?" → `5`
-  - Example: "How many orders have total > 400?" → `12`
+  - Example: "How many active employees have more than 10 years of experience?" → `8`

 #### Evaluation Process

-1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, CSV, XML, JSON, YAML).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-4. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).
+3. **Validate with LLM-as-judge**: `gpt-5-nano` validates if the answer is semantically correct (e.g., `50000` = `$50,000`, `Engineering` = `engineering`, `2025-01-01` = `January 1, 2025`).

 #### Models & Configuration

- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5`, `gemini-2.5-flash`
+- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`
 - **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
 - **Temperature**: 0 (for non-reasoning models)
- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
+- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls

 </details>
--- a/benchmarks/results/token-efficiency.md
+++ b/benchmarks/results/token-efficiency.md
@@ -39,11 +39,11 @@ Total                        ████████████░░░░░
      "repo": "freeCodeCamp/freeCodeCamp",
      "description": "freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…",
      "createdAt": "2014-12-24T17:49:19Z",
-      "updatedAt": "2025-10-27T07:40:58Z",
-      "pushedAt": "2025-10-26T11:31:08Z",
-      "stars": 430828,
-      "watchers": 8582,
-      "forks": 42136,
+      "updatedAt": "2025-10-28T11:58:08Z",
+      "pushedAt": "2025-10-28T10:17:16Z",
+      "stars": 430886,
+      "watchers": 8583,
+      "forks": 42146,
      "defaultBranch": "main"
    },
    {
@@ -52,11 +52,11 @@ Total                        ████████████░░░░░
      "repo": "codecrafters-io/build-your-own-x",
      "description": "Master programming by recreating your favorite technologies from scratch.",
      "createdAt": "2018-05-09T12:03:18Z",
-      "updatedAt": "2025-10-27T07:43:25Z",
+      "updatedAt": "2025-10-28T12:37:11Z",
      "pushedAt": "2025-10-10T18:45:01Z",
-      "stars": 430102,
-      "watchers": 6322,
-      "forks": 40388,
+      "stars": 430877,
+      "watchers": 6332,
+      "forks": 40453,
      "defaultBranch": "master"
    },
    {
@@ -65,11 +65,11 @@ Total                        ████████████░░░░░
      "repo": "sindresorhus/awesome",
      "description": "😎 Awesome lists about all kinds of interesting topics",
      "createdAt": "2014-07-11T13:42:37Z",
-      "updatedAt": "2025-10-27T07:44:27Z",
-      "pushedAt": "2025-10-23T17:26:53Z",
-      "stars": 409760,
-      "watchers": 8016,
-      "forks": 32015,
+      "updatedAt": "2025-10-28T12:40:21Z",
+      "pushedAt": "2025-10-27T17:57:31Z",
+      "stars": 410052,
+      "watchers": 8017,
+      "forks": 32029,
      "defaultBranch": "main"
    }
  ]
@@ -80,9 +80,9 @@ Total                        ████████████░░░░░

 ```
 repositories[3]{id,name,repo,description,createdAt,updatedAt,pushedAt,stars,watchers,forks,defaultBranch}:
-  28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-27T07:40:58Z","2025-10-26T11:31:08Z",430828,8582,42136,main
-  132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-27T07:43:25Z","2025-10-10T18:45:01Z",430102,6322,40388,master
-  21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-27T07:44:27Z","2025-10-23T17:26:53Z",409760,8016,32015,main
+  28457823,freeCodeCamp,freeCodeCamp/freeCodeCamp,"freeCodeCamp.org's open-source codebase and curriculum. Learn math, programming,…","2014-12-24T17:49:19Z","2025-10-28T11:58:08Z","2025-10-28T10:17:16Z",430886,8583,42146,main
+  132750724,build-your-own-x,codecrafters-io/build-your-own-x,Master programming by recreating your favorite technologies from scratch.,"2018-05-09T12:03:18Z","2025-10-28T12:37:11Z","2025-10-10T18:45:01Z",430877,6332,40453,master
+  21737465,awesome,sindresorhus/awesome,😎 Awesome lists about all kinds of interesting topics,"2014-07-11T13:42:37Z","2025-10-28T12:40:21Z","2025-10-27T17:57:31Z",410052,8017,32029,main
 ```

 ---
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -1,51 +1,53 @@
-/**
- * LLM Retrieval Accuracy Benchmark
- *
- * Main entry point that orchestrates the full benchmark:
- * 1. Generate questions from datasets
- * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
- * 3. Evaluate each question with each format using LLMs
- * 4. Generate reports
- */
-
-import type { EvaluationResult, Question } from '../src/types'
-import * as fsp from 'node:fs/promises'
+import type { Question } from '../src/types'
 import * as path from 'node:path'
-import { consola } from 'consola'
-import pMap from 'p-map'
-import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
+import process from 'node:process'
+import * as prompts from '@clack/prompts'
+import PQueue from 'p-queue'
+import { DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, MODEL_RPM_LIMITS, ROOT_DIR } from '../src/constants'
 import { datasets } from '../src/datasets'
 import { evaluateQuestion, models } from '../src/evaluate'
 import { formatters } from '../src/formatters'
 import { generateQuestions } from '../src/questions'
 import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
+import { getAllModelResults, hasModelResults, saveModelResults } from '../src/storage'

-consola.start('Retrieval Accuracy Benchmark for TOON')
+prompts.intro('Retrieval Accuracy Benchmark')

-// Check if results already exist
-const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
-const rawResultsPath = path.join(resultsDir, 'raw-results.json')
-const summaryPath = path.join(resultsDir, 'summary.json')
+// Prompt user to select which models to benchmark
+const modelChoices = models.map(({ modelId }) => ({
+  value: modelId,
+  label: modelId,
+}))

-let existingResults: EvaluationResult[] | undefined
-let existingTokenCounts: Record<string, number> | undefined
+const selectedModels = await prompts.multiselect({
+  message: 'Select models to benchmark (Space to select, Enter to confirm)',
+  options: modelChoices,
+  required: true,
+})

-try {
-  const [rawData, summaryData] = await Promise.all([
-    fsp.readFile(rawResultsPath, 'utf-8'),
-    fsp.readFile(summaryPath, 'utf-8'),
-  ])
-  existingResults = JSON.parse(rawData)
-  const summary = JSON.parse(summaryData)
-  existingTokenCounts = summary.tokenCounts
-  consola.info('Found existing results – regenerating report only')
+if (prompts.isCancel(selectedModels)) {
+  prompts.cancel('Benchmark cancelled')
+  process.exit(0)
 }
-catch {
-  // Results don't exist, will run full evaluation
+
+const activeModels = models.filter(m => selectedModels.includes(m.modelId))
+
+prompts.log.info(`Selected ${activeModels.length} model(s): ${activeModels.map(m => m.modelId).join(', ')}`)
+
+// Check which models already have results
+const existingModelResults: Record<string, boolean> = {}
+for (const model of activeModels) {
+  const existingResult = await hasModelResults(model.modelId)
+  if (existingResult)
+    existingModelResults[model.modelId] = existingResult
+}
+
+if (Object.keys(existingModelResults).length > 0) {
+  prompts.log.info(`Found existing results for ${Object.values(existingModelResults).length} model(s)`)
 }

 if (DRY_RUN) {
-  consola.info('Limiting questions and models for dry run')
+  prompts.log.info('Limiting questions and models for dry run')
 }

 let questions = generateQuestions()
@@ -55,79 +57,98 @@ if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
  questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
 }

-// Filter models for dry run
-const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
-  ? Object.fromEntries(
-      Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
-    )
-  : models
+prompts.log.info(`Evaluating ${questions.length} questions`)
+prompts.log.info(`Testing ${Object.keys(formatters).length} formats`)

-let results: EvaluationResult[]
-let tokenCounts: Record<string, number>
+// Evaluate each model separately and save results incrementally
+for (const model of activeModels) {
+  const modelId = model.modelId

-if (existingResults && existingTokenCounts) {
-  // Reuse existing results
-  results = existingResults
-  tokenCounts = existingTokenCounts
-}
-else {
-  // Run full evaluation
-  consola.info(`Evaluating ${questions.length} questions`)
-  consola.info(`Testing ${Object.keys(formatters).length} formats`)
-  consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
+  // Skip if results already exist
+  if (existingModelResults[modelId]) {
+    prompts.log.info(`Skipping ${modelId} (results already exist)`)
+    continue
+  }

-  // Calculate token counts for all format+dataset combinations
-  tokenCounts = calculateTokenCounts(formatters)
-
-  // Generate evaluation tasks
-  const tasks: { question: Question, formatName: string, modelName: string }[] = []
+  prompts.log.step(`Running benchmark for ${modelId}`)

+  // Generate evaluation tasks for this model
+  const tasks: { question: Question, formatName: string }[] = []
  for (const question of questions) {
    for (const [formatName] of Object.entries(formatters)) {
-      for (const [modelName] of Object.entries(activeModels)) {
-        tasks.push({ question, formatName, modelName })
-      }
+      tasks.push({ question, formatName })
    }
  }

  const total = tasks.length
-  consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
+  const rpmLimit = MODEL_RPM_LIMITS[modelId]
+  const queue = new PQueue({
+    concurrency: DEFAULT_CONCURRENCY,
+    intervalCap: rpmLimit,
+    interval: rpmLimit ? 60_000 : undefined,
+  })

-  results = await pMap(
-    tasks,
-    async (task, index) => {
+  const evalSpinner = prompts.spinner()
+  evalSpinner.start(`Running ${total} evaluations (concurrency: ${DEFAULT_CONCURRENCY}, RPM limit: ${rpmLimit ?? 'unlimited'})`)
+
+  let completed = 0
+
+  // Queue all tasks
+  const modelResultPromises = tasks.map(task =>
+    queue.add(async () => {
      // Format data on-demand
      const dataset = datasets.find(d => d.name === task.question.dataset)!
      const formatter = formatters[task.formatName]!
      const formattedData = formatter(dataset.data)
-      const model = activeModels[task.modelName as keyof typeof activeModels]!

      const result = await evaluateQuestion({
        question: task.question,
        formatName: task.formatName,
        formattedData,
        model,
-        modelName: task.modelName,
      })

      // Progress update after task completes
-      if ((index + 1) % 10 === 0 || (index + 1) === total) {
-        const percent = (((index + 1) / total) * 100).toFixed(1)
-        consola.start(`Progress: ${index + 1}/${total} (${percent}%)`)
+      completed++
+      if (completed % 10 === 0 || completed === total) {
+        const percent = ((completed / total) * 100).toFixed(1)
+        evalSpinner.message(`Progress: ${completed}/${total} (${percent}%)`)
      }

      return result
-    },
-    { concurrency: DEFAULT_CONCURRENCY },
+    }),
  )

-  consola.success('Evaluation complete!')
+  // Wait for all tasks to complete
+  const modelResults = await Promise.all(modelResultPromises)
+
+  evalSpinner.stop(`Evaluation complete for ${modelId}`)
+
+  // Save results immediately for this model
+  await saveModelResults(modelId, modelResults)
+  prompts.log.success(`Saved results for ${modelId}`)
 }

-// Generate/regenerate markdown report
-consola.start('Generating report and saving results…')
-const formatResults = calculateFormatResults(results, tokenCounts)
-await saveResults(results, formatResults, questions, tokenCounts)
+// Generate/regenerate markdown report from all available model results
+const reportSpinner = prompts.spinner()
+reportSpinner.start('Generating report from all model results')

-consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
-consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
+// Load all available model results (including any that were skipped)
+const allModelResults = await getAllModelResults()
+const allResults = Object.values(allModelResults).flat()
+
+if (allResults.length === 0) {
+  prompts.log.warn('No results available to generate report')
+  process.exit(0)
+}
+
+// Calculate token counts freshly (deterministic, no need to persist)
+const tokenCounts = calculateTokenCounts(formatters)
+
+// Calculate format statistics and save report
+const formatResults = calculateFormatResults(allResults, tokenCounts)
+const resultsDir = await saveResults(allResults, formatResults, questions, tokenCounts)
+
+const reportPath = path.join(resultsDir, 'retrieval-accuracy.md')
+prompts.log.info(`Report saved to: \`${path.relative(ROOT_DIR, reportPath)}\``)
+reportSpinner.stop('Report generation complete!')
--- a/benchmarks/scripts/fetch-github-repos.ts
+++ b/benchmarks/scripts/fetch-github-repos.ts
@@ -1,18 +1,20 @@
 import * as path from 'node:path'
 import process from 'node:process'
-import { consola } from 'consola'
+import * as prompts from '@clack/prompts'
 import { ofetch } from 'ofetch'
 import pMap from 'p-map'
 import { BENCHMARKS_DIR } from '../src/constants'
 import { ensureDir, saveJsonFile } from '../src/utils'

+prompts.intro('GitHub Repositories Fetcher')
+
 try {
  // Fetch top 100 repos from GitHub
  const repoList = await searchTop100Repos()
  const repos = await fetchRepoDetails(repoList)

  if (repos.length === 0) {
-    consola.error('❌ No repositories fetched. Exiting.')
+    prompts.log.error('No repositories fetched. Exiting.')
    process.exit(1)
  }

@@ -21,15 +23,16 @@ try {

  await saveRepos(repos)

-  consola.success('Done!')
+  prompts.log.success('Done!')
 }
 catch (error) {
-  consola.error(error)
+  prompts.log.error(String(error))
  process.exit(1)
 }

 async function searchTop100Repos(): Promise<string[]> {
-  consola.start('Fetching top 100 starred repositories from GitHub API…')
+  const s = prompts.spinner()
+  s.start('Fetching top 100 starred repositories')

  const response = await ofetch<{ items: { full_name: string }[] }>(
    'https://api.github.com/search/repositories',
@@ -47,23 +50,26 @@ async function searchTop100Repos(): Promise<string[]> {
    },
  )

+  s.stop('Fetched top 100 repositories')
+
  return response.items.map(item => item.full_name)
 }

 async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
-  consola.start(`Fetching ${repoList.length} GitHub repositories…`)
+  const s = prompts.spinner()
+  s.start(`Fetching ${repoList.length} GitHub repositories`)

  const repos = await pMap(
    repoList,
    async (repoPath, index) => {
-      consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}…`)
+      s.message(`[${index + 1}/${repoList.length}] Fetching ${repoPath}`)
      const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`)
      return repo
    },
    { concurrency: 5 },
  )

-  consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
+  s.stop(`Successfully fetched ${repos.length}/${repoList.length} repositories`)

  return repos
 }
@@ -76,5 +82,5 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
  await saveJsonFile(outputFile, repos)

  const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
-  consola.info(`Saved to \`${relativePath}\``)
+  prompts.log.info(`Result saved to \`${relativePath}\``)
 }
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -1,6 +1,6 @@
 import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
-import { consola } from 'consola'
+import * as prompts from '@clack/prompts'
 import { encode } from '../../src/index'
 import githubRepos from '../data/github-repos.json' with { type: 'json' }
 import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
@@ -24,8 +24,6 @@ interface BenchmarkResult {
  showDetailed: boolean
 }

-const outputFilePath = path.join(BENCHMARKS_DIR, 'results', 'token-efficiency.md')
-
 const BENCHMARK_EXAMPLES = [
  {
    name: 'GitHub Repositories',
@@ -50,6 +48,8 @@ const BENCHMARK_EXAMPLES = [
  },
 ] as const

+prompts.intro('Token Efficiency Benchmark')
+
 // Calculate total savings
 let totalJsonTokens = 0
 let totalToonTokens = 0
@@ -204,9 +204,12 @@ ${detailedExamples}
 </details>
 `.trimStart()

-console.log(`${barChartSection}\n`)
+prompts.log.message(`${barChartSection}\n`)

-await ensureDir(path.join(BENCHMARKS_DIR, 'results'))
+const resultsDir = path.join(BENCHMARKS_DIR, 'results')
+await ensureDir(resultsDir)
+
+const outputFilePath = path.join(resultsDir, 'token-efficiency.md')
 await fsp.writeFile(outputFilePath, markdown, 'utf-8')

-consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
+prompts.log.success(`Result saved to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -5,9 +5,22 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.
 export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))

 /**
- * Default concurrency for parallel evaluations
+ * Model-specific RPM (requests per minute) limits to handle API quotas
+ *
+ * @remarks
+ * Set `undefined` for models without specific limits
 */
-export const DEFAULT_CONCURRENCY = 20
+/// keep-sorted
+export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
+  'claude-haiku-4-5-20251001': 50,
+  'gemini-2.5-flash': 25,
+  'gpt-5-nano': undefined,
+}
+
+/**
+ * Default concurrency for parallel evaluations to prevent bursting
+ */
+export const DEFAULT_CONCURRENCY = 10

 /**
 * Progress bar configuration
@@ -28,13 +41,83 @@ export const PROGRESS_BAR = {
 export const DRY_RUN: boolean = process.env.DRY_RUN === 'true'

 /**
- * Limits applied when DRY_RUN is enabled
+ * Limits applied during dry run mode
 */
 export const DRY_RUN_LIMITS = {
  /** Maximum number of questions to evaluate */
  maxQuestions: 10,
-  /** Maximum number of formats to test */
-  maxFormats: undefined as number | undefined,
-  /** Models to use in dry run */
-  allowedModels: [] as string[],
 }
+
+/**
+ * Threshold values for filtering and aggregation questions
+ */
+export const QUESTION_THRESHOLDS = {
+  tabular: {
+    salaryRanges: [60000, 80000, 100000, 120000],
+    experienceYears: [5, 10, 15, 20],
+    departmentSalaryThreshold: 80000,
+    departmentExperienceThreshold: 10,
+  },
+  nested: {
+    highValueOrders: [200, 400, 600],
+    statusValueThreshold: 300,
+    itemCountThreshold: 3,
+    totalThresholdsForItems: [300, 500],
+  },
+  analytics: {
+    views: [5000, 7000],
+    conversions: [10, 30],
+    viewsForFiltering: [6000, 7000],
+    conversionsForFiltering: 15,
+    revenueThresholds: [500, 1000, 1500, 2000, 2500],
+    viewsThresholdForRevenue: 6000,
+    clicksForFiltering: [250, 400],
+    conversionsForClickFiltering: 15,
+    revenueForBounceRate: [1000, 1500],
+    bounceRateThreshold: 0.5,
+  },
+  github: {
+    stars: [100000, 150000, 200000],
+    forks: [20000, 35000, 50000],
+    watchers: [5000, 8000],
+    starForkCombinations: [
+      { stars: 75000, forks: 15000 },
+      { stars: 100000, forks: 20000 },
+      { stars: 150000, forks: 30000 },
+      { stars: 200000, forks: 45000 },
+    ],
+    starWatcherCombinations: [
+      { stars: 100000, watchers: 7000 },
+      { stars: 150000, watchers: 9000 },
+    ],
+  },
+} as const
+
+/**
+ * Question generation configuration
+ */
+export const QUESTION_LIMITS = {
+  tabular: {
+    fieldRetrieval: 20,
+    aggregationDepartments: 6,
+    filteringMultiConditionDepartments: 6,
+    filteringExperience: 4,
+    filteringDepartmentExp: 3,
+    filteringDepartmentActive: 3,
+  },
+  nested: {
+    fieldRetrievalOrders: 8,
+    fieldRetrievalCustomers: 10,
+    aggregationStatuses: 5,
+    filteringStatusAndValue: 5,
+    filteringStatusAndItems: 3,
+  },
+  analytics: {
+    fieldRetrievalDates: 13,
+  },
+  github: {
+    fieldRetrievalRepos: 11,
+    aggregationBranches: 2,
+    filteringStarsAndForks: 8,
+  },
+} as const
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -1,12 +1,3 @@
-/**
- * Datasets for TOON benchmarks
- *
- * These datasets are designed to test TOON's strengths and weaknesses:
- * - Tabular: Uniform records (TOON optimal)
- * - Nested: Complex structures with nested objects
- * - Analytics: Time-series data
- */
-
 import type { Dataset } from './types'
 import { faker } from '@faker-js/faker'
 import githubRepos from '../data/github-repos.json' with { type: 'json' }
@@ -128,7 +119,7 @@ const tabularDataset: Dataset = {
  description: 'Uniform employee records (TOON optimal format)',
  data: {
    employees: Array.from({ length: 100 }, (_, i): Employee => {
-      const yearsExp = faker.number.int({ min: 1, max: 20 })
+      const yearsExp = faker.number.int({ min: 1, max: 25 })
      return {
        id: i + 1,
        name: faker.person.fullName(),
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -1,28 +1,19 @@
-/**
- * LLM evaluation logic for TOON benchmarks
- *
- * Handles:
- * - Model configuration
- * - Question evaluation with LLMs
- * - Answer validation using LLM-as-judge
- */
-
 import type { LanguageModelV2 } from '@ai-sdk/provider'
 import type { EvaluationResult, Question } from './types'
 import { anthropic } from '@ai-sdk/anthropic'
 import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
+import * as prompts from '@clack/prompts'
 import { generateText } from 'ai'
-import { consola } from 'consola'

 /**
 * Models used for evaluation
 */
-export const models: Record<string, LanguageModelV2> = {
-  'gpt-5-nano': openai('gpt-5-nano'),
-  'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
-  'gemini-2.5-flash': google('gemini-2.5-flash'),
-}
+export const models: LanguageModelV2[] = [
+  openai('gpt-5-nano'),
+  google('gemini-2.5-flash'),
+  anthropic('claude-haiku-4-5-20251001'),
+]

 /**
 * Evaluate a single question with a specific format and model
@@ -33,14 +24,12 @@ export async function evaluateQuestion(
    formatName,
    formattedData,
    model,
-    modelName,
  }:
  {
    question: Question
    formatName: string
    formattedData: string
    model: LanguageModelV2
-    modelName: string
  },
 ): Promise<EvaluationResult> {
  const prompt = `
@@ -59,10 +48,11 @@ Provide only the direct answer, without any additional explanation or formatting
  const { text, usage } = await generateText({
    model,
    prompt,
-    temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
+    temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
  })

  const latencyMs = performance.now() - startTime
+
  const isCorrect = await validateAnswer({
    actual: text.trim(),
    expected: question.groundTruth,
@@ -72,7 +62,7 @@ Provide only the direct answer, without any additional explanation or formatting
  return {
    questionId: question.id,
    format: formatName,
-    model: modelName,
+    model: model.modelId,
    expected: question.groundTruth,
    actual: text.trim(),
    isCorrect,
@@ -115,14 +105,14 @@ Respond with only "YES" or "NO".

  try {
    const { text } = await generateText({
-      model: models['gpt-5-nano']!,
+      model: models.find(m => m.modelId === 'gpt-5-nano')!,
      prompt,
    })

    return text.trim().toUpperCase() === 'YES'
  }
  catch (error) {
-    consola.error('Validation error:', error)
+    prompts.log.error(`Validation error: ${error}`)
    // Fallback to simple string comparison
    return actual.toLowerCase().trim() === expected.toLowerCase().trim()
  }
--- a/benchmarks/src/formatters.ts
+++ b/benchmarks/src/formatters.ts
@@ -1,20 +1,3 @@
-/**
- * Format converters for TOON benchmarks
- *
- * Converts data to different formats for comparison:
- * - JSON
- * - TOON
- * - CSV
- * - XML
- * - YAML
- *
- * ## Semantic Equivalence
- *
- * All formatters attempt to preserve semantic equivalence with the source data,
- * meaning the converted data should represent the same information. However,
- * CSV has inherent limitations with nested structures (see `toCSV` docs).
- */
-
 import { stringify as stringifyCSV } from 'csv-stringify/sync'
 import { XMLBuilder } from 'fast-xml-parser'
 import { stringify as stringifyYAML } from 'yaml'
@@ -23,7 +6,10 @@ import { encode as encodeToon } from '../../src/index'
 /**
 * Format converters registry
 *
- * Each formatter takes unknown data and returns a string representation
+ * @remarks
+ * All formatters attempt to preserve semantic equivalence with the source data,
+ * meaning the converted data should represent the same information. However,
+ * CSV has inherent limitations with nested structures (see `toCSV` docs).
 */
 export const formatters: Record<string, (data: unknown) => string> = {
  json: data => JSON.stringify(data, undefined, 2),
@@ -37,11 +23,13 @@ export const formatters: Record<string, (data: unknown) => string> = {
 * Convert data to CSV format
 *
 * @remarks
- * **Limitations**: CSV is designed for flat tabular data only. This formatter:
- * - Only handles top-level objects with arrays of flat objects
- * - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
- * - Loses nested structure information during conversion
- * - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
+ * Limitations: CSV is designed for flat tabular data only.
+ *
+ * This formatter:
+ *   - Only handles top-level objects with arrays of flat objects
+ *   - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
+ *   - Loses nested structure information during conversion
+ *   - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
 *
 * For datasets with nested structures, CSV comparisons may not be fair or representative
 * of how CSV would typically be used in practice.
--- a/benchmarks/src/questions.ts
+++ b/benchmarks/src/questions.ts
@@ -1,24 +1,18 @@
 /**
 * Question generation for TOON benchmarks
 *
- * Generates ~160 questions across different types:
- * - Field retrieval (50%): "What is X's Y?"
- * - Aggregation (25%): "How many X have Y?"
- * - Filtering (25%): "List/count X where Y"
- *
- * Questions are generated dynamically based on actual data values
- *
- * TODO: Balance question distribution across datasets to ensure fair representation.
- * Current distribution:
- * - Tabular: 70 questions (43%)
- * - Nested: 50 questions (31%)
- * - Analytics: 40 questions (25%)
- * - GitHub: 40 questions (25%)
+ * Generates ~150-160 questions across different question types and datasets:
+ * - Field Retrieval: Direct field access with no computation
+ *   Examples: "What is X's salary?", "What is the status of order Y?"
+ * - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
+ *   Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
+ * - Filtering: Multi-condition queries requiring complex logical operations
+ *   Examples: "How many X WHERE condition1 AND condition2?"
 */

 import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
 import type { Question } from './types'
-import { consola } from 'consola'
+import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from './constants'
 import { datasets } from './datasets'

 /**
@@ -34,19 +28,15 @@ export function generateQuestions(): Question[] {
  const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) ?? []
  const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) ?? []

-  // ========================================
-  // TABULAR DATASET QUESTIONS (70 questions)
-  // ========================================
-
  if (tabular.length > 0) {
-    // Field retrieval: specific employees (40 questions)
-    for (let i = 0; i < Math.min(40, tabular.length); i++) {
+    // Field retrieval: specific employees
+    for (let i = 0; i < Math.min(QUESTION_LIMITS.tabular.fieldRetrieval, tabular.length); i++) {
      const emp = tabular[i * 2] || tabular[i]
      if (!emp)
        continue

-      // Alternate between different field types
-      if (i % 3 === 0) {
+      // Rotate through all field types
+      if (i % 5 === 0) {
        questions.push({
          id: `q${idCounter++}`,
          prompt: `What is the salary of ${emp.name}?`,
@@ -55,7 +45,7 @@ export function generateQuestions(): Question[] {
          dataset: 'tabular',
        })
      }
-      else if (i % 3 === 1) {
+      else if (i % 5 === 1) {
        questions.push({
          id: `q${idCounter++}`,
          prompt: `What department does ${emp.name} work in?`,
@@ -64,7 +54,7 @@ export function generateQuestions(): Question[] {
          dataset: 'tabular',
        })
      }
-      else {
+      else if (i % 5 === 2) {
        questions.push({
          id: `q${idCounter++}`,
          prompt: `What is the email address of ${emp.name}?`,
@@ -73,11 +63,29 @@ export function generateQuestions(): Question[] {
          dataset: 'tabular',
        })
      }
+      else if (i % 5 === 3) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many years of experience does ${emp.name} have?`,
+          groundTruth: String(emp.yearsExperience),
+          type: 'field-retrieval',
+          dataset: 'tabular',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `Is ${emp.name} an active employee?`,
+          groundTruth: emp.active ? 'yes' : 'no',
+          type: 'field-retrieval',
+          dataset: 'tabular',
+        })
+      }
    }

    // Aggregation: count by department
    const departments = [...new Set(tabular.map(e => e.department))]
-    for (const dept of departments.slice(0, 6)) {
+    for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
      const count = tabular.filter(e => e.department === dept).length
      questions.push({
        id: `q${idCounter++}`,
@@ -88,9 +96,8 @@ export function generateQuestions(): Question[] {
      })
    }

-    // Aggregation: salary ranges (4 questions)
-    const salaryThresholds = [60000, 80000, 100000, 120000]
-    for (const threshold of salaryThresholds) {
+    // Aggregation: salary ranges (single-condition filters)
+    for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
      const count = tabular.filter(e => e.salary > threshold).length
      questions.push({
        id: `q${idCounter++}`,
@@ -101,39 +108,57 @@ export function generateQuestions(): Question[] {
      })
    }

-    // Filtering: active status
+    // Aggregation: totals and averages
+    const totalEmployees = tabular.length
+    const avgSalary = Math.round(tabular.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
    const activeCount = tabular.filter(e => e.active).length
    const inactiveCount = tabular.filter(e => !e.active).length
+
    questions.push(
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many employees are in the dataset?',
+        groundTruth: String(totalEmployees),
+        type: 'aggregation',
+        dataset: 'tabular',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average salary across all employees?',
+        groundTruth: String(avgSalary),
+        type: 'aggregation',
+        dataset: 'tabular',
+      },
      {
        id: `q${idCounter++}`,
        prompt: 'How many employees are active?',
        groundTruth: String(activeCount),
-        type: 'filtering',
+        type: 'aggregation',
        dataset: 'tabular',
      },
      {
        id: `q${idCounter++}`,
        prompt: 'How many employees are inactive?',
        groundTruth: String(inactiveCount),
-        type: 'filtering',
+        type: 'aggregation',
        dataset: 'tabular',
      },
    )

-    // Complex filtering: multi-condition (8 questions)
-    for (const dept of departments.slice(0, 4)) {
-      const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
+    // Filtering: count by department with salary filter (multi-condition)
+    for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
+      const count = tabular.filter(e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many employees in ${dept} have a salary greater than 80000?`,
+        prompt: `How many employees in ${dept} have a salary greater than ${QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold}?`,
        groundTruth: String(count),
        type: 'filtering',
        dataset: 'tabular',
      })
    }

-    for (const exp of [5, 10]) {
+    // Filtering: active employees by experience (multi-condition)
+    for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
      const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
      questions.push({
        id: `q${idCounter++}`,
@@ -143,15 +168,35 @@ export function generateQuestions(): Question[] {
        dataset: 'tabular',
      })
    }
+
+    // Filtering: department by experience (multi-condition)
+    for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
+      const count = tabular.filter(e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many employees in ${dept} have more than ${QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold} years of experience?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'tabular',
+      })
+    }
+
+    // Filtering: department by active status (multi-condition)
+    for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
+      const count = tabular.filter(e => e.department === dept && e.active).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many active employees work in ${dept}?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'tabular',
+      })
+    }
  }

-  // ========================================
-  // NESTED DATASET QUESTIONS (50 questions)
-  // ========================================
-
  if (nested.length > 0) {
-    // Field retrieval: order totals (20 questions)
-    for (let i = 0; i < Math.min(20, nested.length); i++) {
+    // Field retrieval: order totals and statuses
+    for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalOrders, nested.length); i++) {
      const order = nested[i * 2] || nested[i]
      if (!order)
        continue
@@ -159,7 +204,7 @@ export function generateQuestions(): Question[] {
      if (i % 2 === 0) {
        questions.push({
          id: `q${idCounter++}`,
-          prompt: `What is the total amount for order ${order.orderId}?`,
+          prompt: `What is the total for order ${order.orderId}?`,
          groundTruth: String(order.total),
          type: 'field-retrieval',
          dataset: 'nested',
@@ -176,51 +221,143 @@ export function generateQuestions(): Question[] {
      }
    }

-    // Field retrieval: customer info (15 questions)
-    for (let i = 0; i < Math.min(15, nested.length); i++) {
-      const order = nested[i * 3] || nested[i]
+    // Field retrieval: customer info and order dates (expanded)
+    for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalCustomers, nested.length); i++) {
+      const order = nested[i * 2 + 1] || nested[i]
      if (!order)
        continue

-      questions.push({
-        id: `q${idCounter++}`,
-        prompt: `What is the customer name for order ${order.orderId}?`,
-        groundTruth: order.customer.name,
-        type: 'field-retrieval',
-        dataset: 'nested',
-      })
+      if (i % 4 === 0) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the customer name for order ${order.orderId}?`,
+          groundTruth: order.customer.name,
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
+      else if (i % 4 === 1) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the customer email for order ${order.orderId}?`,
+          groundTruth: order.customer.email,
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
+      else if (i % 4 === 2) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the order date for order ${order.orderId}?`,
+          groundTruth: order.orderDate || '',
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many items are in order ${order.orderId}?`,
+          groundTruth: String(order.items.length),
+          type: 'field-retrieval',
+          dataset: 'nested',
+        })
+      }
    }

-    // Aggregation: count by status
+    // Aggregation: totals and averages
+    const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
+    const avgOrderValue = totalRevenue / nested.length
+    const totalOrders = nested.length
+    const maxOrderValue = Math.max(...nested.map(o => o.total))
+
+    // Count by status
    const statuses = [...new Set(nested.map(o => o.status))]
-    for (const status of statuses) {
+    for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
      const count = nested.filter(o => o.status === status).length
      questions.push({
        id: `q${idCounter++}`,
        prompt: `How many orders have status "${status}"?`,
        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'nested',
+      })
+    }
+
+    questions.push(
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the total revenue across all orders?',
+        groundTruth: String(totalRevenue.toFixed(2)),
+        type: 'aggregation',
+        dataset: 'nested',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average order value?',
+        groundTruth: String(avgOrderValue.toFixed(2)),
+        type: 'aggregation',
+        dataset: 'nested',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many orders are in the dataset?',
+        groundTruth: String(totalOrders),
+        type: 'aggregation',
+        dataset: 'nested',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the highest order total?',
+        groundTruth: String(maxOrderValue.toFixed(2)),
+        type: 'aggregation',
+        dataset: 'nested',
+      },
+    )
+
+    // Aggregation: high-value orders (single-condition filter)
+    for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
+      const count = nested.filter(o => o.total > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many orders have a total greater than ${threshold}?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'nested',
+      })
+    }
+
+    // Filtering: multi-condition queries (status AND value)
+    const orderStatuses = [...new Set(nested.map(o => o.status))]
+    for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
+      const count = nested.filter(o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many orders have status "${status}" and total greater than ${QUESTION_THRESHOLDS.nested.statusValueThreshold}?`,
+        groundTruth: String(count),
        type: 'filtering',
        dataset: 'nested',
      })
    }

-    // Aggregation: total revenue
-    const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
-    questions.push({
-      id: `q${idCounter++}`,
-      prompt: 'What is the total revenue across all orders?',
-      groundTruth: String(totalRevenue.toFixed(2)),
-      type: 'aggregation',
-      dataset: 'nested',
-    })
-
-    // Filtering: high-value orders (3 questions)
-    const highValueThresholds = [200, 400, 600]
-    for (const threshold of highValueThresholds) {
-      const count = nested.filter(o => o.total > threshold).length
+    // Filtering: status AND items count (multi-condition)
+    for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
+      const count = nested.filter(o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many orders have a total greater than ${threshold}?`,
+        prompt: `How many orders have status "${status}" and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'nested',
+      })
+    }
+
+    // Filtering: total AND items count (multi-condition)
+    for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
+      const count = nested.filter(o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many orders have a total greater than ${threshold} and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
        groundTruth: String(count),
        type: 'filtering',
        dataset: 'nested',
@@ -228,18 +365,14 @@ export function generateQuestions(): Question[] {
    }
  }

-  // ========================================
-  // ANALYTICS DATASET QUESTIONS (40 questions)
-  // ========================================
-
  if (analytics.length > 0) {
-    // Field retrieval: specific dates (20 questions)
-    for (let i = 0; i < Math.min(20, analytics.length); i++) {
+    // Field retrieval: specific dates (expanded with all metrics)
+    for (let i = 0; i < Math.min(QUESTION_LIMITS.analytics.fieldRetrievalDates, analytics.length); i++) {
      const metric = analytics[i * 3] || analytics[i]
      if (!metric)
        continue

-      if (i % 2 === 0) {
+      if (i % 5 === 0) {
        questions.push({
          id: `q${idCounter++}`,
          prompt: `How many views were recorded on ${metric.date}?`,
@@ -248,7 +381,7 @@ export function generateQuestions(): Question[] {
          dataset: 'analytics',
        })
      }
-      else {
+      else if (i % 5 === 1) {
        questions.push({
          id: `q${idCounter++}`,
          prompt: `What was the revenue on ${metric.date}?`,
@@ -257,12 +390,42 @@ export function generateQuestions(): Question[] {
          dataset: 'analytics',
        })
      }
+      else if (i % 5 === 2) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What was the conversion count on ${metric.date}?`,
+          groundTruth: String(metric.conversions),
+          type: 'field-retrieval',
+          dataset: 'analytics',
+        })
+      }
+      else if (i % 5 === 3) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many clicks were recorded on ${metric.date}?`,
+          groundTruth: String(metric.clicks),
+          type: 'field-retrieval',
+          dataset: 'analytics',
+        })
+      }
+      else {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What was the bounce rate on ${metric.date}?`,
+          groundTruth: String(metric.bounceRate),
+          type: 'field-retrieval',
+          dataset: 'analytics',
+        })
+      }
    }

-    // Aggregation: totals (4 questions)
+    // Aggregation: totals and averages
    const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
    const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
    const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
+    const avgViews = Math.round(totalViews / analytics.length)
+    const avgRevenue = totalRevenue / analytics.length
+    const avgConversions = Math.round(totalConversions / analytics.length)

    questions.push(
      {
@@ -286,27 +449,97 @@ export function generateQuestions(): Question[] {
        type: 'aggregation',
        dataset: 'analytics',
      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average number of views per day?',
+        groundTruth: String(avgViews),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average revenue per day?',
+        groundTruth: String(avgRevenue.toFixed(2)),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average number of conversions per day?',
+        groundTruth: String(avgConversions),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many days are included in the analytics data?',
+        groundTruth: String(analytics.length),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the highest number of views recorded in a single day?',
+        groundTruth: String(Math.max(...analytics.map(m => m.views))),
+        type: 'aggregation',
+        dataset: 'analytics',
+      },
    )

-    // Filtering: high-performing days (10 questions)
-    const viewThresholds = [5000, 6000, 7000]
-    for (const threshold of viewThresholds) {
+    // Aggregation: high-performing days (single-condition filters)
+    for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
      const count = analytics.filter(m => m.views > threshold).length
      questions.push({
        id: `q${idCounter++}`,
        prompt: `How many days had more than ${threshold} views?`,
        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'analytics',
+      })
+    }
+
+    // Filtering: multi-condition queries (views AND conversions)
+    for (const viewThreshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
+      const count = analytics.filter(m => m.views > viewThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many days had more than ${viewThreshold} views and more than ${QUESTION_THRESHOLDS.analytics.conversionsForFiltering} conversions?`,
+        groundTruth: String(count),
        type: 'filtering',
        dataset: 'analytics',
      })
    }

-    const conversionThresholds = [10, 20, 30]
-    for (const threshold of conversionThresholds) {
-      const count = analytics.filter(m => m.conversions > threshold).length
+    // Filtering: views AND revenue (expanded)
+    for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueThresholds.slice(0, 5)) {
+      const count = analytics.filter(m => m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue && m.revenue > revenueThreshold).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many days had more than ${threshold} conversions?`,
+        prompt: `How many days had more than ${QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue} views and revenue greater than ${revenueThreshold}?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'analytics',
+      })
+    }
+
+    // Filtering: clicks AND conversions (multi-condition)
+    for (const clickThreshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
+      const count = analytics.filter(m => m.clicks > clickThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many days had more than ${clickThreshold} clicks and more than ${QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering} conversions?`,
+        groundTruth: String(count),
+        type: 'filtering',
+        dataset: 'analytics',
+      })
+    }
+
+    // Filtering: revenue AND bounce rate (multi-condition)
+    for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
+      const count = analytics.filter(m => m.revenue > revenueThreshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many days had revenue greater than ${revenueThreshold} and bounce rate less than ${QUESTION_THRESHOLDS.analytics.bounceRateThreshold}?`,
        groundTruth: String(count),
        type: 'filtering',
        dataset: 'analytics',
@@ -314,79 +547,159 @@ export function generateQuestions(): Question[] {
    }
  }

-  // ========================================
-  // GITHUB DATASET QUESTIONS (40 questions)
-  // ========================================
-
  if (github.length > 0) {
-    // Field retrieval: specific repos (20 questions)
-    for (let i = 0; i < Math.min(20, github.length); i++) {
-      const repo = github[i * 10] || github[i]
+    // Helper to extract owner from repo field
+    const getOwner = (repoFullName: string) => repoFullName.split('/')[0]!
+
+    // Field retrieval: specific repos (diverse fields)
+    for (let i = 0; i < Math.min(QUESTION_LIMITS.github.fieldRetrievalRepos, github.length); i++) {
+      const repo = github[i * 7]
      if (!repo)
        continue

-      if (i % 2 === 0) {
+      if (i % 5 === 0) {
        questions.push({
          id: `q${idCounter++}`,
-          prompt: `How many stars does ${repo.owner}/${repo.name} have?`,
+          prompt: `How many stars does ${repo.repo} have?`,
          groundTruth: String(repo.stars),
          type: 'field-retrieval',
          dataset: 'github',
        })
      }
+      else if (i % 5 === 1) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `How many forks does ${repo.repo} have?`,
+          groundTruth: String(repo.forks),
+          type: 'field-retrieval',
+          dataset: 'github',
+        })
+      }
+      else if (i % 5 === 2) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `Who is the owner of ${repo.repo}?`,
+          groundTruth: getOwner(repo.repo),
+          type: 'field-retrieval',
+          dataset: 'github',
+        })
+      }
+      else if (i % 5 === 3) {
+        questions.push({
+          id: `q${idCounter++}`,
+          prompt: `What is the default branch of ${repo.repo}?`,
+          groundTruth: repo.defaultBranch,
+          type: 'field-retrieval',
+          dataset: 'github',
+        })
+      }
      else {
        questions.push({
          id: `q${idCounter++}`,
-          prompt: `How many forks does ${repo.owner}/${repo.name} have?`,
-          groundTruth: String(repo.forks),
+          prompt: `How many watchers does ${repo.repo} have?`,
+          groundTruth: String(repo.watchers),
          type: 'field-retrieval',
          dataset: 'github',
        })
      }
    }

-    // Aggregation: count by owner (5 questions)
-    const owners = [...new Set(github.map(r => r.owner))]
-    for (const owner of owners.slice(0, 5)) {
-      const count = github.filter(r => r.owner === owner).length
+    // Aggregation: popular repositories
+    const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
+    const totalRepos = github.length
+    const avgStars = Math.round(totalStars / totalRepos)
+
+    questions.push(
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the total number of stars across all repositories?',
+        groundTruth: String(totalStars),
+        type: 'aggregation',
+        dataset: 'github',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'How many repositories are in the dataset?',
+        groundTruth: String(totalRepos),
+        type: 'aggregation',
+        dataset: 'github',
+      },
+      {
+        id: `q${idCounter++}`,
+        prompt: 'What is the average number of stars per repository?',
+        groundTruth: String(avgStars),
+        type: 'aggregation',
+        dataset: 'github',
+      },
+    )
+
+    // Aggregation: star thresholds (single-condition filters)
+    for (const threshold of QUESTION_THRESHOLDS.github.stars) {
+      const count = github.filter(r => r.stars > threshold).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many repositories does ${owner} have in the dataset?`,
+        prompt: `How many repositories have more than ${threshold} stars?`,
        groundTruth: String(count),
        type: 'aggregation',
        dataset: 'github',
      })
    }

-    // Aggregation: total stars
-    const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
-    questions.push({
-      id: `q${idCounter++}`,
-      prompt: 'What is the total number of stars across all repositories?',
-      groundTruth: String(totalStars),
-      type: 'aggregation',
-      dataset: 'github',
-    })
-
-    // Filtering: popular repos (8 questions)
-    const starThresholds = [10000, 50000, 100000]
-    for (const threshold of starThresholds) {
-      const count = github.filter(r => r.stars > threshold).length
+    // Aggregation: fork thresholds (single-condition filters)
+    for (const threshold of QUESTION_THRESHOLDS.github.forks) {
+      const count = github.filter(r => r.forks > threshold).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many repositories have more than ${threshold} stars?`,
+        prompt: `How many repositories have more than ${threshold} forks?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'github',
+      })
+    }
+
+    // Aggregation: watcher thresholds (single-condition filters)
+    for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
+      const count = github.filter(r => r.watchers > threshold).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories have more than ${threshold} watchers?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'github',
+      })
+    }
+
+    // Aggregation: default branch counts
+    const branches = [...new Set(github.map(r => r.defaultBranch))]
+    for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
+      const count = github.filter(r => r.defaultBranch === branch).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories use "${branch}" as their default branch?`,
+        groundTruth: String(count),
+        type: 'aggregation',
+        dataset: 'github',
+      })
+    }
+
+    // Filtering: multi-condition queries (stars AND forks)
+    for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
+      const count = github.filter(r => r.stars > combo.stars && r.forks > combo.forks).length
+      questions.push({
+        id: `q${idCounter++}`,
+        prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.forks} forks?`,
        groundTruth: String(count),
        type: 'filtering',
        dataset: 'github',
      })
    }

-    const forkThresholds = [1000, 5000, 10000]
-    for (const threshold of forkThresholds) {
-      const count = github.filter(r => r.forks > threshold).length
+    // Filtering: stars AND watchers (multi-condition)
+    for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
+      const count = github.filter(r => r.stars > combo.stars && r.watchers > combo.watchers).length
      questions.push({
        id: `q${idCounter++}`,
-        prompt: `How many repositories have more than ${threshold} forks?`,
+        prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.watchers} watchers?`,
        groundTruth: String(count),
        type: 'filtering',
        dataset: 'github',
@@ -394,14 +707,5 @@ export function generateQuestions(): Question[] {
    }
  }

-  consola.info(`Question breakdown:`)
-  consola.box(`
-Tabular: ${questions.filter(q => q.dataset === 'tabular').length}
-Nested: ${questions.filter(q => q.dataset === 'nested').length}
-Analytics: ${questions.filter(q => q.dataset === 'analytics').length}
-GitHub: ${questions.filter(q => q.dataset === 'github').length}
-Total: ${questions.length}
-`.trim())
-
  return questions
 }
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,21 +1,9 @@
-/**
- * Report generation for TOON benchmarks
- *
- * Handles:
- * - Statistical analysis
- * - Markdown report generation with visual elements
- * - Per-dataset breakdowns
- * - Cost analysis
- * - Result file saving
- */
-
 import type { EvaluationResult, FormatResult, Question } from './types'
 import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
 import { BENCHMARKS_DIR } from './constants'
 import { datasets } from './datasets'
-import { models } from './evaluate'
-import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
+import { createProgressBar, ensureDir, tokenize } from './utils'

 /**
 * Calculate per-format statistics from evaluation results
@@ -63,8 +51,8 @@ export function generateMarkdownReport(
  const json = formatResults.find(r => r.format === 'json')

  // Build model-by-model breakdown with ASCII bars
-  const modelCount = Object.keys(models).length
-  const modelNames = Object.keys(models)
+  const modelNames = [...new Set(results.map(r => r.model))].reverse()
+  const modelCount = modelNames.length

  const modelBreakdown = modelNames.map((modelName, i) => {
    const modelResults = formatResults.map((fr) => {
@@ -136,7 +124,7 @@ export function generateMarkdownReport(
    })

    const tableRows = datasetResults.slice(0, 6).map(result =>
-      `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
+      `| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString('en-US')} | ${result.correctCount}/${result.totalCount} |`,
    ).join('\n')

    return `
@@ -180,6 +168,27 @@ ${tableRows}
  // Calculate total unique questions
  const totalQuestions = [...new Set(results.map(r => r.questionId))].length

+  // Calculate question type distribution
+  const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
+  const aggregationCount = questions.filter(q => q.type === 'aggregation').length
+  const filteringCount = questions.filter(q => q.type === 'filtering').length
+
+  const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
+  const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
+  const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
+
+  // Calculate dataset sizes
+  const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
+  const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
+  const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
+  const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
+
+  // Calculate number of formats and models
+  const formatCount = formatResults.length
+  const modelsUsed = [...new Set(results.map(r => r.model))]
+  const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
+  const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
+
  return `
 ### Retrieval Accuracy

@@ -213,39 +222,41 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di

 Four datasets designed to test different structural patterns:

-1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
-2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
-3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
-4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars.
+1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
+2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
+3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
+4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.

 #### Question Types

 ${totalQuestions} questions are generated dynamically across three categories:

- **Field retrieval (50%)**: Direct value lookups
+\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → \`75000\`
+  - Example: "How many items are in order ORD-0042?" → \`3\`
  - Example: "What is the customer name for order ORD-0042?" → \`John Doe\`

- **Aggregation (25%)**: Counting and summation tasks
+- **Aggregation (${aggregationPercent}%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
  - Example: "How many employees work in Engineering?" → \`17\`
  - Example: "What is the total revenue across all orders?" → \`45123.50\`
+  - Example: "How many employees have salary > 80000?" → \`23\`

- **Filtering (25%)**: Conditional queries
+- **Filtering (${filteringPercent}%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
  - Example: "How many employees in Sales have salary > 80000?" → \`5\`
-  - Example: "How many orders have total > 400?" → \`12\`
+  - Example: "How many active employees have more than 10 years of experience?" → \`8\`

 #### Evaluation Process

-1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
+3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).

 #### Models & Configuration

- **Models tested**: \`gpt-5-nano\`, \`claude-haiku-4-5\`, \`gemini-2.5-flash\`
+- **Models tested**: ${modelsListStr}
 - **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
 - **Temperature**: 0 (for non-reasoning models)
- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
+- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls

 </details>
 `.trimStart()
@@ -272,6 +283,10 @@ export function calculateTokenCounts(

 /**
 * Save results to disk
+ *
+ * @remarks
+ * Per-model results are managed separately via storage.ts
+ * This function only generates the aggregated markdown report
 */
 export async function saveResults(
  results: EvaluationResult[],
@@ -279,31 +294,12 @@ export async function saveResults(
  questions: Question[],
  tokenCounts: Record<string, number>,
 ): Promise<string> {
-  const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
+  const resultsDir = path.join(BENCHMARKS_DIR, 'results')
  await ensureDir(resultsDir)

-  // Save raw results
-  await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
-
-  // Save summary
-  await saveJsonFile(
-    path.join(resultsDir, 'summary.json'),
-    {
-      formatResults,
-      questions: questions.length,
-      models: Object.keys(models),
-      datasets: datasets.map(d => ({ name: d.name, description: d.description })),
-      tokenCounts,
-      timestamp: new Date().toISOString(),
-    },
-  )
-
-  // Generate markdown report
+  // Generate markdown report from all available model results
  const report = generateMarkdownReport(formatResults, results, questions, tokenCounts)
-  await fsp.writeFile(
-    path.join(resultsDir, 'report.md'),
-    report,
-  )
+  await fsp.writeFile(path.join(resultsDir, 'retrieval-accuracy.md'), report)

  return resultsDir
 }
--- a/benchmarks/src/storage.ts
+++ b/benchmarks/src/storage.ts
@@ -0,0 +1,46 @@
+import type { Storage, StorageValue } from 'unstorage'
+import type { EvaluationResult } from './types'
+import * as path from 'node:path'
+import { createStorage } from 'unstorage'
+import fsDriver from 'unstorage/drivers/fs'
+import { BENCHMARKS_DIR } from './constants'
+
+/**
+ * Storage instance for model results
+ *
+ * @remarks
+ * Stores results in: `benchmarks/results/accuracy/models/`
+ */
+export const resultsStorage: Storage<StorageValue> = createStorage({
+  driver: fsDriver({
+    base: path.join(BENCHMARKS_DIR, 'results', 'accuracy', 'models'),
+  }),
+})
+
+export async function loadModelResults(modelId: string): Promise<EvaluationResult[] | undefined> {
+  const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
+  return data ?? undefined
+}
+
+export async function saveModelResults(modelId: string, results: EvaluationResult[]): Promise<void> {
+  await resultsStorage.setItem(modelId, results)
+}
+
+export async function getAllModelResults(): Promise<Record<string, EvaluationResult[]>> {
+  const keys = await resultsStorage.getKeys()
+  const results: Record<string, EvaluationResult[]> = {}
+
+  await Promise.all(
+    keys.map(async (modelId) => {
+      const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
+      if (data)
+        results[modelId] = data
+    }),
+  )
+
+  return results
+}
+
+export async function hasModelResults(modelId: string): Promise<boolean> {
+  return await resultsStorage.hasItem(modelId)
+}
--- a/benchmarks/src/utils.ts
+++ b/benchmarks/src/utils.ts
@@ -1,13 +1,3 @@
-/**
- * Shared utility functions for TOON benchmarks
- *
- * Provides common functionality used across multiple benchmark scripts:
- * - Progress bar visualization
- * - Token counting
- * - File I/O operations
- * - Retry logic for API calls
- */
-
 import * as fsp from 'node:fs/promises'
 import { encode } from 'gpt-tokenizer'