docs: overhaul retrieval accuracy benchmark

2026-01-29 23:34:10 +08:00 · 2025-10-28 20:22:43 +01:00
parent efbe4ded88
commit 67c0df8cb0
22 changed files with 1553 additions and 27288 deletions
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -1,51 +1,53 @@
-/**
- * LLM Retrieval Accuracy Benchmark
- *
- * Main entry point that orchestrates the full benchmark:
- * 1. Generate questions from datasets
- * 2. Format data in all formats (JSON, TOON, YAML, Markdown-kv)
- * 3. Evaluate each question with each format using LLMs
- * 4. Generate reports
- */
-
-import type { EvaluationResult, Question } from '../src/types'
-import * as fsp from 'node:fs/promises'
+import type { Question } from '../src/types'
 import * as path from 'node:path'
-import { consola } from 'consola'
-import pMap from 'p-map'
-import { BENCHMARKS_DIR, DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, ROOT_DIR } from '../src/constants'
+import process from 'node:process'
+import * as prompts from '@clack/prompts'
+import PQueue from 'p-queue'
+import { DEFAULT_CONCURRENCY, DRY_RUN, DRY_RUN_LIMITS, MODEL_RPM_LIMITS, ROOT_DIR } from '../src/constants'
 import { datasets } from '../src/datasets'
 import { evaluateQuestion, models } from '../src/evaluate'
 import { formatters } from '../src/formatters'
 import { generateQuestions } from '../src/questions'
 import { calculateFormatResults, calculateTokenCounts, saveResults } from '../src/report'
+import { getAllModelResults, hasModelResults, saveModelResults } from '../src/storage'

-consola.start('Retrieval Accuracy Benchmark for TOON')
+prompts.intro('Retrieval Accuracy Benchmark')

-// Check if results already exist
-const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
-const rawResultsPath = path.join(resultsDir, 'raw-results.json')
-const summaryPath = path.join(resultsDir, 'summary.json')
+// Prompt user to select which models to benchmark
+const modelChoices = models.map(({ modelId }) => ({
+  value: modelId,
+  label: modelId,
+}))

-let existingResults: EvaluationResult[] | undefined
-let existingTokenCounts: Record<string, number> | undefined
+const selectedModels = await prompts.multiselect({
+  message: 'Select models to benchmark (Space to select, Enter to confirm)',
+  options: modelChoices,
+  required: true,
+})

-try {
-  const [rawData, summaryData] = await Promise.all([
-    fsp.readFile(rawResultsPath, 'utf-8'),
-    fsp.readFile(summaryPath, 'utf-8'),
-  ])
-  existingResults = JSON.parse(rawData)
-  const summary = JSON.parse(summaryData)
-  existingTokenCounts = summary.tokenCounts
-  consola.info('Found existing results – regenerating report only')
+if (prompts.isCancel(selectedModels)) {
+  prompts.cancel('Benchmark cancelled')
+  process.exit(0)
 }
-catch {
-  // Results don't exist, will run full evaluation
+
+const activeModels = models.filter(m => selectedModels.includes(m.modelId))
+
+prompts.log.info(`Selected ${activeModels.length} model(s): ${activeModels.map(m => m.modelId).join(', ')}`)
+
+// Check which models already have results
+const existingModelResults: Record<string, boolean> = {}
+for (const model of activeModels) {
+  const existingResult = await hasModelResults(model.modelId)
+  if (existingResult)
+    existingModelResults[model.modelId] = existingResult
+}
+
+if (Object.keys(existingModelResults).length > 0) {
+  prompts.log.info(`Found existing results for ${Object.values(existingModelResults).length} model(s)`)
 }

 if (DRY_RUN) {
-  consola.info('Limiting questions and models for dry run')
+  prompts.log.info('Limiting questions and models for dry run')
 }

 let questions = generateQuestions()
@@ -55,79 +57,98 @@ if (DRY_RUN && DRY_RUN_LIMITS.maxQuestions) {
  questions = questions.slice(0, DRY_RUN_LIMITS.maxQuestions)
 }

-// Filter models for dry run
-const activeModels = DRY_RUN && DRY_RUN_LIMITS.allowedModels.length > 0
-  ? Object.fromEntries(
-      Object.entries(models).filter(([name]) => DRY_RUN_LIMITS.allowedModels.includes(name)),
-    )
-  : models
+prompts.log.info(`Evaluating ${questions.length} questions`)
+prompts.log.info(`Testing ${Object.keys(formatters).length} formats`)

-let results: EvaluationResult[]
-let tokenCounts: Record<string, number>
+// Evaluate each model separately and save results incrementally
+for (const model of activeModels) {
+  const modelId = model.modelId

-if (existingResults && existingTokenCounts) {
-  // Reuse existing results
-  results = existingResults
-  tokenCounts = existingTokenCounts
-}
-else {
-  // Run full evaluation
-  consola.info(`Evaluating ${questions.length} questions`)
-  consola.info(`Testing ${Object.keys(formatters).length} formats`)
-  consola.info(`Using ${Object.keys(activeModels).length} models: ${Object.keys(activeModels).join(', ')}`)
+  // Skip if results already exist
+  if (existingModelResults[modelId]) {
+    prompts.log.info(`Skipping ${modelId} (results already exist)`)
+    continue
+  }

-  // Calculate token counts for all format+dataset combinations
-  tokenCounts = calculateTokenCounts(formatters)
-
-  // Generate evaluation tasks
-  const tasks: { question: Question, formatName: string, modelName: string }[] = []
+  prompts.log.step(`Running benchmark for ${modelId}`)

+  // Generate evaluation tasks for this model
+  const tasks: { question: Question, formatName: string }[] = []
  for (const question of questions) {
    for (const [formatName] of Object.entries(formatters)) {
-      for (const [modelName] of Object.entries(activeModels)) {
-        tasks.push({ question, formatName, modelName })
-      }
+      tasks.push({ question, formatName })
    }
  }

  const total = tasks.length
-  consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
+  const rpmLimit = MODEL_RPM_LIMITS[modelId]
+  const queue = new PQueue({
+    concurrency: DEFAULT_CONCURRENCY,
+    intervalCap: rpmLimit,
+    interval: rpmLimit ? 60_000 : undefined,
+  })

-  results = await pMap(
-    tasks,
-    async (task, index) => {
+  const evalSpinner = prompts.spinner()
+  evalSpinner.start(`Running ${total} evaluations (concurrency: ${DEFAULT_CONCURRENCY}, RPM limit: ${rpmLimit ?? 'unlimited'})`)
+
+  let completed = 0
+
+  // Queue all tasks
+  const modelResultPromises = tasks.map(task =>
+    queue.add(async () => {
      // Format data on-demand
      const dataset = datasets.find(d => d.name === task.question.dataset)!
      const formatter = formatters[task.formatName]!
      const formattedData = formatter(dataset.data)
-      const model = activeModels[task.modelName as keyof typeof activeModels]!

      const result = await evaluateQuestion({
        question: task.question,
        formatName: task.formatName,
        formattedData,
        model,
-        modelName: task.modelName,
      })

      // Progress update after task completes
-      if ((index + 1) % 10 === 0 || (index + 1) === total) {
-        const percent = (((index + 1) / total) * 100).toFixed(1)
-        consola.start(`Progress: ${index + 1}/${total} (${percent}%)`)
+      completed++
+      if (completed % 10 === 0 || completed === total) {
+        const percent = ((completed / total) * 100).toFixed(1)
+        evalSpinner.message(`Progress: ${completed}/${total} (${percent}%)`)
      }

      return result
-    },
-    { concurrency: DEFAULT_CONCURRENCY },
+    }),
  )

-  consola.success('Evaluation complete!')
+  // Wait for all tasks to complete
+  const modelResults = await Promise.all(modelResultPromises)
+
+  evalSpinner.stop(`Evaluation complete for ${modelId}`)
+
+  // Save results immediately for this model
+  await saveModelResults(modelId, modelResults)
+  prompts.log.success(`Saved results for ${modelId}`)
 }

-// Generate/regenerate markdown report
-consola.start('Generating report and saving results…')
-const formatResults = calculateFormatResults(results, tokenCounts)
-await saveResults(results, formatResults, questions, tokenCounts)
+// Generate/regenerate markdown report from all available model results
+const reportSpinner = prompts.spinner()
+reportSpinner.start('Generating report from all model results')

-consola.info(`Results saved to: \`${path.relative(ROOT_DIR, resultsDir)}\``)
-consola.success(existingResults ? 'Markdown report regenerated!' : 'Evaluation complete!')
+// Load all available model results (including any that were skipped)
+const allModelResults = await getAllModelResults()
+const allResults = Object.values(allModelResults).flat()
+
+if (allResults.length === 0) {
+  prompts.log.warn('No results available to generate report')
+  process.exit(0)
+}
+
+// Calculate token counts freshly (deterministic, no need to persist)
+const tokenCounts = calculateTokenCounts(formatters)
+
+// Calculate format statistics and save report
+const formatResults = calculateFormatResults(allResults, tokenCounts)
+const resultsDir = await saveResults(allResults, formatResults, questions, tokenCounts)
+
+const reportPath = path.join(resultsDir, 'retrieval-accuracy.md')
+prompts.log.info(`Report saved to: \`${path.relative(ROOT_DIR, reportPath)}\``)
+reportSpinner.stop('Report generation complete!')