test: refactor accuracy benchmark generation

2026-01-29 23:34:10 +08:00 · 2025-10-27 14:07:20 +01:00
parent 1a5e6199ac
commit 05b3d43023
11 changed files with 1708 additions and 1721 deletions
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -9,12 +9,10 @@

 import type { LanguageModelV2 } from '@ai-sdk/provider'
 import type { EvaluationResult, Question } from './types'
-import { setTimeout } from 'node:timers/promises'
 import { anthropic } from '@ai-sdk/anthropic'
 import { openai } from '@ai-sdk/openai'
 import { generateText } from 'ai'
 import { consola } from 'consola'
-import { RATE_LIMIT_DELAY_MS } from './constants'

 /**
 * Models used for evaluation
@@ -28,11 +26,8 @@ export const models: Record<string, LanguageModelV2> = {
 * Evaluate a single question with a specific format and model
 */
 export async function evaluateQuestion(
-  question: Question,
-  formatName: string,
-  formattedData: string,
-  model: LanguageModelV2,
-  modelName: string,
+  { question, formatName, formattedData, model}:
+  { question: Question, formatName: string, formattedData: string, model: LanguageModelV2 },
 ): Promise<EvaluationResult> {
  const prompt = `Given the following data in ${formatName} format:

@@ -51,10 +46,8 @@ Provide only the direct answer, without any additional explanation or formatting
    temperature: model.modelId.startsWith('gpt-') ? undefined : 0,
  })

-  await setTimeout(RATE_LIMIT_DELAY_MS)
-
  const latencyMs = performance.now() - startTime
-  const correct = await validateAnswer({
+  const isCorrect = await validateAnswer({
    actual: text.trim(),
    expected: question.groundTruth,
    question: question.prompt,
@@ -63,10 +56,10 @@ Provide only the direct answer, without any additional explanation or formatting
  return {
    questionId: question.id,
    format: formatName,
-    model: modelName,
+    model: model.modelId,
    expected: question.groundTruth,
    actual: text.trim(),
-    correct,
+    isCorrect,
    inputTokens: usage.inputTokens,
    outputTokens: usage.outputTokens,
    latencyMs,
@@ -105,8 +98,6 @@ Respond with only "YES" or "NO".`
      temperature: 0,
    })

-    await setTimeout(RATE_LIMIT_DELAY_MS)
-
    return text.trim().toUpperCase() === 'YES'
  }
  catch (error) {