chore(benchmarks): replace LLM-as-judge, new structural validation

2026-01-29 23:34:10 +08:00 · 2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -5,6 +5,7 @@ import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
 import { xai } from '@ai-sdk/xai'
 import { generateText } from 'ai'
+import { compareAnswers } from './normalize'

 /**
 * Models used for evaluation
@@ -74,7 +75,13 @@ ${formattedData}

 Question: ${question.prompt}

-Provide only the direct answer, without any additional explanation or formatting.
+Answer format requirements:
+- Provide only the value itself, no explanation
+- For numbers: output digits only (no commas, currency symbols, or units)
+- For dates/field names: use the exact string from the data
+- For lists: output comma-separated values with no spaces
+
+Answer:
 `.trim()

  const startTime = performance.now()
@@ -83,11 +90,13 @@ Provide only the direct answer, without any additional explanation or formatting
  const actual = text.trim()
  const latencyMs = performance.now() - startTime

-  const isCorrect = await validateAnswer({
+  const comparisonResult = compareAnswers(
    actual,
-    expected: question.groundTruth,
-    question: question.prompt,
-  })
+    question.groundTruth,
+    question.answerType ?? 'string',
+    question.normalizationOptions,
+  )
+  const isCorrect = comparisonResult.match

  return {
    questionId: question.id,
@@ -101,42 +110,3 @@ Provide only the direct answer, without any additional explanation or formatting
    latencyMs,
  }
 }
-
-/**
- * Validate an answer using LLM-as-judge approach
- */
-async function validateAnswer(
-  {
-    actual,
-    expected,
-    question,
-  }:
-  {
-    actual: string
-    expected: string
-    question: string
-  },
-): Promise<boolean> {
-  const prompt = `
-You are validating answers to questions about structured data.
-
-Question: ${question}
-Expected answer: ${expected}
-Actual answer: ${actual}
-
-Is the actual answer correct? Consider:
- Exact matches are correct
- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
- Minor formatting differences are acceptable
- Case-insensitive comparison for text
-
-Respond with only "YES" or "NO".
-`.trim()
-
-  const { text } = await generateText({
-    model: models.find(m => m.modelId === 'gpt-5-nano')!,
-    prompt,
-  })
-
-  return text.trim().toUpperCase() === 'YES'
-}