Files
toon/benchmarks/src/evaluate.ts
2025-10-28 22:54:00 +01:00

119 lines
2.7 KiB
TypeScript

import type { LanguageModelV2 } from '@ai-sdk/provider'
import type { EvaluationResult, Question } from './types'
import { anthropic } from '@ai-sdk/anthropic'
import { google } from '@ai-sdk/google'
import { openai } from '@ai-sdk/openai'
import { xai } from '@ai-sdk/xai'
import * as prompts from '@clack/prompts'
import { generateText } from 'ai'
/**
* Models used for evaluation
*/
export const models: LanguageModelV2[] = [
openai('gpt-5-nano'),
anthropic('claude-haiku-4-5-20251001'),
google('gemini-2.5-flash'),
xai('grok-4-fast-non-reasoning'),
]
/**
* Evaluate a single question with a specific format and model
*/
export async function evaluateQuestion(
{
question,
formatName,
formattedData,
model,
}:
{
question: Question
formatName: string
formattedData: string
model: LanguageModelV2
},
): Promise<EvaluationResult> {
const prompt = `
Given the following data in ${formatName} format:
\`\`\`
${formattedData}
\`\`\`
Question: ${question.prompt}
Provide only the direct answer, without any additional explanation or formatting.
`.trim()
const startTime = performance.now()
const { text, usage } = await generateText({ model, prompt })
const actual = text.trim()
const latencyMs = performance.now() - startTime
const isCorrect = await validateAnswer({
actual,
expected: question.groundTruth,
question: question.prompt,
})
return {
questionId: question.id,
format: formatName,
model: model.modelId,
expected: question.groundTruth,
actual,
isCorrect,
inputTokens: usage.inputTokens,
outputTokens: usage.outputTokens,
latencyMs,
}
}
/**
* Validate an answer using LLM-as-judge approach
*/
async function validateAnswer(
{
actual,
expected,
question,
}:
{
actual: string
expected: string
question: string
},
): Promise<boolean> {
const prompt = `
You are validating answers to questions about structured data.
Question: ${question}
Expected answer: ${expected}
Actual answer: ${actual}
Is the actual answer correct? Consider:
- Exact matches are correct
- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
- Minor formatting differences are acceptable
- Case-insensitive comparison for text
Respond with only "YES" or "NO".
`.trim()
try {
const { text } = await generateText({
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
prompts.log.error(`Validation error: ${error}`)
// Fallback to simple string comparison
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
}
}