mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-30 07:44:10 +08:00
119 lines
2.7 KiB
TypeScript
119 lines
2.7 KiB
TypeScript
import type { LanguageModelV2 } from '@ai-sdk/provider'
|
|
import type { EvaluationResult, Question } from './types'
|
|
import { anthropic } from '@ai-sdk/anthropic'
|
|
import { google } from '@ai-sdk/google'
|
|
import { openai } from '@ai-sdk/openai'
|
|
import { xai } from '@ai-sdk/xai'
|
|
import * as prompts from '@clack/prompts'
|
|
import { generateText } from 'ai'
|
|
|
|
/**
|
|
* Models used for evaluation
|
|
*/
|
|
export const models: LanguageModelV2[] = [
|
|
openai('gpt-5-nano'),
|
|
anthropic('claude-haiku-4-5-20251001'),
|
|
google('gemini-2.5-flash'),
|
|
xai('grok-4-fast-non-reasoning'),
|
|
]
|
|
|
|
/**
|
|
* Evaluate a single question with a specific format and model
|
|
*/
|
|
export async function evaluateQuestion(
|
|
{
|
|
question,
|
|
formatName,
|
|
formattedData,
|
|
model,
|
|
}:
|
|
{
|
|
question: Question
|
|
formatName: string
|
|
formattedData: string
|
|
model: LanguageModelV2
|
|
},
|
|
): Promise<EvaluationResult> {
|
|
const prompt = `
|
|
Given the following data in ${formatName} format:
|
|
|
|
\`\`\`
|
|
${formattedData}
|
|
\`\`\`
|
|
|
|
Question: ${question.prompt}
|
|
|
|
Provide only the direct answer, without any additional explanation or formatting.
|
|
`.trim()
|
|
|
|
const startTime = performance.now()
|
|
const { text, usage } = await generateText({ model, prompt })
|
|
|
|
const actual = text.trim()
|
|
const latencyMs = performance.now() - startTime
|
|
|
|
const isCorrect = await validateAnswer({
|
|
actual,
|
|
expected: question.groundTruth,
|
|
question: question.prompt,
|
|
})
|
|
|
|
return {
|
|
questionId: question.id,
|
|
format: formatName,
|
|
model: model.modelId,
|
|
expected: question.groundTruth,
|
|
actual,
|
|
isCorrect,
|
|
inputTokens: usage.inputTokens,
|
|
outputTokens: usage.outputTokens,
|
|
latencyMs,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validate an answer using LLM-as-judge approach
|
|
*/
|
|
async function validateAnswer(
|
|
{
|
|
actual,
|
|
expected,
|
|
question,
|
|
}:
|
|
{
|
|
actual: string
|
|
expected: string
|
|
question: string
|
|
},
|
|
): Promise<boolean> {
|
|
const prompt = `
|
|
You are validating answers to questions about structured data.
|
|
|
|
Question: ${question}
|
|
Expected answer: ${expected}
|
|
Actual answer: ${actual}
|
|
|
|
Is the actual answer correct? Consider:
|
|
- Exact matches are correct
|
|
- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
|
|
- Minor formatting differences are acceptable
|
|
- Case-insensitive comparison for text
|
|
|
|
Respond with only "YES" or "NO".
|
|
`.trim()
|
|
|
|
try {
|
|
const { text } = await generateText({
|
|
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
|
prompt,
|
|
})
|
|
|
|
return text.trim().toUpperCase() === 'YES'
|
|
}
|
|
catch (error) {
|
|
prompts.log.error(`Validation error: ${error}`)
|
|
// Fallback to simple string comparison
|
|
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
|
|
}
|
|
}
|