chore: more work on benchmarks

This commit is contained in:
Johann Schopplich
2025-11-06 15:51:31 +01:00
parent bc711ccecf
commit a9d52fc69b
15 changed files with 1647 additions and 213 deletions

View File

@@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic'
import { google } from '@ai-sdk/google'
import { openai } from '@ai-sdk/openai'
import { xai } from '@ai-sdk/xai'
import * as prompts from '@clack/prompts'
import { generateText } from 'ai'
/**
@@ -102,17 +101,10 @@ Is the actual answer correct? Consider:
Respond with only "YES" or "NO".
`.trim()
try {
const { text } = await generateText({
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
const { text } = await generateText({
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
prompts.log.error(`Validation error: ${error}`)
// Fallback to simple string comparison
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
}
return text.trim().toUpperCase() === 'YES'
}