mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
docs: add benchmarks for gemini-2.5-flash
This commit is contained in:
@@ -10,6 +10,7 @@
|
||||
import type { LanguageModelV2 } from '@ai-sdk/provider'
|
||||
import type { EvaluationResult, Question } from './types'
|
||||
import { anthropic } from '@ai-sdk/anthropic'
|
||||
import { google } from '@ai-sdk/google'
|
||||
import { openai } from '@ai-sdk/openai'
|
||||
import { generateText } from 'ai'
|
||||
import { consola } from 'consola'
|
||||
@@ -20,16 +21,18 @@ import { consola } from 'consola'
|
||||
export const models: Record<string, LanguageModelV2> = {
|
||||
'gpt-5-nano': openai('gpt-5-nano'),
|
||||
'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
|
||||
'gemini-2.5-flash': google('gemini-2.5-flash'),
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a single question with a specific format and model
|
||||
*/
|
||||
export async function evaluateQuestion(
|
||||
{ question, formatName, formattedData, model}:
|
||||
{ question: Question, formatName: string, formattedData: string, model: LanguageModelV2 },
|
||||
{ question, formatName, formattedData, model, modelName}:
|
||||
{ question: Question, formatName: string, formattedData: string, model: LanguageModelV2, modelName: string },
|
||||
): Promise<EvaluationResult> {
|
||||
const prompt = `Given the following data in ${formatName} format:
|
||||
const prompt = `
|
||||
Given the following data in ${formatName} format:
|
||||
|
||||
\`\`\`
|
||||
${formattedData}
|
||||
@@ -37,13 +40,14 @@ ${formattedData}
|
||||
|
||||
Question: ${question.prompt}
|
||||
|
||||
Provide only the direct answer, without any additional explanation or formatting.`
|
||||
Provide only the direct answer, without any additional explanation or formatting.
|
||||
`.trim()
|
||||
|
||||
const startTime = performance.now()
|
||||
const { text, usage } = await generateText({
|
||||
model,
|
||||
prompt,
|
||||
temperature: model.modelId.startsWith('gpt-') ? undefined : 0,
|
||||
temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
|
||||
})
|
||||
|
||||
const latencyMs = performance.now() - startTime
|
||||
@@ -56,7 +60,7 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
return {
|
||||
questionId: question.id,
|
||||
format: formatName,
|
||||
model: model.modelId,
|
||||
model: modelName,
|
||||
expected: question.groundTruth,
|
||||
actual: text.trim(),
|
||||
isCorrect,
|
||||
@@ -93,9 +97,8 @@ Respond with only "YES" or "NO".`
|
||||
|
||||
try {
|
||||
const { text } = await generateText({
|
||||
model: models['claude-haiku-4-5']!,
|
||||
model: models['gpt-5-nano']!,
|
||||
prompt,
|
||||
temperature: 0,
|
||||
})
|
||||
|
||||
return text.trim().toUpperCase() === 'YES'
|
||||
|
||||
@@ -201,8 +201,8 @@ ${modelPerformance}
|
||||
|
||||
- **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).
|
||||
- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding.
|
||||
- **Question types**: Field retrieval, aggregation, and filtering tasks.
|
||||
- **Real data**: Faker.js-generated datasets + GitHub repositories.
|
||||
- **Question types**: ~160 questions across field retrieval, aggregation, and filtering tasks.
|
||||
- **Datasets**: Faker.js-generated datasets (seeded) + GitHub repositories.
|
||||
|
||||
</details>
|
||||
`.trimStart()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
export interface Dataset {
|
||||
name: string
|
||||
description: string
|
||||
data: any
|
||||
data: Record<string, any>
|
||||
}
|
||||
|
||||
export interface Question {
|
||||
|
||||
Reference in New Issue
Block a user