docs: add benchmarks for gemini-2.5-flash

2026-01-29 23:34:10 +08:00 · 2025-10-27 16:02:51 +01:00
parent 77696ce932
commit 7b76acde31
10 changed files with 15837 additions and 7011 deletions
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -10,6 +10,7 @@
 import type { LanguageModelV2 } from '@ai-sdk/provider'
 import type { EvaluationResult, Question } from './types'
 import { anthropic } from '@ai-sdk/anthropic'
+import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
 import { generateText } from 'ai'
 import { consola } from 'consola'
@@ -20,16 +21,18 @@ import { consola } from 'consola'
 export const models: Record<string, LanguageModelV2> = {
  'gpt-5-nano': openai('gpt-5-nano'),
  'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
+  'gemini-2.5-flash': google('gemini-2.5-flash'),
 }

 /**
 * Evaluate a single question with a specific format and model
 */
 export async function evaluateQuestion(
-  { question, formatName, formattedData, model}:
-  { question: Question, formatName: string, formattedData: string, model: LanguageModelV2 },
+  { question, formatName, formattedData, model, modelName}:
+  { question: Question, formatName: string, formattedData: string, model: LanguageModelV2, modelName: string },
 ): Promise<EvaluationResult> {
-  const prompt = `Given the following data in ${formatName} format:
+  const prompt = `
+Given the following data in ${formatName} format:

 \`\`\`
 ${formattedData}
@@ -37,13 +40,14 @@ ${formattedData}

 Question: ${question.prompt}

-Provide only the direct answer, without any additional explanation or formatting.`
+Provide only the direct answer, without any additional explanation or formatting.
+`.trim()

  const startTime = performance.now()
  const { text, usage } = await generateText({
    model,
    prompt,
-    temperature: model.modelId.startsWith('gpt-') ? undefined : 0,
+    temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
  })

  const latencyMs = performance.now() - startTime
@@ -56,7 +60,7 @@ Provide only the direct answer, without any additional explanation or formatting
  return {
    questionId: question.id,
    format: formatName,
-    model: model.modelId,
+    model: modelName,
    expected: question.groundTruth,
    actual: text.trim(),
    isCorrect,
@@ -93,9 +97,8 @@ Respond with only "YES" or "NO".`

  try {
    const { text } = await generateText({
-      model: models['claude-haiku-4-5']!,
+      model: models['gpt-5-nano']!,
      prompt,
-      temperature: 0,
    })

    return text.trim().toUpperCase() === 'YES'
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -201,8 +201,8 @@ ${modelPerformance}

 - **Semantic validation**: LLM-as-judge validates responses semantically (not exact string matching).
 - **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding.
- **Question types**: Field retrieval, aggregation, and filtering tasks.
- **Real data**: Faker.js-generated datasets + GitHub repositories.
+- **Question types**: ~160 questions across field retrieval, aggregation, and filtering tasks.
+- **Datasets**: Faker.js-generated datasets (seeded) + GitHub repositories.

 </details>
 `.trimStart()
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -1,7 +1,7 @@
 export interface Dataset {
  name: string
  description: string
-  data: any
+  data: Record<string, any>
 }

 export interface Question {