text(accuracy): add Grok-4-fast, remove default temperature

2026-01-29 23:34:10 +08:00 · 2025-10-28 22:54:00 +01:00
parent e400e68ad6
commit ecf578a7dc
13 changed files with 301 additions and 117 deletions
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -15,6 +15,7 @@ export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
  'claude-haiku-4-5-20251001': 50,
  'gemini-2.5-flash': 25,
  'gpt-5-nano': undefined,
+  'grok-4-fast-non-reasoning': 50,
 }

 /**
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -3,6 +3,7 @@ import type { EvaluationResult, Question } from './types'
 import { anthropic } from '@ai-sdk/anthropic'
 import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
+import { xai } from '@ai-sdk/xai'
 import * as prompts from '@clack/prompts'
 import { generateText } from 'ai'

@@ -11,8 +12,9 @@ import { generateText } from 'ai'
 */
 export const models: LanguageModelV2[] = [
  openai('gpt-5-nano'),
-  google('gemini-2.5-flash'),
  anthropic('claude-haiku-4-5-20251001'),
+  google('gemini-2.5-flash'),
+  xai('grok-4-fast-non-reasoning'),
 ]

 /**
@@ -45,16 +47,13 @@ Provide only the direct answer, without any additional explanation or formatting
 `.trim()

  const startTime = performance.now()
-  const { text, usage } = await generateText({
-    model,
-    prompt,
-    temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
-  })
+  const { text, usage } = await generateText({ model, prompt })

+  const actual = text.trim()
  const latencyMs = performance.now() - startTime

  const isCorrect = await validateAnswer({
-    actual: text.trim(),
+    actual,
    expected: question.groundTruth,
    question: question.prompt,
  })
@@ -64,7 +63,7 @@ Provide only the direct answer, without any additional explanation or formatting
    format: formatName,
    model: model.modelId,
    expected: question.groundTruth,
-    actual: text.trim(),
+    actual,
    isCorrect,
    inputTokens: usage.inputTokens,
    outputTokens: usage.outputTokens,
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -3,6 +3,7 @@ import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
 import { BENCHMARKS_DIR } from './constants'
 import { datasets } from './datasets'
+import { models } from './evaluate'
 import { createProgressBar, ensureDir, tokenize } from './utils'

 /**
@@ -50,9 +51,8 @@ export function generateMarkdownReport(
  const toon = formatResults.find(r => r.format === 'toon')
  const json = formatResults.find(r => r.format === 'json')

-  // Build model-by-model breakdown with ASCII bars
-  const modelNames = [...new Set(results.map(r => r.model))].reverse()
-  const modelCount = modelNames.length
+  const modelIds = models.map(m => m.modelId)
+  const modelNames = modelIds.filter(id => results.some(r => r.model === id))

  const modelBreakdown = modelNames.map((modelName, i) => {
    const modelResults = formatResults.map((fr) => {
@@ -183,16 +183,14 @@ ${tableRows}
  const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
  const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0

-  // Calculate number of formats and models
+  // Calculate number of formats and evaluations
  const formatCount = formatResults.length
-  const modelsUsed = [...new Set(results.map(r => r.model))]
-  const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
-  const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
+  const totalEvaluations = totalQuestions * formatCount * modelNames.length

  return `
 ### Retrieval Accuracy

-Accuracy across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** on **${totalQuestions} data retrieval questions**:
+Accuracy across **${modelNames.length} ${modelNames.length === 1 ? 'LLM' : 'LLMs'}** on ${totalQuestions} data retrieval questions:

 \`\`\`
 ${modelBreakdown}
@@ -253,10 +251,10 @@ ${totalQuestions} questions are generated dynamically across three categories:

 #### Models & Configuration

- **Models tested**: ${modelsListStr}
+- **Models tested**: ${modelNames.map(m => `\`${m}\``).join(', ')}
 - **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
- **Temperature**: 0 (for non-reasoning models)
- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
+- **Temperature**: Not set (models use their defaults)
+- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelNames.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls

 </details>
 `.trimStart()