chore(benchmarks): finalize structure-awareness run

This commit is contained in:
Johann Schopplich
2025-11-07 10:33:46 +01:00
parent 89df613059
commit c6ba6446f5
10 changed files with 259 additions and 223 deletions

View File

@@ -10,9 +10,9 @@ import { generateText } from 'ai'
* Models used for evaluation
*/
export const models: LanguageModelV2[] = [
openai('gpt-5-nano'),
anthropic('claude-haiku-4-5-20251001'),
google('gemini-2.5-flash'),
openai('gpt-5-nano'),
xai('grok-4-fast-non-reasoning'),
]

View File

@@ -81,7 +81,7 @@ export function generateAccuracyReport(
Benchmarks test LLM comprehension across different input formats using ${totalQuestions} data retrieval questions on ${modelNames.length} ${modelNames.length === 1 ? 'model' : 'models'}.
<details>
<summary><strong>View Dataset Catalog</strong></summary>
<summary><strong>Show Dataset Catalog</strong></summary>
${generateDatasetCatalog(ACCURACY_DATASETS)}