test: refactor accuracy benchmark generation

This commit is contained in:
Johann Schopplich
2025-10-27 14:07:20 +01:00
parent 1a5e6199ac
commit 05b3d43023
11 changed files with 1708 additions and 1721 deletions

View File

@@ -18,7 +18,7 @@ export interface EvaluationResult {
model: string
expected: string
actual: string
correct: boolean
isCorrect: boolean
inputTokens?: number
outputTokens?: number
latencyMs: number