chore(benchmarks): replace LLM-as-judge, new structural validation

This commit is contained in:
Johann Schopplich
2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions

View File

@@ -1,4 +1,5 @@
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
import type { AnswerType, NormalizationOptions } from './normalize'
export type QuestionType = typeof QUESTION_TYPES[number]
export type DatasetName = typeof DATASET_NAMES[number]
@@ -23,6 +24,15 @@ export interface Question {
groundTruth: string
type: QuestionType
dataset: DatasetName
/**
* Expected answer kind for deterministic comparison.
* @default 'string'
*/
answerType?: AnswerType
/**
* Options for answer normalization and comparison.
*/
normalizationOptions?: Partial<NormalizationOptions>
}
export interface EvaluationResult {