chore(benchmarks): replace LLM-as-judge, new structural validation

2026-01-29 23:34:10 +08:00 · 2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -1,4 +1,5 @@
 import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
+import type { AnswerType, NormalizationOptions } from './normalize'

 export type QuestionType = typeof QUESTION_TYPES[number]
 export type DatasetName = typeof DATASET_NAMES[number]
@@ -23,6 +24,15 @@ export interface Question {
  groundTruth: string
  type: QuestionType
  dataset: DatasetName
+  /**
+   * Expected answer kind for deterministic comparison.
+   * @default 'string'
+   */
+  answerType?: AnswerType
+  /**
+   * Options for answer normalization and comparison.
+   */
+  normalizationOptions?: Partial<NormalizationOptions>
 }

 export interface EvaluationResult {