chore(benchmarks): replace LLM-as-judge, new structural validation

2026-01-29 23:34:10 +08:00 · 2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions
--- a/benchmarks/src/questions/utils.ts
+++ b/benchmarks/src/questions/utils.ts
@@ -1,3 +1,4 @@
+import type { AnswerType, NormalizationOptions } from '../normalize'
 import type { Question } from '../types'

 // Constants for sampling strides
@@ -52,10 +53,21 @@ export class QuestionBuilder {
    return this
  }

+  answerType(kind: AnswerType): this {
+    this.question.answerType = kind
+    return this
+  }
+
+  normalize(options: Partial<NormalizationOptions>): this {
+    this.question.normalizationOptions = options
+    return this
+  }
+
  build(): Question {
    if (!this.question.id || !this.question.prompt || !this.question.groundTruth || !this.question.type || !this.question.dataset) {
      throw new Error('Incomplete question')
    }
+
    return this.question as Question
  }
 }
@@ -65,7 +77,7 @@ export class QuestionBuilder {
 */
 export function rotateQuestions<T>(
  items: T[],
-  generators: Array<(item: T, getId: () => string) => Question>,
+  generators: ((item: T, getId: () => string) => Question)[],
  limit: number,
  stride: number,
  getId: () => string,