mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
chore(benchmarks): replace LLM-as-judge, new structural validation
This commit is contained in:
@@ -56,9 +56,11 @@ export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||
*/
|
||||
export const QUESTION_TYPES = [
|
||||
'field-retrieval',
|
||||
'retrieval',
|
||||
'aggregation',
|
||||
'filtering',
|
||||
'structure-awareness',
|
||||
'structural-validation',
|
||||
] as const
|
||||
|
||||
/**
|
||||
@@ -66,9 +68,11 @@ export const QUESTION_TYPES = [
|
||||
*/
|
||||
export const QUESTION_TYPE_LABELS = {
|
||||
'field-retrieval': 'Field Retrieval',
|
||||
'retrieval': 'Retrieval',
|
||||
'aggregation': 'Aggregation',
|
||||
'filtering': 'Filtering',
|
||||
'structure-awareness': 'Structure Awareness',
|
||||
'structural-validation': 'Structural Validation',
|
||||
} as const
|
||||
|
||||
/**
|
||||
@@ -81,6 +85,12 @@ export const DATASET_NAMES = [
|
||||
'github',
|
||||
'event-logs',
|
||||
'nested-config',
|
||||
'large-uniform',
|
||||
'structural-validation-control',
|
||||
'structural-validation-truncated',
|
||||
'structural-validation-extra-rows',
|
||||
'structural-validation-width-mismatch',
|
||||
'structural-validation-missing-fields',
|
||||
] as const
|
||||
|
||||
/**
|
||||
|
||||
@@ -144,6 +144,30 @@ export interface NestedConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Product structure for large uniform arrays
|
||||
*/
|
||||
export interface Product {
|
||||
sku: string
|
||||
name: string
|
||||
category: string
|
||||
price: number
|
||||
qty: number
|
||||
lastUpdated: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal types for structural validation pattern generation
|
||||
*/
|
||||
type StructuralValidationType = 'truncated' | 'extra-rows' | 'width-mismatch' | 'missing-fields'
|
||||
|
||||
interface StructuralValidationFixture {
|
||||
type: StructuralValidationType
|
||||
description: string
|
||||
data: Record<string, unknown>
|
||||
isValid: boolean
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate analytics time-series data
|
||||
*/
|
||||
@@ -505,6 +529,100 @@ export function generateNestedConfig(): NestedConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate large uniform product array (5000+ rows)
|
||||
*
|
||||
* @remarks
|
||||
* Tests TOON's token efficiency and structural reliability at scale.
|
||||
*/
|
||||
export function generateProducts(count: number): { products: Product[] } {
|
||||
const categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys'] as const
|
||||
|
||||
return {
|
||||
products: Array.from({ length: count }, (_, i): Product => ({
|
||||
sku: `SKU-${String(i + 1).padStart(6, '0')}`,
|
||||
name: faker.commerce.productName(),
|
||||
category: categories[i % categories.length]!,
|
||||
price: Number(faker.commerce.price({ min: 5, max: 500 })),
|
||||
qty: faker.number.int({ min: 0, max: 1000 }),
|
||||
lastUpdated: faker.date.recent({ days: 30 }).toISOString().split('T')[0]!,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate structural validation fixtures from employee data
|
||||
*
|
||||
* @remarks
|
||||
* Creates deliberately corrupted datasets to test TOON's structural validation
|
||||
* capabilities via [N] length declarations and {fields} headers.
|
||||
* Internal function used to generate structural validation datasets.
|
||||
*/
|
||||
function generateStructuralValidationFixtures(): StructuralValidationFixture[] {
|
||||
const baseData = generateEmployees(20)
|
||||
|
||||
return [
|
||||
// Valid baseline
|
||||
{
|
||||
type: 'truncated' as const,
|
||||
description: 'Valid complete dataset (control)',
|
||||
data: { employees: baseData.employees },
|
||||
isValid: true,
|
||||
},
|
||||
// Truncated array (missing last 3 rows)
|
||||
{
|
||||
type: 'truncated' as const,
|
||||
description: 'Array truncated: 3 rows removed from end',
|
||||
data: { employees: baseData.employees.slice(0, -3) },
|
||||
isValid: false, // [N] won't match actual row count in TOON
|
||||
},
|
||||
// Extra rows (3 more than original)
|
||||
{
|
||||
type: 'extra-rows' as const,
|
||||
description: 'Extra rows added beyond declared length',
|
||||
data: {
|
||||
employees: [
|
||||
...baseData.employees,
|
||||
...generateEmployees(3).employees,
|
||||
],
|
||||
},
|
||||
isValid: false, // [N] won't match actual row count in TOON
|
||||
},
|
||||
// Width mismatch (inconsistent field count)
|
||||
{
|
||||
type: 'width-mismatch' as const,
|
||||
description: 'Inconsistent field count (missing salary in row 10)',
|
||||
data: {
|
||||
employees: baseData.employees.map((emp, i) => {
|
||||
if (i === 9) {
|
||||
// Row 10, missing salary field
|
||||
const { salary, ...rest } = emp
|
||||
return rest
|
||||
}
|
||||
return emp
|
||||
}),
|
||||
},
|
||||
isValid: false, // Not all objects have same fields (tabular requirement)
|
||||
},
|
||||
// Missing required fields
|
||||
{
|
||||
type: 'missing-fields' as const,
|
||||
description: 'Missing required fields (no email in multiple rows)',
|
||||
data: {
|
||||
employees: baseData.employees.map((emp, i) => {
|
||||
if (i % 5 === 0) {
|
||||
// Every 5th row, missing email
|
||||
const { email, ...rest } = emp
|
||||
return rest
|
||||
}
|
||||
return emp
|
||||
}),
|
||||
},
|
||||
isValid: false, // Not all objects have same fields (tabular requirement)
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
/**
|
||||
* Event logs dataset: Semi-uniform structure
|
||||
*
|
||||
@@ -539,6 +657,34 @@ const nestedConfigDataset: Dataset = {
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Structural validation datasets: Tests ability to detect incomplete, truncated, or corrupted data
|
||||
*
|
||||
* @remarks
|
||||
* These datasets test TOON's structural validation advantages via [N] length declarations
|
||||
* and {fields} headers. CSV is included to demonstrate its lack of structural metadata.
|
||||
*/
|
||||
const structuralValidationDatasets: Dataset[] = generateStructuralValidationFixtures().map((fixture, index) => {
|
||||
const datasetNames = [
|
||||
'structural-validation-control',
|
||||
'structural-validation-truncated',
|
||||
'structural-validation-extra-rows',
|
||||
'structural-validation-width-mismatch',
|
||||
'structural-validation-missing-fields',
|
||||
] as const
|
||||
|
||||
return {
|
||||
name: datasetNames[index]!,
|
||||
description: fixture.description,
|
||||
data: fixture.data,
|
||||
metadata: {
|
||||
supportsCSV: true, // Include CSV to show it can't validate structure
|
||||
structureClass: 'uniform',
|
||||
tabularEligibility: 100,
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
/**
|
||||
* Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
|
||||
*/
|
||||
@@ -549,6 +695,7 @@ export const ACCURACY_DATASETS: Dataset[] = [
|
||||
githubDataset, // 100 repos
|
||||
eventLogsDataset, // 75 logs
|
||||
nestedConfigDataset, // 1 config
|
||||
...structuralValidationDatasets, // 5 validation fixtures
|
||||
]
|
||||
|
||||
/**
|
||||
|
||||
@@ -5,6 +5,7 @@ import { google } from '@ai-sdk/google'
|
||||
import { openai } from '@ai-sdk/openai'
|
||||
import { xai } from '@ai-sdk/xai'
|
||||
import { generateText } from 'ai'
|
||||
import { compareAnswers } from './normalize'
|
||||
|
||||
/**
|
||||
* Models used for evaluation
|
||||
@@ -74,7 +75,13 @@ ${formattedData}
|
||||
|
||||
Question: ${question.prompt}
|
||||
|
||||
Provide only the direct answer, without any additional explanation or formatting.
|
||||
Answer format requirements:
|
||||
- Provide only the value itself, no explanation
|
||||
- For numbers: output digits only (no commas, currency symbols, or units)
|
||||
- For dates/field names: use the exact string from the data
|
||||
- For lists: output comma-separated values with no spaces
|
||||
|
||||
Answer:
|
||||
`.trim()
|
||||
|
||||
const startTime = performance.now()
|
||||
@@ -83,11 +90,13 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
const actual = text.trim()
|
||||
const latencyMs = performance.now() - startTime
|
||||
|
||||
const isCorrect = await validateAnswer({
|
||||
const comparisonResult = compareAnswers(
|
||||
actual,
|
||||
expected: question.groundTruth,
|
||||
question: question.prompt,
|
||||
})
|
||||
question.groundTruth,
|
||||
question.answerType ?? 'string',
|
||||
question.normalizationOptions,
|
||||
)
|
||||
const isCorrect = comparisonResult.match
|
||||
|
||||
return {
|
||||
questionId: question.id,
|
||||
@@ -101,42 +110,3 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
latencyMs,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate an answer using LLM-as-judge approach
|
||||
*/
|
||||
async function validateAnswer(
|
||||
{
|
||||
actual,
|
||||
expected,
|
||||
question,
|
||||
}:
|
||||
{
|
||||
actual: string
|
||||
expected: string
|
||||
question: string
|
||||
},
|
||||
): Promise<boolean> {
|
||||
const prompt = `
|
||||
You are validating answers to questions about structured data.
|
||||
|
||||
Question: ${question}
|
||||
Expected answer: ${expected}
|
||||
Actual answer: ${actual}
|
||||
|
||||
Is the actual answer correct? Consider:
|
||||
- Exact matches are correct
|
||||
- Semantically equivalent answers are correct (e.g., "50000" vs "$50,000" vs "50000 dollars")
|
||||
- Minor formatting differences are acceptable
|
||||
- Case-insensitive comparison for text
|
||||
|
||||
Respond with only "YES" or "NO".
|
||||
`.trim()
|
||||
|
||||
const { text } = await generateText({
|
||||
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
||||
prompt,
|
||||
})
|
||||
|
||||
return text.trim().toUpperCase() === 'YES'
|
||||
}
|
||||
|
||||
386
benchmarks/src/normalize.ts
Normal file
386
benchmarks/src/normalize.ts
Normal file
@@ -0,0 +1,386 @@
|
||||
/**
|
||||
* Type of expected answer for deterministic comparison
|
||||
*/
|
||||
export type AnswerType
|
||||
= | 'integer'
|
||||
| 'number'
|
||||
| 'boolean'
|
||||
| 'date'
|
||||
| 'string'
|
||||
| 'csv-list-ordered'
|
||||
| 'csv-list-unordered'
|
||||
|
||||
/**
|
||||
* Options for answer normalization and comparison
|
||||
*/
|
||||
export interface NormalizationOptions {
|
||||
/**
|
||||
* Tolerance for floating-point number comparison (e.g., 1e-6).
|
||||
* @default 1e-6
|
||||
*/
|
||||
tolerance?: number
|
||||
|
||||
/**
|
||||
* Whether string comparison should be case-sensitive.
|
||||
* @default false
|
||||
*/
|
||||
caseSensitive?: boolean
|
||||
|
||||
/**
|
||||
* Allow currency symbols ($, €, etc.) in number extraction.
|
||||
* @default true
|
||||
*/
|
||||
allowCurrency?: boolean
|
||||
|
||||
/**
|
||||
* Allow percent signs (%) in number extraction (will divide by 100).
|
||||
* @default true
|
||||
*/
|
||||
allowPercent?: boolean
|
||||
|
||||
/**
|
||||
* Number of decimal places to round to for number comparison.
|
||||
* If specified, overrides tolerance-based comparison.
|
||||
*/
|
||||
decimalPlaces?: number
|
||||
}
|
||||
|
||||
interface NormalizedResult {
|
||||
success: boolean
|
||||
value?: unknown
|
||||
error?: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Default normalization options
|
||||
*/
|
||||
const DEFAULT_OPTIONS: Required<NormalizationOptions> = {
|
||||
tolerance: 1e-6,
|
||||
caseSensitive: false,
|
||||
allowCurrency: true,
|
||||
allowPercent: true,
|
||||
decimalPlaces: undefined!,
|
||||
}
|
||||
|
||||
// Regex pattern constants
|
||||
const INTEGER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*/
|
||||
const INTEGER_PATTERN = /-?\d[\d,]*/
|
||||
const NUMBER_PATTERN_WITH_CURRENCY = /[$€£¥]?\s*-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
|
||||
const NUMBER_PATTERN = /-?\d[\d,]*(?:\.\d+)?(?:e[+-]?\d+)?%?/i
|
||||
const WRAPPING_QUOTES_PATTERN = /^["']|["']$/g
|
||||
const CODE_FENCE_PATTERN = /^```[\s\S]*?```$/g
|
||||
const LANGUAGE_IDENTIFIER_PATTERN = /^\w+\n/
|
||||
const CURRENCY_AND_FORMATTING_CHARS = /[$€£¥,\s]/g
|
||||
const NUMBER_CLEANUP_CHARS = /[$€£¥,%\s]/g
|
||||
|
||||
// Boolean value constants
|
||||
const TRUE_VALUES = new Set(['true', 'yes', 'y', '1'])
|
||||
const FALSE_VALUES = new Set(['false', 'no', 'n', '0'])
|
||||
|
||||
// Numeric constants
|
||||
const PERCENTAGE_DIVISOR = 100
|
||||
const DECIMAL_BASE = 10
|
||||
const MONTH_OFFSET = 1 // JavaScript months are 0-indexed
|
||||
const DATE_COMPONENT_WIDTH = 2
|
||||
const DATE_PAD_CHAR = '0'
|
||||
|
||||
// String constants
|
||||
const CSV_DELIMITER = ','
|
||||
|
||||
/**
|
||||
* Strip wrapping quotes from a string
|
||||
*/
|
||||
function stripWrappingQuotes(text: string): string {
|
||||
return text.trim().replace(WRAPPING_QUOTES_PATTERN, '')
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and normalize an integer from a string
|
||||
*
|
||||
* @remarks
|
||||
* Handles: "42", "1,234", "$5,678", " -99 ", "The answer is 42."
|
||||
*/
|
||||
function normalizeInteger(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||
// Strip common formatting, extract first integer-like token
|
||||
const pattern = options.allowCurrency
|
||||
? INTEGER_PATTERN_WITH_CURRENCY
|
||||
: INTEGER_PATTERN
|
||||
|
||||
const match = text.match(pattern)
|
||||
if (!match)
|
||||
return { success: false, error: `No integer found in: "${text}"` }
|
||||
|
||||
// Remove currency symbols, spaces, and thousand separators
|
||||
const normalizedValue = match[0].replace(CURRENCY_AND_FORMATTING_CHARS, '')
|
||||
const parsedNumber = Number.parseInt(normalizedValue, DECIMAL_BASE)
|
||||
|
||||
if (Number.isNaN(parsedNumber))
|
||||
return { success: false, error: `Failed to parse integer: "${match[0]}"` }
|
||||
|
||||
return { success: true, value: parsedNumber }
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and normalize a floating-point number from a string
|
||||
*
|
||||
* @remarks
|
||||
* Handles: "3.14", "1,234.56", "$5,678.90", "42%", "1.5e-3", "Price: $99.99"
|
||||
*/
|
||||
function normalizeNumber(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||
// Extract first number-like token (supports scientific notation)
|
||||
const pattern = options.allowCurrency
|
||||
? NUMBER_PATTERN_WITH_CURRENCY
|
||||
: NUMBER_PATTERN
|
||||
|
||||
const match = text.match(pattern)
|
||||
if (!match)
|
||||
return { success: false, error: `No number found in: "${text}"` }
|
||||
|
||||
const token = match[0]
|
||||
const hasPercentSign = options.allowPercent && token.endsWith('%')
|
||||
|
||||
// Remove currency, commas, spaces, and percent sign
|
||||
const normalizedToken = token.replace(NUMBER_CLEANUP_CHARS, '')
|
||||
let parsedNumber = Number.parseFloat(normalizedToken)
|
||||
|
||||
if (Number.isNaN(parsedNumber))
|
||||
return { success: false, error: `Failed to parse number: "${token}"` }
|
||||
|
||||
// Convert percentage to decimal if present
|
||||
if (hasPercentSign)
|
||||
parsedNumber = parsedNumber / PERCENTAGE_DIVISOR
|
||||
|
||||
// Round to specified decimal places if requested
|
||||
if (options.decimalPlaces !== undefined) {
|
||||
const factor = DECIMAL_BASE ** options.decimalPlaces
|
||||
parsedNumber = Math.round(parsedNumber * factor) / factor
|
||||
}
|
||||
|
||||
return { success: true, value: parsedNumber }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a boolean/yes-no answer
|
||||
*
|
||||
* @remarks
|
||||
* Handles: "true", "false", "yes", "no", "y", "n", "1", "0" (case-insensitive)
|
||||
*/
|
||||
function normalizeBoolean(text: string): NormalizedResult {
|
||||
const normalizedValue = text.trim().toLowerCase()
|
||||
|
||||
if (TRUE_VALUES.has(normalizedValue))
|
||||
return { success: true, value: true }
|
||||
|
||||
if (FALSE_VALUES.has(normalizedValue))
|
||||
return { success: true, value: false }
|
||||
|
||||
return { success: false, error: `Not a boolean: "${text}"` }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a date string to YYYY-MM-DD format
|
||||
*
|
||||
* @remarks
|
||||
* Handles: ISO dates, "Nov 1, 2025", "2025-11-01", RFC 2822, etc.
|
||||
*/
|
||||
function normalizeDate(text: string): NormalizedResult {
|
||||
const cleaned = stripWrappingQuotes(text)
|
||||
|
||||
// Try parsing as date
|
||||
const parsedDate = new Date(cleaned)
|
||||
if (Number.isNaN(parsedDate.getTime()))
|
||||
return { success: false, error: `Invalid date: "${text}"` }
|
||||
|
||||
// Normalize to YYYY-MM-DD (UTC)
|
||||
const year = parsedDate.getUTCFullYear()
|
||||
const monthPadded = String(parsedDate.getUTCMonth() + MONTH_OFFSET).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
|
||||
const dayPadded = String(parsedDate.getUTCDate()).padStart(DATE_COMPONENT_WIDTH, DATE_PAD_CHAR)
|
||||
const normalized = `${year}-${monthPadded}-${dayPadded}`
|
||||
|
||||
return { success: true, value: normalized }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a string (trim, optionally case-insensitive)
|
||||
*
|
||||
* @remarks
|
||||
* Handles wrapping quotes and code fences.
|
||||
*/
|
||||
function normalizeString(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||
let trimmedText = text.trim()
|
||||
|
||||
// Strip wrapping quotes
|
||||
trimmedText = trimmedText.replace(WRAPPING_QUOTES_PATTERN, '')
|
||||
|
||||
// Strip code fences (```...```)
|
||||
trimmedText = trimmedText.replace(CODE_FENCE_PATTERN, (match) => {
|
||||
const inner = match.slice(3, -3).trim()
|
||||
// Remove language identifier if present (e.g., ```json)
|
||||
return inner.replace(LANGUAGE_IDENTIFIER_PATTERN, '')
|
||||
})
|
||||
|
||||
trimmedText = trimmedText.trim()
|
||||
|
||||
const value = options.caseSensitive ? trimmedText : trimmedText.toLowerCase()
|
||||
return { success: true, value }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a comma-separated list (ordered)
|
||||
*
|
||||
* @remarks
|
||||
* Handles: "a,b,c", "a, b, c", " a , b , c "
|
||||
*/
|
||||
function normalizeCsvListOrdered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||
const strippedText = stripWrappingQuotes(text)
|
||||
const items = strippedText
|
||||
.split(CSV_DELIMITER)
|
||||
.map(item => item.trim())
|
||||
.filter(item => item.length > 0)
|
||||
|
||||
const normalizedItems = items.map(item =>
|
||||
options.caseSensitive ? item : item.toLowerCase(),
|
||||
)
|
||||
|
||||
return { success: true, value: normalizedItems }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a comma-separated list (unordered, compare as sets)
|
||||
*
|
||||
* @remarks
|
||||
* Handles: "c,a,b" equals "a,b,c"
|
||||
*/
|
||||
function normalizeCsvListUnordered(text: string, options: Required<NormalizationOptions>): NormalizedResult {
|
||||
const result = normalizeCsvListOrdered(text, options)
|
||||
if (!result.success)
|
||||
return result
|
||||
|
||||
// Type guard: ensure result.value is an array
|
||||
if (!Array.isArray(result.value))
|
||||
return { success: false, error: 'Expected array result from normalizeCsvListOrdered' }
|
||||
|
||||
// Sort for deterministic comparison
|
||||
const sorted = [...result.value].sort()
|
||||
return { success: true, value: sorted }
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a value based on its expected kind
|
||||
*/
|
||||
export function normalizeAnswer(
|
||||
text: string,
|
||||
kind: AnswerType,
|
||||
options: Partial<NormalizationOptions> = {},
|
||||
): NormalizedResult {
|
||||
const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
|
||||
|
||||
switch (kind) {
|
||||
case 'integer':
|
||||
return normalizeInteger(text, resolvedOptions)
|
||||
case 'number':
|
||||
return normalizeNumber(text, resolvedOptions)
|
||||
case 'boolean':
|
||||
return normalizeBoolean(text)
|
||||
case 'date':
|
||||
return normalizeDate(text)
|
||||
case 'string':
|
||||
return normalizeString(text, resolvedOptions)
|
||||
case 'csv-list-ordered':
|
||||
return normalizeCsvListOrdered(text, resolvedOptions)
|
||||
case 'csv-list-unordered':
|
||||
return normalizeCsvListUnordered(text, resolvedOptions)
|
||||
default:
|
||||
return { success: false, error: `Unknown answer kind: ${kind}` }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two normalized values based on answer kind
|
||||
*/
|
||||
function compareValues(
|
||||
actual: unknown,
|
||||
expected: unknown,
|
||||
kind: AnswerType,
|
||||
options: Required<NormalizationOptions>,
|
||||
): boolean {
|
||||
switch (kind) {
|
||||
case 'integer':
|
||||
case 'boolean':
|
||||
case 'date':
|
||||
case 'string':
|
||||
return actual === expected
|
||||
|
||||
case 'number':
|
||||
if (typeof actual !== 'number' || typeof expected !== 'number')
|
||||
return false
|
||||
|
||||
if (options.decimalPlaces !== undefined) {
|
||||
// Already rounded during normalization
|
||||
return actual === expected
|
||||
}
|
||||
return Math.abs(actual - expected) <= options.tolerance
|
||||
|
||||
case 'csv-list-ordered':
|
||||
if (!Array.isArray(actual) || !Array.isArray(expected))
|
||||
return false
|
||||
if (actual.length !== expected.length)
|
||||
return false
|
||||
return actual.every((item, i) => item === expected[i])
|
||||
|
||||
case 'csv-list-unordered':
|
||||
if (!Array.isArray(actual) || !Array.isArray(expected))
|
||||
return false
|
||||
if (actual.length !== expected.length)
|
||||
return false
|
||||
// Already sorted during normalization
|
||||
return actual.every((item, i) => item === expected[i])
|
||||
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare actual and expected answers with deterministic, type-aware normalization
|
||||
*
|
||||
* @remarks
|
||||
* Returns true if answers match within the specified tolerance/rules.
|
||||
*/
|
||||
export function compareAnswers(
|
||||
actual: string,
|
||||
expected: string,
|
||||
kind: AnswerType,
|
||||
options: Partial<NormalizationOptions> = {},
|
||||
): { match: boolean, details?: string } {
|
||||
const resolvedOptions: Required<NormalizationOptions> = { ...DEFAULT_OPTIONS, ...options }
|
||||
|
||||
// Normalize both answers
|
||||
const actualResult = normalizeAnswer(actual, kind, resolvedOptions)
|
||||
const expectedResult = normalizeAnswer(expected, kind, resolvedOptions)
|
||||
|
||||
// If either normalization failed, return false with details
|
||||
if (!actualResult.success) {
|
||||
return {
|
||||
match: false,
|
||||
details: `Failed to normalize actual answer: ${actualResult.error}`,
|
||||
}
|
||||
}
|
||||
|
||||
if (!expectedResult.success) {
|
||||
return {
|
||||
match: false,
|
||||
details: `Failed to normalize expected answer: ${expectedResult.error}`,
|
||||
}
|
||||
}
|
||||
|
||||
// Compare normalized values
|
||||
const match = compareValues(actualResult.value, expectedResult.value, kind, resolvedOptions)
|
||||
|
||||
return {
|
||||
match,
|
||||
details: match
|
||||
? undefined
|
||||
: `Mismatch: actual="${actualResult.value}" vs expected="${expectedResult.value}"`,
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(metric.views))
|
||||
.type('field-retrieval')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(metric, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -24,6 +25,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(metric.revenue))
|
||||
.type('field-retrieval')
|
||||
.dataset('analytics')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
(metric, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -31,6 +34,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(metric.bounceRate))
|
||||
.type('field-retrieval')
|
||||
.dataset('analytics')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
(metric, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -38,6 +43,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(metric.conversions))
|
||||
.type('field-retrieval')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -63,6 +69,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(totalDays))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -70,6 +77,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(totalViews))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -77,6 +85,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(totalConversions))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -84,6 +93,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(totalRevenue.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -91,6 +102,8 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(avgBounceRate.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -104,6 +117,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -117,6 +131,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -133,6 +148,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -149,6 +165,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -165,6 +182,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -181,6 +199,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(log.level)
|
||||
.type('field-retrieval')
|
||||
.dataset('event-logs')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(log, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -24,6 +25,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(log.endpoint)
|
||||
.type('field-retrieval')
|
||||
.dataset('event-logs')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(log, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -31,6 +33,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(log.statusCode))
|
||||
.type('field-retrieval')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(log, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -38,6 +41,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(log.responseTime))
|
||||
.type('field-retrieval')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -60,6 +64,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(totalLogs))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -67,6 +72,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(avgResponseTime.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -81,6 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -96,6 +104,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -111,6 +120,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(errorCount))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -118,6 +128,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(successCount))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -130,6 +141,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(retryableErrorCount))
|
||||
.type('aggregation')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -147,6 +159,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -161,6 +174,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -175,6 +189,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(repo.stars))
|
||||
.type('field-retrieval')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(repo, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -24,6 +25,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(repo.forks))
|
||||
.type('field-retrieval')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(repo, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -31,6 +33,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(repo.watchers))
|
||||
.type('field-retrieval')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(repo, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -38,6 +41,8 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(repo.defaultBranch)
|
||||
.type('field-retrieval')
|
||||
.dataset('github')
|
||||
.answerType('string')
|
||||
.normalize({ caseSensitive: true })
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -62,6 +67,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(totalRepos))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -69,6 +75,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(totalStars))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -76,6 +83,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(totalForks))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -83,6 +91,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(Math.round(avgStars)))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -97,6 +106,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -111,6 +121,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -125,6 +136,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -139,6 +151,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -155,6 +168,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -171,6 +185,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import { generateEventLogsQuestions } from './event-logs'
|
||||
import { generateGithubQuestions } from './github'
|
||||
import { generateNestedQuestions } from './nested'
|
||||
import { generateNestedConfigQuestions } from './nested-config'
|
||||
import { generateStructuralValidationQuestions } from './structural-validation'
|
||||
import { generateStructureQuestions } from './structure'
|
||||
import { generateTabularQuestions } from './tabular'
|
||||
import { createIdGenerator } from './utils'
|
||||
@@ -47,5 +48,8 @@ export function generateQuestions(): Question[] {
|
||||
// Generate structure-awareness questions (tests format-native affordances)
|
||||
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
|
||||
|
||||
// Generate structural-validation questions (tests ability to detect corrupted data)
|
||||
questions.push(...generateStructuralValidationQuestions(getId))
|
||||
|
||||
return questions
|
||||
}
|
||||
|
||||
@@ -17,42 +17,52 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
{
|
||||
prompt: 'What is the environment in the configuration?',
|
||||
groundTruth: config.environment,
|
||||
answerType: 'string' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the database host?',
|
||||
groundTruth: config.database.host,
|
||||
answerType: 'string' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the database port?',
|
||||
groundTruth: String(config.database.port),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the maximum connection pool size?',
|
||||
groundTruth: String(config.database.pool.max),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the session duration?',
|
||||
groundTruth: String(config.authentication.session.duration),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the minimum connection pool size?',
|
||||
groundTruth: String(config.database.pool.min),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the connection pool idle timeout?',
|
||||
groundTruth: String(config.database.pool.idleTimeout),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the database name?',
|
||||
groundTruth: config.database.name,
|
||||
answerType: 'string' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the session refresh threshold?',
|
||||
groundTruth: String(config.authentication.session.refreshThreshold),
|
||||
answerType: 'integer' as const,
|
||||
},
|
||||
{
|
||||
prompt: 'What is the version in the configuration?',
|
||||
groundTruth: config.version,
|
||||
answerType: 'string' as const,
|
||||
},
|
||||
]
|
||||
|
||||
@@ -64,6 +74,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(q.groundTruth)
|
||||
.type('field-retrieval')
|
||||
.dataset('nested-config')
|
||||
.answerType(q.answerType)
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -82,6 +93,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(roleCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -89,6 +101,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(groupCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -96,6 +109,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(providerCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -103,6 +117,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(featureCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -110,6 +125,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(replicaCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -122,6 +138,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(adminScopeProviderCount))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -134,6 +151,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(enabledFeatures))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -146,6 +164,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(adminPermissions))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -164,6 +183,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(totalPermissions))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -171,6 +191,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(distinctPermissions))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -178,6 +199,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(totalVariants))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -185,6 +207,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(highPriorityReplicas))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -192,6 +215,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(featuresWithHighRollout))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -199,6 +223,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(String(groupsWithMultipleRoles))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -249,6 +274,7 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.groundTruth(q.groundTruth)
|
||||
.type('filtering')
|
||||
.dataset('nested-config')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -17,6 +17,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(order.total))
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
(order, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -24,6 +26,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(order.status)
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -43,6 +46,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(order.customer.name)
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(order, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -50,6 +54,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(order.customer.email)
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(order, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -57,6 +62,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(order.orderDate || '')
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(order, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -64,6 +70,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(order.items.length))
|
||||
.type('field-retrieval')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -94,6 +101,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -105,6 +113,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(totalRevenue.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -112,6 +122,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(avgOrderValue.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -119,6 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(totalOrders))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -126,6 +139,8 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(maxOrderValue.toFixed(2)))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('number')
|
||||
.normalize({ decimalPlaces: 2 })
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -139,6 +154,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -156,6 +172,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -172,6 +189,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -188,6 +206,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
44
benchmarks/src/questions/structural-validation.ts
Normal file
44
benchmarks/src/questions/structural-validation.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import type { Question } from '../types'
|
||||
import { QuestionBuilder } from './utils'
|
||||
|
||||
/**
|
||||
* Generate structural validation questions for all incompleteness fixtures
|
||||
*
|
||||
* These questions test the ability to detect incomplete, truncated, or corrupted data
|
||||
* by validating structural metadata (TOON's [N] length declarations and {fields} headers).
|
||||
*
|
||||
* @remarks
|
||||
* - TOON's advantage: Explicit [N] and {fields} enable validation
|
||||
* - CSV disadvantage: No structural metadata to validate against
|
||||
* - JSON/YAML disadvantage: Require manual counting and schema inference
|
||||
*/
|
||||
export function generateStructuralValidationQuestions(
|
||||
getId: () => string,
|
||||
): Question[] {
|
||||
const questions: Question[] = []
|
||||
|
||||
// Dataset names and their expected validity
|
||||
const validationFixtures = [
|
||||
{ dataset: 'structural-validation-control', isValid: true, description: 'Valid complete dataset (control)' },
|
||||
{ dataset: 'structural-validation-truncated', isValid: false, description: 'Array truncated: 3 rows removed from end' },
|
||||
{ dataset: 'structural-validation-extra-rows', isValid: false, description: 'Extra rows added beyond declared length' },
|
||||
{ dataset: 'structural-validation-width-mismatch', isValid: false, description: 'Inconsistent field count (missing salary in row 10)' },
|
||||
{ dataset: 'structural-validation-missing-fields', isValid: false, description: 'Missing required fields (no email in multiple rows)' },
|
||||
] as const
|
||||
|
||||
// Generate one validation question per fixture
|
||||
for (const fixture of validationFixtures) {
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('Is this data complete and valid? Answer only YES or NO.')
|
||||
.groundTruth(fixture.isValid ? 'YES' : 'NO')
|
||||
.type('structural-validation')
|
||||
.dataset(fixture.dataset)
|
||||
.answerType('boolean')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
return questions
|
||||
}
|
||||
@@ -30,6 +30,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(employees.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -42,6 +43,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(employeeFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -53,6 +55,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('email')
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -65,6 +68,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastEmployee.department)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -76,6 +80,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastEmployee.name)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -87,6 +92,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('7')
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -100,6 +106,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(orders.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -112,6 +119,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(orderFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -126,6 +134,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(orderWithManyItems.items.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -138,6 +147,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(itemFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -150,6 +160,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastOrder.status)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -162,6 +173,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(customerFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -175,6 +187,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(metrics.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -187,6 +200,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(metricFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -198,6 +212,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('revenue')
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -210,6 +225,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastMetric.date)
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -221,6 +237,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('6')
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -234,6 +251,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(repos.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -246,6 +264,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(repoFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.answerType('csv-list-ordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -257,6 +276,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('forks')
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -269,6 +289,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastRepo.name)
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -280,6 +301,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth('11')
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -293,6 +315,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(String(logs.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -305,6 +328,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(logFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.answerType('csv-list-unordered')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -317,6 +341,7 @@ export function generateStructureQuestions(
|
||||
.groundTruth(lastLog.level)
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
)
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(emp.salary))
|
||||
.type('field-retrieval')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(emp, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -24,6 +25,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(emp.department)
|
||||
.type('field-retrieval')
|
||||
.dataset('tabular')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(emp, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -31,6 +33,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(emp.email)
|
||||
.type('field-retrieval')
|
||||
.dataset('tabular')
|
||||
.answerType('string')
|
||||
.build(),
|
||||
(emp, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -38,6 +41,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(emp.yearsExperience))
|
||||
.type('field-retrieval')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
(emp, getId) => new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -45,6 +49,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(emp.active ? 'yes' : 'no')
|
||||
.type('field-retrieval')
|
||||
.dataset('tabular')
|
||||
.answerType('boolean')
|
||||
.build(),
|
||||
]
|
||||
|
||||
@@ -67,6 +72,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -81,6 +87,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -98,6 +105,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(totalEmployees))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -105,6 +113,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(avgSalary))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -112,6 +121,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(activeCount))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
@@ -119,6 +129,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(inactiveCount))
|
||||
.type('aggregation')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
|
||||
@@ -134,6 +145,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -148,6 +160,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -164,6 +177,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
@@ -178,6 +192,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
||||
.groundTruth(String(count))
|
||||
.type('filtering')
|
||||
.dataset('tabular')
|
||||
.answerType('integer')
|
||||
.build(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import type { AnswerType, NormalizationOptions } from '../normalize'
|
||||
import type { Question } from '../types'
|
||||
|
||||
// Constants for sampling strides
|
||||
@@ -52,10 +53,21 @@ export class QuestionBuilder {
|
||||
return this
|
||||
}
|
||||
|
||||
answerType(kind: AnswerType): this {
|
||||
this.question.answerType = kind
|
||||
return this
|
||||
}
|
||||
|
||||
normalize(options: Partial<NormalizationOptions>): this {
|
||||
this.question.normalizationOptions = options
|
||||
return this
|
||||
}
|
||||
|
||||
build(): Question {
|
||||
if (!this.question.id || !this.question.prompt || !this.question.groundTruth || !this.question.type || !this.question.dataset) {
|
||||
throw new Error('Incomplete question')
|
||||
}
|
||||
|
||||
return this.question as Question
|
||||
}
|
||||
}
|
||||
@@ -65,7 +77,7 @@ export class QuestionBuilder {
|
||||
*/
|
||||
export function rotateQuestions<T>(
|
||||
items: T[],
|
||||
generators: Array<(item: T, getId: () => string) => Question>,
|
||||
generators: ((item: T, getId: () => string) => Question)[],
|
||||
limit: number,
|
||||
stride: number,
|
||||
getId: () => string,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
||||
import { ACCURACY_DATASETS } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { models, PRIMERS } from './evaluate'
|
||||
import { supportsCSV } from './formatters'
|
||||
import { generateQuestions } from './questions'
|
||||
import { createProgressBar, tokenize } from './utils'
|
||||
@@ -10,6 +10,9 @@ const EFFICIENCY_CHART_STYLE: 'vertical' | 'horizontal' = 'horizontal'
|
||||
|
||||
/**
|
||||
* Calculate token counts for all format+dataset combinations
|
||||
*
|
||||
* @remarks
|
||||
* Includes primer tokens for fairer comparison across formats
|
||||
*/
|
||||
export function calculateTokenCounts(
|
||||
formatters: Record<string, (data: unknown) => string>,
|
||||
@@ -23,8 +26,11 @@ export function calculateTokenCounts(
|
||||
continue
|
||||
|
||||
const formattedData = formatter(dataset.data)
|
||||
const primer = PRIMERS[formatName] ?? ''
|
||||
// Include primer in token count for fair comparison
|
||||
const fullPrompt = primer ? `${primer}\n\n${formattedData}` : formattedData
|
||||
const key = `${formatName}-${dataset.name}`
|
||||
tokenCounts[key] = tokenize(formattedData)
|
||||
tokenCounts[key] = tokenize(fullPrompt)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,9 +143,12 @@ function generateEfficiencyRankingReport(
|
||||
): string {
|
||||
const toon = formatResults.find(r => r.format === 'toon')
|
||||
const json = formatResults.find(r => r.format === 'json-pretty')
|
||||
const csv = formatResults.find(r => r.format === 'csv')
|
||||
|
||||
// Build efficiency ranking (accuracy per 1k tokens)
|
||||
const efficiencyRanking = formatResults
|
||||
// Exclude CSV since it only supports a subset of datasets (~half the questions)
|
||||
.filter(fr => fr.format !== 'csv')
|
||||
.map((fr) => {
|
||||
const efficiency = (fr.accuracy * 100) / (fr.totalTokens / 1000)
|
||||
return {
|
||||
@@ -163,6 +172,12 @@ function generateEfficiencyRankingReport(
|
||||
summary = `TOON achieves ${toonVsJson} while using ${tokenSavings}.`
|
||||
}
|
||||
|
||||
// Add CSV note if available
|
||||
let csvNote = ''
|
||||
if (csv) {
|
||||
csvNote = `\n\n**Note on CSV:** Excluded from ranking as it only supports ${csv.totalCount}/209 questions (flat tabular data only). While CSV is highly token-efficient for simple tabular data, it cannot represent nested structures that other formats handle.`
|
||||
}
|
||||
|
||||
return `
|
||||
Each format's overall performance, balancing accuracy against token cost:
|
||||
|
||||
@@ -170,7 +185,7 @@ Each format's overall performance, balancing accuracy against token cost:
|
||||
${efficiencyChart}
|
||||
\`\`\`
|
||||
|
||||
${summary}
|
||||
${summary}${csvNote}
|
||||
`.trim()
|
||||
}
|
||||
|
||||
@@ -210,11 +225,13 @@ function generateDetailedAccuracyReport(
|
||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
||||
const structuralValidationCount = questions.filter(q => q.type === 'structural-validation').length
|
||||
|
||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
||||
const structuralValidationPercent = ((structuralValidationCount / totalQuestions) * 100).toFixed(0)
|
||||
|
||||
// Calculate dataset sizes
|
||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
@@ -263,8 +280,9 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
||||
|
||||
#### Datasets Tested
|
||||
|
||||
Six datasets designed to test different structural patterns:
|
||||
Eleven datasets designed to test different structural patterns and validation capabilities:
|
||||
|
||||
**Primary datasets:**
|
||||
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
|
||||
@@ -272,9 +290,16 @@ Six datasets designed to test different structural patterns:
|
||||
5. **Event Logs** (${eventLogsSize} logs): Semi-uniform data with ~50% flat logs and ~50% with nested error objects.
|
||||
6. **Nested Config** (${nestedConfigSize} configuration): Deeply nested configuration with minimal tabular eligibility.
|
||||
|
||||
**Structural validation datasets:**
|
||||
7. **Control**: Valid complete dataset (baseline for validation)
|
||||
8. **Truncated**: Array with 3 rows removed from end (tests [N] length detection)
|
||||
9. **Extra rows**: Array with 3 additional rows beyond declared length
|
||||
10. **Width mismatch**: Inconsistent field count (missing salary in row 10)
|
||||
11. **Missing fields**: Systematic field omissions (no email in multiple rows)
|
||||
|
||||
#### Question Types
|
||||
|
||||
${totalQuestions} questions are generated dynamically across four categories:
|
||||
${totalQuestions} questions are generated dynamically across five categories:
|
||||
|
||||
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- Example: "What is Alice's salary?" → \`75000\`
|
||||
@@ -295,11 +320,16 @@ ${totalQuestions} questions are generated dynamically across four categories:
|
||||
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
||||
- Example: "What is the department of the last employee?" → \`Sales\`
|
||||
|
||||
- **Structural validation (${structuralValidationPercent}%)**: Tests ability to detect incomplete, truncated, or corrupted data using structural metadata
|
||||
- Example: "Is this data complete and valid?" → \`YES\` (control dataset) or \`NO\` (corrupted datasets)
|
||||
- Tests TOON's [N] length validation and {fields} consistency checking
|
||||
- Demonstrates CSV's lack of structural validation capabilities
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||
3. **Validate deterministically**: Answers are validated using type-aware comparison (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`) without requiring an LLM judge.
|
||||
|
||||
#### Models & Configuration
|
||||
|
||||
@@ -376,9 +406,12 @@ function generateDatasetBreakdown(
|
||||
questions: Question[],
|
||||
tokenCounts: Record<string, number>,
|
||||
): string {
|
||||
// Build question ID to dataset mapping for O(1) lookups
|
||||
const questionDatasetMap = new Map(questions.map(q => [q.id, q.dataset]))
|
||||
|
||||
return ACCURACY_DATASETS.map((dataset) => {
|
||||
const datasetResults = formatResults.map((fr) => {
|
||||
const datasetFormatResults = results.filter(r => r.questionId.includes(dataset.name) || questions.find(q => q.id === r.questionId)?.dataset === dataset.name)
|
||||
const datasetFormatResults = results.filter(r => questionDatasetMap.get(r.questionId) === dataset.name)
|
||||
if (datasetFormatResults.length === 0)
|
||||
return undefined
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
|
||||
import type { AnswerType, NormalizationOptions } from './normalize'
|
||||
|
||||
export type QuestionType = typeof QUESTION_TYPES[number]
|
||||
export type DatasetName = typeof DATASET_NAMES[number]
|
||||
@@ -23,6 +24,15 @@ export interface Question {
|
||||
groundTruth: string
|
||||
type: QuestionType
|
||||
dataset: DatasetName
|
||||
/**
|
||||
* Expected answer kind for deterministic comparison.
|
||||
* @default 'string'
|
||||
*/
|
||||
answerType?: AnswerType
|
||||
/**
|
||||
* Options for answer normalization and comparison.
|
||||
*/
|
||||
normalizationOptions?: Partial<NormalizationOptions>
|
||||
}
|
||||
|
||||
export interface EvaluationResult {
|
||||
|
||||
Reference in New Issue
Block a user