chore(benchmarks): replace LLM-as-judge, new structural validation

This commit is contained in:
Johann Schopplich
2025-11-07 21:28:21 +01:00
parent 9a519dd114
commit acca69c64a
25 changed files with 1311 additions and 396 deletions

View File

@@ -144,6 +144,30 @@ export interface NestedConfig {
}
}
/**
* Product structure for large uniform arrays
*/
export interface Product {
sku: string
name: string
category: string
price: number
qty: number
lastUpdated: string
}
/**
* Internal types for structural validation pattern generation
*/
type StructuralValidationType = 'truncated' | 'extra-rows' | 'width-mismatch' | 'missing-fields'
interface StructuralValidationFixture {
type: StructuralValidationType
description: string
data: Record<string, unknown>
isValid: boolean
}
/**
* Generate analytics time-series data
*/
@@ -505,6 +529,100 @@ export function generateNestedConfig(): NestedConfig {
}
}
/**
* Generate large uniform product array (5000+ rows)
*
* @remarks
* Tests TOON's token efficiency and structural reliability at scale.
*/
export function generateProducts(count: number): { products: Product[] } {
const categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys'] as const
return {
products: Array.from({ length: count }, (_, i): Product => ({
sku: `SKU-${String(i + 1).padStart(6, '0')}`,
name: faker.commerce.productName(),
category: categories[i % categories.length]!,
price: Number(faker.commerce.price({ min: 5, max: 500 })),
qty: faker.number.int({ min: 0, max: 1000 }),
lastUpdated: faker.date.recent({ days: 30 }).toISOString().split('T')[0]!,
})),
}
}
/**
* Generate structural validation fixtures from employee data
*
* @remarks
* Creates deliberately corrupted datasets to test TOON's structural validation
* capabilities via [N] length declarations and {fields} headers.
* Internal function used to generate structural validation datasets.
*/
function generateStructuralValidationFixtures(): StructuralValidationFixture[] {
const baseData = generateEmployees(20)
return [
// Valid baseline
{
type: 'truncated' as const,
description: 'Valid complete dataset (control)',
data: { employees: baseData.employees },
isValid: true,
},
// Truncated array (missing last 3 rows)
{
type: 'truncated' as const,
description: 'Array truncated: 3 rows removed from end',
data: { employees: baseData.employees.slice(0, -3) },
isValid: false, // [N] won't match actual row count in TOON
},
// Extra rows (3 more than original)
{
type: 'extra-rows' as const,
description: 'Extra rows added beyond declared length',
data: {
employees: [
...baseData.employees,
...generateEmployees(3).employees,
],
},
isValid: false, // [N] won't match actual row count in TOON
},
// Width mismatch (inconsistent field count)
{
type: 'width-mismatch' as const,
description: 'Inconsistent field count (missing salary in row 10)',
data: {
employees: baseData.employees.map((emp, i) => {
if (i === 9) {
// Row 10, missing salary field
const { salary, ...rest } = emp
return rest
}
return emp
}),
},
isValid: false, // Not all objects have same fields (tabular requirement)
},
// Missing required fields
{
type: 'missing-fields' as const,
description: 'Missing required fields (no email in multiple rows)',
data: {
employees: baseData.employees.map((emp, i) => {
if (i % 5 === 0) {
// Every 5th row, missing email
const { email, ...rest } = emp
return rest
}
return emp
}),
},
isValid: false, // Not all objects have same fields (tabular requirement)
},
]
}
/**
* Event logs dataset: Semi-uniform structure
*
@@ -539,6 +657,34 @@ const nestedConfigDataset: Dataset = {
},
}
/**
* Structural validation datasets: Tests ability to detect incomplete, truncated, or corrupted data
*
* @remarks
* These datasets test TOON's structural validation advantages via [N] length declarations
* and {fields} headers. CSV is included to demonstrate its lack of structural metadata.
*/
const structuralValidationDatasets: Dataset[] = generateStructuralValidationFixtures().map((fixture, index) => {
const datasetNames = [
'structural-validation-control',
'structural-validation-truncated',
'structural-validation-extra-rows',
'structural-validation-width-mismatch',
'structural-validation-missing-fields',
] as const
return {
name: datasetNames[index]!,
description: fixture.description,
data: fixture.data,
metadata: {
supportsCSV: true, // Include CSV to show it can't validate structure
structureClass: 'uniform',
tabularEligibility: 100,
},
}
})
/**
* Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
*/
@@ -549,6 +695,7 @@ export const ACCURACY_DATASETS: Dataset[] = [
githubDataset, // 100 repos
eventLogsDataset, // 75 logs
nestedConfigDataset, // 1 config
...structuralValidationDatasets, // 5 validation fixtures
]
/**