chore(benchmarks): add structure-awareness questions

This commit is contained in:
Johann Schopplich
2025-11-07 09:03:51 +01:00
parent 853c3babea
commit 89df613059
13 changed files with 522 additions and 67 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -4,37 +4,11 @@ import * as url from 'node:url'
export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url)) export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url)) export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
/**
* Model-specific RPM (requests per minute) limits to handle API quotas
*
* @remarks
* Set `undefined` for models without specific limits.
*/
/// keep-sorted
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
'claude-haiku-4-5-20251001': 50,
'gemini-2.5-flash': 25,
'gpt-5-nano': 50,
'grok-4-fast-non-reasoning': 50,
}
/** /**
* Default concurrency for parallel evaluations to prevent bursting * Default concurrency for parallel evaluations to prevent bursting
*/ */
export const DEFAULT_CONCURRENCY = 10 export const DEFAULT_CONCURRENCY = 10
/**
* Display names for data format types
*/
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
'json-pretty': 'JSON',
'json-compact': 'JSON compact',
'toon': 'TOON',
'csv': 'CSV',
'xml': 'XML',
'yaml': 'YAML',
} as const
/** /**
* Enable dry run mode for quick testing with limited AI requests * Enable dry run mode for quick testing with limited AI requests
* *
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
maxQuestions: 10, maxQuestions: 10,
} }
/**
* Model-specific RPM (requests per minute) limits to handle API quotas
*
* @remarks
* Set `undefined` for models without specific limits.
*/
/// keep-sorted
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
'claude-haiku-4-5-20251001': 50,
'gemini-2.5-flash': 25,
'gpt-5-nano': 50,
'grok-4-fast-non-reasoning': 50,
}
/**
* Display names for data format types
*/
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
'json-pretty': 'JSON',
'json-compact': 'JSON compact',
'toon': 'TOON',
'csv': 'CSV',
'xml': 'XML',
'yaml': 'YAML',
} as const
/**
* Question type identifiers
*/
export const QUESTION_TYPES = [
'field-retrieval',
'aggregation',
'filtering',
'structure-awareness',
] as const
/**
* Display names for question types
*/
export const QUESTION_TYPE_LABELS = {
'field-retrieval': 'Field Retrieval',
'aggregation': 'Aggregation',
'filtering': 'Filtering',
'structure-awareness': 'Structure Awareness',
} as const
/**
* Dataset identifiers
*/
export const DATASET_NAMES = [
'tabular',
'nested',
'analytics',
'github',
'event-logs',
'nested-config',
] as const
/**
* Structure class identifiers
*/
export const STRUCTURE_CLASSES = [
'uniform',
'semi-uniform',
'nested',
'deep',
] as const
/** /**
* Threshold values for filtering and aggregation questions * Threshold values for filtering and aggregation questions
*/ */
export const QUESTION_THRESHOLDS = { export const QUESTION_THRESHOLDS = {
tabular: { tabular: {
salaryRanges: [60000, 80000, 100000, 120000], salaryRanges: [60000, 80000, 100000],
experienceYears: [5, 10, 15, 20], experienceYears: [5, 10, 15, 20],
departmentSalaryThreshold: 80000, departmentSalaryThreshold: 80000,
departmentExperienceThreshold: 10, departmentExperienceThreshold: 10,
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
totalThresholdsForItems: [300, 500], totalThresholdsForItems: [300, 500],
}, },
analytics: { analytics: {
views: [5000, 7000], views: [6000],
conversions: [10, 30], conversions: [20],
viewsForFiltering: [6000, 7000], viewsForFiltering: [6000, 7000],
conversionsForFiltering: 15, conversionsForFiltering: 15,
revenueThresholds: [500, 1000, 1500, 2000, 2500], revenueThresholds: [1000, 1500, 2000],
viewsThresholdForRevenue: 6000, viewsThresholdForRevenue: 6000,
clicksForFiltering: [250, 400], clicksForFiltering: [250, 400],
conversionsForClickFiltering: 15, conversionsForClickFiltering: 15,
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
}, },
github: { github: {
stars: [100000, 150000, 200000], stars: [100000, 150000, 200000],
forks: [20000, 35000, 50000], forks: [20000, 35000],
watchers: [5000, 8000], watchers: [8000],
starForkCombinations: [ starForkCombinations: [
{ stars: 75000, forks: 15000 }, { stars: 75000, forks: 15000 },
{ stars: 100000, forks: 20000 }, { stars: 100000, forks: 20000 },
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
*/ */
export const QUESTION_LIMITS = { export const QUESTION_LIMITS = {
tabular: { tabular: {
fieldRetrieval: 14, fieldRetrieval: 12,
aggregationDepartments: 4, aggregationDepartments: 3,
filteringMultiConditionDepartments: 5, filteringMultiConditionDepartments: 5,
filteringExperience: 3, filteringExperience: 3,
filteringDepartmentExp: 3, filteringDepartmentExp: 3,
filteringDepartmentActive: 3, filteringDepartmentActive: 2,
}, },
nested: { nested: {
fieldRetrievalOrders: 8, fieldRetrievalOrders: 8,
fieldRetrievalCustomers: 10, fieldRetrievalCustomers: 8,
aggregationStatuses: 5, aggregationStatuses: 3,
filteringStatusAndValue: 5, filteringStatusAndValue: 4,
filteringStatusAndItems: 3, filteringStatusAndItems: 3,
}, },
analytics: { analytics: {
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
github: { github: {
fieldRetrievalRepos: 11, fieldRetrievalRepos: 11,
aggregationBranches: 2, aggregationBranches: 2,
filteringStarsAndForks: 8, filteringStarsAndForks: 3,
}, },
eventLogs: { eventLogs: {
fieldRetrieval: 10, fieldRetrieval: 10,
aggregationEndpoints: 4, aggregationEndpoints: 2,
filteringLevelAndStatus: 3, filteringLevelAndStatus: 3,
filteringEndpointAndStatus: 3, filteringEndpointAndStatus: 3,
filteringEndpointRetryable: 2,
}, },
nestedConfig: { nestedConfig: {
fieldRetrieval: 10, fieldRetrieval: 10,
filteringComplex: 6, filteringComplex: 5,
}, },
} as const } as const

View File

@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
/** /**
* Generate employee data (uniform tabular structure) * Generate employee data (uniform tabular structure)
*/ */
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
function generateEmployees(count: number): { employees: Employee[] } { function generateEmployees(count: number): { employees: Employee[] } {
return { return {

View File

@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
xai('grok-4-fast-non-reasoning'), xai('grok-4-fast-non-reasoning'),
] ]
/**
* Format primers
*
* @remarks
* Neutral descriptions to help models parse each format.
*/
export const PRIMERS: Record<string, string> = {
'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
'yaml': 'YAML: Indentation-based key/value and lists (- items).',
'xml': 'XML: Tag-based tree structure with nested elements.',
'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
}
/**
* Code fence language tags for proper syntax highlighting
*/
export const FENCE: Record<string, string> = {
'toon': 'toon',
'json-pretty': 'json',
'json-compact': 'json',
'yaml': 'yaml',
'xml': 'xml',
'csv': 'csv',
}
/** /**
* Evaluate a single question with a specific format and model * Evaluate a single question with a specific format and model
*/ */
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
model: LanguageModelV2 model: LanguageModelV2
}, },
): Promise<EvaluationResult> { ): Promise<EvaluationResult> {
const primer = PRIMERS[formatName] ?? ''
const fence = FENCE[formatName] ?? ''
const prompt = ` const prompt = `
${primer}
Given the following data in ${formatName} format: Given the following data in ${formatName} format:
\`\`\` \`\`\`${fence}
${formattedData} ${formattedData}
\`\`\` \`\`\`

View File

@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
} }
// Filtering: endpoint AND retryable error // Filtering: endpoint AND retryable error
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) { for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
questions.push( questions.push(
new QuestionBuilder() new QuestionBuilder()

View File

@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
import { generateGithubQuestions } from './github' import { generateGithubQuestions } from './github'
import { generateNestedQuestions } from './nested' import { generateNestedQuestions } from './nested'
import { generateNestedConfigQuestions } from './nested-config' import { generateNestedConfigQuestions } from './nested-config'
import { generateStructureQuestions } from './structure'
import { generateTabularQuestions } from './tabular' import { generateTabularQuestions } from './tabular'
import { createIdGenerator } from './utils' import { createIdGenerator } from './utils'
/** /**
* Generate ~200 questions from all datasets * Generate questions from all datasets
* *
* @remarks * @remarks
* - Field Retrieval: Direct field access with no computation * - Field Retrieval: Direct field access with no computation
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?" * Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
* - Filtering: Multi-condition queries requiring complex logical operations * - Filtering: Multi-condition queries requiring complex logical operations
* Examples: "How many X WHERE condition1 AND condition2?" * Examples: "How many X WHERE condition1 AND condition2?"
* - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
* Examples: "How many records?", "List the field names", "What is the last record's field?"
*/ */
export function generateQuestions(): Question[] { export function generateQuestions(): Question[] {
const questions: Question[] = [] const questions: Question[] = []
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
questions.push(...generateEventLogsQuestions(eventLogs, getId)) questions.push(...generateEventLogsQuestions(eventLogs, getId))
questions.push(...generateNestedConfigQuestions(nestedConfig, getId)) questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
// Generate structure-awareness questions (tests format-native affordances)
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
return questions return questions
} }

View File

@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
// Aggregation: additional nested counts // Aggregation: additional nested counts
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0) const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0) const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
.type('aggregation') .type('aggregation')
.dataset('nested-config') .dataset('nested-config')
.build(), .build(),
new QuestionBuilder()
.id(getId())
.prompt('How many distinct scopes are defined across all authentication providers?')
.groundTruth(String(distinctScopes))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder() new QuestionBuilder()
.id(getId()) .id(getId())
.prompt('What is the total number of variants across all feature flags?') .prompt('What is the total number of variants across all feature flags?')

View File

@@ -0,0 +1,324 @@
import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
import type { Question } from '../types'
import { QuestionBuilder } from './utils'
/**
* Generate structure-awareness questions across all datasets
*
* These questions test format-native structural affordances:
* - TOON's explicit array length [N] and field declarations {fields}
* - CSV's header row (but no explicit length)
* - JSON/YAML have neither unless the model counts manually
*/
export function generateStructureQuestions(
employees: Employee[],
orders: Order[],
metrics: AnalyticsMetric[],
repos: Repository[],
logs: EventLog[],
getId: () => string,
): Question[] {
const questions: Question[] = []
// ========== TABULAR DATASET (Employees) ==========
// Count: Total employees (tests array length awareness)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many employees are in the dataset?')
.groundTruth(String(employees.length))
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Field list: Employee fields (tests field name awareness)
const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for employees (comma-separated, in order).')
.groundTruth(employeeFields)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Positional: Third field name for employees (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 3rd field name for employees?')
.groundTruth('email')
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Last row: Last employee's department (tests ability to find last row using length)
const lastEmployee = employees.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the department of the last employee in the dataset?')
.groundTruth(lastEmployee.department)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Last row: Last employee's name
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the name of the last employee in the dataset?')
.groundTruth(lastEmployee.name)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Field count: How many fields per employee (tests schema awareness)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each employee record have?')
.groundTruth('7')
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// ========== NESTED DATASET (Orders) ==========
// Count: Total orders
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many orders are in the dataset?')
.groundTruth(String(orders.length))
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Field list: Order fields
const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the top-level field names for orders (comma-separated, in order).')
.groundTruth(orderFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Nested count: Items in specific order
const orderWithManyItems = orders.reduce((max, order) =>
order.items.length > max.items.length ? order : max,
)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
.groundTruth(String(orderWithManyItems.items.length))
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Nested field list: Item fields
const itemFields = 'sku,name,quantity,price'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What are the field names for items within orders (comma-separated, in order)?')
.groundTruth(itemFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Last row: Last order's status
const lastOrder = orders.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the status of the last order in the dataset?')
.groundTruth(lastOrder.status)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Customer field list
const customerFields = 'id,name,email,phone'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
.groundTruth(customerFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// ========== ANALYTICS DATASET (Metrics) ==========
// Count: Total metrics
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many metric records are in the dataset?')
.groundTruth(String(metrics.length))
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Field list: Metric fields
const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for metrics (comma-separated, in order).')
.groundTruth(metricFields)
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Positional: Fifth field name for metrics (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 5th field name for analytics metrics?')
.groundTruth('revenue')
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Last row: Last metric's date
const lastMetric = metrics.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the date of the last metric record in the dataset?')
.groundTruth(lastMetric.date)
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Field count: How many fields per metric
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each metric record have?')
.groundTruth('6')
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// ========== GITHUB DATASET (Repositories) ==========
// Count: Total repositories
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many repositories are in the dataset?')
.groundTruth(String(repos.length))
.type('structure-awareness')
.dataset('github')
.build(),
)
// Field list: Repository fields
const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for repositories (comma-separated, in order).')
.groundTruth(repoFields)
.type('structure-awareness')
.dataset('github')
.build(),
)
// Positional: Seventh field name for repos (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 7th field name for GitHub repositories?')
.groundTruth('forks')
.type('structure-awareness')
.dataset('github')
.build(),
)
// Last row: Last repo's name
const lastRepo = repos.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the name of the last repository in the dataset?')
.groundTruth(lastRepo.name)
.type('structure-awareness')
.dataset('github')
.build(),
)
// Field count: How many fields per repository
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each repository record have?')
.groundTruth('11')
.type('structure-awareness')
.dataset('github')
.build(),
)
// ========== EVENT LOGS DATASET ==========
// Count: Total logs
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many log entries are in the dataset?')
.groundTruth(String(logs.length))
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
// Field list: Base log fields (including optional error)
const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
.groundTruth(logFields)
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
// Last row: Last log's level
const lastLog = logs.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the level of the last log entry in the dataset?')
.groundTruth(lastLog.level)
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
return questions
}

View File

@@ -1,5 +1,5 @@
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types' import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
import { FORMATTER_DISPLAY_NAMES } from './constants' import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
import { ACCURACY_DATASETS } from './datasets' import { ACCURACY_DATASETS } from './datasets'
import { models } from './evaluate' import { models } from './evaluate'
import { supportsCSV } from './formatters' import { supportsCSV } from './formatters'
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
if (formatName === 'csv' && !supportsCSV(dataset)) if (formatName === 'csv' && !supportsCSV(dataset))
continue continue
const formatted = formatter(dataset.data) const formattedData = formatter(dataset.data)
const key = `${formatName}-${dataset.name}` const key = `${formatName}-${dataset.name}`
tokenCounts[key] = tokenize(formatted) tokenCounts[key] = tokenize(formattedData)
} }
} }
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
// Generate performance by model // Generate performance by model
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames) const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
// Generate question type breakdown
const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
const totalQuestions = [...new Set(results.map(r => r.questionId))].length const totalQuestions = [...new Set(results.map(r => r.questionId))].length
// Calculate question type distribution // Calculate question type distribution
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
const aggregationCount = questions.filter(q => q.type === 'aggregation').length const aggregationCount = questions.filter(q => q.type === 'aggregation').length
const filteringCount = questions.filter(q => q.type === 'filtering').length const filteringCount = questions.filter(q => q.type === 'filtering').length
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0) const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0) const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0) const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
// Calculate dataset sizes // Calculate dataset sizes
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0 const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
@@ -233,7 +238,11 @@ ${modelBreakdown}
${summaryComparison} ${summaryComparison}
<details> <details>
<summary><strong>Performance by dataset and model</strong></summary> <summary><strong>Performance by dataset, model, and question type</strong></summary>
#### Performance by Question Type
${questionTypeBreakdown}
#### Performance by Dataset #### Performance by Dataset
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
#### Question Types #### Question Types
${totalQuestions} questions are generated dynamically across three categories: ${totalQuestions} questions are generated dynamically across four categories:
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths) - **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → \`75000\` - Example: "What is Alice's salary?" → \`75000\`
- Example: "How many items are in order ORD-0042?" → \`3\` - Example: "How many items are in order ORD-0042?" → \`3\`
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\` - Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
- Example: "How many employees in Sales have salary > 80000?" → \`5\` - Example: "How many employees in Sales have salary > 80000?" → \`5\`
- Example: "How many active employees have more than 10 years of experience?" → \`8\` - Example: "How many active employees have more than 10 years of experience?" → \`8\`
- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
- Example: "How many employees are in the dataset?" → \`100\`
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
- Example: "What is the department of the last employee?" → \`Sales\`
#### Evaluation Process #### Evaluation Process
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}). 1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
@@ -413,6 +427,48 @@ ${tableRows}
}).filter(Boolean).join('\n').trim() }).filter(Boolean).join('\n').trim()
} }
/**
* Generate question type breakdown table
*/
function generateQuestionTypeBreakdown(
formatResults: FormatResult[],
results: EvaluationResult[],
questions: Question[],
): string {
// Build header
const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
const header = `| Question Type | ${formatNames.join(' | ')} |`
const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
// Build rows
const rows = QUESTION_TYPES.map((type) => {
const questionIds = questions.filter(q => q.type === type).map(q => q.id)
const typeResults = results.filter(r => questionIds.includes(r.questionId))
if (typeResults.length === 0)
return undefined
const accuracies = formatResults.map((fr) => {
const formatTypeResults = typeResults.filter(r => r.format === fr.format)
if (formatTypeResults.length === 0)
return 'N/A'
const correctCount = formatTypeResults.filter(r => r.isCorrect).length
const totalCount = formatTypeResults.length
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
return `${(accuracy * 100).toFixed(1)}%`
})
return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
}).filter(Boolean)
return `
${header}
${separator}
${rows.join('\n')}
`.trim()
}
/** /**
* Generate per-model performance comparison tables * Generate per-model performance comparison tables
*/ */

View File

@@ -1,11 +1,17 @@
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
export type QuestionType = typeof QUESTION_TYPES[number]
export type DatasetName = typeof DATASET_NAMES[number]
export type StructureClass = typeof STRUCTURE_CLASSES[number]
export interface DatasetMetadata { export interface DatasetMetadata {
supportsCSV: boolean supportsCSV: boolean
structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep' structureClass: StructureClass
tabularEligibility: number tabularEligibility: number
} }
export interface Dataset { export interface Dataset {
name: string name: DatasetName
description: string description: string
data: Record<string, any> data: Record<string, any>
metadata: DatasetMetadata metadata: DatasetMetadata
@@ -15,8 +21,8 @@ export interface Question {
id: string id: string
prompt: string prompt: string
groundTruth: string groundTruth: string
type: 'field-retrieval' | 'aggregation' | 'filtering' type: QuestionType
dataset: string dataset: DatasetName
} }
export interface EvaluationResult { export interface EvaluationResult {