chore(benchmarks): add structure-awareness questions

2026-01-29 23:34:10 +08:00 · 2025-11-07 09:03:51 +01:00
parent 853c3babea
commit 89df613059
13 changed files with 522 additions and 67 deletions
--- a/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
+++ b/benchmarks/results/accuracy/models/claude-haiku-4-5-20251001
--- a/benchmarks/results/accuracy/models/gemini-2.5-flash
+++ b/benchmarks/results/accuracy/models/gemini-2.5-flash
--- a/benchmarks/results/accuracy/models/gpt-5-nano
+++ b/benchmarks/results/accuracy/models/gpt-5-nano
--- a/benchmarks/results/accuracy/models/grok-4-fast-non-reasoning
+++ b/benchmarks/results/accuracy/models/grok-4-fast-non-reasoning
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -4,37 +4,11 @@ import * as url from 'node:url'
 export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
 export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
 /**
 * Model-specific RPM (requests per minute) limits to handle API quotas
 *
 * @remarks
 * Set `undefined` for models without specific limits.
 */
 /// keep-sorted
 export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
  'claude-haiku-4-5-20251001': 50,
  'gemini-2.5-flash': 25,
  'gpt-5-nano': 50,
  'grok-4-fast-non-reasoning': 50,
 }
 /**
 * Default concurrency for parallel evaluations to prevent bursting
 */
 export const DEFAULT_CONCURRENCY = 10
 /**
 * Display names for data format types
 */
 export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
  'json-pretty': 'JSON',
  'json-compact': 'JSON compact',
  'toon': 'TOON',
  'csv': 'CSV',
  'xml': 'XML',
  'yaml': 'YAML',
 } as const
 /**
 * Enable dry run mode for quick testing with limited AI requests
 *
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
  maxQuestions: 10,
 }
 /**
 * Model-specific RPM (requests per minute) limits to handle API quotas
 *
 * @remarks
 * Set `undefined` for models without specific limits.
 */
 /// keep-sorted
 export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
  'claude-haiku-4-5-20251001': 50,
  'gemini-2.5-flash': 25,
  'gpt-5-nano': 50,
  'grok-4-fast-non-reasoning': 50,
 }
 /**
 * Display names for data format types
 */
 export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
  'json-pretty': 'JSON',
  'json-compact': 'JSON compact',
  'toon': 'TOON',
  'csv': 'CSV',
  'xml': 'XML',
  'yaml': 'YAML',
 } as const
 /**
 * Question type identifiers
 */
 export const QUESTION_TYPES = [
  'field-retrieval',
  'aggregation',
  'filtering',
  'structure-awareness',
 ] as const
 /**
 * Display names for question types
 */
 export const QUESTION_TYPE_LABELS = {
  'field-retrieval': 'Field Retrieval',
  'aggregation': 'Aggregation',
  'filtering': 'Filtering',
  'structure-awareness': 'Structure Awareness',
 } as const
 /**
 * Dataset identifiers
 */
 export const DATASET_NAMES = [
  'tabular',
  'nested',
  'analytics',
  'github',
  'event-logs',
  'nested-config',
 ] as const
 /**
 * Structure class identifiers
 */
 export const STRUCTURE_CLASSES = [
  'uniform',
  'semi-uniform',
  'nested',
  'deep',
 ] as const
 /**
 * Threshold values for filtering and aggregation questions
 */
 export const QUESTION_THRESHOLDS = {
  tabular: {
-    salaryRanges: [60000, 80000, 100000, 120000],
+    salaryRanges: [60000, 80000, 100000],
    experienceYears: [5, 10, 15, 20],
    departmentSalaryThreshold: 80000,
    departmentExperienceThreshold: 10,
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
    totalThresholdsForItems: [300, 500],
  },
  analytics: {
-    views: [5000, 7000],
+    views: [6000],
-    conversions: [10, 30],
+    conversions: [20],
    viewsForFiltering: [6000, 7000],
    conversionsForFiltering: 15,
-    revenueThresholds: [500, 1000, 1500, 2000, 2500],
+    revenueThresholds: [1000, 1500, 2000],
    viewsThresholdForRevenue: 6000,
    clicksForFiltering: [250, 400],
    conversionsForClickFiltering: 15,
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
  },
  github: {
    stars: [100000, 150000, 200000],
-    forks: [20000, 35000, 50000],
+    forks: [20000, 35000],
-    watchers: [5000, 8000],
+    watchers: [8000],
    starForkCombinations: [
      { stars: 75000, forks: 15000 },
      { stars: 100000, forks: 20000 },
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
 */
 export const QUESTION_LIMITS = {
  tabular: {
-    fieldRetrieval: 14,
+    fieldRetrieval: 12,
-    aggregationDepartments: 4,
+    aggregationDepartments: 3,
    filteringMultiConditionDepartments: 5,
    filteringExperience: 3,
    filteringDepartmentExp: 3,
-    filteringDepartmentActive: 3,
+    filteringDepartmentActive: 2,
  },
  nested: {
    fieldRetrievalOrders: 8,
-    fieldRetrievalCustomers: 10,
+    fieldRetrievalCustomers: 8,
-    aggregationStatuses: 5,
+    aggregationStatuses: 3,
-    filteringStatusAndValue: 5,
+    filteringStatusAndValue: 4,
    filteringStatusAndItems: 3,
  },
  analytics: {
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
  github: {
    fieldRetrievalRepos: 11,
    aggregationBranches: 2,
-    filteringStarsAndForks: 8,
+    filteringStarsAndForks: 3,
  },
  eventLogs: {
    fieldRetrieval: 10,
-    aggregationEndpoints: 4,
+    aggregationEndpoints: 2,
    filteringLevelAndStatus: 3,
    filteringEndpointAndStatus: 3,
    filteringEndpointRetryable: 2,
  },
  nestedConfig: {
    fieldRetrieval: 10,
-    filteringComplex: 6,
+    filteringComplex: 5,
  },
 } as const
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
 /**
 * Generate employee data (uniform tabular structure)
 */
-const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
+const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
 function generateEmployees(count: number): { employees: Employee[] } {
  return {
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
  xai('grok-4-fast-non-reasoning'),
 ]
 /**
 * Format primers
 *
 * @remarks
 * Neutral descriptions to help models parse each format.
 */
 export const PRIMERS: Record<string, string> = {
  'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
  'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
  'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
  'yaml': 'YAML: Indentation-based key/value and lists (- items).',
  'xml': 'XML: Tag-based tree structure with nested elements.',
  'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
 }
 /**
 * Code fence language tags for proper syntax highlighting
 */
 export const FENCE: Record<string, string> = {
  'toon': 'toon',
  'json-pretty': 'json',
  'json-compact': 'json',
  'yaml': 'yaml',
  'xml': 'xml',
  'csv': 'csv',
 }
 /**
 * Evaluate a single question with a specific format and model
 */
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
    model: LanguageModelV2
  },
 ): Promise<EvaluationResult> {
  const primer = PRIMERS[formatName] ?? ''
  const fence = FENCE[formatName] ?? ''
  const prompt = `
 ${primer}
 Given the following data in ${formatName} format:
-\`\`\`
+\`\`\`${fence}
 ${formattedData}
 \`\`\`
--- a/benchmarks/src/questions/event-logs.ts
+++ b/benchmarks/src/questions/event-logs.ts
@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
  }
  // Filtering: endpoint AND retryable error
-  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
+  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
    const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
    questions.push(
      new QuestionBuilder()
--- a/benchmarks/src/questions/index.ts
+++ b/benchmarks/src/questions/index.ts
@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
 import { generateGithubQuestions } from './github'
 import { generateNestedQuestions } from './nested'
 import { generateNestedConfigQuestions } from './nested-config'
 import { generateStructureQuestions } from './structure'
 import { generateTabularQuestions } from './tabular'
 import { createIdGenerator } from './utils'
 /**
- * Generate ~200 questions from all datasets
+ * Generate questions from all datasets
 *
 * @remarks
 * - Field Retrieval: Direct field access with no computation
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
 *   Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
 * - Filtering: Multi-condition queries requiring complex logical operations
 *   Examples: "How many X WHERE condition1 AND condition2?"
 * - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
 *   Examples: "How many records?", "List the field names", "What is the last record's field?"
 */
 export function generateQuestions(): Question[] {
  const questions: Question[] = []
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
  questions.push(...generateEventLogsQuestions(eventLogs, getId))
  questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
  // Generate structure-awareness questions (tests format-native affordances)
  questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
  return questions
 }
--- a/benchmarks/src/questions/nested-config.ts
+++ b/benchmarks/src/questions/nested-config.ts
@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
  // Aggregation: additional nested counts
  const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
  const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
  const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
  const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
  const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
  const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .type('aggregation')
      .dataset('nested-config')
      .build(),
    new QuestionBuilder()
      .id(getId())
      .prompt('How many distinct scopes are defined across all authentication providers?')
      .groundTruth(String(distinctScopes))
      .type('aggregation')
      .dataset('nested-config')
      .build(),
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the total number of variants across all feature flags?')
--- a/benchmarks/src/questions/structure.ts
+++ b/benchmarks/src/questions/structure.ts
@@ -0,0 +1,324 @@
 import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
 import type { Question } from '../types'
 import { QuestionBuilder } from './utils'
 /**
 * Generate structure-awareness questions across all datasets
 *
 * These questions test format-native structural affordances:
 * - TOON's explicit array length [N] and field declarations {fields}
 * - CSV's header row (but no explicit length)
 * - JSON/YAML have neither unless the model counts manually
 */
 export function generateStructureQuestions(
  employees: Employee[],
  orders: Order[],
  metrics: AnalyticsMetric[],
  repos: Repository[],
  logs: EventLog[],
  getId: () => string,
 ): Question[] {
  const questions: Question[] = []
  // ========== TABULAR DATASET (Employees) ==========
  // Count: Total employees (tests array length awareness)
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many employees are in the dataset?')
      .groundTruth(String(employees.length))
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // Field list: Employee fields (tests field name awareness)
  const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('List the field names for employees (comma-separated, in order).')
      .groundTruth(employeeFields)
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // Positional: Third field name for employees (tests TOON {fields} syntax)
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the 3rd field name for employees?')
      .groundTruth('email')
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // Last row: Last employee's department (tests ability to find last row using length)
  const lastEmployee = employees.at(-1)!
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the department of the last employee in the dataset?')
      .groundTruth(lastEmployee.department)
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // Last row: Last employee's name
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the name of the last employee in the dataset?')
      .groundTruth(lastEmployee.name)
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // Field count: How many fields per employee (tests schema awareness)
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many fields does each employee record have?')
      .groundTruth('7')
      .type('structure-awareness')
      .dataset('tabular')
      .build(),
  )
  // ========== NESTED DATASET (Orders) ==========
  // Count: Total orders
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many orders are in the dataset?')
      .groundTruth(String(orders.length))
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // Field list: Order fields
  const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('List the top-level field names for orders (comma-separated, in order).')
      .groundTruth(orderFields)
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // Nested count: Items in specific order
  const orderWithManyItems = orders.reduce((max, order) =>
    order.items.length > max.items.length ? order : max,
  )
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
      .groundTruth(String(orderWithManyItems.items.length))
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // Nested field list: Item fields
  const itemFields = 'sku,name,quantity,price'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What are the field names for items within orders (comma-separated, in order)?')
      .groundTruth(itemFields)
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // Last row: Last order's status
  const lastOrder = orders.at(-1)!
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the status of the last order in the dataset?')
      .groundTruth(lastOrder.status)
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // Customer field list
  const customerFields = 'id,name,email,phone'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
      .groundTruth(customerFields)
      .type('structure-awareness')
      .dataset('nested')
      .build(),
  )
  // ========== ANALYTICS DATASET (Metrics) ==========
  // Count: Total metrics
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many metric records are in the dataset?')
      .groundTruth(String(metrics.length))
      .type('structure-awareness')
      .dataset('analytics')
      .build(),
  )
  // Field list: Metric fields
  const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('List the field names for metrics (comma-separated, in order).')
      .groundTruth(metricFields)
      .type('structure-awareness')
      .dataset('analytics')
      .build(),
  )
  // Positional: Fifth field name for metrics (tests TOON {fields} syntax)
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the 5th field name for analytics metrics?')
      .groundTruth('revenue')
      .type('structure-awareness')
      .dataset('analytics')
      .build(),
  )
  // Last row: Last metric's date
  const lastMetric = metrics.at(-1)!
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the date of the last metric record in the dataset?')
      .groundTruth(lastMetric.date)
      .type('structure-awareness')
      .dataset('analytics')
      .build(),
  )
  // Field count: How many fields per metric
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many fields does each metric record have?')
      .groundTruth('6')
      .type('structure-awareness')
      .dataset('analytics')
      .build(),
  )
  // ========== GITHUB DATASET (Repositories) ==========
  // Count: Total repositories
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many repositories are in the dataset?')
      .groundTruth(String(repos.length))
      .type('structure-awareness')
      .dataset('github')
      .build(),
  )
  // Field list: Repository fields
  const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('List the field names for repositories (comma-separated, in order).')
      .groundTruth(repoFields)
      .type('structure-awareness')
      .dataset('github')
      .build(),
  )
  // Positional: Seventh field name for repos (tests TOON {fields} syntax)
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the 7th field name for GitHub repositories?')
      .groundTruth('forks')
      .type('structure-awareness')
      .dataset('github')
      .build(),
  )
  // Last row: Last repo's name
  const lastRepo = repos.at(-1)!
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the name of the last repository in the dataset?')
      .groundTruth(lastRepo.name)
      .type('structure-awareness')
      .dataset('github')
      .build(),
  )
  // Field count: How many fields per repository
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many fields does each repository record have?')
      .groundTruth('11')
      .type('structure-awareness')
      .dataset('github')
      .build(),
  )
  // ========== EVENT LOGS DATASET ==========
  // Count: Total logs
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('How many log entries are in the dataset?')
      .groundTruth(String(logs.length))
      .type('structure-awareness')
      .dataset('event-logs')
      .build(),
  )
  // Field list: Base log fields (including optional error)
  const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
      .groundTruth(logFields)
      .type('structure-awareness')
      .dataset('event-logs')
      .build(),
  )
  // Last row: Last log's level
  const lastLog = logs.at(-1)!
  questions.push(
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the level of the last log entry in the dataset?')
      .groundTruth(lastLog.level)
      .type('structure-awareness')
      .dataset('event-logs')
      .build(),
  )
  return questions
 }
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,5 +1,5 @@
 import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
-import { FORMATTER_DISPLAY_NAMES } from './constants'
+import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
 import { ACCURACY_DATASETS } from './datasets'
 import { models } from './evaluate'
 import { supportsCSV } from './formatters'
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
      if (formatName === 'csv' && !supportsCSV(dataset))
        continue
-      const formatted = formatter(dataset.data)
+      const formattedData = formatter(dataset.data)
      const key = `${formatName}-${dataset.name}`
-      tokenCounts[key] = tokenize(formatted)
+      tokenCounts[key] = tokenize(formattedData)
    }
  }
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
  // Generate performance by model
  const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
  // Generate question type breakdown
  const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
  const totalQuestions = [...new Set(results.map(r => r.questionId))].length
  // Calculate question type distribution
  const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
  const aggregationCount = questions.filter(q => q.type === 'aggregation').length
  const filteringCount = questions.filter(q => q.type === 'filtering').length
  const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
  const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
  const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
  const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
  const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
  // Calculate dataset sizes
  const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
@@ -233,7 +238,11 @@ ${modelBreakdown}
 ${summaryComparison}
 <details>
-<summary><strong>Performance by dataset and model</strong></summary>
+<summary><strong>Performance by dataset, model, and question type</strong></summary>
 #### Performance by Question Type
 ${questionTypeBreakdown}
 #### Performance by Dataset
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
 #### Question Types
-${totalQuestions} questions are generated dynamically across three categories:
+${totalQuestions} questions are generated dynamically across four categories:
-\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
+- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → \`75000\`
  - Example: "How many items are in order ORD-0042?" → \`3\`
  - Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
  - Example: "How many employees in Sales have salary > 80000?" → \`5\`
  - Example: "How many active employees have more than 10 years of experience?" → \`8\`
 - **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
  - Example: "How many employees are in the dataset?" → \`100\`
  - Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
  - Example: "What is the department of the last employee?" → \`Sales\`
 #### Evaluation Process
 1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
@@ -413,6 +427,48 @@ ${tableRows}
  }).filter(Boolean).join('\n').trim()
 }
 /**
 * Generate question type breakdown table
 */
 function generateQuestionTypeBreakdown(
  formatResults: FormatResult[],
  results: EvaluationResult[],
  questions: Question[],
 ): string {
  // Build header
  const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
  const header = `| Question Type | ${formatNames.join(' | ')} |`
  const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
  // Build rows
  const rows = QUESTION_TYPES.map((type) => {
    const questionIds = questions.filter(q => q.type === type).map(q => q.id)
    const typeResults = results.filter(r => questionIds.includes(r.questionId))
    if (typeResults.length === 0)
      return undefined
    const accuracies = formatResults.map((fr) => {
      const formatTypeResults = typeResults.filter(r => r.format === fr.format)
      if (formatTypeResults.length === 0)
        return 'N/A'
      const correctCount = formatTypeResults.filter(r => r.isCorrect).length
      const totalCount = formatTypeResults.length
      const accuracy = totalCount > 0 ? correctCount / totalCount : 0
      return `${(accuracy * 100).toFixed(1)}%`
    })
    return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
  }).filter(Boolean)
  return `
 ${header}
 ${separator}
 ${rows.join('\n')}
 `.trim()
 }
 /**
 * Generate per-model performance comparison tables
 */
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -1,11 +1,17 @@
 import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
 export type QuestionType = typeof QUESTION_TYPES[number]
 export type DatasetName = typeof DATASET_NAMES[number]
 export type StructureClass = typeof STRUCTURE_CLASSES[number]
 export interface DatasetMetadata {
  supportsCSV: boolean
-  structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep'
+  structureClass: StructureClass
  tabularEligibility: number
 }
 export interface Dataset {
-  name: string
+  name: DatasetName
  description: string
  data: Record<string, any>
  metadata: DatasetMetadata
@@ -15,8 +21,8 @@ export interface Question {
  id: string
  prompt: string
  groundTruth: string
-  type: 'field-retrieval' | 'aggregation' | 'filtering'
+  type: QuestionType
-  dataset: string
+  dataset: DatasetName
 }
 export interface EvaluationResult {