chore(benchmarks): add structure-awareness questions

2026-01-29 23:34:10 +08:00 · 2025-11-07 09:03:51 +01:00
parent 853c3babea
commit 89df613059
13 changed files with 522 additions and 67 deletions
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -4,37 +4,11 @@ import * as url from 'node:url'
 export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
 export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))

-/**
- * Model-specific RPM (requests per minute) limits to handle API quotas
- *
- * @remarks
- * Set `undefined` for models without specific limits.
- */
-/// keep-sorted
-export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
-  'claude-haiku-4-5-20251001': 50,
-  'gemini-2.5-flash': 25,
-  'gpt-5-nano': 50,
-  'grok-4-fast-non-reasoning': 50,
-}
-
 /**
 * Default concurrency for parallel evaluations to prevent bursting
 */
 export const DEFAULT_CONCURRENCY = 10

-/**
- * Display names for data format types
- */
-export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
-  'json-pretty': 'JSON',
-  'json-compact': 'JSON compact',
-  'toon': 'TOON',
-  'csv': 'CSV',
-  'xml': 'XML',
-  'yaml': 'YAML',
-} as const
-
 /**
 * Enable dry run mode for quick testing with limited AI requests
 *
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
  maxQuestions: 10,
 }

+/**
+ * Model-specific RPM (requests per minute) limits to handle API quotas
+ *
+ * @remarks
+ * Set `undefined` for models without specific limits.
+ */
+/// keep-sorted
+export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
+  'claude-haiku-4-5-20251001': 50,
+  'gemini-2.5-flash': 25,
+  'gpt-5-nano': 50,
+  'grok-4-fast-non-reasoning': 50,
+}
+
+/**
+ * Display names for data format types
+ */
+export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
+  'json-pretty': 'JSON',
+  'json-compact': 'JSON compact',
+  'toon': 'TOON',
+  'csv': 'CSV',
+  'xml': 'XML',
+  'yaml': 'YAML',
+} as const
+
+/**
+ * Question type identifiers
+ */
+export const QUESTION_TYPES = [
+  'field-retrieval',
+  'aggregation',
+  'filtering',
+  'structure-awareness',
+] as const
+
+/**
+ * Display names for question types
+ */
+export const QUESTION_TYPE_LABELS = {
+  'field-retrieval': 'Field Retrieval',
+  'aggregation': 'Aggregation',
+  'filtering': 'Filtering',
+  'structure-awareness': 'Structure Awareness',
+} as const
+
+/**
+ * Dataset identifiers
+ */
+export const DATASET_NAMES = [
+  'tabular',
+  'nested',
+  'analytics',
+  'github',
+  'event-logs',
+  'nested-config',
+] as const
+
+/**
+ * Structure class identifiers
+ */
+export const STRUCTURE_CLASSES = [
+  'uniform',
+  'semi-uniform',
+  'nested',
+  'deep',
+] as const
+
 /**
 * Threshold values for filtering and aggregation questions
 */
 export const QUESTION_THRESHOLDS = {
  tabular: {
-    salaryRanges: [60000, 80000, 100000, 120000],
+    salaryRanges: [60000, 80000, 100000],
    experienceYears: [5, 10, 15, 20],
    departmentSalaryThreshold: 80000,
    departmentExperienceThreshold: 10,
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
    totalThresholdsForItems: [300, 500],
  },
  analytics: {
-    views: [5000, 7000],
-    conversions: [10, 30],
+    views: [6000],
+    conversions: [20],
    viewsForFiltering: [6000, 7000],
    conversionsForFiltering: 15,
-    revenueThresholds: [500, 1000, 1500, 2000, 2500],
+    revenueThresholds: [1000, 1500, 2000],
    viewsThresholdForRevenue: 6000,
    clicksForFiltering: [250, 400],
    conversionsForClickFiltering: 15,
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
  },
  github: {
    stars: [100000, 150000, 200000],
-    forks: [20000, 35000, 50000],
-    watchers: [5000, 8000],
+    forks: [20000, 35000],
+    watchers: [8000],
    starForkCombinations: [
      { stars: 75000, forks: 15000 },
      { stars: 100000, forks: 20000 },
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
 */
 export const QUESTION_LIMITS = {
  tabular: {
-    fieldRetrieval: 14,
-    aggregationDepartments: 4,
+    fieldRetrieval: 12,
+    aggregationDepartments: 3,
    filteringMultiConditionDepartments: 5,
    filteringExperience: 3,
    filteringDepartmentExp: 3,
-    filteringDepartmentActive: 3,
+    filteringDepartmentActive: 2,
  },
  nested: {
    fieldRetrievalOrders: 8,
-    fieldRetrievalCustomers: 10,
-    aggregationStatuses: 5,
-    filteringStatusAndValue: 5,
+    fieldRetrievalCustomers: 8,
+    aggregationStatuses: 3,
+    filteringStatusAndValue: 4,
    filteringStatusAndItems: 3,
  },
  analytics: {
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
  github: {
    fieldRetrievalRepos: 11,
    aggregationBranches: 2,
-    filteringStarsAndForks: 8,
+    filteringStarsAndForks: 3,
  },
  eventLogs: {
    fieldRetrieval: 10,
-    aggregationEndpoints: 4,
+    aggregationEndpoints: 2,
    filteringLevelAndStatus: 3,
    filteringEndpointAndStatus: 3,
+    filteringEndpointRetryable: 2,
  },
  nestedConfig: {
    fieldRetrieval: 10,
-    filteringComplex: 6,
+    filteringComplex: 5,
  },
 } as const
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
 /**
 * Generate employee data (uniform tabular structure)
 */
-const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
+const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const

 function generateEmployees(count: number): { employees: Employee[] } {
  return {
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
  xai('grok-4-fast-non-reasoning'),
 ]

+/**
+ * Format primers
+ *
+ * @remarks
+ * Neutral descriptions to help models parse each format.
+ */
+export const PRIMERS: Record<string, string> = {
+  'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
+  'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
+  'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
+  'yaml': 'YAML: Indentation-based key/value and lists (- items).',
+  'xml': 'XML: Tag-based tree structure with nested elements.',
+  'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
+}
+
+/**
+ * Code fence language tags for proper syntax highlighting
+ */
+export const FENCE: Record<string, string> = {
+  'toon': 'toon',
+  'json-pretty': 'json',
+  'json-compact': 'json',
+  'yaml': 'yaml',
+  'xml': 'xml',
+  'csv': 'csv',
+}
+
 /**
 * Evaluate a single question with a specific format and model
 */
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
    model: LanguageModelV2
  },
 ): Promise<EvaluationResult> {
+  const primer = PRIMERS[formatName] ?? ''
+  const fence = FENCE[formatName] ?? ''
+
  const prompt = `
+${primer}
+
 Given the following data in ${formatName} format:

-\`\`\`
+\`\`\`${fence}
 ${formattedData}
 \`\`\`

--- a/benchmarks/src/questions/event-logs.ts
+++ b/benchmarks/src/questions/event-logs.ts
@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
  }

  // Filtering: endpoint AND retryable error
-  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
+  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
    const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
    questions.push(
      new QuestionBuilder()
--- a/benchmarks/src/questions/index.ts
+++ b/benchmarks/src/questions/index.ts
@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
 import { generateGithubQuestions } from './github'
 import { generateNestedQuestions } from './nested'
 import { generateNestedConfigQuestions } from './nested-config'
+import { generateStructureQuestions } from './structure'
 import { generateTabularQuestions } from './tabular'
 import { createIdGenerator } from './utils'

 /**
- * Generate ~200 questions from all datasets
+ * Generate questions from all datasets
 *
 * @remarks
 * - Field Retrieval: Direct field access with no computation
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
 *   Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
 * - Filtering: Multi-condition queries requiring complex logical operations
 *   Examples: "How many X WHERE condition1 AND condition2?"
+ * - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
+ *   Examples: "How many records?", "List the field names", "What is the last record's field?"
 */
 export function generateQuestions(): Question[] {
  const questions: Question[] = []
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
  questions.push(...generateEventLogsQuestions(eventLogs, getId))
  questions.push(...generateNestedConfigQuestions(nestedConfig, getId))

+  // Generate structure-awareness questions (tests format-native affordances)
+  questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
+
  return questions
 }
--- a/benchmarks/src/questions/nested-config.ts
+++ b/benchmarks/src/questions/nested-config.ts
@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
  // Aggregation: additional nested counts
  const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
  const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
-  const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
  const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
  const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
  const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .type('aggregation')
      .dataset('nested-config')
      .build(),
-    new QuestionBuilder()
-      .id(getId())
-      .prompt('How many distinct scopes are defined across all authentication providers?')
-      .groundTruth(String(distinctScopes))
-      .type('aggregation')
-      .dataset('nested-config')
-      .build(),
    new QuestionBuilder()
      .id(getId())
      .prompt('What is the total number of variants across all feature flags?')
--- a/benchmarks/src/questions/structure.ts
+++ b/benchmarks/src/questions/structure.ts
@@ -0,0 +1,324 @@
+import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
+import type { Question } from '../types'
+import { QuestionBuilder } from './utils'
+
+/**
+ * Generate structure-awareness questions across all datasets
+ *
+ * These questions test format-native structural affordances:
+ * - TOON's explicit array length [N] and field declarations {fields}
+ * - CSV's header row (but no explicit length)
+ * - JSON/YAML have neither unless the model counts manually
+ */
+export function generateStructureQuestions(
+  employees: Employee[],
+  orders: Order[],
+  metrics: AnalyticsMetric[],
+  repos: Repository[],
+  logs: EventLog[],
+  getId: () => string,
+): Question[] {
+  const questions: Question[] = []
+
+  // ========== TABULAR DATASET (Employees) ==========
+
+  // Count: Total employees (tests array length awareness)
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many employees are in the dataset?')
+      .groundTruth(String(employees.length))
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // Field list: Employee fields (tests field name awareness)
+  const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('List the field names for employees (comma-separated, in order).')
+      .groundTruth(employeeFields)
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // Positional: Third field name for employees (tests TOON {fields} syntax)
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the 3rd field name for employees?')
+      .groundTruth('email')
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // Last row: Last employee's department (tests ability to find last row using length)
+  const lastEmployee = employees.at(-1)!
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the department of the last employee in the dataset?')
+      .groundTruth(lastEmployee.department)
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // Last row: Last employee's name
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the name of the last employee in the dataset?')
+      .groundTruth(lastEmployee.name)
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // Field count: How many fields per employee (tests schema awareness)
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many fields does each employee record have?')
+      .groundTruth('7')
+      .type('structure-awareness')
+      .dataset('tabular')
+      .build(),
+  )
+
+  // ========== NESTED DATASET (Orders) ==========
+
+  // Count: Total orders
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many orders are in the dataset?')
+      .groundTruth(String(orders.length))
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // Field list: Order fields
+  const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('List the top-level field names for orders (comma-separated, in order).')
+      .groundTruth(orderFields)
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // Nested count: Items in specific order
+  const orderWithManyItems = orders.reduce((max, order) =>
+    order.items.length > max.items.length ? order : max,
+  )
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
+      .groundTruth(String(orderWithManyItems.items.length))
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // Nested field list: Item fields
+  const itemFields = 'sku,name,quantity,price'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What are the field names for items within orders (comma-separated, in order)?')
+      .groundTruth(itemFields)
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // Last row: Last order's status
+  const lastOrder = orders.at(-1)!
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the status of the last order in the dataset?')
+      .groundTruth(lastOrder.status)
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // Customer field list
+  const customerFields = 'id,name,email,phone'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
+      .groundTruth(customerFields)
+      .type('structure-awareness')
+      .dataset('nested')
+      .build(),
+  )
+
+  // ========== ANALYTICS DATASET (Metrics) ==========
+
+  // Count: Total metrics
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many metric records are in the dataset?')
+      .groundTruth(String(metrics.length))
+      .type('structure-awareness')
+      .dataset('analytics')
+      .build(),
+  )
+
+  // Field list: Metric fields
+  const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('List the field names for metrics (comma-separated, in order).')
+      .groundTruth(metricFields)
+      .type('structure-awareness')
+      .dataset('analytics')
+      .build(),
+  )
+
+  // Positional: Fifth field name for metrics (tests TOON {fields} syntax)
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the 5th field name for analytics metrics?')
+      .groundTruth('revenue')
+      .type('structure-awareness')
+      .dataset('analytics')
+      .build(),
+  )
+
+  // Last row: Last metric's date
+  const lastMetric = metrics.at(-1)!
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the date of the last metric record in the dataset?')
+      .groundTruth(lastMetric.date)
+      .type('structure-awareness')
+      .dataset('analytics')
+      .build(),
+  )
+
+  // Field count: How many fields per metric
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many fields does each metric record have?')
+      .groundTruth('6')
+      .type('structure-awareness')
+      .dataset('analytics')
+      .build(),
+  )
+
+  // ========== GITHUB DATASET (Repositories) ==========
+
+  // Count: Total repositories
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many repositories are in the dataset?')
+      .groundTruth(String(repos.length))
+      .type('structure-awareness')
+      .dataset('github')
+      .build(),
+  )
+
+  // Field list: Repository fields
+  const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('List the field names for repositories (comma-separated, in order).')
+      .groundTruth(repoFields)
+      .type('structure-awareness')
+      .dataset('github')
+      .build(),
+  )
+
+  // Positional: Seventh field name for repos (tests TOON {fields} syntax)
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the 7th field name for GitHub repositories?')
+      .groundTruth('forks')
+      .type('structure-awareness')
+      .dataset('github')
+      .build(),
+  )
+
+  // Last row: Last repo's name
+  const lastRepo = repos.at(-1)!
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the name of the last repository in the dataset?')
+      .groundTruth(lastRepo.name)
+      .type('structure-awareness')
+      .dataset('github')
+      .build(),
+  )
+
+  // Field count: How many fields per repository
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many fields does each repository record have?')
+      .groundTruth('11')
+      .type('structure-awareness')
+      .dataset('github')
+      .build(),
+  )
+
+  // ========== EVENT LOGS DATASET ==========
+
+  // Count: Total logs
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many log entries are in the dataset?')
+      .groundTruth(String(logs.length))
+      .type('structure-awareness')
+      .dataset('event-logs')
+      .build(),
+  )
+
+  // Field list: Base log fields (including optional error)
+  const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
+      .groundTruth(logFields)
+      .type('structure-awareness')
+      .dataset('event-logs')
+      .build(),
+  )
+
+  // Last row: Last log's level
+  const lastLog = logs.at(-1)!
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the level of the last log entry in the dataset?')
+      .groundTruth(lastLog.level)
+      .type('structure-awareness')
+      .dataset('event-logs')
+      .build(),
+  )
+
+  return questions
+}
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,5 +1,5 @@
 import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
-import { FORMATTER_DISPLAY_NAMES } from './constants'
+import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
 import { ACCURACY_DATASETS } from './datasets'
 import { models } from './evaluate'
 import { supportsCSV } from './formatters'
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
      if (formatName === 'csv' && !supportsCSV(dataset))
        continue

-      const formatted = formatter(dataset.data)
+      const formattedData = formatter(dataset.data)
      const key = `${formatName}-${dataset.name}`
-      tokenCounts[key] = tokenize(formatted)
+      tokenCounts[key] = tokenize(formattedData)
    }
  }

@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(

  // Generate performance by model
  const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
+
+  // Generate question type breakdown
+  const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
  const totalQuestions = [...new Set(results.map(r => r.questionId))].length

  // Calculate question type distribution
  const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
  const aggregationCount = questions.filter(q => q.type === 'aggregation').length
  const filteringCount = questions.filter(q => q.type === 'filtering').length
+  const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length

  const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
  const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
  const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
+  const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)

  // Calculate dataset sizes
  const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
@@ -233,7 +238,11 @@ ${modelBreakdown}
 ${summaryComparison}

 <details>
-<summary><strong>Performance by dataset and model</strong></summary>
+<summary><strong>Performance by dataset, model, and question type</strong></summary>
+
+#### Performance by Question Type
+
+${questionTypeBreakdown}

 #### Performance by Dataset

@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:

 #### Question Types

-${totalQuestions} questions are generated dynamically across three categories:
+${totalQuestions} questions are generated dynamically across four categories:

-\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
+- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
  - Example: "What is Alice's salary?" → \`75000\`
  - Example: "How many items are in order ORD-0042?" → \`3\`
  - Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
  - Example: "How many employees in Sales have salary > 80000?" → \`5\`
  - Example: "How many active employees have more than 10 years of experience?" → \`8\`

+- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
+  - Example: "How many employees are in the dataset?" → \`100\`
+  - Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
+  - Example: "What is the department of the last employee?" → \`Sales\`
+
 #### Evaluation Process

 1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
@@ -413,6 +427,48 @@ ${tableRows}
  }).filter(Boolean).join('\n').trim()
 }

+/**
+ * Generate question type breakdown table
+ */
+function generateQuestionTypeBreakdown(
+  formatResults: FormatResult[],
+  results: EvaluationResult[],
+  questions: Question[],
+): string {
+  // Build header
+  const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
+  const header = `| Question Type | ${formatNames.join(' | ')} |`
+  const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
+
+  // Build rows
+  const rows = QUESTION_TYPES.map((type) => {
+    const questionIds = questions.filter(q => q.type === type).map(q => q.id)
+    const typeResults = results.filter(r => questionIds.includes(r.questionId))
+
+    if (typeResults.length === 0)
+      return undefined
+
+    const accuracies = formatResults.map((fr) => {
+      const formatTypeResults = typeResults.filter(r => r.format === fr.format)
+      if (formatTypeResults.length === 0)
+        return 'N/A'
+
+      const correctCount = formatTypeResults.filter(r => r.isCorrect).length
+      const totalCount = formatTypeResults.length
+      const accuracy = totalCount > 0 ? correctCount / totalCount : 0
+      return `${(accuracy * 100).toFixed(1)}%`
+    })
+
+    return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
+  }).filter(Boolean)
+
+  return `
+${header}
+${separator}
+${rows.join('\n')}
+`.trim()
+}
+
 /**
 * Generate per-model performance comparison tables
 */
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -1,11 +1,17 @@
+import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
+
+export type QuestionType = typeof QUESTION_TYPES[number]
+export type DatasetName = typeof DATASET_NAMES[number]
+export type StructureClass = typeof STRUCTURE_CLASSES[number]
+
 export interface DatasetMetadata {
  supportsCSV: boolean
-  structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep'
+  structureClass: StructureClass
  tabularEligibility: number
 }

 export interface Dataset {
-  name: string
+  name: DatasetName
  description: string
  data: Record<string, any>
  metadata: DatasetMetadata
@@ -15,8 +21,8 @@ export interface Question {
  id: string
  prompt: string
  groundTruth: string
-  type: 'field-retrieval' | 'aggregation' | 'filtering'
-  dataset: string
+  type: QuestionType
+  dataset: DatasetName
 }

 export interface EvaluationResult {