chore: more work on benchmarks

2026-01-29 23:34:10 +08:00 · 2025-11-06 15:51:31 +01:00
parent bc711ccecf
commit a9d52fc69b
15 changed files with 1647 additions and 213 deletions
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -101,10 +101,10 @@ export const QUESTION_THRESHOLDS = {
 */
 export const QUESTION_LIMITS = {
  tabular: {
-    fieldRetrieval: 20,
-    aggregationDepartments: 6,
-    filteringMultiConditionDepartments: 6,
-    filteringExperience: 4,
+    fieldRetrieval: 14,
+    aggregationDepartments: 4,
+    filteringMultiConditionDepartments: 5,
+    filteringExperience: 3,
    filteringDepartmentExp: 3,
    filteringDepartmentActive: 3,
  },
@@ -116,7 +116,7 @@ export const QUESTION_LIMITS = {
    filteringStatusAndItems: 3,
  },
  analytics: {
-    fieldRetrievalDates: 13,
+    fieldRetrievalDates: 9,
  },
  github: {
    fieldRetrievalRepos: 11,
@@ -125,12 +125,12 @@ export const QUESTION_LIMITS = {
  },
  eventLogs: {
    fieldRetrieval: 10,
-    aggregationEndpoints: 3,
-    filteringLevelAndStatus: 2,
-    filteringEndpointAndStatus: 2,
+    aggregationEndpoints: 4,
+    filteringLevelAndStatus: 3,
+    filteringEndpointAndStatus: 3,
  },
  nestedConfig: {
-    fieldRetrieval: 5,
-    filteringComplex: 2,
+    fieldRetrieval: 10,
+    filteringComplex: 6,
  },
 } as const
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -5,67 +5,6 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
 // Seed for reproducibility
 faker.seed(12345)

-/**
- * Calculate the tabular eligibility percentage of a data structure
- *
- * @remarks
- * Recursively analyzes data to determine what percentage of arrays qualify
- * for TOON's tabular format (uniform objects with primitive values only).
- */
-export function calculateTabularEligibility(data: unknown): number {
-  let totalArrays = 0
-  let tabularArrays = 0
-
-  function isTabularArray(arr: unknown[]): boolean {
-    if (arr.length === 0)
-      return false
-
-    // Check if all elements are objects
-    if (!arr.every(item => typeof item === 'object' && item !== null && !Array.isArray(item)))
-      return false
-
-    // Get keys from first object
-    const firstKeys = Object.keys(arr[0] as Record<string, unknown>)
-    if (firstKeys.length === 0)
-      return false
-
-    // Check if all objects have the same keys and only primitive values
-    return arr.every((item) => {
-      const itemObj = item as Record<string, unknown>
-      const itemKeys = Object.keys(itemObj)
-      if (itemKeys.length !== firstKeys.length)
-        return false
-      if (!firstKeys.every(key => itemKeys.includes(key)))
-        return false
-
-      // Check if all values are primitives (no nested objects or arrays)
-      return firstKeys.every((key) => {
-        const value = itemObj[key]
-        return value === null || ['string', 'number', 'boolean'].includes(typeof value)
-      })
-    })
-  }
-
-  function traverse(obj: unknown): void {
-    if (Array.isArray(obj)) {
-      totalArrays++
-      if (isTabularArray(obj))
-        tabularArrays++
-
-      // Continue traversing array elements
-      obj.forEach(item => traverse(item))
-    }
-    else if (typeof obj === 'object' && obj !== null) {
-      // Traverse object properties
-      Object.values(obj).forEach(value => traverse(value))
-    }
-  }
-
-  traverse(data)
-
-  return totalArrays === 0 ? 0 : Math.round((tabularArrays / totalArrays) * 100)
-}
-
 /**
 * Employee record structure for tabular dataset
 */
@@ -275,7 +214,7 @@ const tabularDataset: Dataset = {
  metadata: {
    supportsCSV: true,
    structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
  },
 }

@@ -285,38 +224,21 @@ const tabularDataset: Dataset = {
 const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
 const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const

-const ORDER_CONSTANTS = {
-  CUSTOMER_ID_MOD: 20,
-  MIN_ITEMS: 1,
-  MAX_ITEMS: 4,
-  MIN_ITEM_PRICE: 9.99,
-  MAX_ITEM_PRICE: 199.99,
-  MIN_ITEM_QUANTITY: 1,
-  MAX_ITEM_QUANTITY: 5,
-  SKU_LENGTH: 6,
-  ORDER_ID_PADDING: 4,
-  RECENT_DAYS: 90,
-  TAX_RATE: 0.08,
-} as const
-
 function generateOrders(count: number): { orders: Order[] } {
  return {
    orders: Array.from({ length: count }, (_, i) => {
-      const customerId = (i % ORDER_CONSTANTS.CUSTOMER_ID_MOD) + 1
-      const itemCount = faker.number.int({ min: ORDER_CONSTANTS.MIN_ITEMS, max: ORDER_CONSTANTS.MAX_ITEMS })
+      const customerId = (i % 20) + 1 // Rotate through 20 customers
+      const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order

      const items = Array.from({ length: itemCount }, (_, j) => {
        const price = faker.number.float({
-          min: ORDER_CONSTANTS.MIN_ITEM_PRICE,
-          max: ORDER_CONSTANTS.MAX_ITEM_PRICE,
+          min: 9.99,
+          max: 199.99,
          fractionDigits: 2,
        })
-        const quantity = faker.number.int({
-          min: ORDER_CONSTANTS.MIN_ITEM_QUANTITY,
-          max: ORDER_CONSTANTS.MAX_ITEM_QUANTITY,
-        })
+        const quantity = faker.number.int({ min: 1, max: 5 })
        return {
-          sku: `SKU-${faker.string.alphanumeric({ length: ORDER_CONSTANTS.SKU_LENGTH }).toUpperCase()}`,
+          sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
          name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
          quantity,
          price,
@@ -324,11 +246,11 @@ function generateOrders(count: number): { orders: Order[] } {
      })

      const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
-      const tax = Number((subtotal * ORDER_CONSTANTS.TAX_RATE).toFixed(2))
+      const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate
      const total = Number((subtotal + tax).toFixed(2))

      return {
-        orderId: `ORD-${String(i + 1).padStart(ORDER_CONSTANTS.ORDER_ID_PADDING, '0')}`,
+        orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
        customer: {
          id: customerId,
          name: faker.person.fullName(),
@@ -340,7 +262,7 @@ function generateOrders(count: number): { orders: Order[] } {
        tax,
        total,
        status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
-        orderDate: faker.date.recent({ days: ORDER_CONSTANTS.RECENT_DAYS }).toISOString().split('T')[0],
+        orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
      }
    }),
  }
@@ -359,7 +281,7 @@ const nestedDataset: Dataset = {
  metadata: {
    supportsCSV: false,
    structureClass: 'nested',
-    tabularEligibility: 33, // orders array is not tabular, but items arrays within are
+    tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
  },
 }

@@ -376,7 +298,7 @@ const analyticsDataset: Dataset = {
  metadata: {
    supportsCSV: true,
    structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
  },
 }

@@ -395,7 +317,7 @@ const githubDataset: Dataset = {
  metadata: {
    supportsCSV: true,
    structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // Repository array contains uniform objects with primitive values
  },
 }

@@ -597,7 +519,7 @@ const eventLogsDataset: Dataset = {
  metadata: {
    supportsCSV: false,
    structureClass: 'semi-uniform',
-    tabularEligibility: 50, // ~50% of logs have nested error objects
+    tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
  },
 }

@@ -614,7 +536,7 @@ const nestedConfigDataset: Dataset = {
  metadata: {
    supportsCSV: false,
    structureClass: 'deep',
-    tabularEligibility: 0, // Highly nested, minimal tabular arrays
+    tabularEligibility: 0, // Deeply nested configuration with no tabular arrays
  },
 }

@@ -642,7 +564,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
    metadata: {
      supportsCSV: true,
      structureClass: 'uniform',
-      tabularEligibility: 100,
+      tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
    },
  },
  // Nested: 500 orders
@@ -653,7 +575,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
    metadata: {
      supportsCSV: false,
      structureClass: 'nested',
-      tabularEligibility: 33,
+      tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
    },
  },
  // Analytics: 365 days
@@ -664,7 +586,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
    metadata: {
      supportsCSV: true,
      structureClass: 'uniform',
-      tabularEligibility: 100,
+      tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
    },
  },
  // GitHub: 100 repos (same as accuracy)
@@ -677,7 +599,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
    metadata: {
      supportsCSV: false,
      structureClass: 'semi-uniform',
-      tabularEligibility: 50,
+      tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
    },
  },
  // Nested config: 1 config (same as accuracy)
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic'
 import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
 import { xai } from '@ai-sdk/xai'
-import * as prompts from '@clack/prompts'
 import { generateText } from 'ai'

 /**
@@ -102,17 +101,10 @@ Is the actual answer correct? Consider:
 Respond with only "YES" or "NO".
 `.trim()

-  try {
-    const { text } = await generateText({
-      model: models.find(m => m.modelId === 'gpt-5-nano')!,
-      prompt,
-    })
+  const { text } = await generateText({
+    model: models.find(m => m.modelId === 'gpt-5-nano')!,
+    prompt,
+  })

-    return text.trim().toUpperCase() === 'YES'
-  }
-  catch (error) {
-    prompts.log.error(`Validation error: ${error}`)
-    // Fallback to simple string comparison
-    return actual.toLowerCase().trim() === expected.toLowerCase().trim()
-  }
+  return text.trim().toUpperCase() === 'YES'
 }
--- a/benchmarks/src/questions/analytics.ts
+++ b/benchmarks/src/questions/analytics.ts
@@ -1,7 +1,7 @@
 import type { AnalyticsMetric } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'

 /**
 * Generate analytics (website metrics) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] {
  const questions: Question[] = []

-  if (metrics.length === 0)
-    return questions
-
  // Field retrieval: date-based metrics
  const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [
    (metric, getId) => new QuestionBuilder()
@@ -99,7 +96,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()

  // Aggregation: high views/conversions
  for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
-    const count = countByPredicate(metrics, m => m.views > threshold)
+    const count = metrics.filter(m => m.views > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -112,7 +109,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
  }

  for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) {
-    const count = countByPredicate(metrics, m => m.conversions > threshold)
+    const count = metrics.filter(m => m.conversions > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -126,10 +123,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()

  // Filtering: multi-condition (views AND revenue)
  for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
      m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -143,10 +139,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()

  // Filtering: revenue thresholds
  for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
      m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -160,10 +155,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()

  // Filtering: clicks and conversions
  for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
      m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -177,10 +171,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()

  // Filtering: revenue and bounce rate
  for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
      m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
--- a/benchmarks/src/questions/event-logs.ts
+++ b/benchmarks/src/questions/event-logs.ts
@@ -1,7 +1,7 @@
 import type { EventLog } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'

 /**
 * Generate event log questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] {
  const questions: Question[] = []

-  if (logs.length === 0)
-    return questions
-
  // Field retrieval: log metadata
  const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [
    (log, getId) => new QuestionBuilder()
@@ -76,7 +73,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
  // Aggregation: by level
  const levels = [...new Set(logs.map(l => l.level))]
  for (const level of levels) {
-    const count = countByPredicate(logs, l => l.level === level)
+    const count = logs.filter(l => l.level === level).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -91,7 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
  // Aggregation: by endpoint
  const endpoints = [...new Set(logs.map(l => l.endpoint))]
  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) {
-    const count = countByPredicate(logs, l => l.endpoint === endpoint)
+    const count = logs.filter(l => l.endpoint === endpoint).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -104,8 +101,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
  }

  // Aggregation: by status code range
-  const errorCount = countByPredicate(logs, l => l.statusCode >= 400)
-  const successCount = countByPredicate(logs, l => l.statusCode >= 200 && l.statusCode < 300)
+  const errorCount = logs.filter(l => l.statusCode >= 400).length
+  const successCount = logs.filter(l => l.statusCode >= 200 && l.statusCode < 300).length

  questions.push(
    new QuestionBuilder()
@@ -124,12 +121,21 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
      .build(),
  )

+  // Aggregation: retryable errors
+  const retryableErrorCount = logs.filter(l => l.error?.retryable === true).length
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many log entries have a retryable error?')
+      .groundTruth(String(retryableErrorCount))
+      .type('aggregation')
+      .dataset('event-logs')
+      .build(),
+  )
+
  // Filtering: multi-condition (level AND status)
  for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) {
-    const count = countByPredicate(
-      logs,
-      l => l.level === level && l.statusCode >= 400,
-    )
+    const count = logs.filter(l => l.level === level && l.statusCode >= 400).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -143,10 +149,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string

  // Filtering: endpoint AND status
  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
-    const count = countByPredicate(
-      logs,
-      l => l.endpoint === endpoint && l.statusCode >= 500,
-    )
+    const count = logs.filter(l => l.endpoint === endpoint && l.statusCode >= 500).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -158,5 +161,19 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
    )
  }

+  // Filtering: endpoint AND retryable error
+  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
+    const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
+    questions.push(
+      new QuestionBuilder()
+        .id(getId())
+        .prompt(`How many log entries for endpoint "${endpoint}" have a retryable error?`)
+        .groundTruth(String(count))
+        .type('filtering')
+        .dataset('event-logs')
+        .build(),
+    )
+  }
+
  return questions
 }
--- a/benchmarks/src/questions/github.ts
+++ b/benchmarks/src/questions/github.ts
@@ -1,7 +1,7 @@
 import type { Repository } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'

 /**
 * Generate GitHub repository questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] {
  const questions: Question[] = []

-  if (repos.length === 0)
-    return questions
-
  // Field retrieval: repository metadata
  const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [
    (repo, getId) => new QuestionBuilder()
@@ -92,7 +89,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
  // Aggregation: by default branch
  const branches = [...new Set(repos.map(r => r.defaultBranch))]
  for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
-    const count = countByPredicate(repos, r => r.defaultBranch === branch)
+    const count = repos.filter(r => r.defaultBranch === branch).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -106,7 +103,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string

  // Aggregation: high star counts
  for (const threshold of QUESTION_THRESHOLDS.github.stars) {
-    const count = countByPredicate(repos, r => r.stars > threshold)
+    const count = repos.filter(r => r.stars > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -120,7 +117,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string

  // Aggregation: high fork counts
  for (const threshold of QUESTION_THRESHOLDS.github.forks) {
-    const count = countByPredicate(repos, r => r.forks > threshold)
+    const count = repos.filter(r => r.forks > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -134,7 +131,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string

  // Aggregation: high watcher counts
  for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
-    const count = countByPredicate(repos, r => r.watchers > threshold)
+    const count = repos.filter(r => r.watchers > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -148,10 +145,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string

  // Filtering: multi-condition (stars AND forks)
  for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
-    const count = countByPredicate(
-      repos,
+    const count = repos.filter(
      r => r.stars > combo.stars && r.forks > combo.forks,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -165,10 +161,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string

  // Filtering: stars AND watchers
  for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
-    const count = countByPredicate(
-      repos,
+    const count = repos.filter(
      r => r.stars > combo.stars && r.watchers > combo.watchers,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
--- a/benchmarks/src/questions/index.ts
+++ b/benchmarks/src/questions/index.ts
@@ -10,10 +10,9 @@ import { generateTabularQuestions } from './tabular'
 import { createIdGenerator } from './utils'

 /**
- * Generate all questions from datasets
+ * Generate ~200 questions from all datasets
 *
 * @remarks
- * Generates ~150-160 questions across different question types and datasets:
 * - Field Retrieval: Direct field access with no computation
 *   Examples: "What is X's salary?", "What is the status of order Y?"
 * - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
--- a/benchmarks/src/questions/nested-config.ts
+++ b/benchmarks/src/questions/nested-config.ts
@@ -34,6 +34,26 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      prompt: 'What is the session duration?',
      groundTruth: String(config.authentication.session.duration),
    },
+    {
+      prompt: 'What is the minimum connection pool size?',
+      groundTruth: String(config.database.pool.min),
+    },
+    {
+      prompt: 'What is the connection pool idle timeout?',
+      groundTruth: String(config.database.pool.idleTimeout),
+    },
+    {
+      prompt: 'What is the database name?',
+      groundTruth: config.database.name,
+    },
+    {
+      prompt: 'What is the session refresh threshold?',
+      groundTruth: String(config.authentication.session.refreshThreshold),
+    },
+    {
+      prompt: 'What is the version in the configuration?',
+      groundTruth: config.version,
+    },
  ]

  for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) {
@@ -93,6 +113,18 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .build(),
  )

+  // Aggregation: providers with admin scope
+  const adminScopeProviderCount = config.authentication.providers.filter(p => p.scopes.includes('admin')).length
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many authentication providers include the "admin" scope?')
+      .groundTruth(String(adminScopeProviderCount))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+  )
+
  // Aggregation: feature flag details
  const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length
  questions.push(
@@ -117,6 +149,67 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      .build(),
  )

+  // Aggregation: additional nested counts
+  const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
+  const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
+  const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
+  const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
+  const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
+  const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
+  const groupsWithMultipleRoles = Object.values(config.permissions.groups).filter(g => g.roles.length > 1).length
+
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the total number of permissions across all roles?')
+      .groundTruth(String(totalPermissions))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many distinct permissions are defined across all roles?')
+      .groundTruth(String(distinctPermissions))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many distinct scopes are defined across all authentication providers?')
+      .groundTruth(String(distinctScopes))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the total number of variants across all feature flags?')
+      .groundTruth(String(totalVariants))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many database replicas have a priority greater than 2?')
+      .groundTruth(String(highPriorityReplicas))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many feature flags have a rollout percentage greater than 50?')
+      .groundTruth(String(featuresWithHighRollout))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many groups have more than one role assigned?')
+      .groundTruth(String(groupsWithMultipleRoles))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+  )
+
  // Filtering: complex multi-condition queries
  const filteringQuestions = [
    {
@@ -129,6 +222,31 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
      groundTruth: String(Object.entries(config.permissions.groups)
        .filter(([_, g]) => g.roles.includes('admin')).length),
    },
+    {
+      prompt: 'How many database replicas have priority greater than 2 and port 5432?',
+      groundTruth: String(config.database.replicas
+        .filter(r => r.priority > 2 && r.port === 5432).length),
+    },
+    {
+      prompt: 'How many authentication providers have more than 2 scopes?',
+      groundTruth: String(config.authentication.providers
+        .filter(p => p.scopes.length > 2).length),
+    },
+    {
+      prompt: 'How many roles have at least 5 permissions?',
+      groundTruth: String(Object.values(config.permissions.roles)
+        .filter(r => r.permissions.length >= 5).length),
+    },
+    {
+      prompt: 'How many feature flags are disabled with rollout less than 25%?',
+      groundTruth: String(Object.values(config.features)
+        .filter(f => !f.enabled && f.rollout < 25).length),
+    },
+    {
+      prompt: 'How many enabled features have at least 2 variants?',
+      groundTruth: String(Object.values(config.features)
+        .filter(f => f.enabled && f.variants.length >= 2).length),
+    },
  ]

  for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) {
--- a/benchmarks/src/questions/nested.ts
+++ b/benchmarks/src/questions/nested.ts
@@ -1,7 +1,7 @@
 import type { Order } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'

 /**
 * Generate nested (orders) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] {
  const questions: Question[] = []

-  if (orders.length === 0)
-    return questions
-
  // Field retrieval: order totals and statuses
  const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [
    (order, getId) => new QuestionBuilder()
@@ -89,7 +86,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
  // Count by status
  const statuses = [...new Set(orders.map(o => o.status))]
  for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
-    const count = countByPredicate(orders, o => o.status === status)
+    const count = orders.filter(o => o.status === status).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -134,7 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q

  // Aggregation: high-value orders (single-condition filter)
  for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
-    const count = countByPredicate(orders, o => o.total > threshold)
+    const count = orders.filter(o => o.total > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -149,10 +146,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
  // Filtering: multi-condition queries (status AND value)
  const orderStatuses = [...new Set(orders.map(o => o.status))]
  for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
      o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -166,10 +162,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q

  // Filtering: status AND items count (multi-condition)
  for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
      o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -183,10 +178,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q

  // Filtering: total AND items count (multi-condition)
  for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
      o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
--- a/benchmarks/src/questions/tabular.ts
+++ b/benchmarks/src/questions/tabular.ts
@@ -1,7 +1,7 @@
 import type { Employee } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'

 /**
 * Generate tabular (employee) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] {
  const questions: Question[] = []

-  if (employees.length === 0)
-    return questions
-
  // Field retrieval: specific employees
  const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [
    (emp, getId) => new QuestionBuilder()
@@ -62,7 +59,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
  // Aggregation: count by department
  const departments = [...new Set(employees.map(e => e.department))]
  for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
-    const count = countByPredicate(employees, e => e.department === dept)
+    const count = employees.filter(e => e.department === dept).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -76,7 +73,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str

  // Aggregation: salary ranges (single-condition filters)
  for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
-    const count = countByPredicate(employees, e => e.salary > threshold)
+    const count = employees.filter(e => e.salary > threshold).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -91,8 +88,8 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
  // Aggregation: totals and averages
  const totalEmployees = employees.length
  const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
-  const activeCount = countByPredicate(employees, e => e.active)
-  const inactiveCount = countByPredicate(employees, e => !e.active)
+  const activeCount = employees.filter(e => e.active).length
+  const inactiveCount = employees.filter(e => !e.active).length

  questions.push(
    new QuestionBuilder()
@@ -127,10 +124,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str

  // Filtering: count by department with salary filter (multi-condition)
  for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
-    const count = countByPredicate(
-      employees,
+    const count = employees.filter(
      e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -144,7 +140,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str

  // Filtering: active employees by experience (multi-condition)
  for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
-    const count = countByPredicate(employees, e => e.yearsExperience > exp && e.active)
+    const count = employees.filter(e => e.yearsExperience > exp && e.active).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -158,10 +154,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str

  // Filtering: department by experience (multi-condition)
  for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
-    const count = countByPredicate(
-      employees,
+    const count = employees.filter(
      e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold,
-    )
+    ).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
@@ -175,7 +170,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str

  // Filtering: department by active status (multi-condition)
  for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
-    const count = countByPredicate(employees, e => e.department === dept && e.active)
+    const count = employees.filter(e => e.department === dept && e.active).length
    questions.push(
      new QuestionBuilder()
        .id(getId())
--- a/benchmarks/src/questions/utils.ts
+++ b/benchmarks/src/questions/utils.ts
@@ -61,14 +61,7 @@ export class QuestionBuilder {
 }

 /**
- * Helper: Count items matching a predicate
- */
-export function countByPredicate<T>(items: T[], predicate: (item: T) => boolean): number {
-  return items.filter(predicate).length
-}
-
-/**
- * Helper: Rotate through question generators
+ * Rotate through question generators
 */
 export function rotateQuestions<T>(
  items: T[],
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -15,7 +15,7 @@ export interface Question {
  id: string
  prompt: string
  groundTruth: string
-  type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison'
+  type: 'field-retrieval' | 'aggregation' | 'filtering'
  dataset: string
 }