chore: more work on benchmarks

This commit is contained in:
Johann Schopplich
2025-11-06 15:51:31 +01:00
parent bc711ccecf
commit a9d52fc69b
15 changed files with 1647 additions and 213 deletions

View File

@@ -101,10 +101,10 @@ export const QUESTION_THRESHOLDS = {
*/
export const QUESTION_LIMITS = {
tabular: {
fieldRetrieval: 20,
aggregationDepartments: 6,
filteringMultiConditionDepartments: 6,
filteringExperience: 4,
fieldRetrieval: 14,
aggregationDepartments: 4,
filteringMultiConditionDepartments: 5,
filteringExperience: 3,
filteringDepartmentExp: 3,
filteringDepartmentActive: 3,
},
@@ -116,7 +116,7 @@ export const QUESTION_LIMITS = {
filteringStatusAndItems: 3,
},
analytics: {
fieldRetrievalDates: 13,
fieldRetrievalDates: 9,
},
github: {
fieldRetrievalRepos: 11,
@@ -125,12 +125,12 @@ export const QUESTION_LIMITS = {
},
eventLogs: {
fieldRetrieval: 10,
aggregationEndpoints: 3,
filteringLevelAndStatus: 2,
filteringEndpointAndStatus: 2,
aggregationEndpoints: 4,
filteringLevelAndStatus: 3,
filteringEndpointAndStatus: 3,
},
nestedConfig: {
fieldRetrieval: 5,
filteringComplex: 2,
fieldRetrieval: 10,
filteringComplex: 6,
},
} as const

View File

@@ -5,67 +5,6 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
// Seed for reproducibility
faker.seed(12345)
/**
* Calculate the tabular eligibility percentage of a data structure
*
* @remarks
* Recursively analyzes data to determine what percentage of arrays qualify
* for TOON's tabular format (uniform objects with primitive values only).
*/
export function calculateTabularEligibility(data: unknown): number {
let totalArrays = 0
let tabularArrays = 0
function isTabularArray(arr: unknown[]): boolean {
if (arr.length === 0)
return false
// Check if all elements are objects
if (!arr.every(item => typeof item === 'object' && item !== null && !Array.isArray(item)))
return false
// Get keys from first object
const firstKeys = Object.keys(arr[0] as Record<string, unknown>)
if (firstKeys.length === 0)
return false
// Check if all objects have the same keys and only primitive values
return arr.every((item) => {
const itemObj = item as Record<string, unknown>
const itemKeys = Object.keys(itemObj)
if (itemKeys.length !== firstKeys.length)
return false
if (!firstKeys.every(key => itemKeys.includes(key)))
return false
// Check if all values are primitives (no nested objects or arrays)
return firstKeys.every((key) => {
const value = itemObj[key]
return value === null || ['string', 'number', 'boolean'].includes(typeof value)
})
})
}
function traverse(obj: unknown): void {
if (Array.isArray(obj)) {
totalArrays++
if (isTabularArray(obj))
tabularArrays++
// Continue traversing array elements
obj.forEach(item => traverse(item))
}
else if (typeof obj === 'object' && obj !== null) {
// Traverse object properties
Object.values(obj).forEach(value => traverse(value))
}
}
traverse(data)
return totalArrays === 0 ? 0 : Math.round((tabularArrays / totalArrays) * 100)
}
/**
* Employee record structure for tabular dataset
*/
@@ -275,7 +214,7 @@ const tabularDataset: Dataset = {
metadata: {
supportsCSV: true,
structureClass: 'uniform',
tabularEligibility: 100,
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
},
}
@@ -285,38 +224,21 @@ const tabularDataset: Dataset = {
const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
const ORDER_CONSTANTS = {
CUSTOMER_ID_MOD: 20,
MIN_ITEMS: 1,
MAX_ITEMS: 4,
MIN_ITEM_PRICE: 9.99,
MAX_ITEM_PRICE: 199.99,
MIN_ITEM_QUANTITY: 1,
MAX_ITEM_QUANTITY: 5,
SKU_LENGTH: 6,
ORDER_ID_PADDING: 4,
RECENT_DAYS: 90,
TAX_RATE: 0.08,
} as const
function generateOrders(count: number): { orders: Order[] } {
return {
orders: Array.from({ length: count }, (_, i) => {
const customerId = (i % ORDER_CONSTANTS.CUSTOMER_ID_MOD) + 1
const itemCount = faker.number.int({ min: ORDER_CONSTANTS.MIN_ITEMS, max: ORDER_CONSTANTS.MAX_ITEMS })
const customerId = (i % 20) + 1 // Rotate through 20 customers
const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order
const items = Array.from({ length: itemCount }, (_, j) => {
const price = faker.number.float({
min: ORDER_CONSTANTS.MIN_ITEM_PRICE,
max: ORDER_CONSTANTS.MAX_ITEM_PRICE,
min: 9.99,
max: 199.99,
fractionDigits: 2,
})
const quantity = faker.number.int({
min: ORDER_CONSTANTS.MIN_ITEM_QUANTITY,
max: ORDER_CONSTANTS.MAX_ITEM_QUANTITY,
})
const quantity = faker.number.int({ min: 1, max: 5 })
return {
sku: `SKU-${faker.string.alphanumeric({ length: ORDER_CONSTANTS.SKU_LENGTH }).toUpperCase()}`,
sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
quantity,
price,
@@ -324,11 +246,11 @@ function generateOrders(count: number): { orders: Order[] } {
})
const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
const tax = Number((subtotal * ORDER_CONSTANTS.TAX_RATE).toFixed(2))
const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate
const total = Number((subtotal + tax).toFixed(2))
return {
orderId: `ORD-${String(i + 1).padStart(ORDER_CONSTANTS.ORDER_ID_PADDING, '0')}`,
orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
customer: {
id: customerId,
name: faker.person.fullName(),
@@ -340,7 +262,7 @@ function generateOrders(count: number): { orders: Order[] } {
tax,
total,
status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
orderDate: faker.date.recent({ days: ORDER_CONSTANTS.RECENT_DAYS }).toISOString().split('T')[0],
orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
}
}),
}
@@ -359,7 +281,7 @@ const nestedDataset: Dataset = {
metadata: {
supportsCSV: false,
structureClass: 'nested',
tabularEligibility: 33, // orders array is not tabular, but items arrays within are
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
},
}
@@ -376,7 +298,7 @@ const analyticsDataset: Dataset = {
metadata: {
supportsCSV: true,
structureClass: 'uniform',
tabularEligibility: 100,
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
},
}
@@ -395,7 +317,7 @@ const githubDataset: Dataset = {
metadata: {
supportsCSV: true,
structureClass: 'uniform',
tabularEligibility: 100,
tabularEligibility: 100, // Repository array contains uniform objects with primitive values
},
}
@@ -597,7 +519,7 @@ const eventLogsDataset: Dataset = {
metadata: {
supportsCSV: false,
structureClass: 'semi-uniform',
tabularEligibility: 50, // ~50% of logs have nested error objects
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
},
}
@@ -614,7 +536,7 @@ const nestedConfigDataset: Dataset = {
metadata: {
supportsCSV: false,
structureClass: 'deep',
tabularEligibility: 0, // Highly nested, minimal tabular arrays
tabularEligibility: 0, // Deeply nested configuration with no tabular arrays
},
}
@@ -642,7 +564,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
metadata: {
supportsCSV: true,
structureClass: 'uniform',
tabularEligibility: 100,
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
},
},
// Nested: 500 orders
@@ -653,7 +575,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
metadata: {
supportsCSV: false,
structureClass: 'nested',
tabularEligibility: 33,
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
},
},
// Analytics: 365 days
@@ -664,7 +586,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
metadata: {
supportsCSV: true,
structureClass: 'uniform',
tabularEligibility: 100,
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
},
},
// GitHub: 100 repos (same as accuracy)
@@ -677,7 +599,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
metadata: {
supportsCSV: false,
structureClass: 'semi-uniform',
tabularEligibility: 50,
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
},
},
// Nested config: 1 config (same as accuracy)

View File

@@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic'
import { google } from '@ai-sdk/google'
import { openai } from '@ai-sdk/openai'
import { xai } from '@ai-sdk/xai'
import * as prompts from '@clack/prompts'
import { generateText } from 'ai'
/**
@@ -102,17 +101,10 @@ Is the actual answer correct? Consider:
Respond with only "YES" or "NO".
`.trim()
try {
const { text } = await generateText({
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
const { text } = await generateText({
model: models.find(m => m.modelId === 'gpt-5-nano')!,
prompt,
})
return text.trim().toUpperCase() === 'YES'
}
catch (error) {
prompts.log.error(`Validation error: ${error}`)
// Fallback to simple string comparison
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
}
return text.trim().toUpperCase() === 'YES'
}

View File

@@ -1,7 +1,7 @@
import type { AnalyticsMetric } from '../datasets'
import type { Question } from '../types'
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
/**
* Generate analytics (website metrics) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] {
const questions: Question[] = []
if (metrics.length === 0)
return questions
// Field retrieval: date-based metrics
const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [
(metric, getId) => new QuestionBuilder()
@@ -99,7 +96,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
// Aggregation: high views/conversions
for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
const count = countByPredicate(metrics, m => m.views > threshold)
const count = metrics.filter(m => m.views > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -112,7 +109,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
}
for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) {
const count = countByPredicate(metrics, m => m.conversions > threshold)
const count = metrics.filter(m => m.conversions > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -126,10 +123,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
// Filtering: multi-condition (views AND revenue)
for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
const count = countByPredicate(
metrics,
const count = metrics.filter(
m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -143,10 +139,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
// Filtering: revenue thresholds
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) {
const count = countByPredicate(
metrics,
const count = metrics.filter(
m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -160,10 +155,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
// Filtering: clicks and conversions
for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
const count = countByPredicate(
metrics,
const count = metrics.filter(
m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -177,10 +171,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
// Filtering: revenue and bounce rate
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
const count = countByPredicate(
metrics,
const count = metrics.filter(
m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())

View File

@@ -1,7 +1,7 @@
import type { EventLog } from '../datasets'
import type { Question } from '../types'
import { QUESTION_LIMITS } from '../constants'
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
/**
* Generate event log questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] {
const questions: Question[] = []
if (logs.length === 0)
return questions
// Field retrieval: log metadata
const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [
(log, getId) => new QuestionBuilder()
@@ -76,7 +73,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
// Aggregation: by level
const levels = [...new Set(logs.map(l => l.level))]
for (const level of levels) {
const count = countByPredicate(logs, l => l.level === level)
const count = logs.filter(l => l.level === level).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -91,7 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
// Aggregation: by endpoint
const endpoints = [...new Set(logs.map(l => l.endpoint))]
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) {
const count = countByPredicate(logs, l => l.endpoint === endpoint)
const count = logs.filter(l => l.endpoint === endpoint).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -104,8 +101,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
}
// Aggregation: by status code range
const errorCount = countByPredicate(logs, l => l.statusCode >= 400)
const successCount = countByPredicate(logs, l => l.statusCode >= 200 && l.statusCode < 300)
const errorCount = logs.filter(l => l.statusCode >= 400).length
const successCount = logs.filter(l => l.statusCode >= 200 && l.statusCode < 300).length
questions.push(
new QuestionBuilder()
@@ -124,12 +121,21 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
.build(),
)
// Aggregation: retryable errors
const retryableErrorCount = logs.filter(l => l.error?.retryable === true).length
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many log entries have a retryable error?')
.groundTruth(String(retryableErrorCount))
.type('aggregation')
.dataset('event-logs')
.build(),
)
// Filtering: multi-condition (level AND status)
for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) {
const count = countByPredicate(
logs,
l => l.level === level && l.statusCode >= 400,
)
const count = logs.filter(l => l.level === level && l.statusCode >= 400).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -143,10 +149,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
// Filtering: endpoint AND status
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
const count = countByPredicate(
logs,
l => l.endpoint === endpoint && l.statusCode >= 500,
)
const count = logs.filter(l => l.endpoint === endpoint && l.statusCode >= 500).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -158,5 +161,19 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
)
}
// Filtering: endpoint AND retryable error
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
questions.push(
new QuestionBuilder()
.id(getId())
.prompt(`How many log entries for endpoint "${endpoint}" have a retryable error?`)
.groundTruth(String(count))
.type('filtering')
.dataset('event-logs')
.build(),
)
}
return questions
}

View File

@@ -1,7 +1,7 @@
import type { Repository } from '../datasets'
import type { Question } from '../types'
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
/**
* Generate GitHub repository questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] {
const questions: Question[] = []
if (repos.length === 0)
return questions
// Field retrieval: repository metadata
const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [
(repo, getId) => new QuestionBuilder()
@@ -92,7 +89,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Aggregation: by default branch
const branches = [...new Set(repos.map(r => r.defaultBranch))]
for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
const count = countByPredicate(repos, r => r.defaultBranch === branch)
const count = repos.filter(r => r.defaultBranch === branch).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -106,7 +103,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Aggregation: high star counts
for (const threshold of QUESTION_THRESHOLDS.github.stars) {
const count = countByPredicate(repos, r => r.stars > threshold)
const count = repos.filter(r => r.stars > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -120,7 +117,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Aggregation: high fork counts
for (const threshold of QUESTION_THRESHOLDS.github.forks) {
const count = countByPredicate(repos, r => r.forks > threshold)
const count = repos.filter(r => r.forks > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -134,7 +131,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Aggregation: high watcher counts
for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
const count = countByPredicate(repos, r => r.watchers > threshold)
const count = repos.filter(r => r.watchers > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -148,10 +145,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Filtering: multi-condition (stars AND forks)
for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
const count = countByPredicate(
repos,
const count = repos.filter(
r => r.stars > combo.stars && r.forks > combo.forks,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -165,10 +161,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
// Filtering: stars AND watchers
for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
const count = countByPredicate(
repos,
const count = repos.filter(
r => r.stars > combo.stars && r.watchers > combo.watchers,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())

View File

@@ -10,10 +10,9 @@ import { generateTabularQuestions } from './tabular'
import { createIdGenerator } from './utils'
/**
* Generate all questions from datasets
* Generate ~200 questions from all datasets
*
* @remarks
* Generates ~150-160 questions across different question types and datasets:
* - Field Retrieval: Direct field access with no computation
* Examples: "What is X's salary?", "What is the status of order Y?"
* - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)

View File

@@ -34,6 +34,26 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
prompt: 'What is the session duration?',
groundTruth: String(config.authentication.session.duration),
},
{
prompt: 'What is the minimum connection pool size?',
groundTruth: String(config.database.pool.min),
},
{
prompt: 'What is the connection pool idle timeout?',
groundTruth: String(config.database.pool.idleTimeout),
},
{
prompt: 'What is the database name?',
groundTruth: config.database.name,
},
{
prompt: 'What is the session refresh threshold?',
groundTruth: String(config.authentication.session.refreshThreshold),
},
{
prompt: 'What is the version in the configuration?',
groundTruth: config.version,
},
]
for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) {
@@ -93,6 +113,18 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
.build(),
)
// Aggregation: providers with admin scope
const adminScopeProviderCount = config.authentication.providers.filter(p => p.scopes.includes('admin')).length
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many authentication providers include the "admin" scope?')
.groundTruth(String(adminScopeProviderCount))
.type('aggregation')
.dataset('nested-config')
.build(),
)
// Aggregation: feature flag details
const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length
questions.push(
@@ -117,6 +149,67 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
.build(),
)
// Aggregation: additional nested counts
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
const groupsWithMultipleRoles = Object.values(config.permissions.groups).filter(g => g.roles.length > 1).length
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the total number of permissions across all roles?')
.groundTruth(String(totalPermissions))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many distinct permissions are defined across all roles?')
.groundTruth(String(distinctPermissions))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many distinct scopes are defined across all authentication providers?')
.groundTruth(String(distinctScopes))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('What is the total number of variants across all feature flags?')
.groundTruth(String(totalVariants))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many database replicas have a priority greater than 2?')
.groundTruth(String(highPriorityReplicas))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many feature flags have a rollout percentage greater than 50?')
.groundTruth(String(featuresWithHighRollout))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many groups have more than one role assigned?')
.groundTruth(String(groupsWithMultipleRoles))
.type('aggregation')
.dataset('nested-config')
.build(),
)
// Filtering: complex multi-condition queries
const filteringQuestions = [
{
@@ -129,6 +222,31 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
groundTruth: String(Object.entries(config.permissions.groups)
.filter(([_, g]) => g.roles.includes('admin')).length),
},
{
prompt: 'How many database replicas have priority greater than 2 and port 5432?',
groundTruth: String(config.database.replicas
.filter(r => r.priority > 2 && r.port === 5432).length),
},
{
prompt: 'How many authentication providers have more than 2 scopes?',
groundTruth: String(config.authentication.providers
.filter(p => p.scopes.length > 2).length),
},
{
prompt: 'How many roles have at least 5 permissions?',
groundTruth: String(Object.values(config.permissions.roles)
.filter(r => r.permissions.length >= 5).length),
},
{
prompt: 'How many feature flags are disabled with rollout less than 25%?',
groundTruth: String(Object.values(config.features)
.filter(f => !f.enabled && f.rollout < 25).length),
},
{
prompt: 'How many enabled features have at least 2 variants?',
groundTruth: String(Object.values(config.features)
.filter(f => f.enabled && f.variants.length >= 2).length),
},
]
for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) {

View File

@@ -1,7 +1,7 @@
import type { Order } from '../datasets'
import type { Question } from '../types'
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
/**
* Generate nested (orders) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] {
const questions: Question[] = []
if (orders.length === 0)
return questions
// Field retrieval: order totals and statuses
const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [
(order, getId) => new QuestionBuilder()
@@ -89,7 +86,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
// Count by status
const statuses = [...new Set(orders.map(o => o.status))]
for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
const count = countByPredicate(orders, o => o.status === status)
const count = orders.filter(o => o.status === status).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -134,7 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
// Aggregation: high-value orders (single-condition filter)
for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
const count = countByPredicate(orders, o => o.total > threshold)
const count = orders.filter(o => o.total > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -149,10 +146,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
// Filtering: multi-condition queries (status AND value)
const orderStatuses = [...new Set(orders.map(o => o.status))]
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
const count = countByPredicate(
orders,
const count = orders.filter(
o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -166,10 +162,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
// Filtering: status AND items count (multi-condition)
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
const count = countByPredicate(
orders,
const count = orders.filter(
o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -183,10 +178,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
// Filtering: total AND items count (multi-condition)
for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
const count = countByPredicate(
orders,
const count = orders.filter(
o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())

View File

@@ -1,7 +1,7 @@
import type { Employee } from '../datasets'
import type { Question } from '../types'
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
/**
* Generate tabular (employee) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] {
const questions: Question[] = []
if (employees.length === 0)
return questions
// Field retrieval: specific employees
const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [
(emp, getId) => new QuestionBuilder()
@@ -62,7 +59,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Aggregation: count by department
const departments = [...new Set(employees.map(e => e.department))]
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
const count = countByPredicate(employees, e => e.department === dept)
const count = employees.filter(e => e.department === dept).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -76,7 +73,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Aggregation: salary ranges (single-condition filters)
for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
const count = countByPredicate(employees, e => e.salary > threshold)
const count = employees.filter(e => e.salary > threshold).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -91,8 +88,8 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Aggregation: totals and averages
const totalEmployees = employees.length
const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
const activeCount = countByPredicate(employees, e => e.active)
const inactiveCount = countByPredicate(employees, e => !e.active)
const activeCount = employees.filter(e => e.active).length
const inactiveCount = employees.filter(e => !e.active).length
questions.push(
new QuestionBuilder()
@@ -127,10 +124,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Filtering: count by department with salary filter (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
const count = countByPredicate(
employees,
const count = employees.filter(
e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -144,7 +140,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Filtering: active employees by experience (multi-condition)
for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
const count = countByPredicate(employees, e => e.yearsExperience > exp && e.active)
const count = employees.filter(e => e.yearsExperience > exp && e.active).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -158,10 +154,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Filtering: department by experience (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
const count = countByPredicate(
employees,
const count = employees.filter(
e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold,
)
).length
questions.push(
new QuestionBuilder()
.id(getId())
@@ -175,7 +170,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
// Filtering: department by active status (multi-condition)
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
const count = countByPredicate(employees, e => e.department === dept && e.active)
const count = employees.filter(e => e.department === dept && e.active).length
questions.push(
new QuestionBuilder()
.id(getId())

View File

@@ -61,14 +61,7 @@ export class QuestionBuilder {
}
/**
* Helper: Count items matching a predicate
*/
export function countByPredicate<T>(items: T[], predicate: (item: T) => boolean): number {
return items.filter(predicate).length
}
/**
* Helper: Rotate through question generators
* Rotate through question generators
*/
export function rotateQuestions<T>(
items: T[],

View File

@@ -15,7 +15,7 @@ export interface Question {
id: string
prompt: string
groundTruth: string
type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison'
type: 'field-retrieval' | 'aggregation' | 'filtering'
dataset: string
}