mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
chore: more work on benchmarks
This commit is contained in:
@@ -34,7 +34,7 @@ Results are saved to `results/token-efficiency.md`.
|
|||||||
|
|
||||||
Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
|
Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
|
||||||
|
|
||||||
1. Generate ~150-160 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
|
1. Generate ~200 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
|
||||||
2. Convert each dataset to all supported formats
|
2. Convert each dataset to all supported formats
|
||||||
3. Query each LLM with formatted data + question
|
3. Query each LLM with formatted data + question
|
||||||
4. Validate answers using `gpt-5-nano` as judge
|
4. Validate answers using `gpt-5-nano` as judge
|
||||||
|
|||||||
1416
benchmarks/questions-generated.json
Normal file
1416
benchmarks/questions-generated.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -100,7 +100,7 @@ function generateTotalLines(
|
|||||||
const csvStr = baselineFormat.tokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
|
const csvStr = baselineFormat.tokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
|
||||||
lines.push(`csv ${csvBar} ${csvStr} tokens`)
|
lines.push(`csv ${csvBar} ${csvStr} tokens`)
|
||||||
|
|
||||||
const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / totalToonTokens) * 100
|
const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / baselineFormat.tokens) * 100
|
||||||
const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
|
const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
|
||||||
const toonStr = totalToonTokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
|
const toonStr = totalToonTokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
|
||||||
lines.push(`toon ${toonBar} ${toonStr} tokens (+${overheadPercent.toFixed(1)}% vs CSV)`)
|
lines.push(`toon ${toonBar} ${toonStr} tokens (+${overheadPercent.toFixed(1)}% vs CSV)`)
|
||||||
@@ -223,7 +223,7 @@ const flatCharts = flatOnlyDatasets
|
|||||||
|
|
||||||
// TOON line with overhead vs CSV
|
// TOON line with overhead vs CSV
|
||||||
const toonOverhead = toon.tokens - csv.tokens
|
const toonOverhead = toon.tokens - csv.tokens
|
||||||
const toonOverheadPercent = (toonOverhead / toon.tokens) * 100
|
const toonOverheadPercent = (toonOverhead / csv.tokens) * 100
|
||||||
const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
|
const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
|
||||||
const toonStr = toon.tokens.toLocaleString('en-US')
|
const toonStr = toon.tokens.toLocaleString('en-US')
|
||||||
const toonVsCSV = toonOverheadPercent >= 0
|
const toonVsCSV = toonOverheadPercent >= 0
|
||||||
|
|||||||
@@ -101,10 +101,10 @@ export const QUESTION_THRESHOLDS = {
|
|||||||
*/
|
*/
|
||||||
export const QUESTION_LIMITS = {
|
export const QUESTION_LIMITS = {
|
||||||
tabular: {
|
tabular: {
|
||||||
fieldRetrieval: 20,
|
fieldRetrieval: 14,
|
||||||
aggregationDepartments: 6,
|
aggregationDepartments: 4,
|
||||||
filteringMultiConditionDepartments: 6,
|
filteringMultiConditionDepartments: 5,
|
||||||
filteringExperience: 4,
|
filteringExperience: 3,
|
||||||
filteringDepartmentExp: 3,
|
filteringDepartmentExp: 3,
|
||||||
filteringDepartmentActive: 3,
|
filteringDepartmentActive: 3,
|
||||||
},
|
},
|
||||||
@@ -116,7 +116,7 @@ export const QUESTION_LIMITS = {
|
|||||||
filteringStatusAndItems: 3,
|
filteringStatusAndItems: 3,
|
||||||
},
|
},
|
||||||
analytics: {
|
analytics: {
|
||||||
fieldRetrievalDates: 13,
|
fieldRetrievalDates: 9,
|
||||||
},
|
},
|
||||||
github: {
|
github: {
|
||||||
fieldRetrievalRepos: 11,
|
fieldRetrievalRepos: 11,
|
||||||
@@ -125,12 +125,12 @@ export const QUESTION_LIMITS = {
|
|||||||
},
|
},
|
||||||
eventLogs: {
|
eventLogs: {
|
||||||
fieldRetrieval: 10,
|
fieldRetrieval: 10,
|
||||||
aggregationEndpoints: 3,
|
aggregationEndpoints: 4,
|
||||||
filteringLevelAndStatus: 2,
|
filteringLevelAndStatus: 3,
|
||||||
filteringEndpointAndStatus: 2,
|
filteringEndpointAndStatus: 3,
|
||||||
},
|
},
|
||||||
nestedConfig: {
|
nestedConfig: {
|
||||||
fieldRetrieval: 5,
|
fieldRetrieval: 10,
|
||||||
filteringComplex: 2,
|
filteringComplex: 6,
|
||||||
},
|
},
|
||||||
} as const
|
} as const
|
||||||
|
|||||||
@@ -5,67 +5,6 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
|||||||
// Seed for reproducibility
|
// Seed for reproducibility
|
||||||
faker.seed(12345)
|
faker.seed(12345)
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculate the tabular eligibility percentage of a data structure
|
|
||||||
*
|
|
||||||
* @remarks
|
|
||||||
* Recursively analyzes data to determine what percentage of arrays qualify
|
|
||||||
* for TOON's tabular format (uniform objects with primitive values only).
|
|
||||||
*/
|
|
||||||
export function calculateTabularEligibility(data: unknown): number {
|
|
||||||
let totalArrays = 0
|
|
||||||
let tabularArrays = 0
|
|
||||||
|
|
||||||
function isTabularArray(arr: unknown[]): boolean {
|
|
||||||
if (arr.length === 0)
|
|
||||||
return false
|
|
||||||
|
|
||||||
// Check if all elements are objects
|
|
||||||
if (!arr.every(item => typeof item === 'object' && item !== null && !Array.isArray(item)))
|
|
||||||
return false
|
|
||||||
|
|
||||||
// Get keys from first object
|
|
||||||
const firstKeys = Object.keys(arr[0] as Record<string, unknown>)
|
|
||||||
if (firstKeys.length === 0)
|
|
||||||
return false
|
|
||||||
|
|
||||||
// Check if all objects have the same keys and only primitive values
|
|
||||||
return arr.every((item) => {
|
|
||||||
const itemObj = item as Record<string, unknown>
|
|
||||||
const itemKeys = Object.keys(itemObj)
|
|
||||||
if (itemKeys.length !== firstKeys.length)
|
|
||||||
return false
|
|
||||||
if (!firstKeys.every(key => itemKeys.includes(key)))
|
|
||||||
return false
|
|
||||||
|
|
||||||
// Check if all values are primitives (no nested objects or arrays)
|
|
||||||
return firstKeys.every((key) => {
|
|
||||||
const value = itemObj[key]
|
|
||||||
return value === null || ['string', 'number', 'boolean'].includes(typeof value)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
function traverse(obj: unknown): void {
|
|
||||||
if (Array.isArray(obj)) {
|
|
||||||
totalArrays++
|
|
||||||
if (isTabularArray(obj))
|
|
||||||
tabularArrays++
|
|
||||||
|
|
||||||
// Continue traversing array elements
|
|
||||||
obj.forEach(item => traverse(item))
|
|
||||||
}
|
|
||||||
else if (typeof obj === 'object' && obj !== null) {
|
|
||||||
// Traverse object properties
|
|
||||||
Object.values(obj).forEach(value => traverse(value))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
traverse(data)
|
|
||||||
|
|
||||||
return totalArrays === 0 ? 0 : Math.round((tabularArrays / totalArrays) * 100)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Employee record structure for tabular dataset
|
* Employee record structure for tabular dataset
|
||||||
*/
|
*/
|
||||||
@@ -275,7 +214,7 @@ const tabularDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: true,
|
supportsCSV: true,
|
||||||
structureClass: 'uniform',
|
structureClass: 'uniform',
|
||||||
tabularEligibility: 100,
|
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -285,38 +224,21 @@ const tabularDataset: Dataset = {
|
|||||||
const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
|
const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
|
||||||
const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
|
const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
|
||||||
|
|
||||||
const ORDER_CONSTANTS = {
|
|
||||||
CUSTOMER_ID_MOD: 20,
|
|
||||||
MIN_ITEMS: 1,
|
|
||||||
MAX_ITEMS: 4,
|
|
||||||
MIN_ITEM_PRICE: 9.99,
|
|
||||||
MAX_ITEM_PRICE: 199.99,
|
|
||||||
MIN_ITEM_QUANTITY: 1,
|
|
||||||
MAX_ITEM_QUANTITY: 5,
|
|
||||||
SKU_LENGTH: 6,
|
|
||||||
ORDER_ID_PADDING: 4,
|
|
||||||
RECENT_DAYS: 90,
|
|
||||||
TAX_RATE: 0.08,
|
|
||||||
} as const
|
|
||||||
|
|
||||||
function generateOrders(count: number): { orders: Order[] } {
|
function generateOrders(count: number): { orders: Order[] } {
|
||||||
return {
|
return {
|
||||||
orders: Array.from({ length: count }, (_, i) => {
|
orders: Array.from({ length: count }, (_, i) => {
|
||||||
const customerId = (i % ORDER_CONSTANTS.CUSTOMER_ID_MOD) + 1
|
const customerId = (i % 20) + 1 // Rotate through 20 customers
|
||||||
const itemCount = faker.number.int({ min: ORDER_CONSTANTS.MIN_ITEMS, max: ORDER_CONSTANTS.MAX_ITEMS })
|
const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order
|
||||||
|
|
||||||
const items = Array.from({ length: itemCount }, (_, j) => {
|
const items = Array.from({ length: itemCount }, (_, j) => {
|
||||||
const price = faker.number.float({
|
const price = faker.number.float({
|
||||||
min: ORDER_CONSTANTS.MIN_ITEM_PRICE,
|
min: 9.99,
|
||||||
max: ORDER_CONSTANTS.MAX_ITEM_PRICE,
|
max: 199.99,
|
||||||
fractionDigits: 2,
|
fractionDigits: 2,
|
||||||
})
|
})
|
||||||
const quantity = faker.number.int({
|
const quantity = faker.number.int({ min: 1, max: 5 })
|
||||||
min: ORDER_CONSTANTS.MIN_ITEM_QUANTITY,
|
|
||||||
max: ORDER_CONSTANTS.MAX_ITEM_QUANTITY,
|
|
||||||
})
|
|
||||||
return {
|
return {
|
||||||
sku: `SKU-${faker.string.alphanumeric({ length: ORDER_CONSTANTS.SKU_LENGTH }).toUpperCase()}`,
|
sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
|
||||||
name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
|
name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
|
||||||
quantity,
|
quantity,
|
||||||
price,
|
price,
|
||||||
@@ -324,11 +246,11 @@ function generateOrders(count: number): { orders: Order[] } {
|
|||||||
})
|
})
|
||||||
|
|
||||||
const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
|
const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
|
||||||
const tax = Number((subtotal * ORDER_CONSTANTS.TAX_RATE).toFixed(2))
|
const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate
|
||||||
const total = Number((subtotal + tax).toFixed(2))
|
const total = Number((subtotal + tax).toFixed(2))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
orderId: `ORD-${String(i + 1).padStart(ORDER_CONSTANTS.ORDER_ID_PADDING, '0')}`,
|
orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
|
||||||
customer: {
|
customer: {
|
||||||
id: customerId,
|
id: customerId,
|
||||||
name: faker.person.fullName(),
|
name: faker.person.fullName(),
|
||||||
@@ -340,7 +262,7 @@ function generateOrders(count: number): { orders: Order[] } {
|
|||||||
tax,
|
tax,
|
||||||
total,
|
total,
|
||||||
status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
|
status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
|
||||||
orderDate: faker.date.recent({ days: ORDER_CONSTANTS.RECENT_DAYS }).toISOString().split('T')[0],
|
orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
@@ -359,7 +281,7 @@ const nestedDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: false,
|
supportsCSV: false,
|
||||||
structureClass: 'nested',
|
structureClass: 'nested',
|
||||||
tabularEligibility: 33, // orders array is not tabular, but items arrays within are
|
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -376,7 +298,7 @@ const analyticsDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: true,
|
supportsCSV: true,
|
||||||
structureClass: 'uniform',
|
structureClass: 'uniform',
|
||||||
tabularEligibility: 100,
|
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,7 +317,7 @@ const githubDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: true,
|
supportsCSV: true,
|
||||||
structureClass: 'uniform',
|
structureClass: 'uniform',
|
||||||
tabularEligibility: 100,
|
tabularEligibility: 100, // Repository array contains uniform objects with primitive values
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -597,7 +519,7 @@ const eventLogsDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: false,
|
supportsCSV: false,
|
||||||
structureClass: 'semi-uniform',
|
structureClass: 'semi-uniform',
|
||||||
tabularEligibility: 50, // ~50% of logs have nested error objects
|
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -614,7 +536,7 @@ const nestedConfigDataset: Dataset = {
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: false,
|
supportsCSV: false,
|
||||||
structureClass: 'deep',
|
structureClass: 'deep',
|
||||||
tabularEligibility: 0, // Highly nested, minimal tabular arrays
|
tabularEligibility: 0, // Deeply nested configuration with no tabular arrays
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -642,7 +564,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: true,
|
supportsCSV: true,
|
||||||
structureClass: 'uniform',
|
structureClass: 'uniform',
|
||||||
tabularEligibility: 100,
|
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Nested: 500 orders
|
// Nested: 500 orders
|
||||||
@@ -653,7 +575,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: false,
|
supportsCSV: false,
|
||||||
structureClass: 'nested',
|
structureClass: 'nested',
|
||||||
tabularEligibility: 33,
|
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Analytics: 365 days
|
// Analytics: 365 days
|
||||||
@@ -664,7 +586,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: true,
|
supportsCSV: true,
|
||||||
structureClass: 'uniform',
|
structureClass: 'uniform',
|
||||||
tabularEligibility: 100,
|
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// GitHub: 100 repos (same as accuracy)
|
// GitHub: 100 repos (same as accuracy)
|
||||||
@@ -677,7 +599,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
|
|||||||
metadata: {
|
metadata: {
|
||||||
supportsCSV: false,
|
supportsCSV: false,
|
||||||
structureClass: 'semi-uniform',
|
structureClass: 'semi-uniform',
|
||||||
tabularEligibility: 50,
|
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// Nested config: 1 config (same as accuracy)
|
// Nested config: 1 config (same as accuracy)
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic'
|
|||||||
import { google } from '@ai-sdk/google'
|
import { google } from '@ai-sdk/google'
|
||||||
import { openai } from '@ai-sdk/openai'
|
import { openai } from '@ai-sdk/openai'
|
||||||
import { xai } from '@ai-sdk/xai'
|
import { xai } from '@ai-sdk/xai'
|
||||||
import * as prompts from '@clack/prompts'
|
|
||||||
import { generateText } from 'ai'
|
import { generateText } from 'ai'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -102,17 +101,10 @@ Is the actual answer correct? Consider:
|
|||||||
Respond with only "YES" or "NO".
|
Respond with only "YES" or "NO".
|
||||||
`.trim()
|
`.trim()
|
||||||
|
|
||||||
try {
|
const { text } = await generateText({
|
||||||
const { text } = await generateText({
|
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
||||||
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
prompt,
|
||||||
prompt,
|
})
|
||||||
})
|
|
||||||
|
|
||||||
return text.trim().toUpperCase() === 'YES'
|
return text.trim().toUpperCase() === 'YES'
|
||||||
}
|
|
||||||
catch (error) {
|
|
||||||
prompts.log.error(`Validation error: ${error}`)
|
|
||||||
// Fallback to simple string comparison
|
|
||||||
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { AnalyticsMetric } from '../datasets'
|
import type { AnalyticsMetric } from '../datasets'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
||||||
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate analytics (website metrics) questions
|
* Generate analytics (website metrics) questions
|
||||||
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
|
|||||||
export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] {
|
export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
|
|
||||||
if (metrics.length === 0)
|
|
||||||
return questions
|
|
||||||
|
|
||||||
// Field retrieval: date-based metrics
|
// Field retrieval: date-based metrics
|
||||||
const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [
|
const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [
|
||||||
(metric, getId) => new QuestionBuilder()
|
(metric, getId) => new QuestionBuilder()
|
||||||
@@ -99,7 +96,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
|
|
||||||
// Aggregation: high views/conversions
|
// Aggregation: high views/conversions
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
|
||||||
const count = countByPredicate(metrics, m => m.views > threshold)
|
const count = metrics.filter(m => m.views > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -112,7 +109,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) {
|
||||||
const count = countByPredicate(metrics, m => m.conversions > threshold)
|
const count = metrics.filter(m => m.conversions > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -126,10 +123,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
|
|
||||||
// Filtering: multi-condition (views AND revenue)
|
// Filtering: multi-condition (views AND revenue)
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
|
||||||
const count = countByPredicate(
|
const count = metrics.filter(
|
||||||
metrics,
|
|
||||||
m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering,
|
m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -143,10 +139,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
|
|
||||||
// Filtering: revenue thresholds
|
// Filtering: revenue thresholds
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) {
|
||||||
const count = countByPredicate(
|
const count = metrics.filter(
|
||||||
metrics,
|
|
||||||
m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue,
|
m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -160,10 +155,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
|
|
||||||
// Filtering: clicks and conversions
|
// Filtering: clicks and conversions
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
|
||||||
const count = countByPredicate(
|
const count = metrics.filter(
|
||||||
metrics,
|
|
||||||
m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering,
|
m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -177,10 +171,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
|
|||||||
|
|
||||||
// Filtering: revenue and bounce rate
|
// Filtering: revenue and bounce rate
|
||||||
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
|
for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
|
||||||
const count = countByPredicate(
|
const count = metrics.filter(
|
||||||
metrics,
|
|
||||||
m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold,
|
m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { EventLog } from '../datasets'
|
import type { EventLog } from '../datasets'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
import { QUESTION_LIMITS } from '../constants'
|
import { QUESTION_LIMITS } from '../constants'
|
||||||
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate event log questions
|
* Generate event log questions
|
||||||
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
|
|||||||
export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] {
|
export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
|
|
||||||
if (logs.length === 0)
|
|
||||||
return questions
|
|
||||||
|
|
||||||
// Field retrieval: log metadata
|
// Field retrieval: log metadata
|
||||||
const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [
|
const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [
|
||||||
(log, getId) => new QuestionBuilder()
|
(log, getId) => new QuestionBuilder()
|
||||||
@@ -76,7 +73,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
// Aggregation: by level
|
// Aggregation: by level
|
||||||
const levels = [...new Set(logs.map(l => l.level))]
|
const levels = [...new Set(logs.map(l => l.level))]
|
||||||
for (const level of levels) {
|
for (const level of levels) {
|
||||||
const count = countByPredicate(logs, l => l.level === level)
|
const count = logs.filter(l => l.level === level).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -91,7 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
// Aggregation: by endpoint
|
// Aggregation: by endpoint
|
||||||
const endpoints = [...new Set(logs.map(l => l.endpoint))]
|
const endpoints = [...new Set(logs.map(l => l.endpoint))]
|
||||||
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) {
|
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) {
|
||||||
const count = countByPredicate(logs, l => l.endpoint === endpoint)
|
const count = logs.filter(l => l.endpoint === endpoint).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -104,8 +101,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: by status code range
|
// Aggregation: by status code range
|
||||||
const errorCount = countByPredicate(logs, l => l.statusCode >= 400)
|
const errorCount = logs.filter(l => l.statusCode >= 400).length
|
||||||
const successCount = countByPredicate(logs, l => l.statusCode >= 200 && l.statusCode < 300)
|
const successCount = logs.filter(l => l.statusCode >= 200 && l.statusCode < 300).length
|
||||||
|
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
@@ -124,12 +121,21 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Aggregation: retryable errors
|
||||||
|
const retryableErrorCount = logs.filter(l => l.error?.retryable === true).length
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many log entries have a retryable error?')
|
||||||
|
.groundTruth(String(retryableErrorCount))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('event-logs')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
// Filtering: multi-condition (level AND status)
|
// Filtering: multi-condition (level AND status)
|
||||||
for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) {
|
for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) {
|
||||||
const count = countByPredicate(
|
const count = logs.filter(l => l.level === level && l.statusCode >= 400).length
|
||||||
logs,
|
|
||||||
l => l.level === level && l.statusCode >= 400,
|
|
||||||
)
|
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -143,10 +149,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
|
|
||||||
// Filtering: endpoint AND status
|
// Filtering: endpoint AND status
|
||||||
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
|
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
|
||||||
const count = countByPredicate(
|
const count = logs.filter(l => l.endpoint === endpoint && l.statusCode >= 500).length
|
||||||
logs,
|
|
||||||
l => l.endpoint === endpoint && l.statusCode >= 500,
|
|
||||||
)
|
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -158,5 +161,19 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filtering: endpoint AND retryable error
|
||||||
|
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
|
||||||
|
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt(`How many log entries for endpoint "${endpoint}" have a retryable error?`)
|
||||||
|
.groundTruth(String(count))
|
||||||
|
.type('filtering')
|
||||||
|
.dataset('event-logs')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
return questions
|
return questions
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { Repository } from '../datasets'
|
import type { Repository } from '../datasets'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
||||||
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate GitHub repository questions
|
* Generate GitHub repository questions
|
||||||
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
|
|||||||
export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] {
|
export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
|
|
||||||
if (repos.length === 0)
|
|
||||||
return questions
|
|
||||||
|
|
||||||
// Field retrieval: repository metadata
|
// Field retrieval: repository metadata
|
||||||
const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [
|
const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [
|
||||||
(repo, getId) => new QuestionBuilder()
|
(repo, getId) => new QuestionBuilder()
|
||||||
@@ -92,7 +89,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
// Aggregation: by default branch
|
// Aggregation: by default branch
|
||||||
const branches = [...new Set(repos.map(r => r.defaultBranch))]
|
const branches = [...new Set(repos.map(r => r.defaultBranch))]
|
||||||
for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
|
for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
|
||||||
const count = countByPredicate(repos, r => r.defaultBranch === branch)
|
const count = repos.filter(r => r.defaultBranch === branch).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -106,7 +103,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
|
|
||||||
// Aggregation: high star counts
|
// Aggregation: high star counts
|
||||||
for (const threshold of QUESTION_THRESHOLDS.github.stars) {
|
for (const threshold of QUESTION_THRESHOLDS.github.stars) {
|
||||||
const count = countByPredicate(repos, r => r.stars > threshold)
|
const count = repos.filter(r => r.stars > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -120,7 +117,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
|
|
||||||
// Aggregation: high fork counts
|
// Aggregation: high fork counts
|
||||||
for (const threshold of QUESTION_THRESHOLDS.github.forks) {
|
for (const threshold of QUESTION_THRESHOLDS.github.forks) {
|
||||||
const count = countByPredicate(repos, r => r.forks > threshold)
|
const count = repos.filter(r => r.forks > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -134,7 +131,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
|
|
||||||
// Aggregation: high watcher counts
|
// Aggregation: high watcher counts
|
||||||
for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
|
for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
|
||||||
const count = countByPredicate(repos, r => r.watchers > threshold)
|
const count = repos.filter(r => r.watchers > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -148,10 +145,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
|
|
||||||
// Filtering: multi-condition (stars AND forks)
|
// Filtering: multi-condition (stars AND forks)
|
||||||
for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
|
for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
|
||||||
const count = countByPredicate(
|
const count = repos.filter(
|
||||||
repos,
|
|
||||||
r => r.stars > combo.stars && r.forks > combo.forks,
|
r => r.stars > combo.stars && r.forks > combo.forks,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -165,10 +161,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
|
|||||||
|
|
||||||
// Filtering: stars AND watchers
|
// Filtering: stars AND watchers
|
||||||
for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
|
for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
|
||||||
const count = countByPredicate(
|
const count = repos.filter(
|
||||||
repos,
|
|
||||||
r => r.stars > combo.stars && r.watchers > combo.watchers,
|
r => r.stars > combo.stars && r.watchers > combo.watchers,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
|
|||||||
@@ -10,10 +10,9 @@ import { generateTabularQuestions } from './tabular'
|
|||||||
import { createIdGenerator } from './utils'
|
import { createIdGenerator } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate all questions from datasets
|
* Generate ~200 questions from all datasets
|
||||||
*
|
*
|
||||||
* @remarks
|
* @remarks
|
||||||
* Generates ~150-160 questions across different question types and datasets:
|
|
||||||
* - Field Retrieval: Direct field access with no computation
|
* - Field Retrieval: Direct field access with no computation
|
||||||
* Examples: "What is X's salary?", "What is the status of order Y?"
|
* Examples: "What is X's salary?", "What is the status of order Y?"
|
||||||
* - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
|
* - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
|
||||||
|
|||||||
@@ -34,6 +34,26 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
prompt: 'What is the session duration?',
|
prompt: 'What is the session duration?',
|
||||||
groundTruth: String(config.authentication.session.duration),
|
groundTruth: String(config.authentication.session.duration),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
prompt: 'What is the minimum connection pool size?',
|
||||||
|
groundTruth: String(config.database.pool.min),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'What is the connection pool idle timeout?',
|
||||||
|
groundTruth: String(config.database.pool.idleTimeout),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'What is the database name?',
|
||||||
|
groundTruth: config.database.name,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'What is the session refresh threshold?',
|
||||||
|
groundTruth: String(config.authentication.session.refreshThreshold),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'What is the version in the configuration?',
|
||||||
|
groundTruth: config.version,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) {
|
for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) {
|
||||||
@@ -93,6 +113,18 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Aggregation: providers with admin scope
|
||||||
|
const adminScopeProviderCount = config.authentication.providers.filter(p => p.scopes.includes('admin')).length
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many authentication providers include the "admin" scope?')
|
||||||
|
.groundTruth(String(adminScopeProviderCount))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
// Aggregation: feature flag details
|
// Aggregation: feature flag details
|
||||||
const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length
|
const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length
|
||||||
questions.push(
|
questions.push(
|
||||||
@@ -117,6 +149,67 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.build(),
|
.build(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Aggregation: additional nested counts
|
||||||
|
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
|
||||||
|
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
|
||||||
|
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
|
||||||
|
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
|
||||||
|
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
|
||||||
|
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
|
||||||
|
const groupsWithMultipleRoles = Object.values(config.permissions.groups).filter(g => g.roles.length > 1).length
|
||||||
|
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the total number of permissions across all roles?')
|
||||||
|
.groundTruth(String(totalPermissions))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many distinct permissions are defined across all roles?')
|
||||||
|
.groundTruth(String(distinctPermissions))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many distinct scopes are defined across all authentication providers?')
|
||||||
|
.groundTruth(String(distinctScopes))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the total number of variants across all feature flags?')
|
||||||
|
.groundTruth(String(totalVariants))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many database replicas have a priority greater than 2?')
|
||||||
|
.groundTruth(String(highPriorityReplicas))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many feature flags have a rollout percentage greater than 50?')
|
||||||
|
.groundTruth(String(featuresWithHighRollout))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many groups have more than one role assigned?')
|
||||||
|
.groundTruth(String(groupsWithMultipleRoles))
|
||||||
|
.type('aggregation')
|
||||||
|
.dataset('nested-config')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
// Filtering: complex multi-condition queries
|
// Filtering: complex multi-condition queries
|
||||||
const filteringQuestions = [
|
const filteringQuestions = [
|
||||||
{
|
{
|
||||||
@@ -129,6 +222,31 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
groundTruth: String(Object.entries(config.permissions.groups)
|
groundTruth: String(Object.entries(config.permissions.groups)
|
||||||
.filter(([_, g]) => g.roles.includes('admin')).length),
|
.filter(([_, g]) => g.roles.includes('admin')).length),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
prompt: 'How many database replicas have priority greater than 2 and port 5432?',
|
||||||
|
groundTruth: String(config.database.replicas
|
||||||
|
.filter(r => r.priority > 2 && r.port === 5432).length),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'How many authentication providers have more than 2 scopes?',
|
||||||
|
groundTruth: String(config.authentication.providers
|
||||||
|
.filter(p => p.scopes.length > 2).length),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'How many roles have at least 5 permissions?',
|
||||||
|
groundTruth: String(Object.values(config.permissions.roles)
|
||||||
|
.filter(r => r.permissions.length >= 5).length),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'How many feature flags are disabled with rollout less than 25%?',
|
||||||
|
groundTruth: String(Object.values(config.features)
|
||||||
|
.filter(f => !f.enabled && f.rollout < 25).length),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
prompt: 'How many enabled features have at least 2 variants?',
|
||||||
|
groundTruth: String(Object.values(config.features)
|
||||||
|
.filter(f => f.enabled && f.variants.length >= 2).length),
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) {
|
for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) {
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { Order } from '../datasets'
|
import type { Order } from '../datasets'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
||||||
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate nested (orders) questions
|
* Generate nested (orders) questions
|
||||||
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
|
|||||||
export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] {
|
export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
|
|
||||||
if (orders.length === 0)
|
|
||||||
return questions
|
|
||||||
|
|
||||||
// Field retrieval: order totals and statuses
|
// Field retrieval: order totals and statuses
|
||||||
const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [
|
const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [
|
||||||
(order, getId) => new QuestionBuilder()
|
(order, getId) => new QuestionBuilder()
|
||||||
@@ -89,7 +86,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
// Count by status
|
// Count by status
|
||||||
const statuses = [...new Set(orders.map(o => o.status))]
|
const statuses = [...new Set(orders.map(o => o.status))]
|
||||||
for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
|
for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
|
||||||
const count = countByPredicate(orders, o => o.status === status)
|
const count = orders.filter(o => o.status === status).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -134,7 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
|
|
||||||
// Aggregation: high-value orders (single-condition filter)
|
// Aggregation: high-value orders (single-condition filter)
|
||||||
for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
|
for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
|
||||||
const count = countByPredicate(orders, o => o.total > threshold)
|
const count = orders.filter(o => o.total > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -149,10 +146,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
// Filtering: multi-condition queries (status AND value)
|
// Filtering: multi-condition queries (status AND value)
|
||||||
const orderStatuses = [...new Set(orders.map(o => o.status))]
|
const orderStatuses = [...new Set(orders.map(o => o.status))]
|
||||||
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
|
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
|
||||||
const count = countByPredicate(
|
const count = orders.filter(
|
||||||
orders,
|
|
||||||
o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold,
|
o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -166,10 +162,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
|
|
||||||
// Filtering: status AND items count (multi-condition)
|
// Filtering: status AND items count (multi-condition)
|
||||||
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
|
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
|
||||||
const count = countByPredicate(
|
const count = orders.filter(
|
||||||
orders,
|
|
||||||
o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
|
o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -183,10 +178,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
|
|||||||
|
|
||||||
// Filtering: total AND items count (multi-condition)
|
// Filtering: total AND items count (multi-condition)
|
||||||
for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
|
for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
|
||||||
const count = countByPredicate(
|
const count = orders.filter(
|
||||||
orders,
|
|
||||||
o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
|
o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import type { Employee } from '../datasets'
|
import type { Employee } from '../datasets'
|
||||||
import type { Question } from '../types'
|
import type { Question } from '../types'
|
||||||
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
|
||||||
import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate tabular (employee) questions
|
* Generate tabular (employee) questions
|
||||||
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
|
|||||||
export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] {
|
export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
|
|
||||||
if (employees.length === 0)
|
|
||||||
return questions
|
|
||||||
|
|
||||||
// Field retrieval: specific employees
|
// Field retrieval: specific employees
|
||||||
const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [
|
const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [
|
||||||
(emp, getId) => new QuestionBuilder()
|
(emp, getId) => new QuestionBuilder()
|
||||||
@@ -62,7 +59,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
// Aggregation: count by department
|
// Aggregation: count by department
|
||||||
const departments = [...new Set(employees.map(e => e.department))]
|
const departments = [...new Set(employees.map(e => e.department))]
|
||||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
|
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
|
||||||
const count = countByPredicate(employees, e => e.department === dept)
|
const count = employees.filter(e => e.department === dept).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -76,7 +73,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
|
|
||||||
// Aggregation: salary ranges (single-condition filters)
|
// Aggregation: salary ranges (single-condition filters)
|
||||||
for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
|
for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
|
||||||
const count = countByPredicate(employees, e => e.salary > threshold)
|
const count = employees.filter(e => e.salary > threshold).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -91,8 +88,8 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
// Aggregation: totals and averages
|
// Aggregation: totals and averages
|
||||||
const totalEmployees = employees.length
|
const totalEmployees = employees.length
|
||||||
const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
|
const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
|
||||||
const activeCount = countByPredicate(employees, e => e.active)
|
const activeCount = employees.filter(e => e.active).length
|
||||||
const inactiveCount = countByPredicate(employees, e => !e.active)
|
const inactiveCount = employees.filter(e => !e.active).length
|
||||||
|
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
@@ -127,10 +124,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
|
|
||||||
// Filtering: count by department with salary filter (multi-condition)
|
// Filtering: count by department with salary filter (multi-condition)
|
||||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
|
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
|
||||||
const count = countByPredicate(
|
const count = employees.filter(
|
||||||
employees,
|
|
||||||
e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold,
|
e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -144,7 +140,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
|
|
||||||
// Filtering: active employees by experience (multi-condition)
|
// Filtering: active employees by experience (multi-condition)
|
||||||
for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
|
for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
|
||||||
const count = countByPredicate(employees, e => e.yearsExperience > exp && e.active)
|
const count = employees.filter(e => e.yearsExperience > exp && e.active).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -158,10 +154,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
|
|
||||||
// Filtering: department by experience (multi-condition)
|
// Filtering: department by experience (multi-condition)
|
||||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
|
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
|
||||||
const count = countByPredicate(
|
const count = employees.filter(
|
||||||
employees,
|
|
||||||
e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold,
|
e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold,
|
||||||
)
|
).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
@@ -175,7 +170,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
|
|||||||
|
|
||||||
// Filtering: department by active status (multi-condition)
|
// Filtering: department by active status (multi-condition)
|
||||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
|
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
|
||||||
const count = countByPredicate(employees, e => e.department === dept && e.active)
|
const count = employees.filter(e => e.department === dept && e.active).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
|
|||||||
@@ -61,14 +61,7 @@ export class QuestionBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper: Count items matching a predicate
|
* Rotate through question generators
|
||||||
*/
|
|
||||||
export function countByPredicate<T>(items: T[], predicate: (item: T) => boolean): number {
|
|
||||||
return items.filter(predicate).length
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper: Rotate through question generators
|
|
||||||
*/
|
*/
|
||||||
export function rotateQuestions<T>(
|
export function rotateQuestions<T>(
|
||||||
items: T[],
|
items: T[],
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ export interface Question {
|
|||||||
id: string
|
id: string
|
||||||
prompt: string
|
prompt: string
|
||||||
groundTruth: string
|
groundTruth: string
|
||||||
type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison'
|
type: 'field-retrieval' | 'aggregation' | 'filtering'
|
||||||
dataset: string
|
dataset: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user