mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
chore(benchmarks): add structure-awareness questions
This commit is contained in:
@@ -4,37 +4,11 @@ import * as url from 'node:url'
|
||||
export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
|
||||
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
|
||||
|
||||
/**
|
||||
* Model-specific RPM (requests per minute) limits to handle API quotas
|
||||
*
|
||||
* @remarks
|
||||
* Set `undefined` for models without specific limits.
|
||||
*/
|
||||
/// keep-sorted
|
||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
'claude-haiku-4-5-20251001': 50,
|
||||
'gemini-2.5-flash': 25,
|
||||
'gpt-5-nano': 50,
|
||||
'grok-4-fast-non-reasoning': 50,
|
||||
}
|
||||
|
||||
/**
|
||||
* Default concurrency for parallel evaluations to prevent bursting
|
||||
*/
|
||||
export const DEFAULT_CONCURRENCY = 10
|
||||
|
||||
/**
|
||||
* Display names for data format types
|
||||
*/
|
||||
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||
'json-pretty': 'JSON',
|
||||
'json-compact': 'JSON compact',
|
||||
'toon': 'TOON',
|
||||
'csv': 'CSV',
|
||||
'xml': 'XML',
|
||||
'yaml': 'YAML',
|
||||
} as const
|
||||
|
||||
/**
|
||||
* Enable dry run mode for quick testing with limited AI requests
|
||||
*
|
||||
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
|
||||
maxQuestions: 10,
|
||||
}
|
||||
|
||||
/**
|
||||
* Model-specific RPM (requests per minute) limits to handle API quotas
|
||||
*
|
||||
* @remarks
|
||||
* Set `undefined` for models without specific limits.
|
||||
*/
|
||||
/// keep-sorted
|
||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
'claude-haiku-4-5-20251001': 50,
|
||||
'gemini-2.5-flash': 25,
|
||||
'gpt-5-nano': 50,
|
||||
'grok-4-fast-non-reasoning': 50,
|
||||
}
|
||||
|
||||
/**
|
||||
* Display names for data format types
|
||||
*/
|
||||
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||
'json-pretty': 'JSON',
|
||||
'json-compact': 'JSON compact',
|
||||
'toon': 'TOON',
|
||||
'csv': 'CSV',
|
||||
'xml': 'XML',
|
||||
'yaml': 'YAML',
|
||||
} as const
|
||||
|
||||
/**
|
||||
* Question type identifiers
|
||||
*/
|
||||
export const QUESTION_TYPES = [
|
||||
'field-retrieval',
|
||||
'aggregation',
|
||||
'filtering',
|
||||
'structure-awareness',
|
||||
] as const
|
||||
|
||||
/**
|
||||
* Display names for question types
|
||||
*/
|
||||
export const QUESTION_TYPE_LABELS = {
|
||||
'field-retrieval': 'Field Retrieval',
|
||||
'aggregation': 'Aggregation',
|
||||
'filtering': 'Filtering',
|
||||
'structure-awareness': 'Structure Awareness',
|
||||
} as const
|
||||
|
||||
/**
|
||||
* Dataset identifiers
|
||||
*/
|
||||
export const DATASET_NAMES = [
|
||||
'tabular',
|
||||
'nested',
|
||||
'analytics',
|
||||
'github',
|
||||
'event-logs',
|
||||
'nested-config',
|
||||
] as const
|
||||
|
||||
/**
|
||||
* Structure class identifiers
|
||||
*/
|
||||
export const STRUCTURE_CLASSES = [
|
||||
'uniform',
|
||||
'semi-uniform',
|
||||
'nested',
|
||||
'deep',
|
||||
] as const
|
||||
|
||||
/**
|
||||
* Threshold values for filtering and aggregation questions
|
||||
*/
|
||||
export const QUESTION_THRESHOLDS = {
|
||||
tabular: {
|
||||
salaryRanges: [60000, 80000, 100000, 120000],
|
||||
salaryRanges: [60000, 80000, 100000],
|
||||
experienceYears: [5, 10, 15, 20],
|
||||
departmentSalaryThreshold: 80000,
|
||||
departmentExperienceThreshold: 10,
|
||||
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
|
||||
totalThresholdsForItems: [300, 500],
|
||||
},
|
||||
analytics: {
|
||||
views: [5000, 7000],
|
||||
conversions: [10, 30],
|
||||
views: [6000],
|
||||
conversions: [20],
|
||||
viewsForFiltering: [6000, 7000],
|
||||
conversionsForFiltering: 15,
|
||||
revenueThresholds: [500, 1000, 1500, 2000, 2500],
|
||||
revenueThresholds: [1000, 1500, 2000],
|
||||
viewsThresholdForRevenue: 6000,
|
||||
clicksForFiltering: [250, 400],
|
||||
conversionsForClickFiltering: 15,
|
||||
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
|
||||
},
|
||||
github: {
|
||||
stars: [100000, 150000, 200000],
|
||||
forks: [20000, 35000, 50000],
|
||||
watchers: [5000, 8000],
|
||||
forks: [20000, 35000],
|
||||
watchers: [8000],
|
||||
starForkCombinations: [
|
||||
{ stars: 75000, forks: 15000 },
|
||||
{ stars: 100000, forks: 20000 },
|
||||
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
|
||||
*/
|
||||
export const QUESTION_LIMITS = {
|
||||
tabular: {
|
||||
fieldRetrieval: 14,
|
||||
aggregationDepartments: 4,
|
||||
fieldRetrieval: 12,
|
||||
aggregationDepartments: 3,
|
||||
filteringMultiConditionDepartments: 5,
|
||||
filteringExperience: 3,
|
||||
filteringDepartmentExp: 3,
|
||||
filteringDepartmentActive: 3,
|
||||
filteringDepartmentActive: 2,
|
||||
},
|
||||
nested: {
|
||||
fieldRetrievalOrders: 8,
|
||||
fieldRetrievalCustomers: 10,
|
||||
aggregationStatuses: 5,
|
||||
filteringStatusAndValue: 5,
|
||||
fieldRetrievalCustomers: 8,
|
||||
aggregationStatuses: 3,
|
||||
filteringStatusAndValue: 4,
|
||||
filteringStatusAndItems: 3,
|
||||
},
|
||||
analytics: {
|
||||
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
|
||||
github: {
|
||||
fieldRetrievalRepos: 11,
|
||||
aggregationBranches: 2,
|
||||
filteringStarsAndForks: 8,
|
||||
filteringStarsAndForks: 3,
|
||||
},
|
||||
eventLogs: {
|
||||
fieldRetrieval: 10,
|
||||
aggregationEndpoints: 4,
|
||||
aggregationEndpoints: 2,
|
||||
filteringLevelAndStatus: 3,
|
||||
filteringEndpointAndStatus: 3,
|
||||
filteringEndpointRetryable: 2,
|
||||
},
|
||||
nestedConfig: {
|
||||
fieldRetrieval: 10,
|
||||
filteringComplex: 6,
|
||||
filteringComplex: 5,
|
||||
},
|
||||
} as const
|
||||
|
||||
@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
||||
/**
|
||||
* Generate employee data (uniform tabular structure)
|
||||
*/
|
||||
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
||||
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
||||
|
||||
function generateEmployees(count: number): { employees: Employee[] } {
|
||||
return {
|
||||
|
||||
@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
|
||||
xai('grok-4-fast-non-reasoning'),
|
||||
]
|
||||
|
||||
/**
|
||||
* Format primers
|
||||
*
|
||||
* @remarks
|
||||
* Neutral descriptions to help models parse each format.
|
||||
*/
|
||||
export const PRIMERS: Record<string, string> = {
|
||||
'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
|
||||
'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
|
||||
'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
|
||||
'yaml': 'YAML: Indentation-based key/value and lists (- items).',
|
||||
'xml': 'XML: Tag-based tree structure with nested elements.',
|
||||
'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
|
||||
}
|
||||
|
||||
/**
|
||||
* Code fence language tags for proper syntax highlighting
|
||||
*/
|
||||
export const FENCE: Record<string, string> = {
|
||||
'toon': 'toon',
|
||||
'json-pretty': 'json',
|
||||
'json-compact': 'json',
|
||||
'yaml': 'yaml',
|
||||
'xml': 'xml',
|
||||
'csv': 'csv',
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a single question with a specific format and model
|
||||
*/
|
||||
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
|
||||
model: LanguageModelV2
|
||||
},
|
||||
): Promise<EvaluationResult> {
|
||||
const primer = PRIMERS[formatName] ?? ''
|
||||
const fence = FENCE[formatName] ?? ''
|
||||
|
||||
const prompt = `
|
||||
${primer}
|
||||
|
||||
Given the following data in ${formatName} format:
|
||||
|
||||
\`\`\`
|
||||
\`\`\`${fence}
|
||||
${formattedData}
|
||||
\`\`\`
|
||||
|
||||
|
||||
@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
||||
}
|
||||
|
||||
// Filtering: endpoint AND retryable error
|
||||
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
|
||||
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
|
||||
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
|
||||
@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
|
||||
import { generateGithubQuestions } from './github'
|
||||
import { generateNestedQuestions } from './nested'
|
||||
import { generateNestedConfigQuestions } from './nested-config'
|
||||
import { generateStructureQuestions } from './structure'
|
||||
import { generateTabularQuestions } from './tabular'
|
||||
import { createIdGenerator } from './utils'
|
||||
|
||||
/**
|
||||
* Generate ~200 questions from all datasets
|
||||
* Generate questions from all datasets
|
||||
*
|
||||
* @remarks
|
||||
* - Field Retrieval: Direct field access with no computation
|
||||
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
|
||||
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
|
||||
* - Filtering: Multi-condition queries requiring complex logical operations
|
||||
* Examples: "How many X WHERE condition1 AND condition2?"
|
||||
* - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
|
||||
* Examples: "How many records?", "List the field names", "What is the last record's field?"
|
||||
*/
|
||||
export function generateQuestions(): Question[] {
|
||||
const questions: Question[] = []
|
||||
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
|
||||
questions.push(...generateEventLogsQuestions(eventLogs, getId))
|
||||
questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
|
||||
|
||||
// Generate structure-awareness questions (tests format-native affordances)
|
||||
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
|
||||
|
||||
return questions
|
||||
}
|
||||
|
||||
@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
// Aggregation: additional nested counts
|
||||
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
|
||||
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
|
||||
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
|
||||
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
|
||||
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
|
||||
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
|
||||
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many distinct scopes are defined across all authentication providers?')
|
||||
.groundTruth(String(distinctScopes))
|
||||
.type('aggregation')
|
||||
.dataset('nested-config')
|
||||
.build(),
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the total number of variants across all feature flags?')
|
||||
|
||||
324
benchmarks/src/questions/structure.ts
Normal file
324
benchmarks/src/questions/structure.ts
Normal file
@@ -0,0 +1,324 @@
|
||||
import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
|
||||
import type { Question } from '../types'
|
||||
import { QuestionBuilder } from './utils'
|
||||
|
||||
/**
|
||||
* Generate structure-awareness questions across all datasets
|
||||
*
|
||||
* These questions test format-native structural affordances:
|
||||
* - TOON's explicit array length [N] and field declarations {fields}
|
||||
* - CSV's header row (but no explicit length)
|
||||
* - JSON/YAML have neither unless the model counts manually
|
||||
*/
|
||||
export function generateStructureQuestions(
|
||||
employees: Employee[],
|
||||
orders: Order[],
|
||||
metrics: AnalyticsMetric[],
|
||||
repos: Repository[],
|
||||
logs: EventLog[],
|
||||
getId: () => string,
|
||||
): Question[] {
|
||||
const questions: Question[] = []
|
||||
|
||||
// ========== TABULAR DATASET (Employees) ==========
|
||||
|
||||
// Count: Total employees (tests array length awareness)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many employees are in the dataset?')
|
||||
.groundTruth(String(employees.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field list: Employee fields (tests field name awareness)
|
||||
const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('List the field names for employees (comma-separated, in order).')
|
||||
.groundTruth(employeeFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Positional: Third field name for employees (tests TOON {fields} syntax)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the 3rd field name for employees?')
|
||||
.groundTruth('email')
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last employee's department (tests ability to find last row using length)
|
||||
const lastEmployee = employees.at(-1)!
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the department of the last employee in the dataset?')
|
||||
.groundTruth(lastEmployee.department)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last employee's name
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the name of the last employee in the dataset?')
|
||||
.groundTruth(lastEmployee.name)
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field count: How many fields per employee (tests schema awareness)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many fields does each employee record have?')
|
||||
.groundTruth('7')
|
||||
.type('structure-awareness')
|
||||
.dataset('tabular')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// ========== NESTED DATASET (Orders) ==========
|
||||
|
||||
// Count: Total orders
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many orders are in the dataset?')
|
||||
.groundTruth(String(orders.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field list: Order fields
|
||||
const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('List the top-level field names for orders (comma-separated, in order).')
|
||||
.groundTruth(orderFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Nested count: Items in specific order
|
||||
const orderWithManyItems = orders.reduce((max, order) =>
|
||||
order.items.length > max.items.length ? order : max,
|
||||
)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
|
||||
.groundTruth(String(orderWithManyItems.items.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Nested field list: Item fields
|
||||
const itemFields = 'sku,name,quantity,price'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What are the field names for items within orders (comma-separated, in order)?')
|
||||
.groundTruth(itemFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last order's status
|
||||
const lastOrder = orders.at(-1)!
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the status of the last order in the dataset?')
|
||||
.groundTruth(lastOrder.status)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Customer field list
|
||||
const customerFields = 'id,name,email,phone'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
|
||||
.groundTruth(customerFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('nested')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// ========== ANALYTICS DATASET (Metrics) ==========
|
||||
|
||||
// Count: Total metrics
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many metric records are in the dataset?')
|
||||
.groundTruth(String(metrics.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field list: Metric fields
|
||||
const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('List the field names for metrics (comma-separated, in order).')
|
||||
.groundTruth(metricFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Positional: Fifth field name for metrics (tests TOON {fields} syntax)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the 5th field name for analytics metrics?')
|
||||
.groundTruth('revenue')
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last metric's date
|
||||
const lastMetric = metrics.at(-1)!
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the date of the last metric record in the dataset?')
|
||||
.groundTruth(lastMetric.date)
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field count: How many fields per metric
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many fields does each metric record have?')
|
||||
.groundTruth('6')
|
||||
.type('structure-awareness')
|
||||
.dataset('analytics')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// ========== GITHUB DATASET (Repositories) ==========
|
||||
|
||||
// Count: Total repositories
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many repositories are in the dataset?')
|
||||
.groundTruth(String(repos.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field list: Repository fields
|
||||
const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('List the field names for repositories (comma-separated, in order).')
|
||||
.groundTruth(repoFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Positional: Seventh field name for repos (tests TOON {fields} syntax)
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the 7th field name for GitHub repositories?')
|
||||
.groundTruth('forks')
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last repo's name
|
||||
const lastRepo = repos.at(-1)!
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the name of the last repository in the dataset?')
|
||||
.groundTruth(lastRepo.name)
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field count: How many fields per repository
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many fields does each repository record have?')
|
||||
.groundTruth('11')
|
||||
.type('structure-awareness')
|
||||
.dataset('github')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// ========== EVENT LOGS DATASET ==========
|
||||
|
||||
// Count: Total logs
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('How many log entries are in the dataset?')
|
||||
.groundTruth(String(logs.length))
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Field list: Base log fields (including optional error)
|
||||
const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
|
||||
.groundTruth(logFields)
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.build(),
|
||||
)
|
||||
|
||||
// Last row: Last log's level
|
||||
const lastLog = logs.at(-1)!
|
||||
questions.push(
|
||||
new QuestionBuilder()
|
||||
.id(getId())
|
||||
.prompt('What is the level of the last log entry in the dataset?')
|
||||
.groundTruth(lastLog.level)
|
||||
.type('structure-awareness')
|
||||
.dataset('event-logs')
|
||||
.build(),
|
||||
)
|
||||
|
||||
return questions
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||
import { FORMATTER_DISPLAY_NAMES } from './constants'
|
||||
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
||||
import { ACCURACY_DATASETS } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { supportsCSV } from './formatters'
|
||||
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
|
||||
if (formatName === 'csv' && !supportsCSV(dataset))
|
||||
continue
|
||||
|
||||
const formatted = formatter(dataset.data)
|
||||
const formattedData = formatter(dataset.data)
|
||||
const key = `${formatName}-${dataset.name}`
|
||||
tokenCounts[key] = tokenize(formatted)
|
||||
tokenCounts[key] = tokenize(formattedData)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
|
||||
|
||||
// Generate performance by model
|
||||
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
|
||||
|
||||
// Generate question type breakdown
|
||||
const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
|
||||
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
|
||||
|
||||
// Calculate question type distribution
|
||||
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
|
||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
||||
|
||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
||||
|
||||
// Calculate dataset sizes
|
||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
@@ -233,7 +238,11 @@ ${modelBreakdown}
|
||||
${summaryComparison}
|
||||
|
||||
<details>
|
||||
<summary><strong>Performance by dataset and model</strong></summary>
|
||||
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
||||
|
||||
#### Performance by Question Type
|
||||
|
||||
${questionTypeBreakdown}
|
||||
|
||||
#### Performance by Dataset
|
||||
|
||||
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
|
||||
|
||||
#### Question Types
|
||||
|
||||
${totalQuestions} questions are generated dynamically across three categories:
|
||||
${totalQuestions} questions are generated dynamically across four categories:
|
||||
|
||||
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- Example: "What is Alice's salary?" → \`75000\`
|
||||
- Example: "How many items are in order ORD-0042?" → \`3\`
|
||||
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
|
||||
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
||||
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
|
||||
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
|
||||
|
||||
- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
|
||||
- Example: "How many employees are in the dataset?" → \`100\`
|
||||
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
||||
- Example: "What is the department of the last employee?" → \`Sales\`
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||
@@ -413,6 +427,48 @@ ${tableRows}
|
||||
}).filter(Boolean).join('\n').trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate question type breakdown table
|
||||
*/
|
||||
function generateQuestionTypeBreakdown(
|
||||
formatResults: FormatResult[],
|
||||
results: EvaluationResult[],
|
||||
questions: Question[],
|
||||
): string {
|
||||
// Build header
|
||||
const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
|
||||
const header = `| Question Type | ${formatNames.join(' | ')} |`
|
||||
const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
|
||||
|
||||
// Build rows
|
||||
const rows = QUESTION_TYPES.map((type) => {
|
||||
const questionIds = questions.filter(q => q.type === type).map(q => q.id)
|
||||
const typeResults = results.filter(r => questionIds.includes(r.questionId))
|
||||
|
||||
if (typeResults.length === 0)
|
||||
return undefined
|
||||
|
||||
const accuracies = formatResults.map((fr) => {
|
||||
const formatTypeResults = typeResults.filter(r => r.format === fr.format)
|
||||
if (formatTypeResults.length === 0)
|
||||
return 'N/A'
|
||||
|
||||
const correctCount = formatTypeResults.filter(r => r.isCorrect).length
|
||||
const totalCount = formatTypeResults.length
|
||||
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
|
||||
return `${(accuracy * 100).toFixed(1)}%`
|
||||
})
|
||||
|
||||
return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
|
||||
}).filter(Boolean)
|
||||
|
||||
return `
|
||||
${header}
|
||||
${separator}
|
||||
${rows.join('\n')}
|
||||
`.trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate per-model performance comparison tables
|
||||
*/
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
|
||||
|
||||
export type QuestionType = typeof QUESTION_TYPES[number]
|
||||
export type DatasetName = typeof DATASET_NAMES[number]
|
||||
export type StructureClass = typeof STRUCTURE_CLASSES[number]
|
||||
|
||||
export interface DatasetMetadata {
|
||||
supportsCSV: boolean
|
||||
structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep'
|
||||
structureClass: StructureClass
|
||||
tabularEligibility: number
|
||||
}
|
||||
|
||||
export interface Dataset {
|
||||
name: string
|
||||
name: DatasetName
|
||||
description: string
|
||||
data: Record<string, any>
|
||||
metadata: DatasetMetadata
|
||||
@@ -15,8 +21,8 @@ export interface Question {
|
||||
id: string
|
||||
prompt: string
|
||||
groundTruth: string
|
||||
type: 'field-retrieval' | 'aggregation' | 'filtering'
|
||||
dataset: string
|
||||
type: QuestionType
|
||||
dataset: DatasetName
|
||||
}
|
||||
|
||||
export interface EvaluationResult {
|
||||
|
||||
Reference in New Issue
Block a user