mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
chore(benchmarks): add structure-awareness questions
This commit is contained in:
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -4,37 +4,11 @@ import * as url from 'node:url'
|
|||||||
export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
|
export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
|
||||||
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
|
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
|
||||||
|
|
||||||
/**
|
|
||||||
* Model-specific RPM (requests per minute) limits to handle API quotas
|
|
||||||
*
|
|
||||||
* @remarks
|
|
||||||
* Set `undefined` for models without specific limits.
|
|
||||||
*/
|
|
||||||
/// keep-sorted
|
|
||||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
|
||||||
'claude-haiku-4-5-20251001': 50,
|
|
||||||
'gemini-2.5-flash': 25,
|
|
||||||
'gpt-5-nano': 50,
|
|
||||||
'grok-4-fast-non-reasoning': 50,
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default concurrency for parallel evaluations to prevent bursting
|
* Default concurrency for parallel evaluations to prevent bursting
|
||||||
*/
|
*/
|
||||||
export const DEFAULT_CONCURRENCY = 10
|
export const DEFAULT_CONCURRENCY = 10
|
||||||
|
|
||||||
/**
|
|
||||||
* Display names for data format types
|
|
||||||
*/
|
|
||||||
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
|
||||||
'json-pretty': 'JSON',
|
|
||||||
'json-compact': 'JSON compact',
|
|
||||||
'toon': 'TOON',
|
|
||||||
'csv': 'CSV',
|
|
||||||
'xml': 'XML',
|
|
||||||
'yaml': 'YAML',
|
|
||||||
} as const
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enable dry run mode for quick testing with limited AI requests
|
* Enable dry run mode for quick testing with limited AI requests
|
||||||
*
|
*
|
||||||
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
|
|||||||
maxQuestions: 10,
|
maxQuestions: 10,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model-specific RPM (requests per minute) limits to handle API quotas
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Set `undefined` for models without specific limits.
|
||||||
|
*/
|
||||||
|
/// keep-sorted
|
||||||
|
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||||
|
'claude-haiku-4-5-20251001': 50,
|
||||||
|
'gemini-2.5-flash': 25,
|
||||||
|
'gpt-5-nano': 50,
|
||||||
|
'grok-4-fast-non-reasoning': 50,
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Display names for data format types
|
||||||
|
*/
|
||||||
|
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
|
||||||
|
'json-pretty': 'JSON',
|
||||||
|
'json-compact': 'JSON compact',
|
||||||
|
'toon': 'TOON',
|
||||||
|
'csv': 'CSV',
|
||||||
|
'xml': 'XML',
|
||||||
|
'yaml': 'YAML',
|
||||||
|
} as const
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Question type identifiers
|
||||||
|
*/
|
||||||
|
export const QUESTION_TYPES = [
|
||||||
|
'field-retrieval',
|
||||||
|
'aggregation',
|
||||||
|
'filtering',
|
||||||
|
'structure-awareness',
|
||||||
|
] as const
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Display names for question types
|
||||||
|
*/
|
||||||
|
export const QUESTION_TYPE_LABELS = {
|
||||||
|
'field-retrieval': 'Field Retrieval',
|
||||||
|
'aggregation': 'Aggregation',
|
||||||
|
'filtering': 'Filtering',
|
||||||
|
'structure-awareness': 'Structure Awareness',
|
||||||
|
} as const
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dataset identifiers
|
||||||
|
*/
|
||||||
|
export const DATASET_NAMES = [
|
||||||
|
'tabular',
|
||||||
|
'nested',
|
||||||
|
'analytics',
|
||||||
|
'github',
|
||||||
|
'event-logs',
|
||||||
|
'nested-config',
|
||||||
|
] as const
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structure class identifiers
|
||||||
|
*/
|
||||||
|
export const STRUCTURE_CLASSES = [
|
||||||
|
'uniform',
|
||||||
|
'semi-uniform',
|
||||||
|
'nested',
|
||||||
|
'deep',
|
||||||
|
] as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Threshold values for filtering and aggregation questions
|
* Threshold values for filtering and aggregation questions
|
||||||
*/
|
*/
|
||||||
export const QUESTION_THRESHOLDS = {
|
export const QUESTION_THRESHOLDS = {
|
||||||
tabular: {
|
tabular: {
|
||||||
salaryRanges: [60000, 80000, 100000, 120000],
|
salaryRanges: [60000, 80000, 100000],
|
||||||
experienceYears: [5, 10, 15, 20],
|
experienceYears: [5, 10, 15, 20],
|
||||||
departmentSalaryThreshold: 80000,
|
departmentSalaryThreshold: 80000,
|
||||||
departmentExperienceThreshold: 10,
|
departmentExperienceThreshold: 10,
|
||||||
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
|
|||||||
totalThresholdsForItems: [300, 500],
|
totalThresholdsForItems: [300, 500],
|
||||||
},
|
},
|
||||||
analytics: {
|
analytics: {
|
||||||
views: [5000, 7000],
|
views: [6000],
|
||||||
conversions: [10, 30],
|
conversions: [20],
|
||||||
viewsForFiltering: [6000, 7000],
|
viewsForFiltering: [6000, 7000],
|
||||||
conversionsForFiltering: 15,
|
conversionsForFiltering: 15,
|
||||||
revenueThresholds: [500, 1000, 1500, 2000, 2500],
|
revenueThresholds: [1000, 1500, 2000],
|
||||||
viewsThresholdForRevenue: 6000,
|
viewsThresholdForRevenue: 6000,
|
||||||
clicksForFiltering: [250, 400],
|
clicksForFiltering: [250, 400],
|
||||||
conversionsForClickFiltering: 15,
|
conversionsForClickFiltering: 15,
|
||||||
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
|
|||||||
},
|
},
|
||||||
github: {
|
github: {
|
||||||
stars: [100000, 150000, 200000],
|
stars: [100000, 150000, 200000],
|
||||||
forks: [20000, 35000, 50000],
|
forks: [20000, 35000],
|
||||||
watchers: [5000, 8000],
|
watchers: [8000],
|
||||||
starForkCombinations: [
|
starForkCombinations: [
|
||||||
{ stars: 75000, forks: 15000 },
|
{ stars: 75000, forks: 15000 },
|
||||||
{ stars: 100000, forks: 20000 },
|
{ stars: 100000, forks: 20000 },
|
||||||
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
|
|||||||
*/
|
*/
|
||||||
export const QUESTION_LIMITS = {
|
export const QUESTION_LIMITS = {
|
||||||
tabular: {
|
tabular: {
|
||||||
fieldRetrieval: 14,
|
fieldRetrieval: 12,
|
||||||
aggregationDepartments: 4,
|
aggregationDepartments: 3,
|
||||||
filteringMultiConditionDepartments: 5,
|
filteringMultiConditionDepartments: 5,
|
||||||
filteringExperience: 3,
|
filteringExperience: 3,
|
||||||
filteringDepartmentExp: 3,
|
filteringDepartmentExp: 3,
|
||||||
filteringDepartmentActive: 3,
|
filteringDepartmentActive: 2,
|
||||||
},
|
},
|
||||||
nested: {
|
nested: {
|
||||||
fieldRetrievalOrders: 8,
|
fieldRetrievalOrders: 8,
|
||||||
fieldRetrievalCustomers: 10,
|
fieldRetrievalCustomers: 8,
|
||||||
aggregationStatuses: 5,
|
aggregationStatuses: 3,
|
||||||
filteringStatusAndValue: 5,
|
filteringStatusAndValue: 4,
|
||||||
filteringStatusAndItems: 3,
|
filteringStatusAndItems: 3,
|
||||||
},
|
},
|
||||||
analytics: {
|
analytics: {
|
||||||
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
|
|||||||
github: {
|
github: {
|
||||||
fieldRetrievalRepos: 11,
|
fieldRetrievalRepos: 11,
|
||||||
aggregationBranches: 2,
|
aggregationBranches: 2,
|
||||||
filteringStarsAndForks: 8,
|
filteringStarsAndForks: 3,
|
||||||
},
|
},
|
||||||
eventLogs: {
|
eventLogs: {
|
||||||
fieldRetrieval: 10,
|
fieldRetrieval: 10,
|
||||||
aggregationEndpoints: 4,
|
aggregationEndpoints: 2,
|
||||||
filteringLevelAndStatus: 3,
|
filteringLevelAndStatus: 3,
|
||||||
filteringEndpointAndStatus: 3,
|
filteringEndpointAndStatus: 3,
|
||||||
|
filteringEndpointRetryable: 2,
|
||||||
},
|
},
|
||||||
nestedConfig: {
|
nestedConfig: {
|
||||||
fieldRetrieval: 10,
|
fieldRetrieval: 10,
|
||||||
filteringComplex: 6,
|
filteringComplex: 5,
|
||||||
},
|
},
|
||||||
} as const
|
} as const
|
||||||
|
|||||||
@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
|||||||
/**
|
/**
|
||||||
* Generate employee data (uniform tabular structure)
|
* Generate employee data (uniform tabular structure)
|
||||||
*/
|
*/
|
||||||
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
||||||
|
|
||||||
function generateEmployees(count: number): { employees: Employee[] } {
|
function generateEmployees(count: number): { employees: Employee[] } {
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
|
|||||||
xai('grok-4-fast-non-reasoning'),
|
xai('grok-4-fast-non-reasoning'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Format primers
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Neutral descriptions to help models parse each format.
|
||||||
|
*/
|
||||||
|
export const PRIMERS: Record<string, string> = {
|
||||||
|
'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
|
||||||
|
'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
|
||||||
|
'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
|
||||||
|
'yaml': 'YAML: Indentation-based key/value and lists (- items).',
|
||||||
|
'xml': 'XML: Tag-based tree structure with nested elements.',
|
||||||
|
'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Code fence language tags for proper syntax highlighting
|
||||||
|
*/
|
||||||
|
export const FENCE: Record<string, string> = {
|
||||||
|
'toon': 'toon',
|
||||||
|
'json-pretty': 'json',
|
||||||
|
'json-compact': 'json',
|
||||||
|
'yaml': 'yaml',
|
||||||
|
'xml': 'xml',
|
||||||
|
'csv': 'csv',
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Evaluate a single question with a specific format and model
|
* Evaluate a single question with a specific format and model
|
||||||
*/
|
*/
|
||||||
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
|
|||||||
model: LanguageModelV2
|
model: LanguageModelV2
|
||||||
},
|
},
|
||||||
): Promise<EvaluationResult> {
|
): Promise<EvaluationResult> {
|
||||||
|
const primer = PRIMERS[formatName] ?? ''
|
||||||
|
const fence = FENCE[formatName] ?? ''
|
||||||
|
|
||||||
const prompt = `
|
const prompt = `
|
||||||
|
${primer}
|
||||||
|
|
||||||
Given the following data in ${formatName} format:
|
Given the following data in ${formatName} format:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`${fence}
|
||||||
${formattedData}
|
${formattedData}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Filtering: endpoint AND retryable error
|
// Filtering: endpoint AND retryable error
|
||||||
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
|
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
|
||||||
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
|
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
|
||||||
questions.push(
|
questions.push(
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
|
|||||||
@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
|
|||||||
import { generateGithubQuestions } from './github'
|
import { generateGithubQuestions } from './github'
|
||||||
import { generateNestedQuestions } from './nested'
|
import { generateNestedQuestions } from './nested'
|
||||||
import { generateNestedConfigQuestions } from './nested-config'
|
import { generateNestedConfigQuestions } from './nested-config'
|
||||||
|
import { generateStructureQuestions } from './structure'
|
||||||
import { generateTabularQuestions } from './tabular'
|
import { generateTabularQuestions } from './tabular'
|
||||||
import { createIdGenerator } from './utils'
|
import { createIdGenerator } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate ~200 questions from all datasets
|
* Generate questions from all datasets
|
||||||
*
|
*
|
||||||
* @remarks
|
* @remarks
|
||||||
* - Field Retrieval: Direct field access with no computation
|
* - Field Retrieval: Direct field access with no computation
|
||||||
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
|
|||||||
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
|
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
|
||||||
* - Filtering: Multi-condition queries requiring complex logical operations
|
* - Filtering: Multi-condition queries requiring complex logical operations
|
||||||
* Examples: "How many X WHERE condition1 AND condition2?"
|
* Examples: "How many X WHERE condition1 AND condition2?"
|
||||||
|
* - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
|
||||||
|
* Examples: "How many records?", "List the field names", "What is the last record's field?"
|
||||||
*/
|
*/
|
||||||
export function generateQuestions(): Question[] {
|
export function generateQuestions(): Question[] {
|
||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
|
|||||||
questions.push(...generateEventLogsQuestions(eventLogs, getId))
|
questions.push(...generateEventLogsQuestions(eventLogs, getId))
|
||||||
questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
|
questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
|
||||||
|
|
||||||
|
// Generate structure-awareness questions (tests format-native affordances)
|
||||||
|
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
|
||||||
|
|
||||||
return questions
|
return questions
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
// Aggregation: additional nested counts
|
// Aggregation: additional nested counts
|
||||||
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
|
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
|
||||||
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
|
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
|
||||||
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
|
|
||||||
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
|
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
|
||||||
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
|
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
|
||||||
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
|
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
|
||||||
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
|
|||||||
.type('aggregation')
|
.type('aggregation')
|
||||||
.dataset('nested-config')
|
.dataset('nested-config')
|
||||||
.build(),
|
.build(),
|
||||||
new QuestionBuilder()
|
|
||||||
.id(getId())
|
|
||||||
.prompt('How many distinct scopes are defined across all authentication providers?')
|
|
||||||
.groundTruth(String(distinctScopes))
|
|
||||||
.type('aggregation')
|
|
||||||
.dataset('nested-config')
|
|
||||||
.build(),
|
|
||||||
new QuestionBuilder()
|
new QuestionBuilder()
|
||||||
.id(getId())
|
.id(getId())
|
||||||
.prompt('What is the total number of variants across all feature flags?')
|
.prompt('What is the total number of variants across all feature flags?')
|
||||||
|
|||||||
324
benchmarks/src/questions/structure.ts
Normal file
324
benchmarks/src/questions/structure.ts
Normal file
@@ -0,0 +1,324 @@
|
|||||||
|
import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
|
||||||
|
import type { Question } from '../types'
|
||||||
|
import { QuestionBuilder } from './utils'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate structure-awareness questions across all datasets
|
||||||
|
*
|
||||||
|
* These questions test format-native structural affordances:
|
||||||
|
* - TOON's explicit array length [N] and field declarations {fields}
|
||||||
|
* - CSV's header row (but no explicit length)
|
||||||
|
* - JSON/YAML have neither unless the model counts manually
|
||||||
|
*/
|
||||||
|
export function generateStructureQuestions(
|
||||||
|
employees: Employee[],
|
||||||
|
orders: Order[],
|
||||||
|
metrics: AnalyticsMetric[],
|
||||||
|
repos: Repository[],
|
||||||
|
logs: EventLog[],
|
||||||
|
getId: () => string,
|
||||||
|
): Question[] {
|
||||||
|
const questions: Question[] = []
|
||||||
|
|
||||||
|
// ========== TABULAR DATASET (Employees) ==========
|
||||||
|
|
||||||
|
// Count: Total employees (tests array length awareness)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many employees are in the dataset?')
|
||||||
|
.groundTruth(String(employees.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field list: Employee fields (tests field name awareness)
|
||||||
|
const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('List the field names for employees (comma-separated, in order).')
|
||||||
|
.groundTruth(employeeFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Positional: Third field name for employees (tests TOON {fields} syntax)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the 3rd field name for employees?')
|
||||||
|
.groundTruth('email')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last employee's department (tests ability to find last row using length)
|
||||||
|
const lastEmployee = employees.at(-1)!
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the department of the last employee in the dataset?')
|
||||||
|
.groundTruth(lastEmployee.department)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last employee's name
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the name of the last employee in the dataset?')
|
||||||
|
.groundTruth(lastEmployee.name)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field count: How many fields per employee (tests schema awareness)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many fields does each employee record have?')
|
||||||
|
.groundTruth('7')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('tabular')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// ========== NESTED DATASET (Orders) ==========
|
||||||
|
|
||||||
|
// Count: Total orders
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many orders are in the dataset?')
|
||||||
|
.groundTruth(String(orders.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field list: Order fields
|
||||||
|
const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('List the top-level field names for orders (comma-separated, in order).')
|
||||||
|
.groundTruth(orderFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Nested count: Items in specific order
|
||||||
|
const orderWithManyItems = orders.reduce((max, order) =>
|
||||||
|
order.items.length > max.items.length ? order : max,
|
||||||
|
)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
|
||||||
|
.groundTruth(String(orderWithManyItems.items.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Nested field list: Item fields
|
||||||
|
const itemFields = 'sku,name,quantity,price'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What are the field names for items within orders (comma-separated, in order)?')
|
||||||
|
.groundTruth(itemFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last order's status
|
||||||
|
const lastOrder = orders.at(-1)!
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the status of the last order in the dataset?')
|
||||||
|
.groundTruth(lastOrder.status)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Customer field list
|
||||||
|
const customerFields = 'id,name,email,phone'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
|
||||||
|
.groundTruth(customerFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('nested')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// ========== ANALYTICS DATASET (Metrics) ==========
|
||||||
|
|
||||||
|
// Count: Total metrics
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many metric records are in the dataset?')
|
||||||
|
.groundTruth(String(metrics.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('analytics')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field list: Metric fields
|
||||||
|
const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('List the field names for metrics (comma-separated, in order).')
|
||||||
|
.groundTruth(metricFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('analytics')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Positional: Fifth field name for metrics (tests TOON {fields} syntax)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the 5th field name for analytics metrics?')
|
||||||
|
.groundTruth('revenue')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('analytics')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last metric's date
|
||||||
|
const lastMetric = metrics.at(-1)!
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the date of the last metric record in the dataset?')
|
||||||
|
.groundTruth(lastMetric.date)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('analytics')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field count: How many fields per metric
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many fields does each metric record have?')
|
||||||
|
.groundTruth('6')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('analytics')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// ========== GITHUB DATASET (Repositories) ==========
|
||||||
|
|
||||||
|
// Count: Total repositories
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many repositories are in the dataset?')
|
||||||
|
.groundTruth(String(repos.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('github')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field list: Repository fields
|
||||||
|
const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('List the field names for repositories (comma-separated, in order).')
|
||||||
|
.groundTruth(repoFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('github')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Positional: Seventh field name for repos (tests TOON {fields} syntax)
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the 7th field name for GitHub repositories?')
|
||||||
|
.groundTruth('forks')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('github')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last repo's name
|
||||||
|
const lastRepo = repos.at(-1)!
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the name of the last repository in the dataset?')
|
||||||
|
.groundTruth(lastRepo.name)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('github')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field count: How many fields per repository
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many fields does each repository record have?')
|
||||||
|
.groundTruth('11')
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('github')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// ========== EVENT LOGS DATASET ==========
|
||||||
|
|
||||||
|
// Count: Total logs
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('How many log entries are in the dataset?')
|
||||||
|
.groundTruth(String(logs.length))
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('event-logs')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Field list: Base log fields (including optional error)
|
||||||
|
const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
|
||||||
|
.groundTruth(logFields)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('event-logs')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Last row: Last log's level
|
||||||
|
const lastLog = logs.at(-1)!
|
||||||
|
questions.push(
|
||||||
|
new QuestionBuilder()
|
||||||
|
.id(getId())
|
||||||
|
.prompt('What is the level of the last log entry in the dataset?')
|
||||||
|
.groundTruth(lastLog.level)
|
||||||
|
.type('structure-awareness')
|
||||||
|
.dataset('event-logs')
|
||||||
|
.build(),
|
||||||
|
)
|
||||||
|
|
||||||
|
return questions
|
||||||
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
|
||||||
import { FORMATTER_DISPLAY_NAMES } from './constants'
|
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
|
||||||
import { ACCURACY_DATASETS } from './datasets'
|
import { ACCURACY_DATASETS } from './datasets'
|
||||||
import { models } from './evaluate'
|
import { models } from './evaluate'
|
||||||
import { supportsCSV } from './formatters'
|
import { supportsCSV } from './formatters'
|
||||||
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
|
|||||||
if (formatName === 'csv' && !supportsCSV(dataset))
|
if (formatName === 'csv' && !supportsCSV(dataset))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
const formatted = formatter(dataset.data)
|
const formattedData = formatter(dataset.data)
|
||||||
const key = `${formatName}-${dataset.name}`
|
const key = `${formatName}-${dataset.name}`
|
||||||
tokenCounts[key] = tokenize(formatted)
|
tokenCounts[key] = tokenize(formattedData)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
|
|||||||
|
|
||||||
// Generate performance by model
|
// Generate performance by model
|
||||||
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
|
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
|
||||||
|
|
||||||
|
// Generate question type breakdown
|
||||||
|
const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
|
||||||
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
|
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
|
||||||
|
|
||||||
// Calculate question type distribution
|
// Calculate question type distribution
|
||||||
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
|
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
|
||||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||||
|
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
|
||||||
|
|
||||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||||
|
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
|
||||||
|
|
||||||
// Calculate dataset sizes
|
// Calculate dataset sizes
|
||||||
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||||
@@ -233,7 +238,11 @@ ${modelBreakdown}
|
|||||||
${summaryComparison}
|
${summaryComparison}
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Performance by dataset and model</strong></summary>
|
<summary><strong>Performance by dataset, model, and question type</strong></summary>
|
||||||
|
|
||||||
|
#### Performance by Question Type
|
||||||
|
|
||||||
|
${questionTypeBreakdown}
|
||||||
|
|
||||||
#### Performance by Dataset
|
#### Performance by Dataset
|
||||||
|
|
||||||
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
|
|||||||
|
|
||||||
#### Question Types
|
#### Question Types
|
||||||
|
|
||||||
${totalQuestions} questions are generated dynamically across three categories:
|
${totalQuestions} questions are generated dynamically across four categories:
|
||||||
|
|
||||||
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||||
- Example: "What is Alice's salary?" → \`75000\`
|
- Example: "What is Alice's salary?" → \`75000\`
|
||||||
- Example: "How many items are in order ORD-0042?" → \`3\`
|
- Example: "How many items are in order ORD-0042?" → \`3\`
|
||||||
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
|
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
|
||||||
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
|||||||
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
|
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
|
||||||
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
|
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
|
||||||
|
|
||||||
|
- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
|
||||||
|
- Example: "How many employees are in the dataset?" → \`100\`
|
||||||
|
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
|
||||||
|
- Example: "What is the department of the last employee?" → \`Sales\`
|
||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
|
||||||
@@ -413,6 +427,48 @@ ${tableRows}
|
|||||||
}).filter(Boolean).join('\n').trim()
|
}).filter(Boolean).join('\n').trim()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate question type breakdown table
|
||||||
|
*/
|
||||||
|
function generateQuestionTypeBreakdown(
|
||||||
|
formatResults: FormatResult[],
|
||||||
|
results: EvaluationResult[],
|
||||||
|
questions: Question[],
|
||||||
|
): string {
|
||||||
|
// Build header
|
||||||
|
const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
|
||||||
|
const header = `| Question Type | ${formatNames.join(' | ')} |`
|
||||||
|
const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
|
||||||
|
|
||||||
|
// Build rows
|
||||||
|
const rows = QUESTION_TYPES.map((type) => {
|
||||||
|
const questionIds = questions.filter(q => q.type === type).map(q => q.id)
|
||||||
|
const typeResults = results.filter(r => questionIds.includes(r.questionId))
|
||||||
|
|
||||||
|
if (typeResults.length === 0)
|
||||||
|
return undefined
|
||||||
|
|
||||||
|
const accuracies = formatResults.map((fr) => {
|
||||||
|
const formatTypeResults = typeResults.filter(r => r.format === fr.format)
|
||||||
|
if (formatTypeResults.length === 0)
|
||||||
|
return 'N/A'
|
||||||
|
|
||||||
|
const correctCount = formatTypeResults.filter(r => r.isCorrect).length
|
||||||
|
const totalCount = formatTypeResults.length
|
||||||
|
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
|
||||||
|
return `${(accuracy * 100).toFixed(1)}%`
|
||||||
|
})
|
||||||
|
|
||||||
|
return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
|
||||||
|
}).filter(Boolean)
|
||||||
|
|
||||||
|
return `
|
||||||
|
${header}
|
||||||
|
${separator}
|
||||||
|
${rows.join('\n')}
|
||||||
|
`.trim()
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate per-model performance comparison tables
|
* Generate per-model performance comparison tables
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -1,11 +1,17 @@
|
|||||||
|
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
|
||||||
|
|
||||||
|
export type QuestionType = typeof QUESTION_TYPES[number]
|
||||||
|
export type DatasetName = typeof DATASET_NAMES[number]
|
||||||
|
export type StructureClass = typeof STRUCTURE_CLASSES[number]
|
||||||
|
|
||||||
export interface DatasetMetadata {
|
export interface DatasetMetadata {
|
||||||
supportsCSV: boolean
|
supportsCSV: boolean
|
||||||
structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep'
|
structureClass: StructureClass
|
||||||
tabularEligibility: number
|
tabularEligibility: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface Dataset {
|
export interface Dataset {
|
||||||
name: string
|
name: DatasetName
|
||||||
description: string
|
description: string
|
||||||
data: Record<string, any>
|
data: Record<string, any>
|
||||||
metadata: DatasetMetadata
|
metadata: DatasetMetadata
|
||||||
@@ -15,8 +21,8 @@ export interface Question {
|
|||||||
id: string
|
id: string
|
||||||
prompt: string
|
prompt: string
|
||||||
groundTruth: string
|
groundTruth: string
|
||||||
type: 'field-retrieval' | 'aggregation' | 'filtering'
|
type: QuestionType
|
||||||
dataset: string
|
dataset: DatasetName
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface EvaluationResult {
|
export interface EvaluationResult {
|
||||||
|
|||||||
Reference in New Issue
Block a user