import type { Dataset } from './types' import { faker } from '@faker-js/faker' import githubRepos from '../data/github-repos.json' with { type: 'json' } // Seed for reproducibility faker.seed(12345) /** * Employee record structure for tabular dataset */ export interface Employee { id: number name: string email: string department: string salary: number yearsExperience: number active: boolean } /** * E-commerce order structure for nested dataset */ export interface Order { orderId: string customer: { id: number name: string email: string phone: string } items: { sku: string name: string quantity: number price: number }[] subtotal: number tax: number total: number status: string orderDate?: string createdAt?: string } /** * Analytics metric structure for time-series dataset */ export interface AnalyticsMetric { date: string views: number clicks: number conversions: number revenue: number bounceRate: number } /** * GitHub repository structure for real-world dataset */ export interface Repository { id: number name: string repo: string description: string stars: number watchers: number forks: number defaultBranch: string createdAt: string updatedAt: string pushedAt: string } /** * Event log structure for semi-uniform dataset */ export interface EventLog { timestamp: string level: 'info' | 'warn' | 'error' endpoint: string statusCode: number responseTime: number userId: number error?: { message: string stack: string retryable: boolean } } /** * Nested configuration structure for deeply nested dataset */ export interface NestedConfig { environment: string version: string database: { host: string port: number name: string pool: { min: number max: number idleTimeout: number } replicas: { host: string port: number priority: number }[] } features: Record }[] }> authentication: { providers: { name: string clientId: string scopes: string[] config: Record }[] session: { secret: string duration: number refreshThreshold: number } } permissions: { roles: Record groups: Record } } /** * Generate analytics time-series data */ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): { metrics: AnalyticsMetric[] } { const date = new Date(startDate) return { metrics: Array.from({ length: days }, (_, i) => { const currentDate = new Date(date) currentDate.setDate(currentDate.getDate() + i) // Simulate realistic web traffic with some variation const baseViews = 5000 const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0 const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 })) const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 })) const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 })) const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 }) const revenue = Number((conversions * avgOrderValue).toFixed(2)) return { date: currentDate.toISOString().split('T')[0]!, views, clicks, conversions, revenue, bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }), } }), } } /** * Generate employee data (uniform tabular structure) */ const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const function generateEmployees(count: number): { employees: Employee[] } { return { employees: Array.from({ length: count }, (_, i): Employee => { const yearsExp = faker.number.int({ min: 1, max: 25 }) return { id: i + 1, name: faker.person.fullName(), email: faker.internet.email().toLowerCase(), department: departments[i % departments.length]!, salary: faker.number.int({ min: 45000, max: 150000 }), yearsExperience: yearsExp, active: faker.datatype.boolean(0.8), // 80% active } }), } } /** * Tabular dataset: Uniform employee records * * @remarks * Tests TOON's tabular array format. */ const tabularDataset: Dataset = { name: 'tabular', description: 'Uniform employee records', data: generateEmployees(100), metadata: { supportsCSV: true, structureClass: 'uniform', tabularEligibility: 100, // All arrays contain uniform objects with primitive values only }, } /** * Generate e-commerce orders (nested structure) */ const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const function generateOrders(count: number): { orders: Order[] } { return { orders: Array.from({ length: count }, (_, i) => { const customerId = (i % 20) + 1 // Rotate through 20 customers const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order const items = Array.from({ length: itemCount }, (_, j) => { const price = faker.number.float({ min: 9.99, max: 199.99, fractionDigits: 2, }) const quantity = faker.number.int({ min: 1, max: 5 }) return { sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`, name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!, quantity, price, } }) const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2)) const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate const total = Number((subtotal + tax).toFixed(2)) return { orderId: `ORD-${String(i + 1).padStart(4, '0')}`, customer: { id: customerId, name: faker.person.fullName(), email: faker.internet.email().toLowerCase(), phone: faker.phone.number(), }, items, subtotal, tax, total, status: ORDER_STATUSES[i % ORDER_STATUSES.length]!, orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0], } }), } } /** * Nested dataset: E-commerce orders with nested structures * * @remarks * Tests TOON's handling of complex nested objects. */ const nestedDataset: Dataset = { name: 'nested', description: 'E-commerce orders with nested structures', data: generateOrders(50), metadata: { supportsCSV: false, structureClass: 'nested', tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular }, } /** * Analytics dataset: Time-series metrics * * @remarks * Tests TOON's handling of numeric data and date fields. */ const analyticsDataset: Dataset = { name: 'analytics', description: 'Time-series analytics data', data: generateAnalyticsData(60), metadata: { supportsCSV: true, structureClass: 'uniform', tabularEligibility: 100, // Uniform time-series records with consistent primitive fields }, } /** * Real-world dataset: Top 100 starred GitHub repositories * * @remarks * Tests TOON's tabular format with real data. */ const githubDataset: Dataset = { name: 'github', description: 'Top 100 GitHub repositories', data: { repositories: githubRepos, }, metadata: { supportsCSV: true, structureClass: 'uniform', tabularEligibility: 100, // Repository array contains uniform objects with primitive values }, } /** * Generate a single e-commerce order with nested structure * * @remarks * Used for token efficiency benchmarks. */ export function generateOrderData(): Order { return { orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }), customer: { id: faker.number.int({ min: 1000, max: 9999 }), name: faker.person.fullName(), email: faker.internet.email(), phone: faker.phone.number(), }, items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({ sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }), name: faker.commerce.productName(), quantity: faker.number.int({ min: 1, max: 5 }), price: Number(faker.commerce.price({ min: 10, max: 200 })), })), subtotal: Number(faker.commerce.price({ min: 100, max: 500 })), tax: Number(faker.commerce.price({ min: 10, max: 50 })), total: Number(faker.commerce.price({ min: 110, max: 550 })), status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']), createdAt: faker.date.recent({ days: 7 }).toISOString(), } } /** * Generate event logs (semi-uniform structure) * * @remarks * Approximately 50% of logs include nested error objects, 50% are flat. * This creates ~45% tabular eligibility. */ export function generateEventLogs(count: number): { logs: EventLog[] } { const endpoints = ['/api/users', '/api/orders', '/api/products', '/api/auth', '/api/payments'] const levels = ['info', 'warn', 'error'] as const return { logs: Array.from({ length: count }, () => { const level = faker.helpers.arrayElement(levels) const hasError = level === 'error' || (level === 'warn' && faker.datatype.boolean(0.3)) const log: EventLog = { timestamp: faker.date.recent({ days: 7 }).toISOString(), level, endpoint: faker.helpers.arrayElement(endpoints), statusCode: hasError ? faker.number.int({ min: 400, max: 599 }) : faker.number.int({ min: 200, max: 299 }), responseTime: faker.number.int({ min: 10, max: 5000 }), userId: faker.number.int({ min: 1000, max: 9999 }), } if (hasError) { log.error = { message: faker.helpers.arrayElement([ 'Database connection timeout', 'Invalid authentication token', 'Resource not found', 'Internal server error', 'Rate limit exceeded', ]), stack: `Error: ${faker.lorem.sentence()}\n at ${faker.lorem.word()}\n at ${faker.lorem.word()}`, retryable: faker.datatype.boolean(0.6), } } return log }), } } /** * Generate deeply nested configuration * * @remarks * Creates a complex nested structure with minimal tabular eligibility (~0%). */ export function generateNestedConfig(): NestedConfig { return { environment: faker.helpers.arrayElement(['production', 'staging', 'development']), version: faker.system.semver(), database: { host: faker.internet.domainName(), port: 5432, name: faker.database.type(), pool: { min: 2, max: faker.number.int({ min: 10, max: 50 }), idleTimeout: 30000, }, replicas: Array.from({ length: 3 }, (_, i) => ({ host: `replica-${i + 1}.${faker.internet.domainName()}`, port: 5432, priority: i + 1, })), }, features: { darkMode: { enabled: faker.datatype.boolean(), rollout: faker.number.int({ min: 0, max: 100 }), variants: [ { name: 'default', weight: 70, config: { theme: 'dark', animations: true }, }, { name: 'minimal', weight: 30, config: { theme: 'dark', animations: false }, }, ], }, analytics: { enabled: faker.datatype.boolean(), rollout: faker.number.int({ min: 0, max: 100 }), variants: [ { name: 'full', weight: 100, config: { tracking: 'all', sampling: 1.0 }, }, ], }, }, authentication: { providers: [ { name: 'oauth2', clientId: faker.string.uuid(), scopes: ['read', 'write', 'admin'], config: { authUrl: faker.internet.url(), tokenUrl: faker.internet.url(), }, }, { name: 'saml', clientId: faker.string.uuid(), scopes: ['read'], config: { entryPoint: faker.internet.url(), cert: faker.string.alphanumeric({ length: 64 }), }, }, ], session: { secret: faker.string.alphanumeric({ length: 32 }), duration: 86400, refreshThreshold: 3600, }, }, permissions: { roles: { admin: { permissions: ['read', 'write', 'delete', 'manage_users', 'manage_roles'], inherits: [], }, editor: { permissions: ['read', 'write'], inherits: ['viewer'], }, viewer: { permissions: ['read'], inherits: [], }, }, groups: { engineering: { members: Array.from({ length: 5 }, () => faker.internet.email()), roles: ['admin', 'editor'], }, support: { members: Array.from({ length: 3 }, () => faker.internet.email()), roles: ['viewer'], }, }, }, } } /** * Event logs dataset: Semi-uniform structure * * @remarks * Tests TOON with semi-uniform data (~50% flat, ~50% with nested errors). */ const eventLogsDataset: Dataset = { name: 'event-logs', description: 'Semi-uniform event logs', data: generateEventLogs(75), metadata: { supportsCSV: false, structureClass: 'semi-uniform', tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects }, } /** * Nested config dataset: Deeply nested structure * * @remarks * Tests TOON's worst-case scenario with deeply nested configuration. */ const nestedConfigDataset: Dataset = { name: 'nested-config', description: 'Deeply nested configuration', data: generateNestedConfig(), metadata: { supportsCSV: false, structureClass: 'deep', tabularEligibility: 0, // Deeply nested configuration with no tabular arrays }, } /** * Datasets for accuracy benchmarks (smaller sizes for faster evaluation) */ export const ACCURACY_DATASETS: Dataset[] = [ tabularDataset, // 100 employees nestedDataset, // 50 orders analyticsDataset, // 60 days githubDataset, // 100 repos eventLogsDataset, // 75 logs nestedConfigDataset, // 1 config ] /** * Datasets for token efficiency benchmarks (larger sizes to amplify token differences) */ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ // Tabular: 2000 employees { name: 'tabular', description: 'Uniform employee records', data: generateEmployees(2000), metadata: { supportsCSV: true, structureClass: 'uniform', tabularEligibility: 100, // All arrays contain uniform objects with primitive values only }, }, // Nested: 500 orders { name: 'nested', description: 'E-commerce orders with nested structures', data: generateOrders(500), metadata: { supportsCSV: false, structureClass: 'nested', tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular }, }, // Analytics: 365 days { name: 'analytics', description: 'Time-series analytics data', data: generateAnalyticsData(365), metadata: { supportsCSV: true, structureClass: 'uniform', tabularEligibility: 100, // Uniform time-series records with consistent primitive fields }, }, // GitHub: 100 repos (same as accuracy) githubDataset, // Event logs: 2000 logs { name: 'event-logs', description: 'Semi-uniform event logs', data: generateEventLogs(2000), metadata: { supportsCSV: false, structureClass: 'semi-uniform', tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects }, }, // Nested config: 1 config (same as accuracy) nestedConfigDataset, ]