mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
255 lines
6.9 KiB
TypeScript
255 lines
6.9 KiB
TypeScript
/**
|
|
* Datasets for TOON benchmarks
|
|
*
|
|
* These datasets are designed to test TOON's strengths and weaknesses:
|
|
* - Tabular: Uniform records (TOON optimal)
|
|
* - Nested: Complex structures with nested objects
|
|
* - Analytics: Time-series data
|
|
*/
|
|
|
|
import type { Dataset } from './types'
|
|
import { faker } from '@faker-js/faker'
|
|
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
|
|
|
// Seed for reproducibility
|
|
faker.seed(12345)
|
|
|
|
/**
|
|
* Employee record structure for tabular dataset
|
|
*/
|
|
export interface Employee {
|
|
id: number
|
|
name: string
|
|
email: string
|
|
department: string
|
|
salary: number
|
|
yearsExperience: number
|
|
active: boolean
|
|
}
|
|
|
|
/**
|
|
* E-commerce order structure for nested dataset
|
|
*/
|
|
export interface Order {
|
|
orderId: string
|
|
customer: {
|
|
id: number
|
|
name: string
|
|
email: string
|
|
phone: string
|
|
}
|
|
items: {
|
|
sku: string
|
|
name: string
|
|
quantity: number
|
|
price: number
|
|
}[]
|
|
subtotal: number
|
|
tax: number
|
|
total: number
|
|
status: string
|
|
orderDate?: string
|
|
createdAt?: string
|
|
}
|
|
|
|
/**
|
|
* Analytics metric structure for time-series dataset
|
|
*/
|
|
export interface AnalyticsMetric {
|
|
date: string
|
|
views: number
|
|
clicks: number
|
|
conversions: number
|
|
revenue: number
|
|
bounceRate: number
|
|
}
|
|
|
|
/**
|
|
* GitHub repository structure for real-world dataset
|
|
*/
|
|
export interface Repository {
|
|
id: number
|
|
name: string
|
|
owner: string
|
|
repo: string
|
|
description: string
|
|
stars: number
|
|
watchers: number
|
|
forks: number
|
|
defaultBranch: string
|
|
createdAt: string
|
|
updatedAt: string
|
|
pushedAt: string
|
|
}
|
|
|
|
/**
|
|
* Generate analytics time-series data
|
|
*/
|
|
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
|
metrics: AnalyticsMetric[]
|
|
} {
|
|
const date = new Date(startDate)
|
|
|
|
return {
|
|
metrics: Array.from({ length: days }, (_, i) => {
|
|
const currentDate = new Date(date)
|
|
currentDate.setDate(currentDate.getDate() + i)
|
|
|
|
// Simulate realistic web traffic with some variation
|
|
const baseViews = 5000
|
|
const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0
|
|
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
|
|
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
|
|
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
|
|
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
|
|
const revenue = Number((conversions * avgOrderValue).toFixed(2))
|
|
|
|
return {
|
|
date: currentDate.toISOString().split('T')[0]!,
|
|
views,
|
|
clicks,
|
|
conversions,
|
|
revenue,
|
|
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
|
|
}
|
|
}),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tabular dataset: 100 uniform employee records
|
|
*
|
|
* @remarks
|
|
* Tests TOON's tabular array format
|
|
*/
|
|
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
|
const tabularDataset: Dataset = {
|
|
name: 'tabular',
|
|
description: 'Uniform employee records (TOON optimal format)',
|
|
data: {
|
|
employees: Array.from({ length: 100 }, (_, i): Employee => {
|
|
const yearsExp = faker.number.int({ min: 1, max: 20 })
|
|
return {
|
|
id: i + 1,
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email().toLowerCase(),
|
|
department: departments[i % departments.length]!,
|
|
salary: faker.number.int({ min: 45000, max: 150000 }),
|
|
yearsExperience: yearsExp,
|
|
active: faker.datatype.boolean(0.8), // 80% active
|
|
}
|
|
}),
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Nested dataset: 50 e-commerce orders with nested structures
|
|
*
|
|
* @remarks
|
|
* Tests TOON's handling of complex nested objects
|
|
*/
|
|
const productNames: readonly string[] = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
|
|
const statuses: readonly string[] = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
|
|
|
|
const nestedDataset: Dataset = {
|
|
name: 'nested',
|
|
description: 'E-commerce orders with nested structures',
|
|
data: {
|
|
orders: Array.from({ length: 50 }, (_, i) => {
|
|
const customerId = (i % 20) + 1
|
|
const itemCount = faker.number.int({ min: 1, max: 4 })
|
|
|
|
const items = Array.from({ length: itemCount }, (_, j) => {
|
|
const price = faker.number.float({ min: 9.99, max: 199.99, fractionDigits: 2 })
|
|
const quantity = faker.number.int({ min: 1, max: 5 })
|
|
return {
|
|
sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
|
|
name: productNames[j % productNames.length]!,
|
|
quantity,
|
|
price,
|
|
}
|
|
})
|
|
|
|
const total = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
|
|
|
|
return {
|
|
orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
|
|
customer: {
|
|
id: customerId,
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email().toLowerCase(),
|
|
},
|
|
items,
|
|
total,
|
|
status: statuses[i % statuses.length]!,
|
|
orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
|
|
}
|
|
}),
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Analytics dataset: 60 days of time-series metrics
|
|
*
|
|
* @remarks
|
|
* Tests TOON's handling of numeric data and date fields
|
|
*/
|
|
const analyticsDataset: Dataset = {
|
|
name: 'analytics',
|
|
description: 'Time-series analytics data',
|
|
data: generateAnalyticsData(60),
|
|
}
|
|
|
|
/**
|
|
* Real-world dataset: Top 100 starred GitHub repositories
|
|
*
|
|
* @remarks
|
|
* Tests TOON's tabular format
|
|
*/
|
|
const githubDataset: Dataset = {
|
|
name: 'github',
|
|
description: 'Top 100 GitHub repositories',
|
|
data: {
|
|
repositories: githubRepos,
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Generate a single e-commerce order with nested structure
|
|
*
|
|
* @remarks
|
|
* Used for token efficiency benchmarks
|
|
*/
|
|
export function generateOrderData(): Order {
|
|
return {
|
|
orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
|
|
customer: {
|
|
id: faker.number.int({ min: 1000, max: 9999 }),
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email(),
|
|
phone: faker.phone.number(),
|
|
},
|
|
items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
|
|
sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
|
|
name: faker.commerce.productName(),
|
|
quantity: faker.number.int({ min: 1, max: 5 }),
|
|
price: Number(faker.commerce.price({ min: 10, max: 200 })),
|
|
})),
|
|
subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
|
|
tax: Number(faker.commerce.price({ min: 10, max: 50 })),
|
|
total: Number(faker.commerce.price({ min: 110, max: 550 })),
|
|
status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
|
|
createdAt: faker.date.recent({ days: 7 }).toISOString(),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* All datasets used in the benchmark
|
|
*/
|
|
export const datasets: Dataset[] = [
|
|
tabularDataset,
|
|
nestedDataset,
|
|
analyticsDataset,
|
|
githubDataset,
|
|
]
|