mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
754 lines
20 KiB
TypeScript
754 lines
20 KiB
TypeScript
import type { Dataset } from './types'
|
|
import { faker } from '@faker-js/faker'
|
|
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
|
|
|
// Seed for reproducibility
|
|
faker.seed(12345)
|
|
|
|
/**
|
|
* Employee record structure for tabular dataset
|
|
*/
|
|
export interface Employee {
|
|
id: number
|
|
name: string
|
|
email: string
|
|
department: string
|
|
salary: number
|
|
yearsExperience: number
|
|
active: boolean
|
|
}
|
|
|
|
/**
|
|
* E-commerce order structure for nested dataset
|
|
*/
|
|
export interface Order {
|
|
orderId: string
|
|
customer: {
|
|
id: number
|
|
name: string
|
|
email: string
|
|
phone: string
|
|
}
|
|
items: {
|
|
sku: string
|
|
name: string
|
|
quantity: number
|
|
price: number
|
|
}[]
|
|
subtotal: number
|
|
tax: number
|
|
total: number
|
|
status: string
|
|
orderDate?: string
|
|
createdAt?: string
|
|
}
|
|
|
|
/**
|
|
* Analytics metric structure for time-series dataset
|
|
*/
|
|
export interface AnalyticsMetric {
|
|
date: string
|
|
views: number
|
|
clicks: number
|
|
conversions: number
|
|
revenue: number
|
|
bounceRate: number
|
|
}
|
|
|
|
/**
|
|
* GitHub repository structure for real-world dataset
|
|
*/
|
|
export interface Repository {
|
|
id: number
|
|
name: string
|
|
repo: string
|
|
description: string
|
|
stars: number
|
|
watchers: number
|
|
forks: number
|
|
defaultBranch: string
|
|
createdAt: string
|
|
updatedAt: string
|
|
pushedAt: string
|
|
}
|
|
|
|
/**
|
|
* Event log structure for semi-uniform dataset
|
|
*/
|
|
export interface EventLog {
|
|
timestamp: string
|
|
level: 'info' | 'warn' | 'error'
|
|
endpoint: string
|
|
statusCode: number
|
|
responseTime: number
|
|
userId: number
|
|
error?: {
|
|
message: string
|
|
stack: string
|
|
retryable: boolean
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Nested configuration structure for deeply nested dataset
|
|
*/
|
|
export interface NestedConfig {
|
|
environment: string
|
|
version: string
|
|
database: {
|
|
host: string
|
|
port: number
|
|
name: string
|
|
pool: {
|
|
min: number
|
|
max: number
|
|
idleTimeout: number
|
|
}
|
|
replicas: {
|
|
host: string
|
|
port: number
|
|
priority: number
|
|
}[]
|
|
}
|
|
features: Record<string, {
|
|
enabled: boolean
|
|
rollout: number
|
|
variants: {
|
|
name: string
|
|
weight: number
|
|
config: Record<string, any>
|
|
}[]
|
|
}>
|
|
authentication: {
|
|
providers: {
|
|
name: string
|
|
clientId: string
|
|
scopes: string[]
|
|
config: Record<string, any>
|
|
}[]
|
|
session: {
|
|
secret: string
|
|
duration: number
|
|
refreshThreshold: number
|
|
}
|
|
}
|
|
permissions: {
|
|
roles: Record<string, {
|
|
permissions: string[]
|
|
inherits: string[]
|
|
}>
|
|
groups: Record<string, {
|
|
members: string[]
|
|
roles: string[]
|
|
}>
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Product structure for large uniform arrays
|
|
*/
|
|
export interface Product {
|
|
sku: string
|
|
name: string
|
|
category: string
|
|
price: number
|
|
qty: number
|
|
lastUpdated: string
|
|
}
|
|
|
|
/**
|
|
* Internal types for structural validation pattern generation
|
|
*/
|
|
type StructuralValidationType = 'truncated' | 'extra-rows' | 'width-mismatch' | 'missing-fields'
|
|
|
|
interface StructuralValidationFixture {
|
|
type: StructuralValidationType
|
|
description: string
|
|
data: Record<string, unknown>
|
|
isValid: boolean
|
|
}
|
|
|
|
/**
|
|
* Generate analytics time-series data
|
|
*/
|
|
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
|
metrics: AnalyticsMetric[]
|
|
} {
|
|
const date = new Date(startDate)
|
|
|
|
return {
|
|
metrics: Array.from({ length: days }, (_, i) => {
|
|
const currentDate = new Date(date)
|
|
currentDate.setDate(currentDate.getDate() + i)
|
|
|
|
// Simulate realistic web traffic with some variation
|
|
const baseViews = 5000
|
|
const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0
|
|
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
|
|
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
|
|
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
|
|
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
|
|
const revenue = Number((conversions * avgOrderValue).toFixed(2))
|
|
|
|
return {
|
|
date: currentDate.toISOString().split('T')[0]!,
|
|
views,
|
|
clicks,
|
|
conversions,
|
|
revenue,
|
|
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
|
|
}
|
|
}),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate employee data (uniform tabular structure)
|
|
*/
|
|
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
|
|
|
function generateEmployees(count: number): { employees: Employee[] } {
|
|
return {
|
|
employees: Array.from({ length: count }, (_, i): Employee => {
|
|
const yearsExp = faker.number.int({ min: 1, max: 25 })
|
|
return {
|
|
id: i + 1,
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email().toLowerCase(),
|
|
department: departments[i % departments.length]!,
|
|
salary: faker.number.int({ min: 45000, max: 150000 }),
|
|
yearsExperience: yearsExp,
|
|
active: faker.datatype.boolean(0.8), // 80% active
|
|
}
|
|
}),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tabular dataset: Uniform employee records
|
|
*
|
|
* @remarks
|
|
* Tests TOON's tabular array format.
|
|
*/
|
|
const tabularDataset: Dataset = {
|
|
name: 'tabular',
|
|
description: 'Uniform employee records',
|
|
data: generateEmployees(100),
|
|
metadata: {
|
|
supportsCSV: true,
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Generate e-commerce orders (nested structure)
|
|
*/
|
|
const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
|
|
const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
|
|
|
|
function generateOrders(count: number): { orders: Order[] } {
|
|
return {
|
|
orders: Array.from({ length: count }, (_, i) => {
|
|
const customerId = (i % 20) + 1 // Rotate through 20 customers
|
|
const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order
|
|
|
|
const items = Array.from({ length: itemCount }, (_, j) => {
|
|
const price = faker.number.float({
|
|
min: 9.99,
|
|
max: 199.99,
|
|
fractionDigits: 2,
|
|
})
|
|
const quantity = faker.number.int({ min: 1, max: 5 })
|
|
return {
|
|
sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
|
|
name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
|
|
quantity,
|
|
price,
|
|
}
|
|
})
|
|
|
|
const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
|
|
const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate
|
|
const total = Number((subtotal + tax).toFixed(2))
|
|
|
|
return {
|
|
orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
|
|
customer: {
|
|
id: customerId,
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email().toLowerCase(),
|
|
phone: faker.phone.number(),
|
|
},
|
|
items,
|
|
subtotal,
|
|
tax,
|
|
total,
|
|
status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
|
|
orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
|
|
}
|
|
}),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Nested dataset: E-commerce orders with nested structures
|
|
*
|
|
* @remarks
|
|
* Tests TOON's handling of complex nested objects.
|
|
*/
|
|
const nestedDataset: Dataset = {
|
|
name: 'nested',
|
|
description: 'E-commerce orders with nested structures',
|
|
data: generateOrders(50),
|
|
metadata: {
|
|
supportsCSV: false,
|
|
structureClass: 'nested',
|
|
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Analytics dataset: Time-series metrics
|
|
*
|
|
* @remarks
|
|
* Tests TOON's handling of numeric data and date fields.
|
|
*/
|
|
const analyticsDataset: Dataset = {
|
|
name: 'analytics',
|
|
description: 'Time-series analytics data',
|
|
data: generateAnalyticsData(60),
|
|
metadata: {
|
|
supportsCSV: true,
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Real-world dataset: Top 100 starred GitHub repositories
|
|
*
|
|
* @remarks
|
|
* Tests TOON's tabular format with real data.
|
|
*/
|
|
const githubDataset: Dataset = {
|
|
name: 'github',
|
|
description: 'Top 100 GitHub repositories',
|
|
data: {
|
|
repositories: githubRepos,
|
|
},
|
|
metadata: {
|
|
supportsCSV: true,
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100, // Repository array contains uniform objects with primitive values
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Generate a single e-commerce order with nested structure
|
|
*
|
|
* @remarks
|
|
* Used for token efficiency benchmarks.
|
|
*/
|
|
export function generateOrderData(): Order {
|
|
return {
|
|
orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
|
|
customer: {
|
|
id: faker.number.int({ min: 1000, max: 9999 }),
|
|
name: faker.person.fullName(),
|
|
email: faker.internet.email(),
|
|
phone: faker.phone.number(),
|
|
},
|
|
items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
|
|
sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
|
|
name: faker.commerce.productName(),
|
|
quantity: faker.number.int({ min: 1, max: 5 }),
|
|
price: Number(faker.commerce.price({ min: 10, max: 200 })),
|
|
})),
|
|
subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
|
|
tax: Number(faker.commerce.price({ min: 10, max: 50 })),
|
|
total: Number(faker.commerce.price({ min: 110, max: 550 })),
|
|
status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
|
|
createdAt: faker.date.recent({ days: 7 }).toISOString(),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate event logs (semi-uniform structure)
|
|
*
|
|
* @remarks
|
|
* Approximately 50% of logs include nested error objects, 50% are flat.
|
|
* This creates ~45% tabular eligibility.
|
|
*/
|
|
export function generateEventLogs(count: number): { logs: EventLog[] } {
|
|
const endpoints = ['/api/users', '/api/orders', '/api/products', '/api/auth', '/api/payments']
|
|
const levels = ['info', 'warn', 'error'] as const
|
|
|
|
return {
|
|
logs: Array.from({ length: count }, () => {
|
|
const level = faker.helpers.arrayElement(levels)
|
|
const hasError = level === 'error' || (level === 'warn' && faker.datatype.boolean(0.3))
|
|
|
|
const log: EventLog = {
|
|
timestamp: faker.date.recent({ days: 7 }).toISOString(),
|
|
level,
|
|
endpoint: faker.helpers.arrayElement(endpoints),
|
|
statusCode: hasError
|
|
? faker.number.int({ min: 400, max: 599 })
|
|
: faker.number.int({ min: 200, max: 299 }),
|
|
responseTime: faker.number.int({ min: 10, max: 5000 }),
|
|
userId: faker.number.int({ min: 1000, max: 9999 }),
|
|
}
|
|
|
|
if (hasError) {
|
|
log.error = {
|
|
message: faker.helpers.arrayElement([
|
|
'Database connection timeout',
|
|
'Invalid authentication token',
|
|
'Resource not found',
|
|
'Internal server error',
|
|
'Rate limit exceeded',
|
|
]),
|
|
stack: `Error: ${faker.lorem.sentence()}\n at ${faker.lorem.word()}\n at ${faker.lorem.word()}`,
|
|
retryable: faker.datatype.boolean(0.6),
|
|
}
|
|
}
|
|
|
|
return log
|
|
}),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate deeply nested configuration
|
|
*
|
|
* @remarks
|
|
* Creates a complex nested structure with minimal tabular eligibility (~0%).
|
|
*/
|
|
export function generateNestedConfig(): NestedConfig {
|
|
return {
|
|
environment: faker.helpers.arrayElement(['production', 'staging', 'development']),
|
|
version: faker.system.semver(),
|
|
database: {
|
|
host: faker.internet.domainName(),
|
|
port: 5432,
|
|
name: faker.database.type(),
|
|
pool: {
|
|
min: 2,
|
|
max: faker.number.int({ min: 10, max: 50 }),
|
|
idleTimeout: 30000,
|
|
},
|
|
replicas: Array.from({ length: 3 }, (_, i) => ({
|
|
host: `replica-${i + 1}.${faker.internet.domainName()}`,
|
|
port: 5432,
|
|
priority: i + 1,
|
|
})),
|
|
},
|
|
features: {
|
|
darkMode: {
|
|
enabled: faker.datatype.boolean(),
|
|
rollout: faker.number.int({ min: 0, max: 100 }),
|
|
variants: [
|
|
{
|
|
name: 'default',
|
|
weight: 70,
|
|
config: { theme: 'dark', animations: true },
|
|
},
|
|
{
|
|
name: 'minimal',
|
|
weight: 30,
|
|
config: { theme: 'dark', animations: false },
|
|
},
|
|
],
|
|
},
|
|
analytics: {
|
|
enabled: faker.datatype.boolean(),
|
|
rollout: faker.number.int({ min: 0, max: 100 }),
|
|
variants: [
|
|
{
|
|
name: 'full',
|
|
weight: 100,
|
|
config: { tracking: 'all', sampling: 1.0 },
|
|
},
|
|
],
|
|
},
|
|
},
|
|
authentication: {
|
|
providers: [
|
|
{
|
|
name: 'oauth2',
|
|
clientId: faker.string.uuid(),
|
|
scopes: ['read', 'write', 'admin'],
|
|
config: {
|
|
authUrl: faker.internet.url(),
|
|
tokenUrl: faker.internet.url(),
|
|
},
|
|
},
|
|
{
|
|
name: 'saml',
|
|
clientId: faker.string.uuid(),
|
|
scopes: ['read'],
|
|
config: {
|
|
entryPoint: faker.internet.url(),
|
|
cert: faker.string.alphanumeric({ length: 64 }),
|
|
},
|
|
},
|
|
],
|
|
session: {
|
|
secret: faker.string.alphanumeric({ length: 32 }),
|
|
duration: 86400,
|
|
refreshThreshold: 3600,
|
|
},
|
|
},
|
|
permissions: {
|
|
roles: {
|
|
admin: {
|
|
permissions: ['read', 'write', 'delete', 'manage_users', 'manage_roles'],
|
|
inherits: [],
|
|
},
|
|
editor: {
|
|
permissions: ['read', 'write'],
|
|
inherits: ['viewer'],
|
|
},
|
|
viewer: {
|
|
permissions: ['read'],
|
|
inherits: [],
|
|
},
|
|
},
|
|
groups: {
|
|
engineering: {
|
|
members: Array.from({ length: 5 }, () => faker.internet.email()),
|
|
roles: ['admin', 'editor'],
|
|
},
|
|
support: {
|
|
members: Array.from({ length: 3 }, () => faker.internet.email()),
|
|
roles: ['viewer'],
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate large uniform product array (5000+ rows)
|
|
*
|
|
* @remarks
|
|
* Tests TOON's token efficiency and structural reliability at scale.
|
|
*/
|
|
export function generateProducts(count: number): { products: Product[] } {
|
|
const categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books', 'Toys'] as const
|
|
|
|
return {
|
|
products: Array.from({ length: count }, (_, i): Product => ({
|
|
sku: `SKU-${String(i + 1).padStart(6, '0')}`,
|
|
name: faker.commerce.productName(),
|
|
category: categories[i % categories.length]!,
|
|
price: Number(faker.commerce.price({ min: 5, max: 500 })),
|
|
qty: faker.number.int({ min: 0, max: 1000 }),
|
|
lastUpdated: faker.date.recent({ days: 30 }).toISOString().split('T')[0]!,
|
|
})),
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate structural validation fixtures from employee data
|
|
*
|
|
* @remarks
|
|
* Creates deliberately corrupted datasets to test TOON's structural validation
|
|
* capabilities via [N] length declarations and {fields} headers.
|
|
* Internal function used to generate structural validation datasets.
|
|
*/
|
|
function generateStructuralValidationFixtures(): StructuralValidationFixture[] {
|
|
const baseData = generateEmployees(20)
|
|
|
|
return [
|
|
// Valid baseline
|
|
{
|
|
type: 'truncated' as const,
|
|
description: 'Valid complete dataset (control)',
|
|
data: { employees: baseData.employees },
|
|
isValid: true,
|
|
},
|
|
// Truncated array (missing last 3 rows)
|
|
{
|
|
type: 'truncated' as const,
|
|
description: 'Array truncated: 3 rows removed from end',
|
|
data: { employees: baseData.employees.slice(0, -3) },
|
|
isValid: false, // [N] won't match actual row count in TOON
|
|
},
|
|
// Extra rows (3 more than original)
|
|
{
|
|
type: 'extra-rows' as const,
|
|
description: 'Extra rows added beyond declared length',
|
|
data: {
|
|
employees: [
|
|
...baseData.employees,
|
|
...generateEmployees(3).employees,
|
|
],
|
|
},
|
|
isValid: false, // [N] won't match actual row count in TOON
|
|
},
|
|
// Width mismatch (inconsistent field count)
|
|
{
|
|
type: 'width-mismatch' as const,
|
|
description: 'Inconsistent field count (missing salary in row 10)',
|
|
data: {
|
|
employees: baseData.employees.map((emp, i) => {
|
|
if (i === 9) {
|
|
// Row 10, missing salary field
|
|
const { salary, ...rest } = emp
|
|
return rest
|
|
}
|
|
return emp
|
|
}),
|
|
},
|
|
isValid: false, // Not all objects have same fields (tabular requirement)
|
|
},
|
|
// Missing required fields
|
|
{
|
|
type: 'missing-fields' as const,
|
|
description: 'Missing required fields (no email in multiple rows)',
|
|
data: {
|
|
employees: baseData.employees.map((emp, i) => {
|
|
if (i % 5 === 0) {
|
|
// Every 5th row, missing email
|
|
const { email, ...rest } = emp
|
|
return rest
|
|
}
|
|
return emp
|
|
}),
|
|
},
|
|
isValid: false, // Not all objects have same fields (tabular requirement)
|
|
},
|
|
]
|
|
}
|
|
|
|
/**
|
|
* Event logs dataset: Semi-uniform structure
|
|
*
|
|
* @remarks
|
|
* Tests TOON with semi-uniform data (~50% flat, ~50% with nested errors).
|
|
*/
|
|
const eventLogsDataset: Dataset = {
|
|
name: 'event-logs',
|
|
description: 'Semi-uniform event logs',
|
|
data: generateEventLogs(75),
|
|
metadata: {
|
|
supportsCSV: false,
|
|
structureClass: 'semi-uniform',
|
|
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Nested config dataset: Deeply nested structure
|
|
*
|
|
* @remarks
|
|
* Tests TOON's worst-case scenario with deeply nested configuration.
|
|
*/
|
|
const nestedConfigDataset: Dataset = {
|
|
name: 'nested-config',
|
|
description: 'Deeply nested configuration',
|
|
data: generateNestedConfig(),
|
|
metadata: {
|
|
supportsCSV: false,
|
|
structureClass: 'deep',
|
|
tabularEligibility: 0, // Deeply nested configuration with no tabular arrays
|
|
},
|
|
}
|
|
|
|
/**
|
|
* Structural validation datasets: Tests ability to detect incomplete, truncated, or corrupted data
|
|
*
|
|
* @remarks
|
|
* These datasets test TOON's structural validation advantages via [N] length declarations
|
|
* and {fields} headers. CSV is included to demonstrate its lack of structural metadata.
|
|
*/
|
|
const structuralValidationDatasets: Dataset[] = generateStructuralValidationFixtures().map((fixture, index) => {
|
|
const datasetNames = [
|
|
'structural-validation-control',
|
|
'structural-validation-truncated',
|
|
'structural-validation-extra-rows',
|
|
'structural-validation-width-mismatch',
|
|
'structural-validation-missing-fields',
|
|
] as const
|
|
|
|
return {
|
|
name: datasetNames[index]!,
|
|
description: fixture.description,
|
|
data: fixture.data,
|
|
metadata: {
|
|
supportsCSV: true, // Include CSV to show it can't validate structure
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100,
|
|
},
|
|
}
|
|
})
|
|
|
|
/**
|
|
* Datasets for accuracy benchmarks (smaller sizes for faster evaluation)
|
|
*/
|
|
export const ACCURACY_DATASETS: Dataset[] = [
|
|
tabularDataset, // 100 employees
|
|
nestedDataset, // 50 orders
|
|
analyticsDataset, // 60 days
|
|
githubDataset, // 100 repos
|
|
eventLogsDataset, // 75 logs
|
|
nestedConfigDataset, // 1 config
|
|
...structuralValidationDatasets, // 5 validation fixtures
|
|
]
|
|
|
|
/**
|
|
* Datasets for token efficiency benchmarks (larger sizes to amplify token differences)
|
|
*/
|
|
export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
|
|
// Tabular: 2000 employees
|
|
{
|
|
name: 'tabular',
|
|
description: 'Uniform employee records',
|
|
data: generateEmployees(2000),
|
|
metadata: {
|
|
supportsCSV: true,
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
|
|
},
|
|
},
|
|
// Nested: 500 orders
|
|
{
|
|
name: 'nested',
|
|
description: 'E-commerce orders with nested structures',
|
|
data: generateOrders(500),
|
|
metadata: {
|
|
supportsCSV: false,
|
|
structureClass: 'nested',
|
|
tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
|
|
},
|
|
},
|
|
// Analytics: 365 days
|
|
{
|
|
name: 'analytics',
|
|
description: 'Time-series analytics data',
|
|
data: generateAnalyticsData(365),
|
|
metadata: {
|
|
supportsCSV: true,
|
|
structureClass: 'uniform',
|
|
tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
|
|
},
|
|
},
|
|
// GitHub: 100 repos (same as accuracy)
|
|
githubDataset,
|
|
// Event logs: 2000 logs
|
|
{
|
|
name: 'event-logs',
|
|
description: 'Semi-uniform event logs',
|
|
data: generateEventLogs(2000),
|
|
metadata: {
|
|
supportsCSV: false,
|
|
structureClass: 'semi-uniform',
|
|
tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
|
|
},
|
|
},
|
|
// Nested config: 1 config (same as accuracy)
|
|
nestedConfigDataset,
|
|
]
|