diff --git a/benchmarks/package.json b/benchmarks/package.json index b3c7f71..1b071c8 100644 --- a/benchmarks/package.json +++ b/benchmarks/package.json @@ -18,6 +18,7 @@ "ai": "^5.0.80", "consola": "^3.4.2", "csv-stringify": "^6.6.0", + "fast-xml-parser": "^5.3.0", "gpt-tokenizer": "^3.2.0", "ofetch": "^1.4.1", "p-map": "^7.0.3", diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index 3b07473..85ed1f9 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -6,6 +6,7 @@ import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (d import { encode } from '../../src/index' import githubRepos from '../data/github-repos.json' with { type: 'json' } import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants' +import { generateAnalyticsData } from '../src/datasets' interface BenchmarkResult { name: string @@ -33,7 +34,7 @@ const BENCHMARK_EXAMPLES = [ name: 'Daily Analytics', emoji: '📈', description: '180 days of web metrics (views, clicks, conversions, revenue)', - getData: () => generateAnalytics(180), + getData: () => generateAnalyticsData(180), showDetailed: true, }, { @@ -169,23 +170,6 @@ function generateBarChart(percentage: number, maxWidth: number = 25): string { return '█'.repeat(filled) + '░'.repeat(empty) } -// Generate analytics time series data -function generateAnalytics(days: number) { - return { - metrics: Array.from({ length: days }, (_, i) => { - const date = new Date(2025, 0, 1) - date.setDate(date.getDate() + i) - return { - date: date.toISOString().split('T')[0], - views: Math.floor(Math.random() * 5000) + 1000, - clicks: Math.floor(Math.random() * 500) + 50, - conversions: Math.floor(Math.random() * 100) + 10, - revenue: Number((Math.random() * 1000 + 100).toFixed(2)), - } - }), - } -} - // Generate user API response function generateUsers(count: number) { return { diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts index 0fbb65c..a9afd77 100644 --- a/benchmarks/src/datasets.ts +++ b/benchmarks/src/datasets.ts @@ -14,6 +14,49 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' } // Seed for reproducibility faker.seed(12345) +interface AnalyticsMetric { + date: string + views: number + clicks: number + conversions: number + revenue: number + bounceRate: number +} + +/** + * Generate analytics time-series data with reproducible seeded randomness + */ +export function generateAnalyticsData(days: number, startDate = '2025-01-01'): { + metrics: AnalyticsMetric[] +} { + const date = new Date(startDate) + + return { + metrics: Array.from({ length: days }, (_, i) => { + const currentDate = new Date(date) + currentDate.setDate(currentDate.getDate() + i) + + // Simulate realistic web traffic with some variation + const baseViews = 5000 + const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0 + const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 })) + const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 })) + const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 })) + const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 }) + const revenue = Number((conversions * avgOrderValue).toFixed(2)) + + return { + date: currentDate.toISOString().split('T')[0]!, + views, + clicks, + conversions, + revenue, + bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }), + } + }), + } +} + /** * Tabular dataset: 100 uniform employee records * @@ -95,30 +138,7 @@ const nestedDataset: Dataset = { const analyticsDataset: Dataset = { name: 'analytics', description: 'Time-series analytics data', - data: { - metrics: Array.from({ length: 60 }, (_, i) => { - const date = new Date('2025-01-01') - date.setDate(date.getDate() + i) - - // Simulate realistic web traffic with some variation - const baseViews = 5000 - const weekendMultiplier = date.getDay() === 0 || date.getDay() === 6 ? 0.7 : 1.0 - const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 })) - const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 })) - const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 })) - const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 }) - const revenue = Number((conversions * avgOrderValue).toFixed(2)) - - return { - date: date.toISOString().split('T')[0]!, - views, - clicks, - conversions, - revenue, - bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }), - } - }), - }, + data: generateAnalyticsData(60), } /** diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts index e1081e3..64aabef 100644 --- a/benchmarks/src/formatters.ts +++ b/benchmarks/src/formatters.ts @@ -5,22 +5,36 @@ * - JSON * - TOON * - CSV - * - Markdown key-value + * - XML * - YAML */ import { stringify as stringifyCSV } from 'csv-stringify/sync' +import { XMLBuilder } from 'fast-xml-parser' import { stringify as stringifyYAML } from 'yaml' import { encode as encodeToon } from '../../src/index' export const formatters = { - 'json': (data: unknown): string => JSON.stringify(data, undefined, 2), - 'toon': (data: unknown): string => encodeToon(data), - 'csv': (data: unknown): string => toCSV(data), - 'markdown-kv': (data: unknown): string => toMarkdownKV(data), - 'yaml': (data: unknown): string => stringifyYAML(data), + json: (data: unknown): string => JSON.stringify(data, undefined, 2), + toon: (data: unknown): string => encodeToon(data), + csv: (data: unknown): string => toCSV(data), + xml: (data: unknown): string => toXML(data), + yaml: (data: unknown): string => stringifyYAML(data), } +/** + * Convert data to CSV format + * + * @remarks + * **Limitations**: CSV is designed for flat tabular data only. This formatter: + * - Only handles top-level objects with arrays of flat objects + * - Cannot properly represent deeply nested structures (nested arrays/objects within rows) + * - Loses nested structure information during conversion + * - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items) + * + * For datasets with nested structures, CSV comparisons may not be fair or representative + * of how CSV would typically be used in practice. + */ function toCSV(data: unknown): string { const sections: string[] = [] @@ -43,48 +57,12 @@ function toCSV(data: unknown): string { return '' } -function toMarkdownKV(data: unknown, indent = 0): string { - const spaces = ' '.repeat(indent) - const lines: string[] = [] +function toXML(data: unknown): string { + const builder = new XMLBuilder({ + format: true, + indentBy: ' ', + suppressEmptyNode: true, + }) - if (Array.isArray(data)) { - data.forEach((item, i) => { - if (typeof item === 'object' && item !== null && !Array.isArray(item)) { - Object.entries(item).forEach(([key, value]) => { - if (typeof value === 'object' && value !== null) { - lines.push(`${spaces}**${key}**:`) - lines.push(toMarkdownKV(value, indent + 1)) - } - else { - lines.push(`${spaces}**${key}**: ${value}`) - } - }) - if (i < data.length - 1) - lines.push('') - } - else { - lines.push(`${spaces}- ${item}`) - } - }) - } - else if (typeof data === 'object' && data !== null) { - Object.entries(data).forEach(([key, value]) => { - if (Array.isArray(value)) { - lines.push(`${spaces}**${key}**:`) - lines.push(toMarkdownKV(value, indent + 1)) - } - else if (typeof value === 'object' && value !== null) { - lines.push(`${spaces}**${key}**:`) - lines.push(toMarkdownKV(value, indent + 1)) - } - else { - lines.push(`${spaces}**${key}**: ${value}`) - } - }) - } - else { - lines.push(`${spaces}${data}`) - } - - return lines.join('\n') + return builder.build(data) } diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts index 46220dd..4c27c33 100644 --- a/benchmarks/src/questions.ts +++ b/benchmarks/src/questions.ts @@ -1,7 +1,7 @@ /** * Question generation for TOON benchmarks * - * Generates ~200 questions across different types: + * Generates ~160 questions across different types: * - Field retrieval (50%): "What is X's Y?" * - Aggregation (25%): "How many X have Y?" * - Filtering (25%): "List/count X where Y" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3894eb4..a645f96 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -68,6 +68,9 @@ importers: csv-stringify: specifier: ^6.6.0 version: 6.6.0 + fast-xml-parser: + specifier: ^5.3.0 + version: 5.3.0 gpt-tokenizer: specifier: ^3.2.0 version: 3.2.0 @@ -1452,6 +1455,10 @@ packages: fast-levenshtein@2.0.6: resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} + fast-xml-parser@5.3.0: + resolution: {integrity: sha512-gkWGshjYcQCF+6qtlrqBqELqNqnt4CxruY6UVAWWnqb3DQ6qaNFEIKqzYep1XzHLM/QtrHVCxyPOtTk4LTQ7Aw==} + hasBin: true + fastest-levenshtein@1.0.16: resolution: {integrity: sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==} engines: {node: '>= 4.9.1'} @@ -2083,6 +2090,9 @@ packages: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} + strnum@2.1.1: + resolution: {integrity: sha512-7ZvoFTiCnGxBtDqJ//Cu6fWtZtc7Y3x+QOirG15wztbdngGSkht27o2pyGWrVy0b4WAy3jbKmnoK6g5VlVNUUw==} + supports-color@7.2.0: resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==} engines: {node: '>=8'} @@ -3684,6 +3694,10 @@ snapshots: fast-levenshtein@2.0.6: {} + fast-xml-parser@5.3.0: + dependencies: + strnum: 2.1.1 + fastest-levenshtein@1.0.16: {} fastq@1.19.1: @@ -4461,6 +4475,8 @@ snapshots: strip-json-comments@3.1.1: {} + strnum@2.1.1: {} + supports-color@7.2.0: dependencies: has-flag: 4.0.0