docs: benchmarks for XML format

This commit is contained in:
Johann Schopplich
2025-10-27 14:50:26 +01:00
parent b9f54ba585
commit 77696ce932
6 changed files with 91 additions and 92 deletions

View File

@@ -18,6 +18,7 @@
"ai": "^5.0.80",
"consola": "^3.4.2",
"csv-stringify": "^6.6.0",
"fast-xml-parser": "^5.3.0",
"gpt-tokenizer": "^3.2.0",
"ofetch": "^1.4.1",
"p-map": "^7.0.3",

View File

@@ -6,6 +6,7 @@ import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (d
import { encode } from '../../src/index'
import githubRepos from '../data/github-repos.json' with { type: 'json' }
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
import { generateAnalyticsData } from '../src/datasets'
interface BenchmarkResult {
name: string
@@ -33,7 +34,7 @@ const BENCHMARK_EXAMPLES = [
name: 'Daily Analytics',
emoji: '📈',
description: '180 days of web metrics (views, clicks, conversions, revenue)',
getData: () => generateAnalytics(180),
getData: () => generateAnalyticsData(180),
showDetailed: true,
},
{
@@ -169,23 +170,6 @@ function generateBarChart(percentage: number, maxWidth: number = 25): string {
return '█'.repeat(filled) + '░'.repeat(empty)
}
// Generate analytics time series data
function generateAnalytics(days: number) {
return {
metrics: Array.from({ length: days }, (_, i) => {
const date = new Date(2025, 0, 1)
date.setDate(date.getDate() + i)
return {
date: date.toISOString().split('T')[0],
views: Math.floor(Math.random() * 5000) + 1000,
clicks: Math.floor(Math.random() * 500) + 50,
conversions: Math.floor(Math.random() * 100) + 10,
revenue: Number((Math.random() * 1000 + 100).toFixed(2)),
}
}),
}
}
// Generate user API response
function generateUsers(count: number) {
return {

View File

@@ -14,6 +14,49 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
// Seed for reproducibility
faker.seed(12345)
interface AnalyticsMetric {
date: string
views: number
clicks: number
conversions: number
revenue: number
bounceRate: number
}
/**
* Generate analytics time-series data with reproducible seeded randomness
*/
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
metrics: AnalyticsMetric[]
} {
const date = new Date(startDate)
return {
metrics: Array.from({ length: days }, (_, i) => {
const currentDate = new Date(date)
currentDate.setDate(currentDate.getDate() + i)
// Simulate realistic web traffic with some variation
const baseViews = 5000
const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
const revenue = Number((conversions * avgOrderValue).toFixed(2))
return {
date: currentDate.toISOString().split('T')[0]!,
views,
clicks,
conversions,
revenue,
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
}
}),
}
}
/**
* Tabular dataset: 100 uniform employee records
*
@@ -95,30 +138,7 @@ const nestedDataset: Dataset = {
const analyticsDataset: Dataset = {
name: 'analytics',
description: 'Time-series analytics data',
data: {
metrics: Array.from({ length: 60 }, (_, i) => {
const date = new Date('2025-01-01')
date.setDate(date.getDate() + i)
// Simulate realistic web traffic with some variation
const baseViews = 5000
const weekendMultiplier = date.getDay() === 0 || date.getDay() === 6 ? 0.7 : 1.0
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
const revenue = Number((conversions * avgOrderValue).toFixed(2))
return {
date: date.toISOString().split('T')[0]!,
views,
clicks,
conversions,
revenue,
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
}
}),
},
data: generateAnalyticsData(60),
}
/**

View File

@@ -5,22 +5,36 @@
* - JSON
* - TOON
* - CSV
* - Markdown key-value
* - XML
* - YAML
*/
import { stringify as stringifyCSV } from 'csv-stringify/sync'
import { XMLBuilder } from 'fast-xml-parser'
import { stringify as stringifyYAML } from 'yaml'
import { encode as encodeToon } from '../../src/index'
export const formatters = {
'json': (data: unknown): string => JSON.stringify(data, undefined, 2),
'toon': (data: unknown): string => encodeToon(data),
'csv': (data: unknown): string => toCSV(data),
'markdown-kv': (data: unknown): string => toMarkdownKV(data),
'yaml': (data: unknown): string => stringifyYAML(data),
json: (data: unknown): string => JSON.stringify(data, undefined, 2),
toon: (data: unknown): string => encodeToon(data),
csv: (data: unknown): string => toCSV(data),
xml: (data: unknown): string => toXML(data),
yaml: (data: unknown): string => stringifyYAML(data),
}
/**
* Convert data to CSV format
*
* @remarks
* **Limitations**: CSV is designed for flat tabular data only. This formatter:
* - Only handles top-level objects with arrays of flat objects
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
* - Loses nested structure information during conversion
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
*
* For datasets with nested structures, CSV comparisons may not be fair or representative
* of how CSV would typically be used in practice.
*/
function toCSV(data: unknown): string {
const sections: string[] = []
@@ -43,48 +57,12 @@ function toCSV(data: unknown): string {
return ''
}
function toMarkdownKV(data: unknown, indent = 0): string {
const spaces = ' '.repeat(indent)
const lines: string[] = []
function toXML(data: unknown): string {
const builder = new XMLBuilder({
format: true,
indentBy: ' ',
suppressEmptyNode: true,
})
if (Array.isArray(data)) {
data.forEach((item, i) => {
if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
Object.entries(item).forEach(([key, value]) => {
if (typeof value === 'object' && value !== null) {
lines.push(`${spaces}**${key}**:`)
lines.push(toMarkdownKV(value, indent + 1))
}
else {
lines.push(`${spaces}**${key}**: ${value}`)
}
})
if (i < data.length - 1)
lines.push('')
}
else {
lines.push(`${spaces}- ${item}`)
}
})
}
else if (typeof data === 'object' && data !== null) {
Object.entries(data).forEach(([key, value]) => {
if (Array.isArray(value)) {
lines.push(`${spaces}**${key}**:`)
lines.push(toMarkdownKV(value, indent + 1))
}
else if (typeof value === 'object' && value !== null) {
lines.push(`${spaces}**${key}**:`)
lines.push(toMarkdownKV(value, indent + 1))
}
else {
lines.push(`${spaces}**${key}**: ${value}`)
}
})
}
else {
lines.push(`${spaces}${data}`)
}
return lines.join('\n')
return builder.build(data)
}

View File

@@ -1,7 +1,7 @@
/**
* Question generation for TOON benchmarks
*
* Generates ~200 questions across different types:
* Generates ~160 questions across different types:
* - Field retrieval (50%): "What is X's Y?"
* - Aggregation (25%): "How many X have Y?"
* - Filtering (25%): "List/count X where Y"