mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
docs: benchmarks for XML format
This commit is contained in:
@@ -18,6 +18,7 @@
|
|||||||
"ai": "^5.0.80",
|
"ai": "^5.0.80",
|
||||||
"consola": "^3.4.2",
|
"consola": "^3.4.2",
|
||||||
"csv-stringify": "^6.6.0",
|
"csv-stringify": "^6.6.0",
|
||||||
|
"fast-xml-parser": "^5.3.0",
|
||||||
"gpt-tokenizer": "^3.2.0",
|
"gpt-tokenizer": "^3.2.0",
|
||||||
"ofetch": "^1.4.1",
|
"ofetch": "^1.4.1",
|
||||||
"p-map": "^7.0.3",
|
"p-map": "^7.0.3",
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (d
|
|||||||
import { encode } from '../../src/index'
|
import { encode } from '../../src/index'
|
||||||
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
||||||
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
||||||
|
import { generateAnalyticsData } from '../src/datasets'
|
||||||
|
|
||||||
interface BenchmarkResult {
|
interface BenchmarkResult {
|
||||||
name: string
|
name: string
|
||||||
@@ -33,7 +34,7 @@ const BENCHMARK_EXAMPLES = [
|
|||||||
name: 'Daily Analytics',
|
name: 'Daily Analytics',
|
||||||
emoji: '📈',
|
emoji: '📈',
|
||||||
description: '180 days of web metrics (views, clicks, conversions, revenue)',
|
description: '180 days of web metrics (views, clicks, conversions, revenue)',
|
||||||
getData: () => generateAnalytics(180),
|
getData: () => generateAnalyticsData(180),
|
||||||
showDetailed: true,
|
showDetailed: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -169,23 +170,6 @@ function generateBarChart(percentage: number, maxWidth: number = 25): string {
|
|||||||
return '█'.repeat(filled) + '░'.repeat(empty)
|
return '█'.repeat(filled) + '░'.repeat(empty)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate analytics time series data
|
|
||||||
function generateAnalytics(days: number) {
|
|
||||||
return {
|
|
||||||
metrics: Array.from({ length: days }, (_, i) => {
|
|
||||||
const date = new Date(2025, 0, 1)
|
|
||||||
date.setDate(date.getDate() + i)
|
|
||||||
return {
|
|
||||||
date: date.toISOString().split('T')[0],
|
|
||||||
views: Math.floor(Math.random() * 5000) + 1000,
|
|
||||||
clicks: Math.floor(Math.random() * 500) + 50,
|
|
||||||
conversions: Math.floor(Math.random() * 100) + 10,
|
|
||||||
revenue: Number((Math.random() * 1000 + 100).toFixed(2)),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate user API response
|
// Generate user API response
|
||||||
function generateUsers(count: number) {
|
function generateUsers(count: number) {
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -14,6 +14,49 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
|||||||
// Seed for reproducibility
|
// Seed for reproducibility
|
||||||
faker.seed(12345)
|
faker.seed(12345)
|
||||||
|
|
||||||
|
interface AnalyticsMetric {
|
||||||
|
date: string
|
||||||
|
views: number
|
||||||
|
clicks: number
|
||||||
|
conversions: number
|
||||||
|
revenue: number
|
||||||
|
bounceRate: number
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate analytics time-series data with reproducible seeded randomness
|
||||||
|
*/
|
||||||
|
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
||||||
|
metrics: AnalyticsMetric[]
|
||||||
|
} {
|
||||||
|
const date = new Date(startDate)
|
||||||
|
|
||||||
|
return {
|
||||||
|
metrics: Array.from({ length: days }, (_, i) => {
|
||||||
|
const currentDate = new Date(date)
|
||||||
|
currentDate.setDate(currentDate.getDate() + i)
|
||||||
|
|
||||||
|
// Simulate realistic web traffic with some variation
|
||||||
|
const baseViews = 5000
|
||||||
|
const weekendMultiplier = currentDate.getDay() === 0 || currentDate.getDay() === 6 ? 0.7 : 1.0
|
||||||
|
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
|
||||||
|
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
|
||||||
|
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
|
||||||
|
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
|
||||||
|
const revenue = Number((conversions * avgOrderValue).toFixed(2))
|
||||||
|
|
||||||
|
return {
|
||||||
|
date: currentDate.toISOString().split('T')[0]!,
|
||||||
|
views,
|
||||||
|
clicks,
|
||||||
|
conversions,
|
||||||
|
revenue,
|
||||||
|
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tabular dataset: 100 uniform employee records
|
* Tabular dataset: 100 uniform employee records
|
||||||
*
|
*
|
||||||
@@ -95,30 +138,7 @@ const nestedDataset: Dataset = {
|
|||||||
const analyticsDataset: Dataset = {
|
const analyticsDataset: Dataset = {
|
||||||
name: 'analytics',
|
name: 'analytics',
|
||||||
description: 'Time-series analytics data',
|
description: 'Time-series analytics data',
|
||||||
data: {
|
data: generateAnalyticsData(60),
|
||||||
metrics: Array.from({ length: 60 }, (_, i) => {
|
|
||||||
const date = new Date('2025-01-01')
|
|
||||||
date.setDate(date.getDate() + i)
|
|
||||||
|
|
||||||
// Simulate realistic web traffic with some variation
|
|
||||||
const baseViews = 5000
|
|
||||||
const weekendMultiplier = date.getDay() === 0 || date.getDay() === 6 ? 0.7 : 1.0
|
|
||||||
const views = Math.round(baseViews * weekendMultiplier + faker.number.int({ min: -1000, max: 3000 }))
|
|
||||||
const clicks = Math.round(views * faker.number.float({ min: 0.02, max: 0.08 }))
|
|
||||||
const conversions = Math.round(clicks * faker.number.float({ min: 0.05, max: 0.15 }))
|
|
||||||
const avgOrderValue = faker.number.float({ min: 49.99, max: 299.99 })
|
|
||||||
const revenue = Number((conversions * avgOrderValue).toFixed(2))
|
|
||||||
|
|
||||||
return {
|
|
||||||
date: date.toISOString().split('T')[0]!,
|
|
||||||
views,
|
|
||||||
clicks,
|
|
||||||
conversions,
|
|
||||||
revenue,
|
|
||||||
bounceRate: faker.number.float({ min: 0.3, max: 0.7, fractionDigits: 2 }),
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -5,22 +5,36 @@
|
|||||||
* - JSON
|
* - JSON
|
||||||
* - TOON
|
* - TOON
|
||||||
* - CSV
|
* - CSV
|
||||||
* - Markdown key-value
|
* - XML
|
||||||
* - YAML
|
* - YAML
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { stringify as stringifyCSV } from 'csv-stringify/sync'
|
import { stringify as stringifyCSV } from 'csv-stringify/sync'
|
||||||
|
import { XMLBuilder } from 'fast-xml-parser'
|
||||||
import { stringify as stringifyYAML } from 'yaml'
|
import { stringify as stringifyYAML } from 'yaml'
|
||||||
import { encode as encodeToon } from '../../src/index'
|
import { encode as encodeToon } from '../../src/index'
|
||||||
|
|
||||||
export const formatters = {
|
export const formatters = {
|
||||||
'json': (data: unknown): string => JSON.stringify(data, undefined, 2),
|
json: (data: unknown): string => JSON.stringify(data, undefined, 2),
|
||||||
'toon': (data: unknown): string => encodeToon(data),
|
toon: (data: unknown): string => encodeToon(data),
|
||||||
'csv': (data: unknown): string => toCSV(data),
|
csv: (data: unknown): string => toCSV(data),
|
||||||
'markdown-kv': (data: unknown): string => toMarkdownKV(data),
|
xml: (data: unknown): string => toXML(data),
|
||||||
'yaml': (data: unknown): string => stringifyYAML(data),
|
yaml: (data: unknown): string => stringifyYAML(data),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert data to CSV format
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* **Limitations**: CSV is designed for flat tabular data only. This formatter:
|
||||||
|
* - Only handles top-level objects with arrays of flat objects
|
||||||
|
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
|
||||||
|
* - Loses nested structure information during conversion
|
||||||
|
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
|
||||||
|
*
|
||||||
|
* For datasets with nested structures, CSV comparisons may not be fair or representative
|
||||||
|
* of how CSV would typically be used in practice.
|
||||||
|
*/
|
||||||
function toCSV(data: unknown): string {
|
function toCSV(data: unknown): string {
|
||||||
const sections: string[] = []
|
const sections: string[] = []
|
||||||
|
|
||||||
@@ -43,48 +57,12 @@ function toCSV(data: unknown): string {
|
|||||||
return ''
|
return ''
|
||||||
}
|
}
|
||||||
|
|
||||||
function toMarkdownKV(data: unknown, indent = 0): string {
|
function toXML(data: unknown): string {
|
||||||
const spaces = ' '.repeat(indent)
|
const builder = new XMLBuilder({
|
||||||
const lines: string[] = []
|
format: true,
|
||||||
|
indentBy: ' ',
|
||||||
|
suppressEmptyNode: true,
|
||||||
|
})
|
||||||
|
|
||||||
if (Array.isArray(data)) {
|
return builder.build(data)
|
||||||
data.forEach((item, i) => {
|
|
||||||
if (typeof item === 'object' && item !== null && !Array.isArray(item)) {
|
|
||||||
Object.entries(item).forEach(([key, value]) => {
|
|
||||||
if (typeof value === 'object' && value !== null) {
|
|
||||||
lines.push(`${spaces}**${key}**:`)
|
|
||||||
lines.push(toMarkdownKV(value, indent + 1))
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
lines.push(`${spaces}**${key}**: ${value}`)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
if (i < data.length - 1)
|
|
||||||
lines.push('')
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
lines.push(`${spaces}- ${item}`)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
else if (typeof data === 'object' && data !== null) {
|
|
||||||
Object.entries(data).forEach(([key, value]) => {
|
|
||||||
if (Array.isArray(value)) {
|
|
||||||
lines.push(`${spaces}**${key}**:`)
|
|
||||||
lines.push(toMarkdownKV(value, indent + 1))
|
|
||||||
}
|
|
||||||
else if (typeof value === 'object' && value !== null) {
|
|
||||||
lines.push(`${spaces}**${key}**:`)
|
|
||||||
lines.push(toMarkdownKV(value, indent + 1))
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
lines.push(`${spaces}**${key}**: ${value}`)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
lines.push(`${spaces}${data}`)
|
|
||||||
}
|
|
||||||
|
|
||||||
return lines.join('\n')
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Question generation for TOON benchmarks
|
* Question generation for TOON benchmarks
|
||||||
*
|
*
|
||||||
* Generates ~200 questions across different types:
|
* Generates ~160 questions across different types:
|
||||||
* - Field retrieval (50%): "What is X's Y?"
|
* - Field retrieval (50%): "What is X's Y?"
|
||||||
* - Aggregation (25%): "How many X have Y?"
|
* - Aggregation (25%): "How many X have Y?"
|
||||||
* - Filtering (25%): "List/count X where Y"
|
* - Filtering (25%): "List/count X where Y"
|
||||||
|
|||||||
16
pnpm-lock.yaml
generated
16
pnpm-lock.yaml
generated
@@ -68,6 +68,9 @@ importers:
|
|||||||
csv-stringify:
|
csv-stringify:
|
||||||
specifier: ^6.6.0
|
specifier: ^6.6.0
|
||||||
version: 6.6.0
|
version: 6.6.0
|
||||||
|
fast-xml-parser:
|
||||||
|
specifier: ^5.3.0
|
||||||
|
version: 5.3.0
|
||||||
gpt-tokenizer:
|
gpt-tokenizer:
|
||||||
specifier: ^3.2.0
|
specifier: ^3.2.0
|
||||||
version: 3.2.0
|
version: 3.2.0
|
||||||
@@ -1452,6 +1455,10 @@ packages:
|
|||||||
fast-levenshtein@2.0.6:
|
fast-levenshtein@2.0.6:
|
||||||
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
|
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
|
||||||
|
|
||||||
|
fast-xml-parser@5.3.0:
|
||||||
|
resolution: {integrity: sha512-gkWGshjYcQCF+6qtlrqBqELqNqnt4CxruY6UVAWWnqb3DQ6qaNFEIKqzYep1XzHLM/QtrHVCxyPOtTk4LTQ7Aw==}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
fastest-levenshtein@1.0.16:
|
fastest-levenshtein@1.0.16:
|
||||||
resolution: {integrity: sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==}
|
resolution: {integrity: sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==}
|
||||||
engines: {node: '>= 4.9.1'}
|
engines: {node: '>= 4.9.1'}
|
||||||
@@ -2083,6 +2090,9 @@ packages:
|
|||||||
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
|
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
|
||||||
|
strnum@2.1.1:
|
||||||
|
resolution: {integrity: sha512-7ZvoFTiCnGxBtDqJ//Cu6fWtZtc7Y3x+QOirG15wztbdngGSkht27o2pyGWrVy0b4WAy3jbKmnoK6g5VlVNUUw==}
|
||||||
|
|
||||||
supports-color@7.2.0:
|
supports-color@7.2.0:
|
||||||
resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
|
resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
@@ -3684,6 +3694,10 @@ snapshots:
|
|||||||
|
|
||||||
fast-levenshtein@2.0.6: {}
|
fast-levenshtein@2.0.6: {}
|
||||||
|
|
||||||
|
fast-xml-parser@5.3.0:
|
||||||
|
dependencies:
|
||||||
|
strnum: 2.1.1
|
||||||
|
|
||||||
fastest-levenshtein@1.0.16: {}
|
fastest-levenshtein@1.0.16: {}
|
||||||
|
|
||||||
fastq@1.19.1:
|
fastq@1.19.1:
|
||||||
@@ -4461,6 +4475,8 @@ snapshots:
|
|||||||
|
|
||||||
strip-json-comments@3.1.1: {}
|
strip-json-comments@3.1.1: {}
|
||||||
|
|
||||||
|
strnum@2.1.1: {}
|
||||||
|
|
||||||
supports-color@7.2.0:
|
supports-color@7.2.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
has-flag: 4.0.0
|
has-flag: 4.0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user