refactor: shared utils for benchmark scripts

This commit is contained in:
Johann Schopplich
2025-10-27 17:37:27 +01:00
parent 7b76acde31
commit 4ec7e84f5f
9 changed files with 269 additions and 124 deletions

View File

@@ -9,6 +9,16 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
*/
export const DEFAULT_CONCURRENCY = 20
/**
* Progress bar configuration
*/
export const PROGRESS_BAR = {
/** Default width for progress bars */
defaultWidth: 25,
/** Compact width for inline displays */
compactWidth: 20,
} as const
/**
* Enable dry run mode for quick testing with limited AI requests
*

View File

@@ -14,7 +14,48 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
// Seed for reproducibility
faker.seed(12345)
interface AnalyticsMetric {
/**
* Employee record structure for tabular dataset
*/
export interface Employee {
id: number
name: string
email: string
department: string
salary: number
yearsExperience: number
active: boolean
}
/**
* E-commerce order structure for nested dataset
*/
export interface Order {
orderId: string
customer: {
id: number
name: string
email: string
phone: string
}
items: {
sku: string
name: string
quantity: number
price: number
}[]
subtotal: number
tax: number
total: number
status: string
orderDate?: string
createdAt?: string
}
/**
* Analytics metric structure for time-series dataset
*/
export interface AnalyticsMetric {
date: string
views: number
clicks: number
@@ -24,7 +65,25 @@ interface AnalyticsMetric {
}
/**
* Generate analytics time-series data with reproducible seeded randomness
* GitHub repository structure for real-world dataset
*/
export interface Repository {
id: number
name: string
owner: string
repo: string
description: string
stars: number
watchers: number
forks: number
defaultBranch: string
createdAt: string
updatedAt: string
pushedAt: string
}
/**
* Generate analytics time-series data
*/
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
metrics: AnalyticsMetric[]
@@ -63,12 +122,12 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
* @remarks
* Tests TOON's tabular array format
*/
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance']
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
const tabularDataset: Dataset = {
name: 'tabular',
description: 'Uniform employee records (TOON optimal format)',
data: {
employees: Array.from({ length: 100 }, (_, i) => {
employees: Array.from({ length: 100 }, (_, i): Employee => {
const yearsExp = faker.number.int({ min: 1, max: 20 })
return {
id: i + 1,
@@ -89,8 +148,8 @@ const tabularDataset: Dataset = {
* @remarks
* Tests TOON's handling of complex nested objects
*/
const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp']
const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
const productNames: readonly string[] = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
const statuses: readonly string[] = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
const nestedDataset: Dataset = {
name: 'nested',
@@ -155,6 +214,35 @@ const githubDataset: Dataset = {
},
}
/**
* Generate a single e-commerce order with nested structure
*
* @remarks
* Used for token efficiency benchmarks
*/
export function generateOrderData(): Order {
return {
orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
customer: {
id: faker.number.int({ min: 1000, max: 9999 }),
name: faker.person.fullName(),
email: faker.internet.email(),
phone: faker.phone.number(),
},
items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
name: faker.commerce.productName(),
quantity: faker.number.int({ min: 1, max: 5 }),
price: Number(faker.commerce.price({ min: 10, max: 200 })),
})),
subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
tax: Number(faker.commerce.price({ min: 10, max: 50 })),
total: Number(faker.commerce.price({ min: 110, max: 550 })),
status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
createdAt: faker.date.recent({ days: 7 }).toISOString(),
}
}
/**
* All datasets used in the benchmark
*/

View File

@@ -1,12 +1,18 @@
/**
* Format converters for TOON benchmarks
*
* Converts data to different formats:
* Converts data to different formats for comparison:
* - JSON
* - TOON
* - CSV
* - XML
* - YAML
*
* ## Semantic Equivalence
*
* All formatters attempt to preserve semantic equivalence with the source data,
* meaning the converted data should represent the same information. However,
* CSV has inherent limitations with nested structures (see `toCSV` docs).
*/
import { stringify as stringifyCSV } from 'csv-stringify/sync'
@@ -14,12 +20,17 @@ import { XMLBuilder } from 'fast-xml-parser'
import { stringify as stringifyYAML } from 'yaml'
import { encode as encodeToon } from '../../src/index'
export const formatters = {
json: (data: unknown): string => JSON.stringify(data, undefined, 2),
toon: (data: unknown): string => encodeToon(data),
csv: (data: unknown): string => toCSV(data),
xml: (data: unknown): string => toXML(data),
yaml: (data: unknown): string => stringifyYAML(data),
/**
* Format converters registry
*
* Each formatter takes unknown data and returns a string representation
*/
export const formatters: Record<string, (data: unknown) => string> = {
json: data => JSON.stringify(data, undefined, 2),
toon: data => encodeToon(data),
csv: data => toCSV(data),
xml: data => toXML(data),
yaml: data => stringifyYAML(data),
}
/**
@@ -57,6 +68,15 @@ function toCSV(data: unknown): string {
return ''
}
/**
* Convert data to XML format
*
* @remarks
* Uses fast-xml-parser to generate well-formatted XML with:
* - 2-space indentation for readability
* - Empty nodes suppressed
* - Proper escaping of special characters
*/
function toXML(data: unknown): string {
const builder = new XMLBuilder({
format: true,

View File

@@ -7,8 +7,16 @@
* - Filtering (25%): "List/count X where Y"
*
* Questions are generated dynamically based on actual data values
*
* TODO: Balance question distribution across datasets to ensure fair representation.
* Current distribution:
* - Tabular: 70 questions (43%)
* - Nested: 50 questions (31%)
* - Analytics: 40 questions (25%)
* - GitHub: 40 questions (25%)
*/
import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
import type { Question } from './types'
import { consola } from 'consola'
import { datasets } from './datasets'
@@ -20,11 +28,11 @@ export function generateQuestions(): Question[] {
const questions: Question[] = []
let idCounter = 1
// Get datasets
const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || []
const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || []
const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || []
const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || []
// Get datasets with proper typing
const tabular = (datasets.find(d => d.name === 'tabular')?.data.employees as Employee[]) || []
const nested = (datasets.find(d => d.name === 'nested')?.data.orders as Order[]) || []
const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) || []
const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) || []
// ========================================
// TABULAR DATASET QUESTIONS (70 questions)
@@ -68,9 +76,9 @@ export function generateQuestions(): Question[] {
}
// Aggregation: count by department
const departments = [...new Set(tabular.map((e: any) => e.department))]
const departments = [...new Set(tabular.map(e => e.department))]
for (const dept of departments.slice(0, 6)) {
const count = tabular.filter((e: any) => e.department === dept).length
const count = tabular.filter(e => e.department === dept).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many employees work in ${dept}?`,
@@ -83,7 +91,7 @@ export function generateQuestions(): Question[] {
// Aggregation: salary ranges (4 questions)
const salaryThresholds = [60000, 80000, 100000, 120000]
for (const threshold of salaryThresholds) {
const count = tabular.filter((e: any) => e.salary > threshold).length
const count = tabular.filter(e => e.salary > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many employees have a salary greater than ${threshold}?`,
@@ -94,8 +102,8 @@ export function generateQuestions(): Question[] {
}
// Filtering: active status
const activeCount = tabular.filter((e: any) => e.active).length
const inactiveCount = tabular.filter((e: any) => !e.active).length
const activeCount = tabular.filter(e => e.active).length
const inactiveCount = tabular.filter(e => !e.active).length
questions.push(
{
id: `q${idCounter++}`,
@@ -115,7 +123,7 @@ export function generateQuestions(): Question[] {
// Complex filtering: multi-condition (8 questions)
for (const dept of departments.slice(0, 4)) {
const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length
const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many employees in ${dept} have a salary greater than 80000?`,
@@ -126,7 +134,7 @@ export function generateQuestions(): Question[] {
}
for (const exp of [5, 10]) {
const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length
const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many active employees have more than ${exp} years of experience?`,
@@ -184,9 +192,9 @@ export function generateQuestions(): Question[] {
}
// Aggregation: count by status
const statuses = [...new Set(nested.map((o: any) => o.status))]
const statuses = [...new Set(nested.map(o => o.status))]
for (const status of statuses) {
const count = nested.filter((o: any) => o.status === status).length
const count = nested.filter(o => o.status === status).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have status "${status}"?`,
@@ -197,7 +205,7 @@ export function generateQuestions(): Question[] {
}
// Aggregation: total revenue
const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0)
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
questions.push({
id: `q${idCounter++}`,
prompt: 'What is the total revenue across all orders?',
@@ -209,7 +217,7 @@ export function generateQuestions(): Question[] {
// Filtering: high-value orders (3 questions)
const highValueThresholds = [200, 400, 600]
for (const threshold of highValueThresholds) {
const count = nested.filter((o: any) => o.total > threshold).length
const count = nested.filter(o => o.total > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many orders have a total greater than ${threshold}?`,
@@ -252,9 +260,9 @@ export function generateQuestions(): Question[] {
}
// Aggregation: totals (4 questions)
const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0)
const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0)
const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0)
const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
questions.push(
{
@@ -283,7 +291,7 @@ export function generateQuestions(): Question[] {
// Filtering: high-performing days (10 questions)
const viewThresholds = [5000, 6000, 7000]
for (const threshold of viewThresholds) {
const count = analytics.filter((m: any) => m.views > threshold).length
const count = analytics.filter(m => m.views > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${threshold} views?`,
@@ -295,7 +303,7 @@ export function generateQuestions(): Question[] {
const conversionThresholds = [10, 20, 30]
for (const threshold of conversionThresholds) {
const count = analytics.filter((m: any) => m.conversions > threshold).length
const count = analytics.filter(m => m.conversions > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many days had more than ${threshold} conversions?`,
@@ -338,9 +346,9 @@ export function generateQuestions(): Question[] {
}
// Aggregation: count by owner (5 questions)
const owners = [...new Set(github.map((r: any) => r.owner))]
const owners = [...new Set(github.map(r => r.owner))]
for (const owner of owners.slice(0, 5)) {
const count = github.filter((r: any) => r.owner === owner).length
const count = github.filter(r => r.owner === owner).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories does ${owner} have in the dataset?`,
@@ -351,7 +359,7 @@ export function generateQuestions(): Question[] {
}
// Aggregation: total stars
const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0)
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
questions.push({
id: `q${idCounter++}`,
prompt: 'What is the total number of stars across all repositories?',
@@ -363,7 +371,7 @@ export function generateQuestions(): Question[] {
// Filtering: popular repos (8 questions)
const starThresholds = [10000, 50000, 100000]
for (const threshold of starThresholds) {
const count = github.filter((r: any) => r.stars > threshold).length
const count = github.filter(r => r.stars > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${threshold} stars?`,
@@ -375,7 +383,7 @@ export function generateQuestions(): Question[] {
const forkThresholds = [1000, 5000, 10000]
for (const threshold of forkThresholds) {
const count = github.filter((r: any) => r.forks > threshold).length
const count = github.filter(r => r.forks > threshold).length
questions.push({
id: `q${idCounter++}`,
prompt: `How many repositories have more than ${threshold} forks?`,

View File

@@ -12,10 +12,10 @@
import type { EvaluationResult, FormatResult, Question } from './types'
import * as fsp from 'node:fs/promises'
import * as path from 'node:path'
import { encode } from 'gpt-tokenizer'
import { BENCHMARKS_DIR } from './constants'
import { datasets } from './datasets'
import { models } from './evaluate'
import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
/**
* Calculate per-format statistics from evaluation results
@@ -220,7 +220,7 @@ export function calculateTokenCounts(
for (const dataset of datasets) {
const formatted = formatter(dataset.data)
const key = `${formatName}-${dataset.name}`
tokenCounts[key] = encode(formatted).length
tokenCounts[key] = tokenize(formatted)
}
}
@@ -237,25 +237,22 @@ export async function saveResults(
tokenCounts: Record<string, number>,
): Promise<string> {
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
await fsp.mkdir(resultsDir, { recursive: true })
await ensureDir(resultsDir)
// Save raw results
await fsp.writeFile(
path.join(resultsDir, 'raw-results.json'),
`${JSON.stringify(results, undefined, 2)}\n`,
)
await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
// Save summary
await fsp.writeFile(
await saveJsonFile(
path.join(resultsDir, 'summary.json'),
`${JSON.stringify({
{
formatResults,
questions: questions.length,
models: Object.keys(models),
datasets: datasets.map(d => ({ name: d.name, description: d.description })),
tokenCounts,
timestamp: new Date().toISOString(),
}, undefined, 2)}\n`,
},
)
// Generate markdown report
@@ -267,12 +264,3 @@ export async function saveResults(
return resultsDir
}
/**
* Generate visual progress bar using ASCII characters (`█` for filled, `░` for empty)
*/
function createProgressBar(tokens: number, maxTokens: number, width = 30): string {
const filled = Math.round((tokens / maxTokens) * width)
const empty = width - filled
return '█'.repeat(filled) + '░'.repeat(empty)
}

68
benchmarks/src/utils.ts Normal file
View File

@@ -0,0 +1,68 @@
/**
* Shared utility functions for TOON benchmarks
*
* Provides common functionality used across multiple benchmark scripts:
* - Progress bar visualization
* - Token counting
* - File I/O operations
* - Retry logic for API calls
*/
import * as fsp from 'node:fs/promises'
import { encode } from 'gpt-tokenizer'
/**
* Generate visual progress bar using ASCII characters
*
* @param value - Current value
* @param max - Maximum value
* @param width - Width of the bar in characters (default: 25)
* @returns ASCII progress bar string (`█` for filled, `░` for empty)
*
* @example
* createProgressBar(75, 100, 20) // "███████████████░░░░░"
* createProgressBar(0.5, 1, 10) // "█████░░░░░"
*/
export function createProgressBar(value: number, max: number, width = 25): string {
const filled = Math.round((value / max) * width)
const empty = width - filled
return '█'.repeat(filled) + '░'.repeat(empty)
}
/**
* Count tokens in text using gpt-tokenizer (o200k_base encoding)
*
* @param text - Text to tokenize
* @returns Number of tokens
*
* @example
* tokenize("Hello, world!") // 4
*/
export function tokenize(text: string): number {
return encode(text).length
}
/**
* Ensure a directory exists, creating it recursively if needed
*
* @param dirPath - Directory path to ensure exists
*/
export async function ensureDir(dirPath: string): Promise<void> {
await fsp.mkdir(dirPath, { recursive: true })
}
/**
* Save data as formatted JSON file
*
* @param filePath - Path to save the file
* @param data - Data to serialize as JSON
* @param indent - Indentation spaces (default: 2)
*/
export async function saveJsonFile(
filePath: string,
data: unknown,
indent = 2,
): Promise<void> {
const json = JSON.stringify(data, undefined, indent)
await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
}