From 4ec7e84f5f073333f60b0834ff5f74fb8487f9ce Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Mon, 27 Oct 2025 17:37:27 +0100 Subject: [PATCH] refactor: shared utils for benchmark scripts --- benchmarks/scripts/accuracy-benchmark.ts | 17 +-- benchmarks/scripts/fetch-github-data.ts | 24 +++-- .../scripts/token-efficiency-benchmark.ts | 54 +++------- benchmarks/src/constants.ts | 10 ++ benchmarks/src/datasets.ts | 100 ++++++++++++++++-- benchmarks/src/formatters.ts | 34 ++++-- benchmarks/src/questions.ts | 60 ++++++----- benchmarks/src/report.ts | 26 ++--- benchmarks/src/utils.ts | 68 ++++++++++++ 9 files changed, 269 insertions(+), 124 deletions(-) create mode 100644 benchmarks/src/utils.ts diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts index 1f0e3ab..70172a1 100644 --- a/benchmarks/scripts/accuracy-benchmark.ts +++ b/benchmarks/scripts/accuracy-benchmark.ts @@ -79,17 +79,6 @@ else { // Calculate token counts for all format+dataset combinations tokenCounts = calculateTokenCounts(formatters) - // Format datasets once (reuse for all questions) - const formattedDatasets: Record> = {} - - for (const [formatName, formatter] of Object.entries(formatters)) { - formattedDatasets[formatName] ??= {} - - for (const dataset of datasets) { - formattedDatasets[formatName]![dataset.name] = formatter(dataset.data) - } - } - // Generate evaluation tasks const tasks: { question: Question, formatName: string, modelName: string }[] = [] @@ -104,11 +93,13 @@ else { const total = tasks.length consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`) - // Evaluate all tasks in parallel results = await pMap( tasks, async (task, index) => { - const formattedData = formattedDatasets[task.formatName]![task.question.dataset]! + // Format data on-demand + const dataset = datasets.find(d => d.name === task.question.dataset)! + const formatter = formatters[task.formatName]! + const formattedData = formatter(dataset.data) const model = activeModels[task.modelName as keyof typeof activeModels]! const result = await evaluateQuestion({ diff --git a/benchmarks/scripts/fetch-github-data.ts b/benchmarks/scripts/fetch-github-data.ts index 335dd77..1ded0ee 100644 --- a/benchmarks/scripts/fetch-github-data.ts +++ b/benchmarks/scripts/fetch-github-data.ts @@ -1,9 +1,10 @@ -import * as fsp from 'node:fs/promises' import * as path from 'node:path' import process from 'node:process' import { consola } from 'consola' import { ofetch } from 'ofetch' +import pMap from 'p-map' import { BENCHMARKS_DIR } from '../src/constants' +import { ensureDir, saveJsonFile } from '../src/utils' try { // Fetch top 100 repos from GitHub @@ -52,14 +53,15 @@ async function searchTop100Repos(): Promise { async function fetchRepoDetails(repoList: string[]): Promise[]> { consola.start(`Fetching ${repoList.length} GitHub repositories…`) - const repos: Record[] = [] - - for (let i = 0; i < repoList.length; i++) { - const repoPath = repoList[i]! - console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}…`) - const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`) - repos.push(repo) - } + const repos = await pMap( + repoList, + async (repoPath, index) => { + consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}…`) + const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`) + return repo + }, + { concurrency: 5 }, + ) consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`) @@ -70,8 +72,8 @@ async function saveRepos(repos: Record[]): Promise { const outputDir = path.join(BENCHMARKS_DIR, 'data') const outputFile = path.join(outputDir, 'github-repos.json') - await fsp.mkdir(outputDir, { recursive: true }) - await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2)) + await ensureDir(outputDir) + await saveJsonFile(outputFile, repos) const relativePath = path.relative(BENCHMARKS_DIR, outputFile) consola.info(`Saved to \`${relativePath}\``) diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index c110a12..ed1a14c 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -1,13 +1,12 @@ import * as fsp from 'node:fs/promises' import * as path from 'node:path' -import { faker } from '@faker-js/faker' import { consola } from 'consola' -import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default) import { encode } from '../../src/index' import githubRepos from '../data/github-repos.json' with { type: 'json' } import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants' -import { generateAnalyticsData } from '../src/datasets' +import { generateAnalyticsData, generateOrderData } from '../src/datasets' import { formatters } from '../src/formatters' +import { createProgressBar, ensureDir, tokenize } from '../src/utils' interface BenchmarkResult { name: string @@ -45,7 +44,7 @@ const BENCHMARK_EXAMPLES = [ name: 'E-Commerce Order', emoji: 'πŸ›’', description: 'Single nested order with customer and items', - getData: generateOrder, + getData: generateOrderData, showDetailed: false, }, ] as const @@ -62,11 +61,11 @@ for (const example of BENCHMARK_EXAMPLES) { const jsonString = JSON.stringify(data, undefined, 2) const toonString = encode(data) - const xmlString = formatters.xml(data) + const xmlString = formatters.xml!(data) - const jsonTokens = encodeTokens(jsonString).length - const toonTokens = encodeTokens(toonString).length - const xmlTokens = encodeTokens(xmlString).length + const jsonTokens = tokenize(jsonString) + const toonTokens = tokenize(toonString) + const xmlTokens = tokenize(xmlString) const jsonSavings = jsonTokens - toonTokens const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1) @@ -104,7 +103,7 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe const datasetRows = results .map((result) => { const percentage = Number.parseFloat(result.jsonSavingsPercent) - const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens + const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens const toonStr = result.toonTokens.toLocaleString('en-US') const jsonStr = result.jsonTokens.toLocaleString('en-US') const xmlStr = result.xmlTokens.toLocaleString('en-US') @@ -123,7 +122,7 @@ const separator = '──────────────────── // Calculate bar for totals (TOON vs average of JSON+XML) const averageComparisonTokens = (totalJsonTokens + totalXmlTokens) / 2 const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100 -const totalBar = generateBarChart(totalPercentage) +const totalBar = createProgressBar(totalPercentage, 100) const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens` const totalLine2 = ` vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)} πŸ’° ${totalJsonSavingsPercent}% saved` @@ -132,6 +131,8 @@ const totalLine3 = ` vs XML: ${totalXmlTokens.toLoc const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}` // Generate detailed examples (only for selected examples) +// Note: Large datasets are truncated for display readability in the report. +// Token counts are calculated from the full datasets, not the truncated versions. const detailedExamples = results .filter(result => result.showDetailed) .map((result, i, filtered) => { @@ -187,38 +188,7 @@ ${detailedExamples} console.log(markdown) -await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true }) +await ensureDir(path.join(BENCHMARKS_DIR, 'results')) await fsp.writeFile(outputFilePath, markdown, 'utf-8') consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``) - -// Generate ASCII bar chart -function generateBarChart(percentage: number, maxWidth: number = 25): string { - const filled = Math.round((percentage / 100) * maxWidth) - const empty = maxWidth - filled - return 'β–ˆ'.repeat(filled) + 'β–‘'.repeat(empty) -} - -// Generate nested e-commerce order -function generateOrder() { - return { - orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }), - customer: { - id: faker.number.int({ min: 1000, max: 9999 }), - name: faker.person.fullName(), - email: faker.internet.email(), - phone: faker.phone.number(), - }, - items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({ - sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }), - name: faker.commerce.productName(), - quantity: faker.number.int({ min: 1, max: 5 }), - price: Number(faker.commerce.price({ min: 10, max: 200 })), - })), - subtotal: Number(faker.commerce.price({ min: 100, max: 500 })), - tax: Number(faker.commerce.price({ min: 10, max: 50 })), - total: Number(faker.commerce.price({ min: 110, max: 550 })), - status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']), - createdAt: faker.date.recent({ days: 7 }).toISOString(), - } -} diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts index 6434dde..e9301d9 100644 --- a/benchmarks/src/constants.ts +++ b/benchmarks/src/constants.ts @@ -9,6 +9,16 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me */ export const DEFAULT_CONCURRENCY = 20 +/** + * Progress bar configuration + */ +export const PROGRESS_BAR = { + /** Default width for progress bars */ + defaultWidth: 25, + /** Compact width for inline displays */ + compactWidth: 20, +} as const + /** * Enable dry run mode for quick testing with limited AI requests * diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts index a9afd77..49de5fb 100644 --- a/benchmarks/src/datasets.ts +++ b/benchmarks/src/datasets.ts @@ -14,7 +14,48 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' } // Seed for reproducibility faker.seed(12345) -interface AnalyticsMetric { +/** + * Employee record structure for tabular dataset + */ +export interface Employee { + id: number + name: string + email: string + department: string + salary: number + yearsExperience: number + active: boolean +} + +/** + * E-commerce order structure for nested dataset + */ +export interface Order { + orderId: string + customer: { + id: number + name: string + email: string + phone: string + } + items: { + sku: string + name: string + quantity: number + price: number + }[] + subtotal: number + tax: number + total: number + status: string + orderDate?: string + createdAt?: string +} + +/** + * Analytics metric structure for time-series dataset + */ +export interface AnalyticsMetric { date: string views: number clicks: number @@ -24,7 +65,25 @@ interface AnalyticsMetric { } /** - * Generate analytics time-series data with reproducible seeded randomness + * GitHub repository structure for real-world dataset + */ +export interface Repository { + id: number + name: string + owner: string + repo: string + description: string + stars: number + watchers: number + forks: number + defaultBranch: string + createdAt: string + updatedAt: string + pushedAt: string +} + +/** + * Generate analytics time-series data */ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): { metrics: AnalyticsMetric[] @@ -63,12 +122,12 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): { * @remarks * Tests TOON's tabular array format */ -const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] +const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const const tabularDataset: Dataset = { name: 'tabular', description: 'Uniform employee records (TOON optimal format)', data: { - employees: Array.from({ length: 100 }, (_, i) => { + employees: Array.from({ length: 100 }, (_, i): Employee => { const yearsExp = faker.number.int({ min: 1, max: 20 }) return { id: i + 1, @@ -89,8 +148,8 @@ const tabularDataset: Dataset = { * @remarks * Tests TOON's handling of complex nested objects */ -const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] -const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] +const productNames: readonly string[] = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const +const statuses: readonly string[] = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const const nestedDataset: Dataset = { name: 'nested', @@ -155,6 +214,35 @@ const githubDataset: Dataset = { }, } +/** + * Generate a single e-commerce order with nested structure + * + * @remarks + * Used for token efficiency benchmarks + */ +export function generateOrderData(): Order { + return { + orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }), + customer: { + id: faker.number.int({ min: 1000, max: 9999 }), + name: faker.person.fullName(), + email: faker.internet.email(), + phone: faker.phone.number(), + }, + items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({ + sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }), + name: faker.commerce.productName(), + quantity: faker.number.int({ min: 1, max: 5 }), + price: Number(faker.commerce.price({ min: 10, max: 200 })), + })), + subtotal: Number(faker.commerce.price({ min: 100, max: 500 })), + tax: Number(faker.commerce.price({ min: 10, max: 50 })), + total: Number(faker.commerce.price({ min: 110, max: 550 })), + status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']), + createdAt: faker.date.recent({ days: 7 }).toISOString(), + } +} + /** * All datasets used in the benchmark */ diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts index 64aabef..4412d60 100644 --- a/benchmarks/src/formatters.ts +++ b/benchmarks/src/formatters.ts @@ -1,12 +1,18 @@ /** * Format converters for TOON benchmarks * - * Converts data to different formats: + * Converts data to different formats for comparison: * - JSON * - TOON * - CSV * - XML * - YAML + * + * ## Semantic Equivalence + * + * All formatters attempt to preserve semantic equivalence with the source data, + * meaning the converted data should represent the same information. However, + * CSV has inherent limitations with nested structures (see `toCSV` docs). */ import { stringify as stringifyCSV } from 'csv-stringify/sync' @@ -14,12 +20,17 @@ import { XMLBuilder } from 'fast-xml-parser' import { stringify as stringifyYAML } from 'yaml' import { encode as encodeToon } from '../../src/index' -export const formatters = { - json: (data: unknown): string => JSON.stringify(data, undefined, 2), - toon: (data: unknown): string => encodeToon(data), - csv: (data: unknown): string => toCSV(data), - xml: (data: unknown): string => toXML(data), - yaml: (data: unknown): string => stringifyYAML(data), +/** + * Format converters registry + * + * Each formatter takes unknown data and returns a string representation + */ +export const formatters: Record string> = { + json: data => JSON.stringify(data, undefined, 2), + toon: data => encodeToon(data), + csv: data => toCSV(data), + xml: data => toXML(data), + yaml: data => stringifyYAML(data), } /** @@ -57,6 +68,15 @@ function toCSV(data: unknown): string { return '' } +/** + * Convert data to XML format + * + * @remarks + * Uses fast-xml-parser to generate well-formatted XML with: + * - 2-space indentation for readability + * - Empty nodes suppressed + * - Proper escaping of special characters + */ function toXML(data: unknown): string { const builder = new XMLBuilder({ format: true, diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts index 4c27c33..41f4481 100644 --- a/benchmarks/src/questions.ts +++ b/benchmarks/src/questions.ts @@ -7,8 +7,16 @@ * - Filtering (25%): "List/count X where Y" * * Questions are generated dynamically based on actual data values + * + * TODO: Balance question distribution across datasets to ensure fair representation. + * Current distribution: + * - Tabular: 70 questions (43%) + * - Nested: 50 questions (31%) + * - Analytics: 40 questions (25%) + * - GitHub: 40 questions (25%) */ +import type { AnalyticsMetric, Employee, Order, Repository } from './datasets' import type { Question } from './types' import { consola } from 'consola' import { datasets } from './datasets' @@ -20,11 +28,11 @@ export function generateQuestions(): Question[] { const questions: Question[] = [] let idCounter = 1 - // Get datasets - const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || [] - const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || [] - const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || [] - const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || [] + // Get datasets with proper typing + const tabular = (datasets.find(d => d.name === 'tabular')?.data.employees as Employee[]) || [] + const nested = (datasets.find(d => d.name === 'nested')?.data.orders as Order[]) || [] + const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) || [] + const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) || [] // ======================================== // TABULAR DATASET QUESTIONS (70 questions) @@ -68,9 +76,9 @@ export function generateQuestions(): Question[] { } // Aggregation: count by department - const departments = [...new Set(tabular.map((e: any) => e.department))] + const departments = [...new Set(tabular.map(e => e.department))] for (const dept of departments.slice(0, 6)) { - const count = tabular.filter((e: any) => e.department === dept).length + const count = tabular.filter(e => e.department === dept).length questions.push({ id: `q${idCounter++}`, prompt: `How many employees work in ${dept}?`, @@ -83,7 +91,7 @@ export function generateQuestions(): Question[] { // Aggregation: salary ranges (4 questions) const salaryThresholds = [60000, 80000, 100000, 120000] for (const threshold of salaryThresholds) { - const count = tabular.filter((e: any) => e.salary > threshold).length + const count = tabular.filter(e => e.salary > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many employees have a salary greater than ${threshold}?`, @@ -94,8 +102,8 @@ export function generateQuestions(): Question[] { } // Filtering: active status - const activeCount = tabular.filter((e: any) => e.active).length - const inactiveCount = tabular.filter((e: any) => !e.active).length + const activeCount = tabular.filter(e => e.active).length + const inactiveCount = tabular.filter(e => !e.active).length questions.push( { id: `q${idCounter++}`, @@ -115,7 +123,7 @@ export function generateQuestions(): Question[] { // Complex filtering: multi-condition (8 questions) for (const dept of departments.slice(0, 4)) { - const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length + const count = tabular.filter(e => e.department === dept && e.salary > 80000).length questions.push({ id: `q${idCounter++}`, prompt: `How many employees in ${dept} have a salary greater than 80000?`, @@ -126,7 +134,7 @@ export function generateQuestions(): Question[] { } for (const exp of [5, 10]) { - const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length + const count = tabular.filter(e => e.yearsExperience > exp && e.active).length questions.push({ id: `q${idCounter++}`, prompt: `How many active employees have more than ${exp} years of experience?`, @@ -184,9 +192,9 @@ export function generateQuestions(): Question[] { } // Aggregation: count by status - const statuses = [...new Set(nested.map((o: any) => o.status))] + const statuses = [...new Set(nested.map(o => o.status))] for (const status of statuses) { - const count = nested.filter((o: any) => o.status === status).length + const count = nested.filter(o => o.status === status).length questions.push({ id: `q${idCounter++}`, prompt: `How many orders have status "${status}"?`, @@ -197,7 +205,7 @@ export function generateQuestions(): Question[] { } // Aggregation: total revenue - const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0) + const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0) questions.push({ id: `q${idCounter++}`, prompt: 'What is the total revenue across all orders?', @@ -209,7 +217,7 @@ export function generateQuestions(): Question[] { // Filtering: high-value orders (3 questions) const highValueThresholds = [200, 400, 600] for (const threshold of highValueThresholds) { - const count = nested.filter((o: any) => o.total > threshold).length + const count = nested.filter(o => o.total > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many orders have a total greater than ${threshold}?`, @@ -252,9 +260,9 @@ export function generateQuestions(): Question[] { } // Aggregation: totals (4 questions) - const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0) - const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0) - const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0) + const totalViews = analytics.reduce((sum, m) => sum + m.views, 0) + const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0) + const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0) questions.push( { @@ -283,7 +291,7 @@ export function generateQuestions(): Question[] { // Filtering: high-performing days (10 questions) const viewThresholds = [5000, 6000, 7000] for (const threshold of viewThresholds) { - const count = analytics.filter((m: any) => m.views > threshold).length + const count = analytics.filter(m => m.views > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many days had more than ${threshold} views?`, @@ -295,7 +303,7 @@ export function generateQuestions(): Question[] { const conversionThresholds = [10, 20, 30] for (const threshold of conversionThresholds) { - const count = analytics.filter((m: any) => m.conversions > threshold).length + const count = analytics.filter(m => m.conversions > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many days had more than ${threshold} conversions?`, @@ -338,9 +346,9 @@ export function generateQuestions(): Question[] { } // Aggregation: count by owner (5 questions) - const owners = [...new Set(github.map((r: any) => r.owner))] + const owners = [...new Set(github.map(r => r.owner))] for (const owner of owners.slice(0, 5)) { - const count = github.filter((r: any) => r.owner === owner).length + const count = github.filter(r => r.owner === owner).length questions.push({ id: `q${idCounter++}`, prompt: `How many repositories does ${owner} have in the dataset?`, @@ -351,7 +359,7 @@ export function generateQuestions(): Question[] { } // Aggregation: total stars - const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0) + const totalStars = github.reduce((sum, r) => sum + r.stars, 0) questions.push({ id: `q${idCounter++}`, prompt: 'What is the total number of stars across all repositories?', @@ -363,7 +371,7 @@ export function generateQuestions(): Question[] { // Filtering: popular repos (8 questions) const starThresholds = [10000, 50000, 100000] for (const threshold of starThresholds) { - const count = github.filter((r: any) => r.stars > threshold).length + const count = github.filter(r => r.stars > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many repositories have more than ${threshold} stars?`, @@ -375,7 +383,7 @@ export function generateQuestions(): Question[] { const forkThresholds = [1000, 5000, 10000] for (const threshold of forkThresholds) { - const count = github.filter((r: any) => r.forks > threshold).length + const count = github.filter(r => r.forks > threshold).length questions.push({ id: `q${idCounter++}`, prompt: `How many repositories have more than ${threshold} forks?`, diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index af41f26..3a8fdda 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -12,10 +12,10 @@ import type { EvaluationResult, FormatResult, Question } from './types' import * as fsp from 'node:fs/promises' import * as path from 'node:path' -import { encode } from 'gpt-tokenizer' import { BENCHMARKS_DIR } from './constants' import { datasets } from './datasets' import { models } from './evaluate' +import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils' /** * Calculate per-format statistics from evaluation results @@ -220,7 +220,7 @@ export function calculateTokenCounts( for (const dataset of datasets) { const formatted = formatter(dataset.data) const key = `${formatName}-${dataset.name}` - tokenCounts[key] = encode(formatted).length + tokenCounts[key] = tokenize(formatted) } } @@ -237,25 +237,22 @@ export async function saveResults( tokenCounts: Record, ): Promise { const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy') - await fsp.mkdir(resultsDir, { recursive: true }) + await ensureDir(resultsDir) // Save raw results - await fsp.writeFile( - path.join(resultsDir, 'raw-results.json'), - `${JSON.stringify(results, undefined, 2)}\n`, - ) + await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results) // Save summary - await fsp.writeFile( + await saveJsonFile( path.join(resultsDir, 'summary.json'), - `${JSON.stringify({ + { formatResults, questions: questions.length, models: Object.keys(models), datasets: datasets.map(d => ({ name: d.name, description: d.description })), tokenCounts, timestamp: new Date().toISOString(), - }, undefined, 2)}\n`, + }, ) // Generate markdown report @@ -267,12 +264,3 @@ export async function saveResults( return resultsDir } - -/** - * Generate visual progress bar using ASCII characters (`β–ˆ` for filled, `β–‘` for empty) - */ -function createProgressBar(tokens: number, maxTokens: number, width = 30): string { - const filled = Math.round((tokens / maxTokens) * width) - const empty = width - filled - return 'β–ˆ'.repeat(filled) + 'β–‘'.repeat(empty) -} diff --git a/benchmarks/src/utils.ts b/benchmarks/src/utils.ts new file mode 100644 index 0000000..3b0a735 --- /dev/null +++ b/benchmarks/src/utils.ts @@ -0,0 +1,68 @@ +/** + * Shared utility functions for TOON benchmarks + * + * Provides common functionality used across multiple benchmark scripts: + * - Progress bar visualization + * - Token counting + * - File I/O operations + * - Retry logic for API calls + */ + +import * as fsp from 'node:fs/promises' +import { encode } from 'gpt-tokenizer' + +/** + * Generate visual progress bar using ASCII characters + * + * @param value - Current value + * @param max - Maximum value + * @param width - Width of the bar in characters (default: 25) + * @returns ASCII progress bar string (`β–ˆ` for filled, `β–‘` for empty) + * + * @example + * createProgressBar(75, 100, 20) // "β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘β–‘β–‘β–‘" + * createProgressBar(0.5, 1, 10) // "β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘β–‘β–‘β–‘" + */ +export function createProgressBar(value: number, max: number, width = 25): string { + const filled = Math.round((value / max) * width) + const empty = width - filled + return 'β–ˆ'.repeat(filled) + 'β–‘'.repeat(empty) +} + +/** + * Count tokens in text using gpt-tokenizer (o200k_base encoding) + * + * @param text - Text to tokenize + * @returns Number of tokens + * + * @example + * tokenize("Hello, world!") // 4 + */ +export function tokenize(text: string): number { + return encode(text).length +} + +/** + * Ensure a directory exists, creating it recursively if needed + * + * @param dirPath - Directory path to ensure exists + */ +export async function ensureDir(dirPath: string): Promise { + await fsp.mkdir(dirPath, { recursive: true }) +} + +/** + * Save data as formatted JSON file + * + * @param filePath - Path to save the file + * @param data - Data to serialize as JSON + * @param indent - Indentation spaces (default: 2) + */ +export async function saveJsonFile( + filePath: string, + data: unknown, + indent = 2, +): Promise { + const json = JSON.stringify(data, undefined, indent) + await fsp.writeFile(filePath, `${json}\n`, 'utf-8') +}