mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
refactor: shared utils for benchmark scripts
This commit is contained in:
@@ -79,17 +79,6 @@ else {
|
|||||||
// Calculate token counts for all format+dataset combinations
|
// Calculate token counts for all format+dataset combinations
|
||||||
tokenCounts = calculateTokenCounts(formatters)
|
tokenCounts = calculateTokenCounts(formatters)
|
||||||
|
|
||||||
// Format datasets once (reuse for all questions)
|
|
||||||
const formattedDatasets: Record<string, Record<string, string>> = {}
|
|
||||||
|
|
||||||
for (const [formatName, formatter] of Object.entries(formatters)) {
|
|
||||||
formattedDatasets[formatName] ??= {}
|
|
||||||
|
|
||||||
for (const dataset of datasets) {
|
|
||||||
formattedDatasets[formatName]![dataset.name] = formatter(dataset.data)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate evaluation tasks
|
// Generate evaluation tasks
|
||||||
const tasks: { question: Question, formatName: string, modelName: string }[] = []
|
const tasks: { question: Question, formatName: string, modelName: string }[] = []
|
||||||
|
|
||||||
@@ -104,11 +93,13 @@ else {
|
|||||||
const total = tasks.length
|
const total = tasks.length
|
||||||
consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
|
consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
|
||||||
|
|
||||||
// Evaluate all tasks in parallel
|
|
||||||
results = await pMap(
|
results = await pMap(
|
||||||
tasks,
|
tasks,
|
||||||
async (task, index) => {
|
async (task, index) => {
|
||||||
const formattedData = formattedDatasets[task.formatName]![task.question.dataset]!
|
// Format data on-demand
|
||||||
|
const dataset = datasets.find(d => d.name === task.question.dataset)!
|
||||||
|
const formatter = formatters[task.formatName]!
|
||||||
|
const formattedData = formatter(dataset.data)
|
||||||
const model = activeModels[task.modelName as keyof typeof activeModels]!
|
const model = activeModels[task.modelName as keyof typeof activeModels]!
|
||||||
|
|
||||||
const result = await evaluateQuestion({
|
const result = await evaluateQuestion({
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
import * as fsp from 'node:fs/promises'
|
|
||||||
import * as path from 'node:path'
|
import * as path from 'node:path'
|
||||||
import process from 'node:process'
|
import process from 'node:process'
|
||||||
import { consola } from 'consola'
|
import { consola } from 'consola'
|
||||||
import { ofetch } from 'ofetch'
|
import { ofetch } from 'ofetch'
|
||||||
|
import pMap from 'p-map'
|
||||||
import { BENCHMARKS_DIR } from '../src/constants'
|
import { BENCHMARKS_DIR } from '../src/constants'
|
||||||
|
import { ensureDir, saveJsonFile } from '../src/utils'
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Fetch top 100 repos from GitHub
|
// Fetch top 100 repos from GitHub
|
||||||
@@ -52,14 +53,15 @@ async function searchTop100Repos(): Promise<string[]> {
|
|||||||
async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
|
async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
|
||||||
consola.start(`Fetching ${repoList.length} GitHub repositories…`)
|
consola.start(`Fetching ${repoList.length} GitHub repositories…`)
|
||||||
|
|
||||||
const repos: Record<string, any>[] = []
|
const repos = await pMap(
|
||||||
|
repoList,
|
||||||
for (let i = 0; i < repoList.length; i++) {
|
async (repoPath, index) => {
|
||||||
const repoPath = repoList[i]!
|
consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}…`)
|
||||||
console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}…`)
|
const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`)
|
||||||
const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`)
|
return repo
|
||||||
repos.push(repo)
|
},
|
||||||
}
|
{ concurrency: 5 },
|
||||||
|
)
|
||||||
|
|
||||||
consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
|
consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
|
||||||
|
|
||||||
@@ -70,8 +72,8 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
|
|||||||
const outputDir = path.join(BENCHMARKS_DIR, 'data')
|
const outputDir = path.join(BENCHMARKS_DIR, 'data')
|
||||||
const outputFile = path.join(outputDir, 'github-repos.json')
|
const outputFile = path.join(outputDir, 'github-repos.json')
|
||||||
|
|
||||||
await fsp.mkdir(outputDir, { recursive: true })
|
await ensureDir(outputDir)
|
||||||
await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2))
|
await saveJsonFile(outputFile, repos)
|
||||||
|
|
||||||
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
|
const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
|
||||||
consola.info(`Saved to \`${relativePath}\``)
|
consola.info(`Saved to \`${relativePath}\``)
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
import * as fsp from 'node:fs/promises'
|
import * as fsp from 'node:fs/promises'
|
||||||
import * as path from 'node:path'
|
import * as path from 'node:path'
|
||||||
import { faker } from '@faker-js/faker'
|
|
||||||
import { consola } from 'consola'
|
import { consola } from 'consola'
|
||||||
import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default)
|
|
||||||
import { encode } from '../../src/index'
|
import { encode } from '../../src/index'
|
||||||
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
||||||
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
|
||||||
import { generateAnalyticsData } from '../src/datasets'
|
import { generateAnalyticsData, generateOrderData } from '../src/datasets'
|
||||||
import { formatters } from '../src/formatters'
|
import { formatters } from '../src/formatters'
|
||||||
|
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
||||||
|
|
||||||
interface BenchmarkResult {
|
interface BenchmarkResult {
|
||||||
name: string
|
name: string
|
||||||
@@ -45,7 +44,7 @@ const BENCHMARK_EXAMPLES = [
|
|||||||
name: 'E-Commerce Order',
|
name: 'E-Commerce Order',
|
||||||
emoji: '🛒',
|
emoji: '🛒',
|
||||||
description: 'Single nested order with customer and items',
|
description: 'Single nested order with customer and items',
|
||||||
getData: generateOrder,
|
getData: generateOrderData,
|
||||||
showDetailed: false,
|
showDetailed: false,
|
||||||
},
|
},
|
||||||
] as const
|
] as const
|
||||||
@@ -62,11 +61,11 @@ for (const example of BENCHMARK_EXAMPLES) {
|
|||||||
|
|
||||||
const jsonString = JSON.stringify(data, undefined, 2)
|
const jsonString = JSON.stringify(data, undefined, 2)
|
||||||
const toonString = encode(data)
|
const toonString = encode(data)
|
||||||
const xmlString = formatters.xml(data)
|
const xmlString = formatters.xml!(data)
|
||||||
|
|
||||||
const jsonTokens = encodeTokens(jsonString).length
|
const jsonTokens = tokenize(jsonString)
|
||||||
const toonTokens = encodeTokens(toonString).length
|
const toonTokens = tokenize(toonString)
|
||||||
const xmlTokens = encodeTokens(xmlString).length
|
const xmlTokens = tokenize(xmlString)
|
||||||
|
|
||||||
const jsonSavings = jsonTokens - toonTokens
|
const jsonSavings = jsonTokens - toonTokens
|
||||||
const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
|
const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
|
||||||
@@ -104,7 +103,7 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
|
|||||||
const datasetRows = results
|
const datasetRows = results
|
||||||
.map((result) => {
|
.map((result) => {
|
||||||
const percentage = Number.parseFloat(result.jsonSavingsPercent)
|
const percentage = Number.parseFloat(result.jsonSavingsPercent)
|
||||||
const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens
|
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
||||||
const toonStr = result.toonTokens.toLocaleString('en-US')
|
const toonStr = result.toonTokens.toLocaleString('en-US')
|
||||||
const jsonStr = result.jsonTokens.toLocaleString('en-US')
|
const jsonStr = result.jsonTokens.toLocaleString('en-US')
|
||||||
const xmlStr = result.xmlTokens.toLocaleString('en-US')
|
const xmlStr = result.xmlTokens.toLocaleString('en-US')
|
||||||
@@ -123,7 +122,7 @@ const separator = '────────────────────
|
|||||||
// Calculate bar for totals (TOON vs average of JSON+XML)
|
// Calculate bar for totals (TOON vs average of JSON+XML)
|
||||||
const averageComparisonTokens = (totalJsonTokens + totalXmlTokens) / 2
|
const averageComparisonTokens = (totalJsonTokens + totalXmlTokens) / 2
|
||||||
const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
|
const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
|
||||||
const totalBar = generateBarChart(totalPercentage)
|
const totalBar = createProgressBar(totalPercentage, 100)
|
||||||
|
|
||||||
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
const totalLine1 = `Total ${totalBar} ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
|
||||||
const totalLine2 = ` vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)} 💰 ${totalJsonSavingsPercent}% saved`
|
const totalLine2 = ` vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)} 💰 ${totalJsonSavingsPercent}% saved`
|
||||||
@@ -132,6 +131,8 @@ const totalLine3 = ` vs XML: ${totalXmlTokens.toLoc
|
|||||||
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}`
|
const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}`
|
||||||
|
|
||||||
// Generate detailed examples (only for selected examples)
|
// Generate detailed examples (only for selected examples)
|
||||||
|
// Note: Large datasets are truncated for display readability in the report.
|
||||||
|
// Token counts are calculated from the full datasets, not the truncated versions.
|
||||||
const detailedExamples = results
|
const detailedExamples = results
|
||||||
.filter(result => result.showDetailed)
|
.filter(result => result.showDetailed)
|
||||||
.map((result, i, filtered) => {
|
.map((result, i, filtered) => {
|
||||||
@@ -187,38 +188,7 @@ ${detailedExamples}
|
|||||||
|
|
||||||
console.log(markdown)
|
console.log(markdown)
|
||||||
|
|
||||||
await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true })
|
await ensureDir(path.join(BENCHMARKS_DIR, 'results'))
|
||||||
await fsp.writeFile(outputFilePath, markdown, 'utf-8')
|
await fsp.writeFile(outputFilePath, markdown, 'utf-8')
|
||||||
|
|
||||||
consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
|
consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
|
||||||
|
|
||||||
// Generate ASCII bar chart
|
|
||||||
function generateBarChart(percentage: number, maxWidth: number = 25): string {
|
|
||||||
const filled = Math.round((percentage / 100) * maxWidth)
|
|
||||||
const empty = maxWidth - filled
|
|
||||||
return '█'.repeat(filled) + '░'.repeat(empty)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate nested e-commerce order
|
|
||||||
function generateOrder() {
|
|
||||||
return {
|
|
||||||
orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
|
|
||||||
customer: {
|
|
||||||
id: faker.number.int({ min: 1000, max: 9999 }),
|
|
||||||
name: faker.person.fullName(),
|
|
||||||
email: faker.internet.email(),
|
|
||||||
phone: faker.phone.number(),
|
|
||||||
},
|
|
||||||
items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
|
|
||||||
sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
|
|
||||||
name: faker.commerce.productName(),
|
|
||||||
quantity: faker.number.int({ min: 1, max: 5 }),
|
|
||||||
price: Number(faker.commerce.price({ min: 10, max: 200 })),
|
|
||||||
})),
|
|
||||||
subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
|
|
||||||
tax: Number(faker.commerce.price({ min: 10, max: 50 })),
|
|
||||||
total: Number(faker.commerce.price({ min: 110, max: 550 })),
|
|
||||||
status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
|
|
||||||
createdAt: faker.date.recent({ days: 7 }).toISOString(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -9,6 +9,16 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
|
|||||||
*/
|
*/
|
||||||
export const DEFAULT_CONCURRENCY = 20
|
export const DEFAULT_CONCURRENCY = 20
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Progress bar configuration
|
||||||
|
*/
|
||||||
|
export const PROGRESS_BAR = {
|
||||||
|
/** Default width for progress bars */
|
||||||
|
defaultWidth: 25,
|
||||||
|
/** Compact width for inline displays */
|
||||||
|
compactWidth: 20,
|
||||||
|
} as const
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enable dry run mode for quick testing with limited AI requests
|
* Enable dry run mode for quick testing with limited AI requests
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -14,7 +14,48 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
|||||||
// Seed for reproducibility
|
// Seed for reproducibility
|
||||||
faker.seed(12345)
|
faker.seed(12345)
|
||||||
|
|
||||||
interface AnalyticsMetric {
|
/**
|
||||||
|
* Employee record structure for tabular dataset
|
||||||
|
*/
|
||||||
|
export interface Employee {
|
||||||
|
id: number
|
||||||
|
name: string
|
||||||
|
email: string
|
||||||
|
department: string
|
||||||
|
salary: number
|
||||||
|
yearsExperience: number
|
||||||
|
active: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* E-commerce order structure for nested dataset
|
||||||
|
*/
|
||||||
|
export interface Order {
|
||||||
|
orderId: string
|
||||||
|
customer: {
|
||||||
|
id: number
|
||||||
|
name: string
|
||||||
|
email: string
|
||||||
|
phone: string
|
||||||
|
}
|
||||||
|
items: {
|
||||||
|
sku: string
|
||||||
|
name: string
|
||||||
|
quantity: number
|
||||||
|
price: number
|
||||||
|
}[]
|
||||||
|
subtotal: number
|
||||||
|
tax: number
|
||||||
|
total: number
|
||||||
|
status: string
|
||||||
|
orderDate?: string
|
||||||
|
createdAt?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analytics metric structure for time-series dataset
|
||||||
|
*/
|
||||||
|
export interface AnalyticsMetric {
|
||||||
date: string
|
date: string
|
||||||
views: number
|
views: number
|
||||||
clicks: number
|
clicks: number
|
||||||
@@ -24,7 +65,25 @@ interface AnalyticsMetric {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate analytics time-series data with reproducible seeded randomness
|
* GitHub repository structure for real-world dataset
|
||||||
|
*/
|
||||||
|
export interface Repository {
|
||||||
|
id: number
|
||||||
|
name: string
|
||||||
|
owner: string
|
||||||
|
repo: string
|
||||||
|
description: string
|
||||||
|
stars: number
|
||||||
|
watchers: number
|
||||||
|
forks: number
|
||||||
|
defaultBranch: string
|
||||||
|
createdAt: string
|
||||||
|
updatedAt: string
|
||||||
|
pushedAt: string
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate analytics time-series data
|
||||||
*/
|
*/
|
||||||
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
||||||
metrics: AnalyticsMetric[]
|
metrics: AnalyticsMetric[]
|
||||||
@@ -63,12 +122,12 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
|
|||||||
* @remarks
|
* @remarks
|
||||||
* Tests TOON's tabular array format
|
* Tests TOON's tabular array format
|
||||||
*/
|
*/
|
||||||
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance']
|
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
|
||||||
const tabularDataset: Dataset = {
|
const tabularDataset: Dataset = {
|
||||||
name: 'tabular',
|
name: 'tabular',
|
||||||
description: 'Uniform employee records (TOON optimal format)',
|
description: 'Uniform employee records (TOON optimal format)',
|
||||||
data: {
|
data: {
|
||||||
employees: Array.from({ length: 100 }, (_, i) => {
|
employees: Array.from({ length: 100 }, (_, i): Employee => {
|
||||||
const yearsExp = faker.number.int({ min: 1, max: 20 })
|
const yearsExp = faker.number.int({ min: 1, max: 20 })
|
||||||
return {
|
return {
|
||||||
id: i + 1,
|
id: i + 1,
|
||||||
@@ -89,8 +148,8 @@ const tabularDataset: Dataset = {
|
|||||||
* @remarks
|
* @remarks
|
||||||
* Tests TOON's handling of complex nested objects
|
* Tests TOON's handling of complex nested objects
|
||||||
*/
|
*/
|
||||||
const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp']
|
const productNames: readonly string[] = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
|
||||||
const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
|
const statuses: readonly string[] = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
|
||||||
|
|
||||||
const nestedDataset: Dataset = {
|
const nestedDataset: Dataset = {
|
||||||
name: 'nested',
|
name: 'nested',
|
||||||
@@ -155,6 +214,35 @@ const githubDataset: Dataset = {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a single e-commerce order with nested structure
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Used for token efficiency benchmarks
|
||||||
|
*/
|
||||||
|
export function generateOrderData(): Order {
|
||||||
|
return {
|
||||||
|
orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
|
||||||
|
customer: {
|
||||||
|
id: faker.number.int({ min: 1000, max: 9999 }),
|
||||||
|
name: faker.person.fullName(),
|
||||||
|
email: faker.internet.email(),
|
||||||
|
phone: faker.phone.number(),
|
||||||
|
},
|
||||||
|
items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
|
||||||
|
sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
|
||||||
|
name: faker.commerce.productName(),
|
||||||
|
quantity: faker.number.int({ min: 1, max: 5 }),
|
||||||
|
price: Number(faker.commerce.price({ min: 10, max: 200 })),
|
||||||
|
})),
|
||||||
|
subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
|
||||||
|
tax: Number(faker.commerce.price({ min: 10, max: 50 })),
|
||||||
|
total: Number(faker.commerce.price({ min: 110, max: 550 })),
|
||||||
|
status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
|
||||||
|
createdAt: faker.date.recent({ days: 7 }).toISOString(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* All datasets used in the benchmark
|
* All datasets used in the benchmark
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -1,12 +1,18 @@
|
|||||||
/**
|
/**
|
||||||
* Format converters for TOON benchmarks
|
* Format converters for TOON benchmarks
|
||||||
*
|
*
|
||||||
* Converts data to different formats:
|
* Converts data to different formats for comparison:
|
||||||
* - JSON
|
* - JSON
|
||||||
* - TOON
|
* - TOON
|
||||||
* - CSV
|
* - CSV
|
||||||
* - XML
|
* - XML
|
||||||
* - YAML
|
* - YAML
|
||||||
|
*
|
||||||
|
* ## Semantic Equivalence
|
||||||
|
*
|
||||||
|
* All formatters attempt to preserve semantic equivalence with the source data,
|
||||||
|
* meaning the converted data should represent the same information. However,
|
||||||
|
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { stringify as stringifyCSV } from 'csv-stringify/sync'
|
import { stringify as stringifyCSV } from 'csv-stringify/sync'
|
||||||
@@ -14,12 +20,17 @@ import { XMLBuilder } from 'fast-xml-parser'
|
|||||||
import { stringify as stringifyYAML } from 'yaml'
|
import { stringify as stringifyYAML } from 'yaml'
|
||||||
import { encode as encodeToon } from '../../src/index'
|
import { encode as encodeToon } from '../../src/index'
|
||||||
|
|
||||||
export const formatters = {
|
/**
|
||||||
json: (data: unknown): string => JSON.stringify(data, undefined, 2),
|
* Format converters registry
|
||||||
toon: (data: unknown): string => encodeToon(data),
|
*
|
||||||
csv: (data: unknown): string => toCSV(data),
|
* Each formatter takes unknown data and returns a string representation
|
||||||
xml: (data: unknown): string => toXML(data),
|
*/
|
||||||
yaml: (data: unknown): string => stringifyYAML(data),
|
export const formatters: Record<string, (data: unknown) => string> = {
|
||||||
|
json: data => JSON.stringify(data, undefined, 2),
|
||||||
|
toon: data => encodeToon(data),
|
||||||
|
csv: data => toCSV(data),
|
||||||
|
xml: data => toXML(data),
|
||||||
|
yaml: data => stringifyYAML(data),
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -57,6 +68,15 @@ function toCSV(data: unknown): string {
|
|||||||
return ''
|
return ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert data to XML format
|
||||||
|
*
|
||||||
|
* @remarks
|
||||||
|
* Uses fast-xml-parser to generate well-formatted XML with:
|
||||||
|
* - 2-space indentation for readability
|
||||||
|
* - Empty nodes suppressed
|
||||||
|
* - Proper escaping of special characters
|
||||||
|
*/
|
||||||
function toXML(data: unknown): string {
|
function toXML(data: unknown): string {
|
||||||
const builder = new XMLBuilder({
|
const builder = new XMLBuilder({
|
||||||
format: true,
|
format: true,
|
||||||
|
|||||||
@@ -7,8 +7,16 @@
|
|||||||
* - Filtering (25%): "List/count X where Y"
|
* - Filtering (25%): "List/count X where Y"
|
||||||
*
|
*
|
||||||
* Questions are generated dynamically based on actual data values
|
* Questions are generated dynamically based on actual data values
|
||||||
|
*
|
||||||
|
* TODO: Balance question distribution across datasets to ensure fair representation.
|
||||||
|
* Current distribution:
|
||||||
|
* - Tabular: 70 questions (43%)
|
||||||
|
* - Nested: 50 questions (31%)
|
||||||
|
* - Analytics: 40 questions (25%)
|
||||||
|
* - GitHub: 40 questions (25%)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
|
||||||
import type { Question } from './types'
|
import type { Question } from './types'
|
||||||
import { consola } from 'consola'
|
import { consola } from 'consola'
|
||||||
import { datasets } from './datasets'
|
import { datasets } from './datasets'
|
||||||
@@ -20,11 +28,11 @@ export function generateQuestions(): Question[] {
|
|||||||
const questions: Question[] = []
|
const questions: Question[] = []
|
||||||
let idCounter = 1
|
let idCounter = 1
|
||||||
|
|
||||||
// Get datasets
|
// Get datasets with proper typing
|
||||||
const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || []
|
const tabular = (datasets.find(d => d.name === 'tabular')?.data.employees as Employee[]) || []
|
||||||
const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || []
|
const nested = (datasets.find(d => d.name === 'nested')?.data.orders as Order[]) || []
|
||||||
const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || []
|
const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) || []
|
||||||
const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || []
|
const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) || []
|
||||||
|
|
||||||
// ========================================
|
// ========================================
|
||||||
// TABULAR DATASET QUESTIONS (70 questions)
|
// TABULAR DATASET QUESTIONS (70 questions)
|
||||||
@@ -68,9 +76,9 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: count by department
|
// Aggregation: count by department
|
||||||
const departments = [...new Set(tabular.map((e: any) => e.department))]
|
const departments = [...new Set(tabular.map(e => e.department))]
|
||||||
for (const dept of departments.slice(0, 6)) {
|
for (const dept of departments.slice(0, 6)) {
|
||||||
const count = tabular.filter((e: any) => e.department === dept).length
|
const count = tabular.filter(e => e.department === dept).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many employees work in ${dept}?`,
|
prompt: `How many employees work in ${dept}?`,
|
||||||
@@ -83,7 +91,7 @@ export function generateQuestions(): Question[] {
|
|||||||
// Aggregation: salary ranges (4 questions)
|
// Aggregation: salary ranges (4 questions)
|
||||||
const salaryThresholds = [60000, 80000, 100000, 120000]
|
const salaryThresholds = [60000, 80000, 100000, 120000]
|
||||||
for (const threshold of salaryThresholds) {
|
for (const threshold of salaryThresholds) {
|
||||||
const count = tabular.filter((e: any) => e.salary > threshold).length
|
const count = tabular.filter(e => e.salary > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many employees have a salary greater than ${threshold}?`,
|
prompt: `How many employees have a salary greater than ${threshold}?`,
|
||||||
@@ -94,8 +102,8 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Filtering: active status
|
// Filtering: active status
|
||||||
const activeCount = tabular.filter((e: any) => e.active).length
|
const activeCount = tabular.filter(e => e.active).length
|
||||||
const inactiveCount = tabular.filter((e: any) => !e.active).length
|
const inactiveCount = tabular.filter(e => !e.active).length
|
||||||
questions.push(
|
questions.push(
|
||||||
{
|
{
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
@@ -115,7 +123,7 @@ export function generateQuestions(): Question[] {
|
|||||||
|
|
||||||
// Complex filtering: multi-condition (8 questions)
|
// Complex filtering: multi-condition (8 questions)
|
||||||
for (const dept of departments.slice(0, 4)) {
|
for (const dept of departments.slice(0, 4)) {
|
||||||
const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length
|
const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many employees in ${dept} have a salary greater than 80000?`,
|
prompt: `How many employees in ${dept} have a salary greater than 80000?`,
|
||||||
@@ -126,7 +134,7 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (const exp of [5, 10]) {
|
for (const exp of [5, 10]) {
|
||||||
const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length
|
const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many active employees have more than ${exp} years of experience?`,
|
prompt: `How many active employees have more than ${exp} years of experience?`,
|
||||||
@@ -184,9 +192,9 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: count by status
|
// Aggregation: count by status
|
||||||
const statuses = [...new Set(nested.map((o: any) => o.status))]
|
const statuses = [...new Set(nested.map(o => o.status))]
|
||||||
for (const status of statuses) {
|
for (const status of statuses) {
|
||||||
const count = nested.filter((o: any) => o.status === status).length
|
const count = nested.filter(o => o.status === status).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many orders have status "${status}"?`,
|
prompt: `How many orders have status "${status}"?`,
|
||||||
@@ -197,7 +205,7 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: total revenue
|
// Aggregation: total revenue
|
||||||
const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0)
|
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: 'What is the total revenue across all orders?',
|
prompt: 'What is the total revenue across all orders?',
|
||||||
@@ -209,7 +217,7 @@ export function generateQuestions(): Question[] {
|
|||||||
// Filtering: high-value orders (3 questions)
|
// Filtering: high-value orders (3 questions)
|
||||||
const highValueThresholds = [200, 400, 600]
|
const highValueThresholds = [200, 400, 600]
|
||||||
for (const threshold of highValueThresholds) {
|
for (const threshold of highValueThresholds) {
|
||||||
const count = nested.filter((o: any) => o.total > threshold).length
|
const count = nested.filter(o => o.total > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many orders have a total greater than ${threshold}?`,
|
prompt: `How many orders have a total greater than ${threshold}?`,
|
||||||
@@ -252,9 +260,9 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: totals (4 questions)
|
// Aggregation: totals (4 questions)
|
||||||
const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0)
|
const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
|
||||||
const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0)
|
const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
|
||||||
const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0)
|
const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
|
||||||
|
|
||||||
questions.push(
|
questions.push(
|
||||||
{
|
{
|
||||||
@@ -283,7 +291,7 @@ export function generateQuestions(): Question[] {
|
|||||||
// Filtering: high-performing days (10 questions)
|
// Filtering: high-performing days (10 questions)
|
||||||
const viewThresholds = [5000, 6000, 7000]
|
const viewThresholds = [5000, 6000, 7000]
|
||||||
for (const threshold of viewThresholds) {
|
for (const threshold of viewThresholds) {
|
||||||
const count = analytics.filter((m: any) => m.views > threshold).length
|
const count = analytics.filter(m => m.views > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many days had more than ${threshold} views?`,
|
prompt: `How many days had more than ${threshold} views?`,
|
||||||
@@ -295,7 +303,7 @@ export function generateQuestions(): Question[] {
|
|||||||
|
|
||||||
const conversionThresholds = [10, 20, 30]
|
const conversionThresholds = [10, 20, 30]
|
||||||
for (const threshold of conversionThresholds) {
|
for (const threshold of conversionThresholds) {
|
||||||
const count = analytics.filter((m: any) => m.conversions > threshold).length
|
const count = analytics.filter(m => m.conversions > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many days had more than ${threshold} conversions?`,
|
prompt: `How many days had more than ${threshold} conversions?`,
|
||||||
@@ -338,9 +346,9 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: count by owner (5 questions)
|
// Aggregation: count by owner (5 questions)
|
||||||
const owners = [...new Set(github.map((r: any) => r.owner))]
|
const owners = [...new Set(github.map(r => r.owner))]
|
||||||
for (const owner of owners.slice(0, 5)) {
|
for (const owner of owners.slice(0, 5)) {
|
||||||
const count = github.filter((r: any) => r.owner === owner).length
|
const count = github.filter(r => r.owner === owner).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many repositories does ${owner} have in the dataset?`,
|
prompt: `How many repositories does ${owner} have in the dataset?`,
|
||||||
@@ -351,7 +359,7 @@ export function generateQuestions(): Question[] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Aggregation: total stars
|
// Aggregation: total stars
|
||||||
const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0)
|
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: 'What is the total number of stars across all repositories?',
|
prompt: 'What is the total number of stars across all repositories?',
|
||||||
@@ -363,7 +371,7 @@ export function generateQuestions(): Question[] {
|
|||||||
// Filtering: popular repos (8 questions)
|
// Filtering: popular repos (8 questions)
|
||||||
const starThresholds = [10000, 50000, 100000]
|
const starThresholds = [10000, 50000, 100000]
|
||||||
for (const threshold of starThresholds) {
|
for (const threshold of starThresholds) {
|
||||||
const count = github.filter((r: any) => r.stars > threshold).length
|
const count = github.filter(r => r.stars > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many repositories have more than ${threshold} stars?`,
|
prompt: `How many repositories have more than ${threshold} stars?`,
|
||||||
@@ -375,7 +383,7 @@ export function generateQuestions(): Question[] {
|
|||||||
|
|
||||||
const forkThresholds = [1000, 5000, 10000]
|
const forkThresholds = [1000, 5000, 10000]
|
||||||
for (const threshold of forkThresholds) {
|
for (const threshold of forkThresholds) {
|
||||||
const count = github.filter((r: any) => r.forks > threshold).length
|
const count = github.filter(r => r.forks > threshold).length
|
||||||
questions.push({
|
questions.push({
|
||||||
id: `q${idCounter++}`,
|
id: `q${idCounter++}`,
|
||||||
prompt: `How many repositories have more than ${threshold} forks?`,
|
prompt: `How many repositories have more than ${threshold} forks?`,
|
||||||
|
|||||||
@@ -12,10 +12,10 @@
|
|||||||
import type { EvaluationResult, FormatResult, Question } from './types'
|
import type { EvaluationResult, FormatResult, Question } from './types'
|
||||||
import * as fsp from 'node:fs/promises'
|
import * as fsp from 'node:fs/promises'
|
||||||
import * as path from 'node:path'
|
import * as path from 'node:path'
|
||||||
import { encode } from 'gpt-tokenizer'
|
|
||||||
import { BENCHMARKS_DIR } from './constants'
|
import { BENCHMARKS_DIR } from './constants'
|
||||||
import { datasets } from './datasets'
|
import { datasets } from './datasets'
|
||||||
import { models } from './evaluate'
|
import { models } from './evaluate'
|
||||||
|
import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate per-format statistics from evaluation results
|
* Calculate per-format statistics from evaluation results
|
||||||
@@ -220,7 +220,7 @@ export function calculateTokenCounts(
|
|||||||
for (const dataset of datasets) {
|
for (const dataset of datasets) {
|
||||||
const formatted = formatter(dataset.data)
|
const formatted = formatter(dataset.data)
|
||||||
const key = `${formatName}-${dataset.name}`
|
const key = `${formatName}-${dataset.name}`
|
||||||
tokenCounts[key] = encode(formatted).length
|
tokenCounts[key] = tokenize(formatted)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -237,25 +237,22 @@ export async function saveResults(
|
|||||||
tokenCounts: Record<string, number>,
|
tokenCounts: Record<string, number>,
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
|
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
|
||||||
await fsp.mkdir(resultsDir, { recursive: true })
|
await ensureDir(resultsDir)
|
||||||
|
|
||||||
// Save raw results
|
// Save raw results
|
||||||
await fsp.writeFile(
|
await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
|
||||||
path.join(resultsDir, 'raw-results.json'),
|
|
||||||
`${JSON.stringify(results, undefined, 2)}\n`,
|
|
||||||
)
|
|
||||||
|
|
||||||
// Save summary
|
// Save summary
|
||||||
await fsp.writeFile(
|
await saveJsonFile(
|
||||||
path.join(resultsDir, 'summary.json'),
|
path.join(resultsDir, 'summary.json'),
|
||||||
`${JSON.stringify({
|
{
|
||||||
formatResults,
|
formatResults,
|
||||||
questions: questions.length,
|
questions: questions.length,
|
||||||
models: Object.keys(models),
|
models: Object.keys(models),
|
||||||
datasets: datasets.map(d => ({ name: d.name, description: d.description })),
|
datasets: datasets.map(d => ({ name: d.name, description: d.description })),
|
||||||
tokenCounts,
|
tokenCounts,
|
||||||
timestamp: new Date().toISOString(),
|
timestamp: new Date().toISOString(),
|
||||||
}, undefined, 2)}\n`,
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
// Generate markdown report
|
// Generate markdown report
|
||||||
@@ -267,12 +264,3 @@ export async function saveResults(
|
|||||||
|
|
||||||
return resultsDir
|
return resultsDir
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate visual progress bar using ASCII characters (`█` for filled, `░` for empty)
|
|
||||||
*/
|
|
||||||
function createProgressBar(tokens: number, maxTokens: number, width = 30): string {
|
|
||||||
const filled = Math.round((tokens / maxTokens) * width)
|
|
||||||
const empty = width - filled
|
|
||||||
return '█'.repeat(filled) + '░'.repeat(empty)
|
|
||||||
}
|
|
||||||
|
|||||||
68
benchmarks/src/utils.ts
Normal file
68
benchmarks/src/utils.ts
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
/**
|
||||||
|
* Shared utility functions for TOON benchmarks
|
||||||
|
*
|
||||||
|
* Provides common functionality used across multiple benchmark scripts:
|
||||||
|
* - Progress bar visualization
|
||||||
|
* - Token counting
|
||||||
|
* - File I/O operations
|
||||||
|
* - Retry logic for API calls
|
||||||
|
*/
|
||||||
|
|
||||||
|
import * as fsp from 'node:fs/promises'
|
||||||
|
import { encode } from 'gpt-tokenizer'
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate visual progress bar using ASCII characters
|
||||||
|
*
|
||||||
|
* @param value - Current value
|
||||||
|
* @param max - Maximum value
|
||||||
|
* @param width - Width of the bar in characters (default: 25)
|
||||||
|
* @returns ASCII progress bar string (`█` for filled, `░` for empty)
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* createProgressBar(75, 100, 20) // "███████████████░░░░░"
|
||||||
|
* createProgressBar(0.5, 1, 10) // "█████░░░░░"
|
||||||
|
*/
|
||||||
|
export function createProgressBar(value: number, max: number, width = 25): string {
|
||||||
|
const filled = Math.round((value / max) * width)
|
||||||
|
const empty = width - filled
|
||||||
|
return '█'.repeat(filled) + '░'.repeat(empty)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Count tokens in text using gpt-tokenizer (o200k_base encoding)
|
||||||
|
*
|
||||||
|
* @param text - Text to tokenize
|
||||||
|
* @returns Number of tokens
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* tokenize("Hello, world!") // 4
|
||||||
|
*/
|
||||||
|
export function tokenize(text: string): number {
|
||||||
|
return encode(text).length
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensure a directory exists, creating it recursively if needed
|
||||||
|
*
|
||||||
|
* @param dirPath - Directory path to ensure exists
|
||||||
|
*/
|
||||||
|
export async function ensureDir(dirPath: string): Promise<void> {
|
||||||
|
await fsp.mkdir(dirPath, { recursive: true })
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save data as formatted JSON file
|
||||||
|
*
|
||||||
|
* @param filePath - Path to save the file
|
||||||
|
* @param data - Data to serialize as JSON
|
||||||
|
* @param indent - Indentation spaces (default: 2)
|
||||||
|
*/
|
||||||
|
export async function saveJsonFile(
|
||||||
|
filePath: string,
|
||||||
|
data: unknown,
|
||||||
|
indent = 2,
|
||||||
|
): Promise<void> {
|
||||||
|
const json = JSON.stringify(data, undefined, indent)
|
||||||
|
await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user