From 4ec7e84f5f073333f60b0834ff5f74fb8487f9ce Mon Sep 17 00:00:00 2001
From: Johann Schopplich <mail@johannschopplich.com>
Date: Mon, 27 Oct 2025 17:37:27 +0100
Subject: [PATCH] refactor: shared utils for benchmark scripts

---
 benchmarks/scripts/accuracy-benchmark.ts      |  17 +--
 benchmarks/scripts/fetch-github-data.ts       |  24 +++--
 .../scripts/token-efficiency-benchmark.ts     |  54 +++-------
 benchmarks/src/constants.ts                   |  10 ++
 benchmarks/src/datasets.ts                    | 100 ++++++++++++++++--
 benchmarks/src/formatters.ts                  |  34 ++++--
 benchmarks/src/questions.ts                   |  60 ++++++-----
 benchmarks/src/report.ts                      |  26 ++---
 benchmarks/src/utils.ts                       |  68 ++++++++++++
 9 files changed, 269 insertions(+), 124 deletions(-)
 create mode 100644 benchmarks/src/utils.ts
diff --git a/benchmarks/scripts/accuracy-benchmark.ts b/benchmarks/scripts/accuracy-benchmark.ts
index 1f0e3ab..70172a1 100644
--- a/benchmarks/scripts/accuracy-benchmark.ts
+++ b/benchmarks/scripts/accuracy-benchmark.ts
@@ -79,17 +79,6 @@ else {
   // Calculate token counts for all format+dataset combinations
   tokenCounts = calculateTokenCounts(formatters)
 
-  // Format datasets once (reuse for all questions)
-  const formattedDatasets: Record<string, Record<string, string>> = {}
-
-  for (const [formatName, formatter] of Object.entries(formatters)) {
-    formattedDatasets[formatName] ??= {}
-
-    for (const dataset of datasets) {
-      formattedDatasets[formatName]![dataset.name] = formatter(dataset.data)
-    }
-  }
-
   // Generate evaluation tasks
   const tasks: { question: Question, formatName: string, modelName: string }[] = []
 
@@ -104,11 +93,13 @@ else {
   const total = tasks.length
   consola.start(`Running ${total} evaluations with concurrency: ${DEFAULT_CONCURRENCY}`)
 
-  // Evaluate all tasks in parallel
   results = await pMap(
     tasks,
     async (task, index) => {
-      const formattedData = formattedDatasets[task.formatName]![task.question.dataset]!
+      // Format data on-demand
+      const dataset = datasets.find(d => d.name === task.question.dataset)!
+      const formatter = formatters[task.formatName]!
+      const formattedData = formatter(dataset.data)
       const model = activeModels[task.modelName as keyof typeof activeModels]!
 
       const result = await evaluateQuestion({
diff --git a/benchmarks/scripts/fetch-github-data.ts b/benchmarks/scripts/fetch-github-data.ts
index 335dd77..1ded0ee 100644
--- a/benchmarks/scripts/fetch-github-data.ts
+++ b/benchmarks/scripts/fetch-github-data.ts
@@ -1,9 +1,10 @@
-import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
 import process from 'node:process'
 import { consola } from 'consola'
 import { ofetch } from 'ofetch'
+import pMap from 'p-map'
 import { BENCHMARKS_DIR } from '../src/constants'
+import { ensureDir, saveJsonFile } from '../src/utils'
 
 try {
   // Fetch top 100 repos from GitHub
@@ -52,14 +53,15 @@ async function searchTop100Repos(): Promise<string[]> {
 async function fetchRepoDetails(repoList: string[]): Promise<Record<string, any>[]> {
   consola.start(`Fetching ${repoList.length} GitHub repositories…`)
 
-  const repos: Record<string, any>[] = []
-
-  for (let i = 0; i < repoList.length; i++) {
-    const repoPath = repoList[i]!
-    console.log(`[${i + 1}/${repoList.length}] Fetching ${repoPath}…`)
-    const { repo } = await await ofetch(`https://ungh.cc/repos/${repoPath}`)
-    repos.push(repo)
-  }
+  const repos = await pMap(
+    repoList,
+    async (repoPath, index) => {
+      consola.info(`[${index + 1}/${repoList.length}] Fetching ${repoPath}…`)
+      const { repo } = await ofetch(`https://ungh.cc/repos/${repoPath}`)
+      return repo
+    },
+    { concurrency: 5 },
+  )
 
   consola.success(`Successfully fetched ${repos.length}/${repoList.length} repositories`)
 
@@ -70,8 +72,8 @@ async function saveRepos(repos: Record<string, any>[]): Promise<void> {
   const outputDir = path.join(BENCHMARKS_DIR, 'data')
   const outputFile = path.join(outputDir, 'github-repos.json')
 
-  await fsp.mkdir(outputDir, { recursive: true })
-  await fsp.writeFile(outputFile, JSON.stringify(repos, undefined, 2))
+  await ensureDir(outputDir)
+  await saveJsonFile(outputFile, repos)
 
   const relativePath = path.relative(BENCHMARKS_DIR, outputFile)
   consola.info(`Saved to \`${relativePath}\``)
diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
index c110a12..ed1a14c 100644
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -1,13 +1,12 @@
 import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
-import { faker } from '@faker-js/faker'
 import { consola } from 'consola'
-import { encode as encodeTokens } from 'gpt-tokenizer' // o200k_base encoding (default)
 import { encode } from '../../src/index'
 import githubRepos from '../data/github-repos.json' with { type: 'json' }
 import { BENCHMARKS_DIR, ROOT_DIR } from '../src/constants'
-import { generateAnalyticsData } from '../src/datasets'
+import { generateAnalyticsData, generateOrderData } from '../src/datasets'
 import { formatters } from '../src/formatters'
+import { createProgressBar, ensureDir, tokenize } from '../src/utils'
 
 interface BenchmarkResult {
   name: string
@@ -45,7 +44,7 @@ const BENCHMARK_EXAMPLES = [
     name: 'E-Commerce Order',
     emoji: '🛒',
     description: 'Single nested order with customer and items',
-    getData: generateOrder,
+    getData: generateOrderData,
     showDetailed: false,
   },
 ] as const
@@ -62,11 +61,11 @@ for (const example of BENCHMARK_EXAMPLES) {
 
   const jsonString = JSON.stringify(data, undefined, 2)
   const toonString = encode(data)
-  const xmlString = formatters.xml(data)
+  const xmlString = formatters.xml!(data)
 
-  const jsonTokens = encodeTokens(jsonString).length
-  const toonTokens = encodeTokens(toonString).length
-  const xmlTokens = encodeTokens(xmlString).length
+  const jsonTokens = tokenize(jsonString)
+  const toonTokens = tokenize(toonString)
+  const xmlTokens = tokenize(xmlString)
 
   const jsonSavings = jsonTokens - toonTokens
   const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
@@ -104,7 +103,7 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
 const datasetRows = results
   .map((result) => {
     const percentage = Number.parseFloat(result.jsonSavingsPercent)
-    const bar = generateBarChart(100 - percentage) // Invert to show TOON tokens
+    const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
     const toonStr = result.toonTokens.toLocaleString('en-US')
     const jsonStr = result.jsonTokens.toLocaleString('en-US')
     const xmlStr = result.xmlTokens.toLocaleString('en-US')
@@ -123,7 +122,7 @@ const separator = '────────────────────
 // Calculate bar for totals (TOON vs average of JSON+XML)
 const averageComparisonTokens = (totalJsonTokens + totalXmlTokens) / 2
 const totalPercentage = (totalToonTokens / averageComparisonTokens) * 100
-const totalBar = generateBarChart(totalPercentage)
+const totalBar = createProgressBar(totalPercentage, 100)
 
 const totalLine1 = `Total                        ${totalBar}  ${totalToonTokens.toLocaleString('en-US').padStart(6)} tokens`
 const totalLine2 = `                             vs JSON: ${totalJsonTokens.toLocaleString('en-US').padStart(6)}  💰 ${totalJsonSavingsPercent}% saved`
@@ -132,6 +131,8 @@ const totalLine3 = `                             vs XML:  ${totalXmlTokens.toLoc
 const barChartSection = `${datasetRows}\n\n${separator}\n${totalLine1}\n${totalLine2}\n${totalLine3}`
 
 // Generate detailed examples (only for selected examples)
+// Note: Large datasets are truncated for display readability in the report.
+// Token counts are calculated from the full datasets, not the truncated versions.
 const detailedExamples = results
   .filter(result => result.showDetailed)
   .map((result, i, filtered) => {
@@ -187,38 +188,7 @@ ${detailedExamples}
 
 console.log(markdown)
 
-await fsp.mkdir(path.join(BENCHMARKS_DIR, 'results'), { recursive: true })
+await ensureDir(path.join(BENCHMARKS_DIR, 'results'))
 await fsp.writeFile(outputFilePath, markdown, 'utf-8')
 
 consola.success(`Benchmark written to \`${path.relative(ROOT_DIR, outputFilePath)}\``)
-
-// Generate ASCII bar chart
-function generateBarChart(percentage: number, maxWidth: number = 25): string {
-  const filled = Math.round((percentage / 100) * maxWidth)
-  const empty = maxWidth - filled
-  return '█'.repeat(filled) + '░'.repeat(empty)
-}
-
-// Generate nested e-commerce order
-function generateOrder() {
-  return {
-    orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
-    customer: {
-      id: faker.number.int({ min: 1000, max: 9999 }),
-      name: faker.person.fullName(),
-      email: faker.internet.email(),
-      phone: faker.phone.number(),
-    },
-    items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
-      sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
-      name: faker.commerce.productName(),
-      quantity: faker.number.int({ min: 1, max: 5 }),
-      price: Number(faker.commerce.price({ min: 10, max: 200 })),
-    })),
-    subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
-    tax: Number(faker.commerce.price({ min: 10, max: 50 })),
-    total: Number(faker.commerce.price({ min: 110, max: 550 })),
-    status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
-    createdAt: faker.date.recent({ days: 7 }).toISOString(),
-  }
-}
diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts
index 6434dde..e9301d9 100644
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -9,6 +9,16 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
  */
 export const DEFAULT_CONCURRENCY = 20
 
+/**
+ * Progress bar configuration
+ */
+export const PROGRESS_BAR = {
+  /** Default width for progress bars */
+  defaultWidth: 25,
+  /** Compact width for inline displays */
+  compactWidth: 20,
+} as const
+
 /**
  * Enable dry run mode for quick testing with limited AI requests
  *
diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts
index a9afd77..49de5fb 100644
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -14,7 +14,48 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
 // Seed for reproducibility
 faker.seed(12345)
 
-interface AnalyticsMetric {
+/**
+ * Employee record structure for tabular dataset
+ */
+export interface Employee {
+  id: number
+  name: string
+  email: string
+  department: string
+  salary: number
+  yearsExperience: number
+  active: boolean
+}
+
+/**
+ * E-commerce order structure for nested dataset
+ */
+export interface Order {
+  orderId: string
+  customer: {
+    id: number
+    name: string
+    email: string
+    phone: string
+  }
+  items: {
+    sku: string
+    name: string
+    quantity: number
+    price: number
+  }[]
+  subtotal: number
+  tax: number
+  total: number
+  status: string
+  orderDate?: string
+  createdAt?: string
+}
+
+/**
+ * Analytics metric structure for time-series dataset
+ */
+export interface AnalyticsMetric {
   date: string
   views: number
   clicks: number
@@ -24,7 +65,25 @@ interface AnalyticsMetric {
 }
 
 /**
- * Generate analytics time-series data with reproducible seeded randomness
+ * GitHub repository structure for real-world dataset
+ */
+export interface Repository {
+  id: number
+  name: string
+  owner: string
+  repo: string
+  description: string
+  stars: number
+  watchers: number
+  forks: number
+  defaultBranch: string
+  createdAt: string
+  updatedAt: string
+  pushedAt: string
+}
+
+/**
+ * Generate analytics time-series data
  */
 export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
   metrics: AnalyticsMetric[]
@@ -63,12 +122,12 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
  * @remarks
  * Tests TOON's tabular array format
  */
-const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance']
+const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
 const tabularDataset: Dataset = {
   name: 'tabular',
   description: 'Uniform employee records (TOON optimal format)',
   data: {
-    employees: Array.from({ length: 100 }, (_, i) => {
+    employees: Array.from({ length: 100 }, (_, i): Employee => {
       const yearsExp = faker.number.int({ min: 1, max: 20 })
       return {
         id: i + 1,
@@ -89,8 +148,8 @@ const tabularDataset: Dataset = {
  * @remarks
  * Tests TOON's handling of complex nested objects
  */
-const productNames = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp']
-const statuses = ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
+const productNames: readonly string[] = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
+const statuses: readonly string[] = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
 
 const nestedDataset: Dataset = {
   name: 'nested',
@@ -155,6 +214,35 @@ const githubDataset: Dataset = {
   },
 }
 
+/**
+ * Generate a single e-commerce order with nested structure
+ *
+ * @remarks
+ * Used for token efficiency benchmarks
+ */
+export function generateOrderData(): Order {
+  return {
+    orderId: faker.string.alphanumeric({ length: 12, casing: 'upper' }),
+    customer: {
+      id: faker.number.int({ min: 1000, max: 9999 }),
+      name: faker.person.fullName(),
+      email: faker.internet.email(),
+      phone: faker.phone.number(),
+    },
+    items: Array.from({ length: faker.number.int({ min: 2, max: 5 }) }, () => ({
+      sku: faker.string.alphanumeric({ length: 8, casing: 'upper' }),
+      name: faker.commerce.productName(),
+      quantity: faker.number.int({ min: 1, max: 5 }),
+      price: Number(faker.commerce.price({ min: 10, max: 200 })),
+    })),
+    subtotal: Number(faker.commerce.price({ min: 100, max: 500 })),
+    tax: Number(faker.commerce.price({ min: 10, max: 50 })),
+    total: Number(faker.commerce.price({ min: 110, max: 550 })),
+    status: faker.helpers.arrayElement(['pending', 'processing', 'shipped', 'delivered']),
+    createdAt: faker.date.recent({ days: 7 }).toISOString(),
+  }
+}
+
 /**
  * All datasets used in the benchmark
  */
diff --git a/benchmarks/src/formatters.ts b/benchmarks/src/formatters.ts
index 64aabef..4412d60 100644
--- a/benchmarks/src/formatters.ts
+++ b/benchmarks/src/formatters.ts
@@ -1,12 +1,18 @@
 /**
  * Format converters for TOON benchmarks
  *
- * Converts data to different formats:
+ * Converts data to different formats for comparison:
  * - JSON
  * - TOON
  * - CSV
  * - XML
  * - YAML
+ *
+ * ## Semantic Equivalence
+ *
+ * All formatters attempt to preserve semantic equivalence with the source data,
+ * meaning the converted data should represent the same information. However,
+ * CSV has inherent limitations with nested structures (see `toCSV` docs).
  */
 
 import { stringify as stringifyCSV } from 'csv-stringify/sync'
@@ -14,12 +20,17 @@ import { XMLBuilder } from 'fast-xml-parser'
 import { stringify as stringifyYAML } from 'yaml'
 import { encode as encodeToon } from '../../src/index'
 
-export const formatters = {
-  json: (data: unknown): string => JSON.stringify(data, undefined, 2),
-  toon: (data: unknown): string => encodeToon(data),
-  csv: (data: unknown): string => toCSV(data),
-  xml: (data: unknown): string => toXML(data),
-  yaml: (data: unknown): string => stringifyYAML(data),
+/**
+ * Format converters registry
+ *
+ * Each formatter takes unknown data and returns a string representation
+ */
+export const formatters: Record<string, (data: unknown) => string> = {
+  json: data => JSON.stringify(data, undefined, 2),
+  toon: data => encodeToon(data),
+  csv: data => toCSV(data),
+  xml: data => toXML(data),
+  yaml: data => stringifyYAML(data),
 }
 
 /**
@@ -57,6 +68,15 @@ function toCSV(data: unknown): string {
   return ''
 }
 
+/**
+ * Convert data to XML format
+ *
+ * @remarks
+ * Uses fast-xml-parser to generate well-formatted XML with:
+ * - 2-space indentation for readability
+ * - Empty nodes suppressed
+ * - Proper escaping of special characters
+ */
 function toXML(data: unknown): string {
   const builder = new XMLBuilder({
     format: true,
diff --git a/benchmarks/src/questions.ts b/benchmarks/src/questions.ts
index 4c27c33..41f4481 100644
--- a/benchmarks/src/questions.ts
+++ b/benchmarks/src/questions.ts
@@ -7,8 +7,16 @@
  * - Filtering (25%): "List/count X where Y"
  *
  * Questions are generated dynamically based on actual data values
+ *
+ * TODO: Balance question distribution across datasets to ensure fair representation.
+ * Current distribution:
+ * - Tabular: 70 questions (43%)
+ * - Nested: 50 questions (31%)
+ * - Analytics: 40 questions (25%)
+ * - GitHub: 40 questions (25%)
  */
 
+import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
 import type { Question } from './types'
 import { consola } from 'consola'
 import { datasets } from './datasets'
@@ -20,11 +28,11 @@ export function generateQuestions(): Question[] {
   const questions: Question[] = []
   let idCounter = 1
 
-  // Get datasets
-  const tabular = datasets.find(d => d.name === 'tabular')?.data.employees as any[] || []
-  const nested = datasets.find(d => d.name === 'nested')?.data.orders as any[] || []
-  const analytics = datasets.find(d => d.name === 'analytics')?.data.metrics as any[] || []
-  const github = datasets.find(d => d.name === 'github')?.data.repositories as any[] || []
+  // Get datasets with proper typing
+  const tabular = (datasets.find(d => d.name === 'tabular')?.data.employees as Employee[]) || []
+  const nested = (datasets.find(d => d.name === 'nested')?.data.orders as Order[]) || []
+  const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) || []
+  const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) || []
 
   // ========================================
   // TABULAR DATASET QUESTIONS (70 questions)
@@ -68,9 +76,9 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: count by department
-    const departments = [...new Set(tabular.map((e: any) => e.department))]
+    const departments = [...new Set(tabular.map(e => e.department))]
     for (const dept of departments.slice(0, 6)) {
-      const count = tabular.filter((e: any) => e.department === dept).length
+      const count = tabular.filter(e => e.department === dept).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many employees work in ${dept}?`,
@@ -83,7 +91,7 @@ export function generateQuestions(): Question[] {
     // Aggregation: salary ranges (4 questions)
     const salaryThresholds = [60000, 80000, 100000, 120000]
     for (const threshold of salaryThresholds) {
-      const count = tabular.filter((e: any) => e.salary > threshold).length
+      const count = tabular.filter(e => e.salary > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many employees have a salary greater than ${threshold}?`,
@@ -94,8 +102,8 @@ export function generateQuestions(): Question[] {
     }
 
     // Filtering: active status
-    const activeCount = tabular.filter((e: any) => e.active).length
-    const inactiveCount = tabular.filter((e: any) => !e.active).length
+    const activeCount = tabular.filter(e => e.active).length
+    const inactiveCount = tabular.filter(e => !e.active).length
     questions.push(
       {
         id: `q${idCounter++}`,
@@ -115,7 +123,7 @@ export function generateQuestions(): Question[] {
 
     // Complex filtering: multi-condition (8 questions)
     for (const dept of departments.slice(0, 4)) {
-      const count = tabular.filter((e: any) => e.department === dept && e.salary > 80000).length
+      const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many employees in ${dept} have a salary greater than 80000?`,
@@ -126,7 +134,7 @@ export function generateQuestions(): Question[] {
     }
 
     for (const exp of [5, 10]) {
-      const count = tabular.filter((e: any) => e.yearsExperience > exp && e.active).length
+      const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many active employees have more than ${exp} years of experience?`,
@@ -184,9 +192,9 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: count by status
-    const statuses = [...new Set(nested.map((o: any) => o.status))]
+    const statuses = [...new Set(nested.map(o => o.status))]
     for (const status of statuses) {
-      const count = nested.filter((o: any) => o.status === status).length
+      const count = nested.filter(o => o.status === status).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many orders have status "${status}"?`,
@@ -197,7 +205,7 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: total revenue
-    const totalRevenue = nested.reduce((sum: number, o: any) => sum + o.total, 0)
+    const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
     questions.push({
       id: `q${idCounter++}`,
       prompt: 'What is the total revenue across all orders?',
@@ -209,7 +217,7 @@ export function generateQuestions(): Question[] {
     // Filtering: high-value orders (3 questions)
     const highValueThresholds = [200, 400, 600]
     for (const threshold of highValueThresholds) {
-      const count = nested.filter((o: any) => o.total > threshold).length
+      const count = nested.filter(o => o.total > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many orders have a total greater than ${threshold}?`,
@@ -252,9 +260,9 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: totals (4 questions)
-    const totalViews = analytics.reduce((sum: number, m: any) => sum + m.views, 0)
-    const totalRevenue = analytics.reduce((sum: number, m: any) => sum + m.revenue, 0)
-    const totalConversions = analytics.reduce((sum: number, m: any) => sum + m.conversions, 0)
+    const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
+    const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
+    const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
 
     questions.push(
       {
@@ -283,7 +291,7 @@ export function generateQuestions(): Question[] {
     // Filtering: high-performing days (10 questions)
     const viewThresholds = [5000, 6000, 7000]
     for (const threshold of viewThresholds) {
-      const count = analytics.filter((m: any) => m.views > threshold).length
+      const count = analytics.filter(m => m.views > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many days had more than ${threshold} views?`,
@@ -295,7 +303,7 @@ export function generateQuestions(): Question[] {
 
     const conversionThresholds = [10, 20, 30]
     for (const threshold of conversionThresholds) {
-      const count = analytics.filter((m: any) => m.conversions > threshold).length
+      const count = analytics.filter(m => m.conversions > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many days had more than ${threshold} conversions?`,
@@ -338,9 +346,9 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: count by owner (5 questions)
-    const owners = [...new Set(github.map((r: any) => r.owner))]
+    const owners = [...new Set(github.map(r => r.owner))]
     for (const owner of owners.slice(0, 5)) {
-      const count = github.filter((r: any) => r.owner === owner).length
+      const count = github.filter(r => r.owner === owner).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many repositories does ${owner} have in the dataset?`,
@@ -351,7 +359,7 @@ export function generateQuestions(): Question[] {
     }
 
     // Aggregation: total stars
-    const totalStars = github.reduce((sum: number, r: any) => sum + r.stars, 0)
+    const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
     questions.push({
       id: `q${idCounter++}`,
       prompt: 'What is the total number of stars across all repositories?',
@@ -363,7 +371,7 @@ export function generateQuestions(): Question[] {
     // Filtering: popular repos (8 questions)
     const starThresholds = [10000, 50000, 100000]
     for (const threshold of starThresholds) {
-      const count = github.filter((r: any) => r.stars > threshold).length
+      const count = github.filter(r => r.stars > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many repositories have more than ${threshold} stars?`,
@@ -375,7 +383,7 @@ export function generateQuestions(): Question[] {
 
     const forkThresholds = [1000, 5000, 10000]
     for (const threshold of forkThresholds) {
-      const count = github.filter((r: any) => r.forks > threshold).length
+      const count = github.filter(r => r.forks > threshold).length
       questions.push({
         id: `q${idCounter++}`,
         prompt: `How many repositories have more than ${threshold} forks?`,
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
index af41f26..3a8fdda 100644
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -12,10 +12,10 @@
 import type { EvaluationResult, FormatResult, Question } from './types'
 import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
-import { encode } from 'gpt-tokenizer'
 import { BENCHMARKS_DIR } from './constants'
 import { datasets } from './datasets'
 import { models } from './evaluate'
+import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
 
 /**
  * Calculate per-format statistics from evaluation results
@@ -220,7 +220,7 @@ export function calculateTokenCounts(
     for (const dataset of datasets) {
       const formatted = formatter(dataset.data)
       const key = `${formatName}-${dataset.name}`
-      tokenCounts[key] = encode(formatted).length
+      tokenCounts[key] = tokenize(formatted)
     }
   }
 
@@ -237,25 +237,22 @@ export async function saveResults(
   tokenCounts: Record<string, number>,
 ): Promise<string> {
   const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
-  await fsp.mkdir(resultsDir, { recursive: true })
+  await ensureDir(resultsDir)
 
   // Save raw results
-  await fsp.writeFile(
-    path.join(resultsDir, 'raw-results.json'),
-    `${JSON.stringify(results, undefined, 2)}\n`,
-  )
+  await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
 
   // Save summary
-  await fsp.writeFile(
+  await saveJsonFile(
     path.join(resultsDir, 'summary.json'),
-    `${JSON.stringify({
+    {
       formatResults,
       questions: questions.length,
       models: Object.keys(models),
       datasets: datasets.map(d => ({ name: d.name, description: d.description })),
       tokenCounts,
       timestamp: new Date().toISOString(),
-    }, undefined, 2)}\n`,
+    },
   )
 
   // Generate markdown report
@@ -267,12 +264,3 @@ export async function saveResults(
 
   return resultsDir
 }
-
-/**
- * Generate visual progress bar using ASCII characters (`█` for filled, `░` for empty)
- */
-function createProgressBar(tokens: number, maxTokens: number, width = 30): string {
-  const filled = Math.round((tokens / maxTokens) * width)
-  const empty = width - filled
-  return '█'.repeat(filled) + '░'.repeat(empty)
-}
diff --git a/benchmarks/src/utils.ts b/benchmarks/src/utils.ts
new file mode 100644
index 0000000..3b0a735
--- /dev/null
+++ b/benchmarks/src/utils.ts
@@ -0,0 +1,68 @@
+/**
+ * Shared utility functions for TOON benchmarks
+ *
+ * Provides common functionality used across multiple benchmark scripts:
+ * - Progress bar visualization
+ * - Token counting
+ * - File I/O operations
+ * - Retry logic for API calls
+ */
+
+import * as fsp from 'node:fs/promises'
+import { encode } from 'gpt-tokenizer'
+
+/**
+ * Generate visual progress bar using ASCII characters
+ *
+ * @param value - Current value
+ * @param max - Maximum value
+ * @param width - Width of the bar in characters (default: 25)
+ * @returns ASCII progress bar string (`█` for filled, `░` for empty)
+ *
+ * @example
+ * createProgressBar(75, 100, 20) // "███████████████░░░░░"
+ * createProgressBar(0.5, 1, 10)  // "█████░░░░░"
+ */
+export function createProgressBar(value: number, max: number, width = 25): string {
+  const filled = Math.round((value / max) * width)
+  const empty = width - filled
+  return '█'.repeat(filled) + '░'.repeat(empty)
+}
+
+/**
+ * Count tokens in text using gpt-tokenizer (o200k_base encoding)
+ *
+ * @param text - Text to tokenize
+ * @returns Number of tokens
+ *
+ * @example
+ * tokenize("Hello, world!") // 4
+ */
+export function tokenize(text: string): number {
+  return encode(text).length
+}
+
+/**
+ * Ensure a directory exists, creating it recursively if needed
+ *
+ * @param dirPath - Directory path to ensure exists
+ */
+export async function ensureDir(dirPath: string): Promise<void> {
+  await fsp.mkdir(dirPath, { recursive: true })
+}
+
+/**
+ * Save data as formatted JSON file
+ *
+ * @param filePath - Path to save the file
+ * @param data - Data to serialize as JSON
+ * @param indent - Indentation spaces (default: 2)
+ */
+export async function saveJsonFile(
+  filePath: string,
+  data: unknown,
+  indent = 2,
+): Promise<void> {
+  const json = JSON.stringify(data, undefined, indent)
+  await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
+}