test: add benchmarks for compact vs. pretty JSON

2026-01-29 23:34:10 +08:00 · 2025-10-30 15:02:51 +01:00
parent df68417d8b
commit 2c4f3c4362
14 changed files with 283 additions and 267 deletions
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -14,7 +14,7 @@ export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.me
 export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
  'claude-haiku-4-5-20251001': 50,
  'gemini-2.5-flash': 25,
-  'gpt-5-nano': undefined,
+  'gpt-5-nano': 50,
  'grok-4-fast-non-reasoning': 50,
 }

@@ -23,6 +23,18 @@ export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
 */
 export const DEFAULT_CONCURRENCY = 10

+/**
+ * Display names for data format types
+ */
+export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
+  'json-pretty': 'JSON',
+  'json-compact': 'JSON compact',
+  'toon': 'TOON',
+  'csv': 'CSV',
+  'xml': 'XML',
+  'yaml': 'YAML',
+} as const
+
 /**
 * Progress bar configuration
 */
--- a/benchmarks/src/formatters.ts
+++ b/benchmarks/src/formatters.ts
@@ -12,11 +12,12 @@ import { encode as encodeToon } from '../../src/index'
 * CSV has inherent limitations with nested structures (see `toCSV` docs).
 */
 export const formatters: Record<string, (data: unknown) => string> = {
-  json: data => JSON.stringify(data, undefined, 2),
-  toon: data => encodeToon(data),
-  csv: data => toCSV(data),
-  xml: data => toXML(data),
-  yaml: data => stringifyYAML(data),
+  'json-pretty': data => JSON.stringify(data, undefined, 2),
+  'json-compact': data => JSON.stringify(data),
+  'toon': data => encodeToon(data),
+  'csv': data => toCSV(data),
+  'xml': data => toXML(data),
+  'yaml': data => stringifyYAML(data),
 }

 /**
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -1,7 +1,7 @@
 import type { EvaluationResult, FormatResult, Question } from './types'
 import * as fsp from 'node:fs/promises'
 import * as path from 'node:path'
-import { BENCHMARKS_DIR } from './constants'
+import { BENCHMARKS_DIR, FORMATTER_DISPLAY_NAMES } from './constants'
 import { datasets } from './datasets'
 import { models } from './evaluate'
 import { createProgressBar, ensureDir, tokenize } from './utils'
@@ -49,7 +49,7 @@ export function generateMarkdownReport(
  tokenCounts: Record<string, number>,
 ): string {
  const toon = formatResults.find(r => r.format === 'toon')
-  const json = formatResults.find(r => r.format === 'json')
+  const json = formatResults.find(r => r.format === 'json-pretty')

  const modelIds = models.map(m => m.modelId)
  const modelNames = modelIds.filter(id => results.some(r => r.model === id))
@@ -71,10 +71,11 @@ export function generateMarkdownReport(

    const formatLines = modelResults.map((result) => {
      const bar = createProgressBar(result.accuracy, 1, 20)
-      const accuracyStr = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
-      const countStr = `(${result.correctCount}/${result.totalCount})`
+      const accuracyString = `${(result.accuracy * 100).toFixed(1)}%`.padStart(6)
+      const countString = `(${result.correctCount}/${result.totalCount})`
      const prefix = result.format === 'toon' ? '→ ' : '  '
-      return `${prefix}${result.format.padEnd(12)} ${bar} ${accuracyStr} ${countStr}`
+      const displayName = FORMATTER_DISPLAY_NAMES[result.format] || result.format
+      return `${prefix}${displayName.padEnd(12)} ${bar} ${accuracyString} ${countString}`
    }).join('\n')

    // Add blank line before model name, except for first model
@@ -248,7 +249,7 @@ ${totalQuestions} questions are generated dynamically across three categories:

 #### Evaluation Process

-1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
+1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
 3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).

--- a/benchmarks/src/utils.ts
+++ b/benchmarks/src/utils.ts
@@ -40,19 +40,3 @@ export function tokenize(text: string): number {
 export async function ensureDir(dirPath: string): Promise<void> {
  await fsp.mkdir(dirPath, { recursive: true })
 }
-
-/**
- * Save data as formatted JSON file
- *
- * @param filePath - Path to save the file
- * @param data - Data to serialize as JSON
- * @param indent - Indentation spaces (default: 2)
- */
-export async function saveJsonFile(
-  filePath: string,
-  data: unknown,
-  indent = 2,
-): Promise<void> {
-  const json = JSON.stringify(data, undefined, indent)
-  await fsp.writeFile(filePath, `${json}\n`, 'utf-8')
-}