refactor: token efficiency benchmark code

2026-01-29 15:24:10 +08:00 · 2025-10-28 07:42:49 +01:00
parent 8836831de3
commit 8b9924ff05
3 changed files with 52 additions and 41 deletions
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
 import { formatters } from '../src/formatters'
 import { createProgressBar, ensureDir, tokenize } from '../src/utils'

+interface FormatMetrics {
+  name: string
+  tokens: number
+  savings: number
+  savingsPercent: string
+}
+
 interface BenchmarkResult {
  name: string
  emoji: string
  description: string
  data: Record<string, any>
-  jsonTokens: number
-  toonTokens: number
-  xmlTokens: number
-  jsonSavings: number
-  jsonSavingsPercent: string
-  xmlSavings: number
-  xmlSavingsPercent: string
+  formats: FormatMetrics[]
  showDetailed: boolean
 }

@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
  const xmlTokens = tokenize(xmlString)

  const jsonSavings = jsonTokens - toonTokens
-  const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
-
  const xmlSavings = xmlTokens - toonTokens
-  const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)

  totalJsonTokens += jsonTokens
  totalToonTokens += toonTokens
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
    emoji: example.emoji,
    description: example.description,
    data,
-    jsonTokens,
-    toonTokens,
-    xmlTokens,
-    jsonSavings,
-    jsonSavingsPercent,
-    xmlSavings,
-    xmlSavingsPercent,
+    formats: [
+      {
+        name: 'toon',
+        tokens: toonTokens,
+        savings: 0,
+        savingsPercent: '0.0',
+      },
+      {
+        name: 'json',
+        tokens: jsonTokens,
+        savings: jsonSavings,
+        savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
+      },
+      {
+        name: 'xml',
+        tokens: xmlTokens,
+        savings: xmlSavings,
+        savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
+      },
+    ],
    showDetailed: example.showDetailed,
  })
 }
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
 // Generate ASCII bar chart visualization (stacked compact format)
 const datasetRows = results
  .map((result) => {
-    const percentage = Number.parseFloat(result.jsonSavingsPercent)
+    const toon = result.formats.find(f => f.name === 'toon')!
+    const json = result.formats.find(f => f.name === 'json')!
+    const xml = result.formats.find(f => f.name === 'xml')!
+
+    const percentage = Number.parseFloat(json.savingsPercent)
    const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
-    const toonStr = result.toonTokens.toLocaleString('en-US')
-    const jsonStr = result.jsonTokens.toLocaleString('en-US')
-    const xmlStr = result.xmlTokens.toLocaleString('en-US')
+    const toonStr = toon.tokens.toLocaleString('en-US')
+    const jsonStr = json.tokens.toLocaleString('en-US')
+    const xmlStr = xml.tokens.toLocaleString('en-US')

    const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar}  ${toonStr.padStart(6)} tokens`
-    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${result.jsonSavingsPercent}% saved`
-    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${result.xmlSavingsPercent}% saved`
+    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${json.savingsPercent}% saved`
+    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${xml.savingsPercent}% saved`

    return `${line1}\n${line2}\n${line3}`
  })
@@ -152,19 +167,22 @@ const detailedExamples = results

    const separator = i < filtered.length - 1 ? '\n\n---' : ''

+    const json = result.formats.find(f => f.name === 'json')!
+    const toon = result.formats.find(f => f.name === 'toon')!
+
    return `#### ${result.emoji} ${result.name}

 **Configuration:** ${result.description}

-**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON)
+**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)

-**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
+**JSON** (${json.tokens.toLocaleString('en-US')} tokens):

 \`\`\`json
 ${JSON.stringify(displayData, undefined, 2)}
 \`\`\`

-**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
+**TOON** (${toon.tokens.toLocaleString('en-US')} tokens):

 \`\`\`
 ${encode(displayData)}
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -81,7 +81,8 @@ async function validateAnswer(
  }:
  { actual: string, expected: string, question: string },
 ): Promise<boolean> {
-  const prompt = `You are validating answers to questions about structured data.
+  const prompt = `
+You are validating answers to questions about structured data.

 Question: ${question}
 Expected answer: ${expected}
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
 - Minor formatting differences are acceptable
 - Case-insensitive comparison for text

-Respond with only "YES" or "NO".`
+Respond with only "YES" or "NO".
+`.trim()

  try {
    const { text } = await generateText({
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -204,7 +204,7 @@ ${modelPerformance}

 #### What's Being Measured

-This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output).
+This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).

 #### Datasets Tested

@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:

 #### Evaluation Process

-1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
-2. **Query LLM**: Each model receives formatted data + question in a prompt.
-3. **LLM responds**: Model extracts the answer from the data.
-4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
-
-#### Semantic Validation
-
-Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
-
- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\` ✓
- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\` ✓
- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\` ✓
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
+4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).

 #### Models & Configuration