refactor: token efficiency benchmark code

2026-01-29 15:24:10 +08:00 · 2025-10-28 07:42:49 +01:00
parent 8836831de3
commit 8b9924ff05
3 changed files with 52 additions and 41 deletions
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
 import { formatters } from '../src/formatters'
 import { createProgressBar, ensureDir, tokenize } from '../src/utils'
 interface FormatMetrics {
  name: string
  tokens: number
  savings: number
  savingsPercent: string
 }
 interface BenchmarkResult {
  name: string
  emoji: string
  description: string
  data: Record<string, any>
-  jsonTokens: number
+  formats: FormatMetrics[]
  toonTokens: number
  xmlTokens: number
  jsonSavings: number
  jsonSavingsPercent: string
  xmlSavings: number
  xmlSavingsPercent: string
  showDetailed: boolean
 }
@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
  const xmlTokens = tokenize(xmlString)
  const jsonSavings = jsonTokens - toonTokens
  const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
  const xmlSavings = xmlTokens - toonTokens
  const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)
  totalJsonTokens += jsonTokens
  totalToonTokens += toonTokens
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
    emoji: example.emoji,
    description: example.description,
    data,
-    jsonTokens,
+    formats: [
-    toonTokens,
+      {
-    xmlTokens,
+        name: 'toon',
-    jsonSavings,
+        tokens: toonTokens,
-    jsonSavingsPercent,
+        savings: 0,
-    xmlSavings,
+        savingsPercent: '0.0',
-    xmlSavingsPercent,
+      },
      {
        name: 'json',
        tokens: jsonTokens,
        savings: jsonSavings,
        savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
      },
      {
        name: 'xml',
        tokens: xmlTokens,
        savings: xmlSavings,
        savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
      },
    ],
    showDetailed: example.showDetailed,
  })
 }
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
 // Generate ASCII bar chart visualization (stacked compact format)
 const datasetRows = results
  .map((result) => {
-    const percentage = Number.parseFloat(result.jsonSavingsPercent)
+    const toon = result.formats.find(f => f.name === 'toon')!
    const json = result.formats.find(f => f.name === 'json')!
    const xml = result.formats.find(f => f.name === 'xml')!
    const percentage = Number.parseFloat(json.savingsPercent)
    const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
-    const toonStr = result.toonTokens.toLocaleString('en-US')
+    const toonStr = toon.tokens.toLocaleString('en-US')
-    const jsonStr = result.jsonTokens.toLocaleString('en-US')
+    const jsonStr = json.tokens.toLocaleString('en-US')
-    const xmlStr = result.xmlTokens.toLocaleString('en-US')
+    const xmlStr = xml.tokens.toLocaleString('en-US')
    const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar}  ${toonStr.padStart(6)} tokens`
-    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${result.jsonSavingsPercent}% saved`
+    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${json.savingsPercent}% saved`
-    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${result.xmlSavingsPercent}% saved`
+    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${xml.savingsPercent}% saved`
    return `${line1}\n${line2}\n${line3}`
  })
@@ -152,19 +167,22 @@ const detailedExamples = results
    const separator = i < filtered.length - 1 ? '\n\n---' : ''
    const json = result.formats.find(f => f.name === 'json')!
    const toon = result.formats.find(f => f.name === 'toon')!
    return `#### ${result.emoji} ${result.name}
 **Configuration:** ${result.description}
-**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON)
+**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)
-**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
+**JSON** (${json.tokens.toLocaleString('en-US')} tokens):
 \`\`\`json
 ${JSON.stringify(displayData, undefined, 2)}
 \`\`\`
-**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
+**TOON** (${toon.tokens.toLocaleString('en-US')} tokens):
 \`\`\`
 ${encode(displayData)}
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -81,7 +81,8 @@ async function validateAnswer(
  }:
  { actual: string, expected: string, question: string },
 ): Promise<boolean> {
-  const prompt = `You are validating answers to questions about structured data.
+  const prompt = `
 You are validating answers to questions about structured data.
 Question: ${question}
 Expected answer: ${expected}
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
 - Minor formatting differences are acceptable
 - Case-insensitive comparison for text
-Respond with only "YES" or "NO".`
+Respond with only "YES" or "NO".
 `.trim()
  try {
    const { text } = await generateText({
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -204,7 +204,7 @@ ${modelPerformance}
 #### What's Being Measured
-This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output).
+This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).
 #### Datasets Tested
@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:
 #### Evaluation Process
-1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
-2. **Query LLM**: Each model receives formatted data + question in a prompt.
+2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
-3. **LLM responds**: Model extracts the answer from the data.
+4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
 4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
 #### Semantic Validation
 Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
 - **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\` ✓
 - **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\` ✓
 - **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\` ✓
 #### Models & Configuration