diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
index ed1a14c..88ddf8d 100644
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
 import { formatters } from '../src/formatters'
 import { createProgressBar, ensureDir, tokenize } from '../src/utils'
 
+interface FormatMetrics {
+  name: string
+  tokens: number
+  savings: number
+  savingsPercent: string
+}
+
 interface BenchmarkResult {
   name: string
   emoji: string
   description: string
   data: Record<string, any>
-  jsonTokens: number
-  toonTokens: number
-  xmlTokens: number
-  jsonSavings: number
-  jsonSavingsPercent: string
-  xmlSavings: number
-  xmlSavingsPercent: string
+  formats: FormatMetrics[]
   showDetailed: boolean
 }
 
@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
   const xmlTokens = tokenize(xmlString)
 
   const jsonSavings = jsonTokens - toonTokens
-  const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
-
   const xmlSavings = xmlTokens - toonTokens
-  const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)
 
   totalJsonTokens += jsonTokens
   totalToonTokens += toonTokens
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
     emoji: example.emoji,
     description: example.description,
     data,
-    jsonTokens,
-    toonTokens,
-    xmlTokens,
-    jsonSavings,
-    jsonSavingsPercent,
-    xmlSavings,
-    xmlSavingsPercent,
+    formats: [
+      {
+        name: 'toon',
+        tokens: toonTokens,
+        savings: 0,
+        savingsPercent: '0.0',
+      },
+      {
+        name: 'json',
+        tokens: jsonTokens,
+        savings: jsonSavings,
+        savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
+      },
+      {
+        name: 'xml',
+        tokens: xmlTokens,
+        savings: xmlSavings,
+        savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
+      },
+    ],
     showDetailed: example.showDetailed,
   })
 }
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
 // Generate ASCII bar chart visualization (stacked compact format)
 const datasetRows = results
   .map((result) => {
-    const percentage = Number.parseFloat(result.jsonSavingsPercent)
+    const toon = result.formats.find(f => f.name === 'toon')!
+    const json = result.formats.find(f => f.name === 'json')!
+    const xml = result.formats.find(f => f.name === 'xml')!
+
+    const percentage = Number.parseFloat(json.savingsPercent)
     const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
-    const toonStr = result.toonTokens.toLocaleString('en-US')
-    const jsonStr = result.jsonTokens.toLocaleString('en-US')
-    const xmlStr = result.xmlTokens.toLocaleString('en-US')
+    const toonStr = toon.tokens.toLocaleString('en-US')
+    const jsonStr = json.tokens.toLocaleString('en-US')
+    const xmlStr = xml.tokens.toLocaleString('en-US')
 
     const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar}  ${toonStr.padStart(6)} tokens`
-    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${result.jsonSavingsPercent}% saved`
-    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${result.xmlSavingsPercent}% saved`
+    const line2 = `                             vs JSON: ${jsonStr.padStart(6)}  💰 ${json.savingsPercent}% saved`
+    const line3 = `                             vs XML:  ${xmlStr.padStart(6)}  💰 ${xml.savingsPercent}% saved`
 
     return `${line1}\n${line2}\n${line3}`
   })
@@ -152,19 +167,22 @@ const detailedExamples = results
 
     const separator = i < filtered.length - 1 ? '\n\n---' : ''
 
+    const json = result.formats.find(f => f.name === 'json')!
+    const toon = result.formats.find(f => f.name === 'toon')!
+
     return `#### ${result.emoji} ${result.name}
 
 **Configuration:** ${result.description}
 
-**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON)
+**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)
 
-**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
+**JSON** (${json.tokens.toLocaleString('en-US')} tokens):
 
 \`\`\`json
 ${JSON.stringify(displayData, undefined, 2)}
 \`\`\`
 
-**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
+**TOON** (${toon.tokens.toLocaleString('en-US')} tokens):
 
 \`\`\`
 ${encode(displayData)}
diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts
index f3701d1..13ae87b 100644
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -81,7 +81,8 @@ async function validateAnswer(
   }:
   { actual: string, expected: string, question: string },
 ): Promise<boolean> {
-  const prompt = `You are validating answers to questions about structured data.
+  const prompt = `
+You are validating answers to questions about structured data.
 
 Question: ${question}
 Expected answer: ${expected}
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
 - Minor formatting differences are acceptable
 - Case-insensitive comparison for text
 
-Respond with only "YES" or "NO".`
+Respond with only "YES" or "NO".
+`.trim()
 
   try {
     const { text } = await generateText({
diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts
index e1a109a..65859b5 100644
--- a/benchmarks/src/report.ts
+++ b/benchmarks/src/report.ts
@@ -204,7 +204,7 @@ ${modelPerformance}
 
 #### What's Being Measured
 
-This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output).
+This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).
 
 #### Datasets Tested
 
@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:
 
 #### Evaluation Process
 
-1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
-2. **Query LLM**: Each model receives formatted data + question in a prompt.
-3. **LLM responds**: Model extracts the answer from the data.
-4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
-
-#### Semantic Validation
-
-Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
-
-- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\` ✓
-- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\` ✓
-- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\` ✓
+1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
+2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
+4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
 
 #### Models & Configuration