diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index ed1a14c..88ddf8d 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets' import { formatters } from '../src/formatters' import { createProgressBar, ensureDir, tokenize } from '../src/utils' +interface FormatMetrics { + name: string + tokens: number + savings: number + savingsPercent: string +} + interface BenchmarkResult { name: string emoji: string description: string data: Record - jsonTokens: number - toonTokens: number - xmlTokens: number - jsonSavings: number - jsonSavingsPercent: string - xmlSavings: number - xmlSavingsPercent: string + formats: FormatMetrics[] showDetailed: boolean } @@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) { const xmlTokens = tokenize(xmlString) const jsonSavings = jsonTokens - toonTokens - const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1) - const xmlSavings = xmlTokens - toonTokens - const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1) totalJsonTokens += jsonTokens totalToonTokens += toonTokens @@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) { emoji: example.emoji, description: example.description, data, - jsonTokens, - toonTokens, - xmlTokens, - jsonSavings, - jsonSavingsPercent, - xmlSavings, - xmlSavingsPercent, + formats: [ + { + name: 'toon', + tokens: toonTokens, + savings: 0, + savingsPercent: '0.0', + }, + { + name: 'json', + tokens: jsonTokens, + savings: jsonSavings, + savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1), + }, + { + name: 'xml', + tokens: xmlTokens, + savings: xmlSavings, + savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1), + }, + ], showDetailed: example.showDetailed, }) } @@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe // Generate ASCII bar chart visualization (stacked compact format) const datasetRows = results .map((result) => { - const percentage = Number.parseFloat(result.jsonSavingsPercent) + const toon = result.formats.find(f => f.name === 'toon')! + const json = result.formats.find(f => f.name === 'json')! + const xml = result.formats.find(f => f.name === 'xml')! + + const percentage = Number.parseFloat(json.savingsPercent) const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens - const toonStr = result.toonTokens.toLocaleString('en-US') - const jsonStr = result.jsonTokens.toLocaleString('en-US') - const xmlStr = result.xmlTokens.toLocaleString('en-US') + const toonStr = toon.tokens.toLocaleString('en-US') + const jsonStr = json.tokens.toLocaleString('en-US') + const xmlStr = xml.tokens.toLocaleString('en-US') const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens` - const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${result.jsonSavingsPercent}% saved` - const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${result.xmlSavingsPercent}% saved` + const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${json.savingsPercent}% saved` + const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${xml.savingsPercent}% saved` return `${line1}\n${line2}\n${line3}` }) @@ -152,19 +167,22 @@ const detailedExamples = results const separator = i < filtered.length - 1 ? '\n\n---' : '' + const json = result.formats.find(f => f.name === 'json')! + const toon = result.formats.find(f => f.name === 'toon')! + return `#### ${result.emoji} ${result.name} **Configuration:** ${result.description} -**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON) +**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON) -**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens): +**JSON** (${json.tokens.toLocaleString('en-US')} tokens): \`\`\`json ${JSON.stringify(displayData, undefined, 2)} \`\`\` -**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens): +**TOON** (${toon.tokens.toLocaleString('en-US')} tokens): \`\`\` ${encode(displayData)} diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts index f3701d1..13ae87b 100644 --- a/benchmarks/src/evaluate.ts +++ b/benchmarks/src/evaluate.ts @@ -81,7 +81,8 @@ async function validateAnswer( }: { actual: string, expected: string, question: string }, ): Promise { - const prompt = `You are validating answers to questions about structured data. + const prompt = ` +You are validating answers to questions about structured data. Question: ${question} Expected answer: ${expected} @@ -93,7 +94,8 @@ Is the actual answer correct? Consider: - Minor formatting differences are acceptable - Case-insensitive comparison for text -Respond with only "YES" or "NO".` +Respond with only "YES" or "NO". +`.trim() try { const { text } = await generateText({ diff --git a/benchmarks/src/report.ts b/benchmarks/src/report.ts index e1a109a..65859b5 100644 --- a/benchmarks/src/report.ts +++ b/benchmarks/src/report.ts @@ -204,7 +204,7 @@ ${modelPerformance} #### What's Being Measured -This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output). +This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output). #### Datasets Tested @@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns: #### Evaluation Process -1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML). -2. **Query LLM**: Each model receives formatted data + question in a prompt. -3. **LLM responds**: Model extracts the answer from the data. -4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct. - -#### Semantic Validation - -Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching: - -- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\` ✓ -- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\` ✓ -- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\` ✓ +1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML). +2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer. +4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`). #### Models & Configuration