refactor: token efficiency benchmark code

This commit is contained in:
Johann Schopplich
2025-10-28 07:42:49 +01:00
parent 8836831de3
commit 8b9924ff05
3 changed files with 52 additions and 41 deletions

View File

@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
import { formatters } from '../src/formatters'
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
interface FormatMetrics {
name: string
tokens: number
savings: number
savingsPercent: string
}
interface BenchmarkResult {
name: string
emoji: string
description: string
data: Record<string, any>
jsonTokens: number
toonTokens: number
xmlTokens: number
jsonSavings: number
jsonSavingsPercent: string
xmlSavings: number
xmlSavingsPercent: string
formats: FormatMetrics[]
showDetailed: boolean
}
@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
const xmlTokens = tokenize(xmlString)
const jsonSavings = jsonTokens - toonTokens
const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
const xmlSavings = xmlTokens - toonTokens
const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)
totalJsonTokens += jsonTokens
totalToonTokens += toonTokens
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
emoji: example.emoji,
description: example.description,
data,
jsonTokens,
toonTokens,
xmlTokens,
jsonSavings,
jsonSavingsPercent,
xmlSavings,
xmlSavingsPercent,
formats: [
{
name: 'toon',
tokens: toonTokens,
savings: 0,
savingsPercent: '0.0',
},
{
name: 'json',
tokens: jsonTokens,
savings: jsonSavings,
savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
},
{
name: 'xml',
tokens: xmlTokens,
savings: xmlSavings,
savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
},
],
showDetailed: example.showDetailed,
})
}
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
// Generate ASCII bar chart visualization (stacked compact format)
const datasetRows = results
.map((result) => {
const percentage = Number.parseFloat(result.jsonSavingsPercent)
const toon = result.formats.find(f => f.name === 'toon')!
const json = result.formats.find(f => f.name === 'json')!
const xml = result.formats.find(f => f.name === 'xml')!
const percentage = Number.parseFloat(json.savingsPercent)
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
const toonStr = result.toonTokens.toLocaleString('en-US')
const jsonStr = result.jsonTokens.toLocaleString('en-US')
const xmlStr = result.xmlTokens.toLocaleString('en-US')
const toonStr = toon.tokens.toLocaleString('en-US')
const jsonStr = json.tokens.toLocaleString('en-US')
const xmlStr = xml.tokens.toLocaleString('en-US')
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${result.jsonSavingsPercent}% saved`
const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${result.xmlSavingsPercent}% saved`
const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${json.savingsPercent}% saved`
const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${xml.savingsPercent}% saved`
return `${line1}\n${line2}\n${line3}`
})
@@ -152,19 +167,22 @@ const detailedExamples = results
const separator = i < filtered.length - 1 ? '\n\n---' : ''
const json = result.formats.find(f => f.name === 'json')!
const toon = result.formats.find(f => f.name === 'toon')!
return `#### ${result.emoji} ${result.name}
**Configuration:** ${result.description}
**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON)
**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)
**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
**JSON** (${json.tokens.toLocaleString('en-US')} tokens):
\`\`\`json
${JSON.stringify(displayData, undefined, 2)}
\`\`\`
**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
**TOON** (${toon.tokens.toLocaleString('en-US')} tokens):
\`\`\`
${encode(displayData)}

View File

@@ -81,7 +81,8 @@ async function validateAnswer(
}:
{ actual: string, expected: string, question: string },
): Promise<boolean> {
const prompt = `You are validating answers to questions about structured data.
const prompt = `
You are validating answers to questions about structured data.
Question: ${question}
Expected answer: ${expected}
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
- Minor formatting differences are acceptable
- Case-insensitive comparison for text
Respond with only "YES" or "NO".`
Respond with only "YES" or "NO".
`.trim()
try {
const { text } = await generateText({

View File

@@ -204,7 +204,7 @@ ${modelPerformance}
#### What's Being Measured
This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output).
This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).
#### Datasets Tested
@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:
#### Evaluation Process
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
2. **Query LLM**: Each model receives formatted data + question in a prompt.
3. **LLM responds**: Model extracts the answer from the data.
4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
#### Semantic Validation
Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\`
- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\`
- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\`
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
#### Models & Configuration