refactor: token efficiency benchmark code

This commit is contained in:
Johann Schopplich
2025-10-28 07:42:49 +01:00
parent 8836831de3
commit 8b9924ff05
3 changed files with 52 additions and 41 deletions

View File

@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
import { formatters } from '../src/formatters' import { formatters } from '../src/formatters'
import { createProgressBar, ensureDir, tokenize } from '../src/utils' import { createProgressBar, ensureDir, tokenize } from '../src/utils'
interface FormatMetrics {
name: string
tokens: number
savings: number
savingsPercent: string
}
interface BenchmarkResult { interface BenchmarkResult {
name: string name: string
emoji: string emoji: string
description: string description: string
data: Record<string, any> data: Record<string, any>
jsonTokens: number formats: FormatMetrics[]
toonTokens: number
xmlTokens: number
jsonSavings: number
jsonSavingsPercent: string
xmlSavings: number
xmlSavingsPercent: string
showDetailed: boolean showDetailed: boolean
} }
@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
const xmlTokens = tokenize(xmlString) const xmlTokens = tokenize(xmlString)
const jsonSavings = jsonTokens - toonTokens const jsonSavings = jsonTokens - toonTokens
const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
const xmlSavings = xmlTokens - toonTokens const xmlSavings = xmlTokens - toonTokens
const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)
totalJsonTokens += jsonTokens totalJsonTokens += jsonTokens
totalToonTokens += toonTokens totalToonTokens += toonTokens
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
emoji: example.emoji, emoji: example.emoji,
description: example.description, description: example.description,
data, data,
jsonTokens, formats: [
toonTokens, {
xmlTokens, name: 'toon',
jsonSavings, tokens: toonTokens,
jsonSavingsPercent, savings: 0,
xmlSavings, savingsPercent: '0.0',
xmlSavingsPercent, },
{
name: 'json',
tokens: jsonTokens,
savings: jsonSavings,
savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
},
{
name: 'xml',
tokens: xmlTokens,
savings: xmlSavings,
savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
},
],
showDetailed: example.showDetailed, showDetailed: example.showDetailed,
}) })
} }
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
// Generate ASCII bar chart visualization (stacked compact format) // Generate ASCII bar chart visualization (stacked compact format)
const datasetRows = results const datasetRows = results
.map((result) => { .map((result) => {
const percentage = Number.parseFloat(result.jsonSavingsPercent) const toon = result.formats.find(f => f.name === 'toon')!
const json = result.formats.find(f => f.name === 'json')!
const xml = result.formats.find(f => f.name === 'xml')!
const percentage = Number.parseFloat(json.savingsPercent)
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
const toonStr = result.toonTokens.toLocaleString('en-US') const toonStr = toon.tokens.toLocaleString('en-US')
const jsonStr = result.jsonTokens.toLocaleString('en-US') const jsonStr = json.tokens.toLocaleString('en-US')
const xmlStr = result.xmlTokens.toLocaleString('en-US') const xmlStr = xml.tokens.toLocaleString('en-US')
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens` const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${result.jsonSavingsPercent}% saved` const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${json.savingsPercent}% saved`
const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${result.xmlSavingsPercent}% saved` const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${xml.savingsPercent}% saved`
return `${line1}\n${line2}\n${line3}` return `${line1}\n${line2}\n${line3}`
}) })
@@ -152,19 +167,22 @@ const detailedExamples = results
const separator = i < filtered.length - 1 ? '\n\n---' : '' const separator = i < filtered.length - 1 ? '\n\n---' : ''
const json = result.formats.find(f => f.name === 'json')!
const toon = result.formats.find(f => f.name === 'toon')!
return `#### ${result.emoji} ${result.name} return `#### ${result.emoji} ${result.name}
**Configuration:** ${result.description} **Configuration:** ${result.description}
**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON) **Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)
**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens): **JSON** (${json.tokens.toLocaleString('en-US')} tokens):
\`\`\`json \`\`\`json
${JSON.stringify(displayData, undefined, 2)} ${JSON.stringify(displayData, undefined, 2)}
\`\`\` \`\`\`
**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens): **TOON** (${toon.tokens.toLocaleString('en-US')} tokens):
\`\`\` \`\`\`
${encode(displayData)} ${encode(displayData)}

View File

@@ -81,7 +81,8 @@ async function validateAnswer(
}: }:
{ actual: string, expected: string, question: string }, { actual: string, expected: string, question: string },
): Promise<boolean> { ): Promise<boolean> {
const prompt = `You are validating answers to questions about structured data. const prompt = `
You are validating answers to questions about structured data.
Question: ${question} Question: ${question}
Expected answer: ${expected} Expected answer: ${expected}
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
- Minor formatting differences are acceptable - Minor formatting differences are acceptable
- Case-insensitive comparison for text - Case-insensitive comparison for text
Respond with only "YES" or "NO".` Respond with only "YES" or "NO".
`.trim()
try { try {
const { text } = await generateText({ const { text } = await generateText({

View File

@@ -204,7 +204,7 @@ ${modelPerformance}
#### What's Being Measured #### What's Being Measured
This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output). This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).
#### Datasets Tested #### Datasets Tested
@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:
#### Evaluation Process #### Evaluation Process
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML). 1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
2. **Query LLM**: Each model receives formatted data + question in a prompt. 2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
3. **LLM responds**: Model extracts the answer from the data. 4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
#### Semantic Validation
Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\`
- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\`
- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\`
#### Models & Configuration #### Models & Configuration