mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
refactor: token efficiency benchmark code
This commit is contained in:
@@ -8,18 +8,19 @@ import { generateAnalyticsData, generateOrderData } from '../src/datasets'
|
|||||||
import { formatters } from '../src/formatters'
|
import { formatters } from '../src/formatters'
|
||||||
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
import { createProgressBar, ensureDir, tokenize } from '../src/utils'
|
||||||
|
|
||||||
|
interface FormatMetrics {
|
||||||
|
name: string
|
||||||
|
tokens: number
|
||||||
|
savings: number
|
||||||
|
savingsPercent: string
|
||||||
|
}
|
||||||
|
|
||||||
interface BenchmarkResult {
|
interface BenchmarkResult {
|
||||||
name: string
|
name: string
|
||||||
emoji: string
|
emoji: string
|
||||||
description: string
|
description: string
|
||||||
data: Record<string, any>
|
data: Record<string, any>
|
||||||
jsonTokens: number
|
formats: FormatMetrics[]
|
||||||
toonTokens: number
|
|
||||||
xmlTokens: number
|
|
||||||
jsonSavings: number
|
|
||||||
jsonSavingsPercent: string
|
|
||||||
xmlSavings: number
|
|
||||||
xmlSavingsPercent: string
|
|
||||||
showDetailed: boolean
|
showDetailed: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,10 +69,7 @@ for (const example of BENCHMARK_EXAMPLES) {
|
|||||||
const xmlTokens = tokenize(xmlString)
|
const xmlTokens = tokenize(xmlString)
|
||||||
|
|
||||||
const jsonSavings = jsonTokens - toonTokens
|
const jsonSavings = jsonTokens - toonTokens
|
||||||
const jsonSavingsPercent = ((jsonSavings / jsonTokens) * 100).toFixed(1)
|
|
||||||
|
|
||||||
const xmlSavings = xmlTokens - toonTokens
|
const xmlSavings = xmlTokens - toonTokens
|
||||||
const xmlSavingsPercent = ((xmlSavings / xmlTokens) * 100).toFixed(1)
|
|
||||||
|
|
||||||
totalJsonTokens += jsonTokens
|
totalJsonTokens += jsonTokens
|
||||||
totalToonTokens += toonTokens
|
totalToonTokens += toonTokens
|
||||||
@@ -82,13 +80,26 @@ for (const example of BENCHMARK_EXAMPLES) {
|
|||||||
emoji: example.emoji,
|
emoji: example.emoji,
|
||||||
description: example.description,
|
description: example.description,
|
||||||
data,
|
data,
|
||||||
jsonTokens,
|
formats: [
|
||||||
toonTokens,
|
{
|
||||||
xmlTokens,
|
name: 'toon',
|
||||||
jsonSavings,
|
tokens: toonTokens,
|
||||||
jsonSavingsPercent,
|
savings: 0,
|
||||||
xmlSavings,
|
savingsPercent: '0.0',
|
||||||
xmlSavingsPercent,
|
},
|
||||||
|
{
|
||||||
|
name: 'json',
|
||||||
|
tokens: jsonTokens,
|
||||||
|
savings: jsonSavings,
|
||||||
|
savingsPercent: ((jsonSavings / jsonTokens) * 100).toFixed(1),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'xml',
|
||||||
|
tokens: xmlTokens,
|
||||||
|
savings: xmlSavings,
|
||||||
|
savingsPercent: ((xmlSavings / xmlTokens) * 100).toFixed(1),
|
||||||
|
},
|
||||||
|
],
|
||||||
showDetailed: example.showDetailed,
|
showDetailed: example.showDetailed,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -102,15 +113,19 @@ const totalXmlSavingsPercent = ((totalXmlSavings / totalXmlTokens) * 100).toFixe
|
|||||||
// Generate ASCII bar chart visualization (stacked compact format)
|
// Generate ASCII bar chart visualization (stacked compact format)
|
||||||
const datasetRows = results
|
const datasetRows = results
|
||||||
.map((result) => {
|
.map((result) => {
|
||||||
const percentage = Number.parseFloat(result.jsonSavingsPercent)
|
const toon = result.formats.find(f => f.name === 'toon')!
|
||||||
|
const json = result.formats.find(f => f.name === 'json')!
|
||||||
|
const xml = result.formats.find(f => f.name === 'xml')!
|
||||||
|
|
||||||
|
const percentage = Number.parseFloat(json.savingsPercent)
|
||||||
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
const bar = createProgressBar(100 - percentage, 100) // Invert to show TOON tokens
|
||||||
const toonStr = result.toonTokens.toLocaleString('en-US')
|
const toonStr = toon.tokens.toLocaleString('en-US')
|
||||||
const jsonStr = result.jsonTokens.toLocaleString('en-US')
|
const jsonStr = json.tokens.toLocaleString('en-US')
|
||||||
const xmlStr = result.xmlTokens.toLocaleString('en-US')
|
const xmlStr = xml.tokens.toLocaleString('en-US')
|
||||||
|
|
||||||
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
const line1 = `${result.emoji} ${result.name.padEnd(25)} ${bar} ${toonStr.padStart(6)} tokens`
|
||||||
const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${result.jsonSavingsPercent}% saved`
|
const line2 = ` vs JSON: ${jsonStr.padStart(6)} 💰 ${json.savingsPercent}% saved`
|
||||||
const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${result.xmlSavingsPercent}% saved`
|
const line3 = ` vs XML: ${xmlStr.padStart(6)} 💰 ${xml.savingsPercent}% saved`
|
||||||
|
|
||||||
return `${line1}\n${line2}\n${line3}`
|
return `${line1}\n${line2}\n${line3}`
|
||||||
})
|
})
|
||||||
@@ -152,19 +167,22 @@ const detailedExamples = results
|
|||||||
|
|
||||||
const separator = i < filtered.length - 1 ? '\n\n---' : ''
|
const separator = i < filtered.length - 1 ? '\n\n---' : ''
|
||||||
|
|
||||||
|
const json = result.formats.find(f => f.name === 'json')!
|
||||||
|
const toon = result.formats.find(f => f.name === 'toon')!
|
||||||
|
|
||||||
return `#### ${result.emoji} ${result.name}
|
return `#### ${result.emoji} ${result.name}
|
||||||
|
|
||||||
**Configuration:** ${result.description}
|
**Configuration:** ${result.description}
|
||||||
|
|
||||||
**Savings:** ${result.jsonSavings.toLocaleString('en-US')} tokens (${result.jsonSavingsPercent}% reduction vs JSON)
|
**Savings:** ${json.savings.toLocaleString('en-US')} tokens (${json.savingsPercent}% reduction vs JSON)
|
||||||
|
|
||||||
**JSON** (${result.jsonTokens.toLocaleString('en-US')} tokens):
|
**JSON** (${json.tokens.toLocaleString('en-US')} tokens):
|
||||||
|
|
||||||
\`\`\`json
|
\`\`\`json
|
||||||
${JSON.stringify(displayData, undefined, 2)}
|
${JSON.stringify(displayData, undefined, 2)}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
**TOON** (${result.toonTokens.toLocaleString('en-US')} tokens):
|
**TOON** (${toon.tokens.toLocaleString('en-US')} tokens):
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
${encode(displayData)}
|
${encode(displayData)}
|
||||||
|
|||||||
@@ -81,7 +81,8 @@ async function validateAnswer(
|
|||||||
}:
|
}:
|
||||||
{ actual: string, expected: string, question: string },
|
{ actual: string, expected: string, question: string },
|
||||||
): Promise<boolean> {
|
): Promise<boolean> {
|
||||||
const prompt = `You are validating answers to questions about structured data.
|
const prompt = `
|
||||||
|
You are validating answers to questions about structured data.
|
||||||
|
|
||||||
Question: ${question}
|
Question: ${question}
|
||||||
Expected answer: ${expected}
|
Expected answer: ${expected}
|
||||||
@@ -93,7 +94,8 @@ Is the actual answer correct? Consider:
|
|||||||
- Minor formatting differences are acceptable
|
- Minor formatting differences are acceptable
|
||||||
- Case-insensitive comparison for text
|
- Case-insensitive comparison for text
|
||||||
|
|
||||||
Respond with only "YES" or "NO".`
|
Respond with only "YES" or "NO".
|
||||||
|
`.trim()
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { text } = await generateText({
|
const { text } = await generateText({
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ ${modelPerformance}
|
|||||||
|
|
||||||
#### What's Being Measured
|
#### What's Being Measured
|
||||||
|
|
||||||
This benchmark tests **LLM comprehension and data retrieval accuracy** when data is presented in different formats. Each LLM receives formatted data and must answer questions about it (this does NOT test LLM's ability to generate TOON output).
|
This benchmark tests **LLM comprehension and data retrieval accuracy** across different input formats. Each LLM receives formatted data and must answer questions about it (this does **not** test model's ability to generate TOON output).
|
||||||
|
|
||||||
#### Datasets Tested
|
#### Datasets Tested
|
||||||
|
|
||||||
@@ -233,18 +233,9 @@ Four datasets designed to test different structural patterns:
|
|||||||
|
|
||||||
#### Evaluation Process
|
#### Evaluation Process
|
||||||
|
|
||||||
1. **Format conversion**: Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
|
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
|
||||||
2. **Query LLM**: Each model receives formatted data + question in a prompt.
|
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||||
3. **LLM responds**: Model extracts the answer from the data.
|
4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||||
4. **Validate with LLM-as-judge**: GPT-5-nano validates if the answer is semantically correct.
|
|
||||||
|
|
||||||
#### Semantic Validation
|
|
||||||
|
|
||||||
Answers are validated by an LLM judge (\`gpt-5-nano\`) using semantic equivalence, not exact string matching:
|
|
||||||
|
|
||||||
- **Numeric formats**: \`50000\` = \`$50,000\` = \`50000 dollars\` ✓
|
|
||||||
- **Case insensitive**: \`Engineering\` = \`engineering\` = \`ENGINEERING\` ✓
|
|
||||||
- **Minor formatting**: \`2025-01-01\` = \`January 1, 2025\` ✓
|
|
||||||
|
|
||||||
#### Models & Configuration
|
#### Models & Configuration
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user