mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 23:34:10 +08:00
docs: overhaul retrieval accuracy benchmark
This commit is contained in:
@@ -5,9 +5,22 @@ export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.
|
||||
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
|
||||
|
||||
/**
|
||||
* Default concurrency for parallel evaluations
|
||||
* Model-specific RPM (requests per minute) limits to handle API quotas
|
||||
*
|
||||
* @remarks
|
||||
* Set `undefined` for models without specific limits
|
||||
*/
|
||||
export const DEFAULT_CONCURRENCY = 20
|
||||
/// keep-sorted
|
||||
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
'claude-haiku-4-5-20251001': 50,
|
||||
'gemini-2.5-flash': 25,
|
||||
'gpt-5-nano': undefined,
|
||||
}
|
||||
|
||||
/**
|
||||
* Default concurrency for parallel evaluations to prevent bursting
|
||||
*/
|
||||
export const DEFAULT_CONCURRENCY = 10
|
||||
|
||||
/**
|
||||
* Progress bar configuration
|
||||
@@ -28,13 +41,83 @@ export const PROGRESS_BAR = {
|
||||
export const DRY_RUN: boolean = process.env.DRY_RUN === 'true'
|
||||
|
||||
/**
|
||||
* Limits applied when DRY_RUN is enabled
|
||||
* Limits applied during dry run mode
|
||||
*/
|
||||
export const DRY_RUN_LIMITS = {
|
||||
/** Maximum number of questions to evaluate */
|
||||
maxQuestions: 10,
|
||||
/** Maximum number of formats to test */
|
||||
maxFormats: undefined as number | undefined,
|
||||
/** Models to use in dry run */
|
||||
allowedModels: [] as string[],
|
||||
}
|
||||
|
||||
/**
|
||||
* Threshold values for filtering and aggregation questions
|
||||
*/
|
||||
export const QUESTION_THRESHOLDS = {
|
||||
tabular: {
|
||||
salaryRanges: [60000, 80000, 100000, 120000],
|
||||
experienceYears: [5, 10, 15, 20],
|
||||
departmentSalaryThreshold: 80000,
|
||||
departmentExperienceThreshold: 10,
|
||||
},
|
||||
nested: {
|
||||
highValueOrders: [200, 400, 600],
|
||||
statusValueThreshold: 300,
|
||||
itemCountThreshold: 3,
|
||||
totalThresholdsForItems: [300, 500],
|
||||
},
|
||||
analytics: {
|
||||
views: [5000, 7000],
|
||||
conversions: [10, 30],
|
||||
viewsForFiltering: [6000, 7000],
|
||||
conversionsForFiltering: 15,
|
||||
revenueThresholds: [500, 1000, 1500, 2000, 2500],
|
||||
viewsThresholdForRevenue: 6000,
|
||||
clicksForFiltering: [250, 400],
|
||||
conversionsForClickFiltering: 15,
|
||||
revenueForBounceRate: [1000, 1500],
|
||||
bounceRateThreshold: 0.5,
|
||||
},
|
||||
github: {
|
||||
stars: [100000, 150000, 200000],
|
||||
forks: [20000, 35000, 50000],
|
||||
watchers: [5000, 8000],
|
||||
starForkCombinations: [
|
||||
{ stars: 75000, forks: 15000 },
|
||||
{ stars: 100000, forks: 20000 },
|
||||
{ stars: 150000, forks: 30000 },
|
||||
{ stars: 200000, forks: 45000 },
|
||||
],
|
||||
starWatcherCombinations: [
|
||||
{ stars: 100000, watchers: 7000 },
|
||||
{ stars: 150000, watchers: 9000 },
|
||||
],
|
||||
},
|
||||
} as const
|
||||
|
||||
/**
|
||||
* Question generation configuration
|
||||
*/
|
||||
export const QUESTION_LIMITS = {
|
||||
tabular: {
|
||||
fieldRetrieval: 20,
|
||||
aggregationDepartments: 6,
|
||||
filteringMultiConditionDepartments: 6,
|
||||
filteringExperience: 4,
|
||||
filteringDepartmentExp: 3,
|
||||
filteringDepartmentActive: 3,
|
||||
},
|
||||
nested: {
|
||||
fieldRetrievalOrders: 8,
|
||||
fieldRetrievalCustomers: 10,
|
||||
aggregationStatuses: 5,
|
||||
filteringStatusAndValue: 5,
|
||||
filteringStatusAndItems: 3,
|
||||
},
|
||||
analytics: {
|
||||
fieldRetrievalDates: 13,
|
||||
},
|
||||
github: {
|
||||
fieldRetrievalRepos: 11,
|
||||
aggregationBranches: 2,
|
||||
filteringStarsAndForks: 8,
|
||||
},
|
||||
} as const
|
||||
|
||||
@@ -1,12 +1,3 @@
|
||||
/**
|
||||
* Datasets for TOON benchmarks
|
||||
*
|
||||
* These datasets are designed to test TOON's strengths and weaknesses:
|
||||
* - Tabular: Uniform records (TOON optimal)
|
||||
* - Nested: Complex structures with nested objects
|
||||
* - Analytics: Time-series data
|
||||
*/
|
||||
|
||||
import type { Dataset } from './types'
|
||||
import { faker } from '@faker-js/faker'
|
||||
import githubRepos from '../data/github-repos.json' with { type: 'json' }
|
||||
@@ -128,7 +119,7 @@ const tabularDataset: Dataset = {
|
||||
description: 'Uniform employee records (TOON optimal format)',
|
||||
data: {
|
||||
employees: Array.from({ length: 100 }, (_, i): Employee => {
|
||||
const yearsExp = faker.number.int({ min: 1, max: 20 })
|
||||
const yearsExp = faker.number.int({ min: 1, max: 25 })
|
||||
return {
|
||||
id: i + 1,
|
||||
name: faker.person.fullName(),
|
||||
|
||||
@@ -1,28 +1,19 @@
|
||||
/**
|
||||
* LLM evaluation logic for TOON benchmarks
|
||||
*
|
||||
* Handles:
|
||||
* - Model configuration
|
||||
* - Question evaluation with LLMs
|
||||
* - Answer validation using LLM-as-judge
|
||||
*/
|
||||
|
||||
import type { LanguageModelV2 } from '@ai-sdk/provider'
|
||||
import type { EvaluationResult, Question } from './types'
|
||||
import { anthropic } from '@ai-sdk/anthropic'
|
||||
import { google } from '@ai-sdk/google'
|
||||
import { openai } from '@ai-sdk/openai'
|
||||
import * as prompts from '@clack/prompts'
|
||||
import { generateText } from 'ai'
|
||||
import { consola } from 'consola'
|
||||
|
||||
/**
|
||||
* Models used for evaluation
|
||||
*/
|
||||
export const models: Record<string, LanguageModelV2> = {
|
||||
'gpt-5-nano': openai('gpt-5-nano'),
|
||||
'claude-haiku-4-5': anthropic('claude-haiku-4-5-20251001'),
|
||||
'gemini-2.5-flash': google('gemini-2.5-flash'),
|
||||
}
|
||||
export const models: LanguageModelV2[] = [
|
||||
openai('gpt-5-nano'),
|
||||
google('gemini-2.5-flash'),
|
||||
anthropic('claude-haiku-4-5-20251001'),
|
||||
]
|
||||
|
||||
/**
|
||||
* Evaluate a single question with a specific format and model
|
||||
@@ -33,14 +24,12 @@ export async function evaluateQuestion(
|
||||
formatName,
|
||||
formattedData,
|
||||
model,
|
||||
modelName,
|
||||
}:
|
||||
{
|
||||
question: Question
|
||||
formatName: string
|
||||
formattedData: string
|
||||
model: LanguageModelV2
|
||||
modelName: string
|
||||
},
|
||||
): Promise<EvaluationResult> {
|
||||
const prompt = `
|
||||
@@ -59,10 +48,11 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
const { text, usage } = await generateText({
|
||||
model,
|
||||
prompt,
|
||||
temperature: !model.modelId.startsWith('gpt-') ? 0 : undefined,
|
||||
temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
|
||||
})
|
||||
|
||||
const latencyMs = performance.now() - startTime
|
||||
|
||||
const isCorrect = await validateAnswer({
|
||||
actual: text.trim(),
|
||||
expected: question.groundTruth,
|
||||
@@ -72,7 +62,7 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
return {
|
||||
questionId: question.id,
|
||||
format: formatName,
|
||||
model: modelName,
|
||||
model: model.modelId,
|
||||
expected: question.groundTruth,
|
||||
actual: text.trim(),
|
||||
isCorrect,
|
||||
@@ -115,14 +105,14 @@ Respond with only "YES" or "NO".
|
||||
|
||||
try {
|
||||
const { text } = await generateText({
|
||||
model: models['gpt-5-nano']!,
|
||||
model: models.find(m => m.modelId === 'gpt-5-nano')!,
|
||||
prompt,
|
||||
})
|
||||
|
||||
return text.trim().toUpperCase() === 'YES'
|
||||
}
|
||||
catch (error) {
|
||||
consola.error('Validation error:', error)
|
||||
prompts.log.error(`Validation error: ${error}`)
|
||||
// Fallback to simple string comparison
|
||||
return actual.toLowerCase().trim() === expected.toLowerCase().trim()
|
||||
}
|
||||
|
||||
@@ -1,20 +1,3 @@
|
||||
/**
|
||||
* Format converters for TOON benchmarks
|
||||
*
|
||||
* Converts data to different formats for comparison:
|
||||
* - JSON
|
||||
* - TOON
|
||||
* - CSV
|
||||
* - XML
|
||||
* - YAML
|
||||
*
|
||||
* ## Semantic Equivalence
|
||||
*
|
||||
* All formatters attempt to preserve semantic equivalence with the source data,
|
||||
* meaning the converted data should represent the same information. However,
|
||||
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
||||
*/
|
||||
|
||||
import { stringify as stringifyCSV } from 'csv-stringify/sync'
|
||||
import { XMLBuilder } from 'fast-xml-parser'
|
||||
import { stringify as stringifyYAML } from 'yaml'
|
||||
@@ -23,7 +6,10 @@ import { encode as encodeToon } from '../../src/index'
|
||||
/**
|
||||
* Format converters registry
|
||||
*
|
||||
* Each formatter takes unknown data and returns a string representation
|
||||
* @remarks
|
||||
* All formatters attempt to preserve semantic equivalence with the source data,
|
||||
* meaning the converted data should represent the same information. However,
|
||||
* CSV has inherent limitations with nested structures (see `toCSV` docs).
|
||||
*/
|
||||
export const formatters: Record<string, (data: unknown) => string> = {
|
||||
json: data => JSON.stringify(data, undefined, 2),
|
||||
@@ -37,11 +23,13 @@ export const formatters: Record<string, (data: unknown) => string> = {
|
||||
* Convert data to CSV format
|
||||
*
|
||||
* @remarks
|
||||
* **Limitations**: CSV is designed for flat tabular data only. This formatter:
|
||||
* - Only handles top-level objects with arrays of flat objects
|
||||
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
|
||||
* - Loses nested structure information during conversion
|
||||
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
|
||||
* Limitations: CSV is designed for flat tabular data only.
|
||||
*
|
||||
* This formatter:
|
||||
* - Only handles top-level objects with arrays of flat objects
|
||||
* - Cannot properly represent deeply nested structures (nested arrays/objects within rows)
|
||||
* - Loses nested structure information during conversion
|
||||
* - May produce misleading results for datasets with complex nesting (e.g., e-commerce orders with nested items)
|
||||
*
|
||||
* For datasets with nested structures, CSV comparisons may not be fair or representative
|
||||
* of how CSV would typically be used in practice.
|
||||
|
||||
@@ -1,24 +1,18 @@
|
||||
/**
|
||||
* Question generation for TOON benchmarks
|
||||
*
|
||||
* Generates ~160 questions across different types:
|
||||
* - Field retrieval (50%): "What is X's Y?"
|
||||
* - Aggregation (25%): "How many X have Y?"
|
||||
* - Filtering (25%): "List/count X where Y"
|
||||
*
|
||||
* Questions are generated dynamically based on actual data values
|
||||
*
|
||||
* TODO: Balance question distribution across datasets to ensure fair representation.
|
||||
* Current distribution:
|
||||
* - Tabular: 70 questions (43%)
|
||||
* - Nested: 50 questions (31%)
|
||||
* - Analytics: 40 questions (25%)
|
||||
* - GitHub: 40 questions (25%)
|
||||
* Generates ~150-160 questions across different question types and datasets:
|
||||
* - Field Retrieval: Direct field access with no computation
|
||||
* Examples: "What is X's salary?", "What is the status of order Y?"
|
||||
* - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
|
||||
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
|
||||
* - Filtering: Multi-condition queries requiring complex logical operations
|
||||
* Examples: "How many X WHERE condition1 AND condition2?"
|
||||
*/
|
||||
|
||||
import type { AnalyticsMetric, Employee, Order, Repository } from './datasets'
|
||||
import type { Question } from './types'
|
||||
import { consola } from 'consola'
|
||||
import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from './constants'
|
||||
import { datasets } from './datasets'
|
||||
|
||||
/**
|
||||
@@ -34,19 +28,15 @@ export function generateQuestions(): Question[] {
|
||||
const analytics = (datasets.find(d => d.name === 'analytics')?.data.metrics as AnalyticsMetric[]) ?? []
|
||||
const github = (datasets.find(d => d.name === 'github')?.data.repositories as Repository[]) ?? []
|
||||
|
||||
// ========================================
|
||||
// TABULAR DATASET QUESTIONS (70 questions)
|
||||
// ========================================
|
||||
|
||||
if (tabular.length > 0) {
|
||||
// Field retrieval: specific employees (40 questions)
|
||||
for (let i = 0; i < Math.min(40, tabular.length); i++) {
|
||||
// Field retrieval: specific employees
|
||||
for (let i = 0; i < Math.min(QUESTION_LIMITS.tabular.fieldRetrieval, tabular.length); i++) {
|
||||
const emp = tabular[i * 2] || tabular[i]
|
||||
if (!emp)
|
||||
continue
|
||||
|
||||
// Alternate between different field types
|
||||
if (i % 3 === 0) {
|
||||
// Rotate through all field types
|
||||
if (i % 5 === 0) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the salary of ${emp.name}?`,
|
||||
@@ -55,7 +45,7 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
else if (i % 3 === 1) {
|
||||
else if (i % 5 === 1) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What department does ${emp.name} work in?`,
|
||||
@@ -64,7 +54,7 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
else {
|
||||
else if (i % 5 === 2) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the email address of ${emp.name}?`,
|
||||
@@ -73,11 +63,29 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 3) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many years of experience does ${emp.name} have?`,
|
||||
groundTruth: String(emp.yearsExperience),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
else {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `Is ${emp.name} an active employee?`,
|
||||
groundTruth: emp.active ? 'yes' : 'no',
|
||||
type: 'field-retrieval',
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregation: count by department
|
||||
const departments = [...new Set(tabular.map(e => e.department))]
|
||||
for (const dept of departments.slice(0, 6)) {
|
||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
|
||||
const count = tabular.filter(e => e.department === dept).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
@@ -88,9 +96,8 @@ export function generateQuestions(): Question[] {
|
||||
})
|
||||
}
|
||||
|
||||
// Aggregation: salary ranges (4 questions)
|
||||
const salaryThresholds = [60000, 80000, 100000, 120000]
|
||||
for (const threshold of salaryThresholds) {
|
||||
// Aggregation: salary ranges (single-condition filters)
|
||||
for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
|
||||
const count = tabular.filter(e => e.salary > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
@@ -101,39 +108,57 @@ export function generateQuestions(): Question[] {
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: active status
|
||||
// Aggregation: totals and averages
|
||||
const totalEmployees = tabular.length
|
||||
const avgSalary = Math.round(tabular.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
|
||||
const activeCount = tabular.filter(e => e.active).length
|
||||
const inactiveCount = tabular.filter(e => !e.active).length
|
||||
|
||||
questions.push(
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many employees are in the dataset?',
|
||||
groundTruth: String(totalEmployees),
|
||||
type: 'aggregation',
|
||||
dataset: 'tabular',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average salary across all employees?',
|
||||
groundTruth: String(avgSalary),
|
||||
type: 'aggregation',
|
||||
dataset: 'tabular',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many employees are active?',
|
||||
groundTruth: String(activeCount),
|
||||
type: 'filtering',
|
||||
type: 'aggregation',
|
||||
dataset: 'tabular',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many employees are inactive?',
|
||||
groundTruth: String(inactiveCount),
|
||||
type: 'filtering',
|
||||
type: 'aggregation',
|
||||
dataset: 'tabular',
|
||||
},
|
||||
)
|
||||
|
||||
// Complex filtering: multi-condition (8 questions)
|
||||
for (const dept of departments.slice(0, 4)) {
|
||||
const count = tabular.filter(e => e.department === dept && e.salary > 80000).length
|
||||
// Filtering: count by department with salary filter (multi-condition)
|
||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
|
||||
const count = tabular.filter(e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many employees in ${dept} have a salary greater than 80000?`,
|
||||
prompt: `How many employees in ${dept} have a salary greater than ${QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
|
||||
for (const exp of [5, 10]) {
|
||||
// Filtering: active employees by experience (multi-condition)
|
||||
for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
|
||||
const count = tabular.filter(e => e.yearsExperience > exp && e.active).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
@@ -143,15 +168,35 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: department by experience (multi-condition)
|
||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
|
||||
const count = tabular.filter(e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many employees in ${dept} have more than ${QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold} years of experience?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: department by active status (multi-condition)
|
||||
for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
|
||||
const count = tabular.filter(e => e.department === dept && e.active).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many active employees work in ${dept}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'tabular',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================
|
||||
// NESTED DATASET QUESTIONS (50 questions)
|
||||
// ========================================
|
||||
|
||||
if (nested.length > 0) {
|
||||
// Field retrieval: order totals (20 questions)
|
||||
for (let i = 0; i < Math.min(20, nested.length); i++) {
|
||||
// Field retrieval: order totals and statuses
|
||||
for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalOrders, nested.length); i++) {
|
||||
const order = nested[i * 2] || nested[i]
|
||||
if (!order)
|
||||
continue
|
||||
@@ -159,7 +204,7 @@ export function generateQuestions(): Question[] {
|
||||
if (i % 2 === 0) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the total amount for order ${order.orderId}?`,
|
||||
prompt: `What is the total for order ${order.orderId}?`,
|
||||
groundTruth: String(order.total),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
@@ -176,51 +221,143 @@ export function generateQuestions(): Question[] {
|
||||
}
|
||||
}
|
||||
|
||||
// Field retrieval: customer info (15 questions)
|
||||
for (let i = 0; i < Math.min(15, nested.length); i++) {
|
||||
const order = nested[i * 3] || nested[i]
|
||||
// Field retrieval: customer info and order dates (expanded)
|
||||
for (let i = 0; i < Math.min(QUESTION_LIMITS.nested.fieldRetrievalCustomers, nested.length); i++) {
|
||||
const order = nested[i * 2 + 1] || nested[i]
|
||||
if (!order)
|
||||
continue
|
||||
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the customer name for order ${order.orderId}?`,
|
||||
groundTruth: order.customer.name,
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
})
|
||||
if (i % 4 === 0) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the customer name for order ${order.orderId}?`,
|
||||
groundTruth: order.customer.name,
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
else if (i % 4 === 1) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the customer email for order ${order.orderId}?`,
|
||||
groundTruth: order.customer.email,
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
else if (i % 4 === 2) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the order date for order ${order.orderId}?`,
|
||||
groundTruth: order.orderDate || '',
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
else {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many items are in order ${order.orderId}?`,
|
||||
groundTruth: String(order.items.length),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregation: count by status
|
||||
// Aggregation: totals and averages
|
||||
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
|
||||
const avgOrderValue = totalRevenue / nested.length
|
||||
const totalOrders = nested.length
|
||||
const maxOrderValue = Math.max(...nested.map(o => o.total))
|
||||
|
||||
// Count by status
|
||||
const statuses = [...new Set(nested.map(o => o.status))]
|
||||
for (const status of statuses) {
|
||||
for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
|
||||
const count = nested.filter(o => o.status === status).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many orders have status "${status}"?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
|
||||
questions.push(
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the total revenue across all orders?',
|
||||
groundTruth: String(totalRevenue.toFixed(2)),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average order value?',
|
||||
groundTruth: String(avgOrderValue.toFixed(2)),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many orders are in the dataset?',
|
||||
groundTruth: String(totalOrders),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the highest order total?',
|
||||
groundTruth: String(maxOrderValue.toFixed(2)),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
},
|
||||
)
|
||||
|
||||
// Aggregation: high-value orders (single-condition filter)
|
||||
for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
|
||||
const count = nested.filter(o => o.total > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many orders have a total greater than ${threshold}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: multi-condition queries (status AND value)
|
||||
const orderStatuses = [...new Set(nested.map(o => o.status))]
|
||||
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
|
||||
const count = nested.filter(o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many orders have status "${status}" and total greater than ${QUESTION_THRESHOLDS.nested.statusValueThreshold}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
|
||||
// Aggregation: total revenue
|
||||
const totalRevenue = nested.reduce((sum, o) => sum + o.total, 0)
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the total revenue across all orders?',
|
||||
groundTruth: String(totalRevenue.toFixed(2)),
|
||||
type: 'aggregation',
|
||||
dataset: 'nested',
|
||||
})
|
||||
|
||||
// Filtering: high-value orders (3 questions)
|
||||
const highValueThresholds = [200, 400, 600]
|
||||
for (const threshold of highValueThresholds) {
|
||||
const count = nested.filter(o => o.total > threshold).length
|
||||
// Filtering: status AND items count (multi-condition)
|
||||
for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
|
||||
const count = nested.filter(o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many orders have a total greater than ${threshold}?`,
|
||||
prompt: `How many orders have status "${status}" and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'nested',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: total AND items count (multi-condition)
|
||||
for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
|
||||
const count = nested.filter(o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many orders have a total greater than ${threshold} and at least ${QUESTION_THRESHOLDS.nested.itemCountThreshold} items?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'nested',
|
||||
@@ -228,18 +365,14 @@ export function generateQuestions(): Question[] {
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================
|
||||
// ANALYTICS DATASET QUESTIONS (40 questions)
|
||||
// ========================================
|
||||
|
||||
if (analytics.length > 0) {
|
||||
// Field retrieval: specific dates (20 questions)
|
||||
for (let i = 0; i < Math.min(20, analytics.length); i++) {
|
||||
// Field retrieval: specific dates (expanded with all metrics)
|
||||
for (let i = 0; i < Math.min(QUESTION_LIMITS.analytics.fieldRetrievalDates, analytics.length); i++) {
|
||||
const metric = analytics[i * 3] || analytics[i]
|
||||
if (!metric)
|
||||
continue
|
||||
|
||||
if (i % 2 === 0) {
|
||||
if (i % 5 === 0) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many views were recorded on ${metric.date}?`,
|
||||
@@ -248,7 +381,7 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
else {
|
||||
else if (i % 5 === 1) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What was the revenue on ${metric.date}?`,
|
||||
@@ -257,12 +390,42 @@ export function generateQuestions(): Question[] {
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 2) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What was the conversion count on ${metric.date}?`,
|
||||
groundTruth: String(metric.conversions),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 3) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many clicks were recorded on ${metric.date}?`,
|
||||
groundTruth: String(metric.clicks),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
else {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What was the bounce rate on ${metric.date}?`,
|
||||
groundTruth: String(metric.bounceRate),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregation: totals (4 questions)
|
||||
// Aggregation: totals and averages
|
||||
const totalViews = analytics.reduce((sum, m) => sum + m.views, 0)
|
||||
const totalRevenue = analytics.reduce((sum, m) => sum + m.revenue, 0)
|
||||
const totalConversions = analytics.reduce((sum, m) => sum + m.conversions, 0)
|
||||
const avgViews = Math.round(totalViews / analytics.length)
|
||||
const avgRevenue = totalRevenue / analytics.length
|
||||
const avgConversions = Math.round(totalConversions / analytics.length)
|
||||
|
||||
questions.push(
|
||||
{
|
||||
@@ -286,27 +449,97 @@ export function generateQuestions(): Question[] {
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average number of views per day?',
|
||||
groundTruth: String(avgViews),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average revenue per day?',
|
||||
groundTruth: String(avgRevenue.toFixed(2)),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average number of conversions per day?',
|
||||
groundTruth: String(avgConversions),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many days are included in the analytics data?',
|
||||
groundTruth: String(analytics.length),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the highest number of views recorded in a single day?',
|
||||
groundTruth: String(Math.max(...analytics.map(m => m.views))),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
},
|
||||
)
|
||||
|
||||
// Filtering: high-performing days (10 questions)
|
||||
const viewThresholds = [5000, 6000, 7000]
|
||||
for (const threshold of viewThresholds) {
|
||||
// Aggregation: high-performing days (single-condition filters)
|
||||
for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
|
||||
const count = analytics.filter(m => m.views > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many days had more than ${threshold} views?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: multi-condition queries (views AND conversions)
|
||||
for (const viewThreshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
|
||||
const count = analytics.filter(m => m.views > viewThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many days had more than ${viewThreshold} views and more than ${QUESTION_THRESHOLDS.analytics.conversionsForFiltering} conversions?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
|
||||
const conversionThresholds = [10, 20, 30]
|
||||
for (const threshold of conversionThresholds) {
|
||||
const count = analytics.filter(m => m.conversions > threshold).length
|
||||
// Filtering: views AND revenue (expanded)
|
||||
for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueThresholds.slice(0, 5)) {
|
||||
const count = analytics.filter(m => m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue && m.revenue > revenueThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many days had more than ${threshold} conversions?`,
|
||||
prompt: `How many days had more than ${QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue} views and revenue greater than ${revenueThreshold}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: clicks AND conversions (multi-condition)
|
||||
for (const clickThreshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
|
||||
const count = analytics.filter(m => m.clicks > clickThreshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many days had more than ${clickThreshold} clicks and more than ${QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering} conversions?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'analytics',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: revenue AND bounce rate (multi-condition)
|
||||
for (const revenueThreshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
|
||||
const count = analytics.filter(m => m.revenue > revenueThreshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many days had revenue greater than ${revenueThreshold} and bounce rate less than ${QUESTION_THRESHOLDS.analytics.bounceRateThreshold}?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'analytics',
|
||||
@@ -314,79 +547,159 @@ export function generateQuestions(): Question[] {
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================
|
||||
// GITHUB DATASET QUESTIONS (40 questions)
|
||||
// ========================================
|
||||
|
||||
if (github.length > 0) {
|
||||
// Field retrieval: specific repos (20 questions)
|
||||
for (let i = 0; i < Math.min(20, github.length); i++) {
|
||||
const repo = github[i * 10] || github[i]
|
||||
// Helper to extract owner from repo field
|
||||
const getOwner = (repoFullName: string) => repoFullName.split('/')[0]!
|
||||
|
||||
// Field retrieval: specific repos (diverse fields)
|
||||
for (let i = 0; i < Math.min(QUESTION_LIMITS.github.fieldRetrievalRepos, github.length); i++) {
|
||||
const repo = github[i * 7]
|
||||
if (!repo)
|
||||
continue
|
||||
|
||||
if (i % 2 === 0) {
|
||||
if (i % 5 === 0) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many stars does ${repo.owner}/${repo.name} have?`,
|
||||
prompt: `How many stars does ${repo.repo} have?`,
|
||||
groundTruth: String(repo.stars),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 1) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many forks does ${repo.repo} have?`,
|
||||
groundTruth: String(repo.forks),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 2) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `Who is the owner of ${repo.repo}?`,
|
||||
groundTruth: getOwner(repo.repo),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
else if (i % 5 === 3) {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `What is the default branch of ${repo.repo}?`,
|
||||
groundTruth: repo.defaultBranch,
|
||||
type: 'field-retrieval',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
else {
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many forks does ${repo.owner}/${repo.name} have?`,
|
||||
groundTruth: String(repo.forks),
|
||||
prompt: `How many watchers does ${repo.repo} have?`,
|
||||
groundTruth: String(repo.watchers),
|
||||
type: 'field-retrieval',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Aggregation: count by owner (5 questions)
|
||||
const owners = [...new Set(github.map(r => r.owner))]
|
||||
for (const owner of owners.slice(0, 5)) {
|
||||
const count = github.filter(r => r.owner === owner).length
|
||||
// Aggregation: popular repositories
|
||||
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
|
||||
const totalRepos = github.length
|
||||
const avgStars = Math.round(totalStars / totalRepos)
|
||||
|
||||
questions.push(
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the total number of stars across all repositories?',
|
||||
groundTruth: String(totalStars),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'How many repositories are in the dataset?',
|
||||
groundTruth: String(totalRepos),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
},
|
||||
{
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the average number of stars per repository?',
|
||||
groundTruth: String(avgStars),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
},
|
||||
)
|
||||
|
||||
// Aggregation: star thresholds (single-condition filters)
|
||||
for (const threshold of QUESTION_THRESHOLDS.github.stars) {
|
||||
const count = github.filter(r => r.stars > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories does ${owner} have in the dataset?`,
|
||||
prompt: `How many repositories have more than ${threshold} stars?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
|
||||
// Aggregation: total stars
|
||||
const totalStars = github.reduce((sum, r) => sum + r.stars, 0)
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: 'What is the total number of stars across all repositories?',
|
||||
groundTruth: String(totalStars),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
})
|
||||
|
||||
// Filtering: popular repos (8 questions)
|
||||
const starThresholds = [10000, 50000, 100000]
|
||||
for (const threshold of starThresholds) {
|
||||
const count = github.filter(r => r.stars > threshold).length
|
||||
// Aggregation: fork thresholds (single-condition filters)
|
||||
for (const threshold of QUESTION_THRESHOLDS.github.forks) {
|
||||
const count = github.filter(r => r.forks > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories have more than ${threshold} stars?`,
|
||||
prompt: `How many repositories have more than ${threshold} forks?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
|
||||
// Aggregation: watcher thresholds (single-condition filters)
|
||||
for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
|
||||
const count = github.filter(r => r.watchers > threshold).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories have more than ${threshold} watchers?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
|
||||
// Aggregation: default branch counts
|
||||
const branches = [...new Set(github.map(r => r.defaultBranch))]
|
||||
for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
|
||||
const count = github.filter(r => r.defaultBranch === branch).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories use "${branch}" as their default branch?`,
|
||||
groundTruth: String(count),
|
||||
type: 'aggregation',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
|
||||
// Filtering: multi-condition queries (stars AND forks)
|
||||
for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
|
||||
const count = github.filter(r => r.stars > combo.stars && r.forks > combo.forks).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.forks} forks?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'github',
|
||||
})
|
||||
}
|
||||
|
||||
const forkThresholds = [1000, 5000, 10000]
|
||||
for (const threshold of forkThresholds) {
|
||||
const count = github.filter(r => r.forks > threshold).length
|
||||
// Filtering: stars AND watchers (multi-condition)
|
||||
for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
|
||||
const count = github.filter(r => r.stars > combo.stars && r.watchers > combo.watchers).length
|
||||
questions.push({
|
||||
id: `q${idCounter++}`,
|
||||
prompt: `How many repositories have more than ${threshold} forks?`,
|
||||
prompt: `How many repositories have more than ${combo.stars} stars and more than ${combo.watchers} watchers?`,
|
||||
groundTruth: String(count),
|
||||
type: 'filtering',
|
||||
dataset: 'github',
|
||||
@@ -394,14 +707,5 @@ export function generateQuestions(): Question[] {
|
||||
}
|
||||
}
|
||||
|
||||
consola.info(`Question breakdown:`)
|
||||
consola.box(`
|
||||
Tabular: ${questions.filter(q => q.dataset === 'tabular').length}
|
||||
Nested: ${questions.filter(q => q.dataset === 'nested').length}
|
||||
Analytics: ${questions.filter(q => q.dataset === 'analytics').length}
|
||||
GitHub: ${questions.filter(q => q.dataset === 'github').length}
|
||||
Total: ${questions.length}
|
||||
`.trim())
|
||||
|
||||
return questions
|
||||
}
|
||||
|
||||
@@ -1,21 +1,9 @@
|
||||
/**
|
||||
* Report generation for TOON benchmarks
|
||||
*
|
||||
* Handles:
|
||||
* - Statistical analysis
|
||||
* - Markdown report generation with visual elements
|
||||
* - Per-dataset breakdowns
|
||||
* - Cost analysis
|
||||
* - Result file saving
|
||||
*/
|
||||
|
||||
import type { EvaluationResult, FormatResult, Question } from './types'
|
||||
import * as fsp from 'node:fs/promises'
|
||||
import * as path from 'node:path'
|
||||
import { BENCHMARKS_DIR } from './constants'
|
||||
import { datasets } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { createProgressBar, ensureDir, saveJsonFile, tokenize } from './utils'
|
||||
import { createProgressBar, ensureDir, tokenize } from './utils'
|
||||
|
||||
/**
|
||||
* Calculate per-format statistics from evaluation results
|
||||
@@ -63,8 +51,8 @@ export function generateMarkdownReport(
|
||||
const json = formatResults.find(r => r.format === 'json')
|
||||
|
||||
// Build model-by-model breakdown with ASCII bars
|
||||
const modelCount = Object.keys(models).length
|
||||
const modelNames = Object.keys(models)
|
||||
const modelNames = [...new Set(results.map(r => r.model))].reverse()
|
||||
const modelCount = modelNames.length
|
||||
|
||||
const modelBreakdown = modelNames.map((modelName, i) => {
|
||||
const modelResults = formatResults.map((fr) => {
|
||||
@@ -136,7 +124,7 @@ export function generateMarkdownReport(
|
||||
})
|
||||
|
||||
const tableRows = datasetResults.slice(0, 6).map(result =>
|
||||
`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString()} | ${result.correctCount}/${result.totalCount} |`,
|
||||
`| \`${result.format}\` | ${(result.accuracy * 100).toFixed(1)}% | ${result.tokens.toLocaleString('en-US')} | ${result.correctCount}/${result.totalCount} |`,
|
||||
).join('\n')
|
||||
|
||||
return `
|
||||
@@ -180,6 +168,27 @@ ${tableRows}
|
||||
// Calculate total unique questions
|
||||
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
|
||||
|
||||
// Calculate question type distribution
|
||||
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
|
||||
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
|
||||
const filteringCount = questions.filter(q => q.type === 'filtering').length
|
||||
|
||||
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
|
||||
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
|
||||
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
|
||||
|
||||
// Calculate dataset sizes
|
||||
const tabularSize = datasets.find(d => d.name === 'tabular')?.data.employees?.length || 0
|
||||
const nestedSize = datasets.find(d => d.name === 'nested')?.data.orders?.length || 0
|
||||
const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
|
||||
const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
|
||||
|
||||
// Calculate number of formats and models
|
||||
const formatCount = formatResults.length
|
||||
const modelsUsed = [...new Set(results.map(r => r.model))]
|
||||
const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
|
||||
const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
|
||||
|
||||
return `
|
||||
### Retrieval Accuracy
|
||||
|
||||
@@ -213,39 +222,41 @@ This benchmark tests **LLM comprehension and data retrieval accuracy** across di
|
||||
|
||||
Four datasets designed to test different structural patterns:
|
||||
|
||||
1. **Tabular** (100 employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||
2. **Nested** (50 e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||
3. **Analytics** (60 days of metrics): Time-series data with dates and numeric values.
|
||||
4. **GitHub** (100 repositories): Real-world data from top GitHub repos by stars.
|
||||
1. **Tabular** (${tabularSize} employee records): Uniform objects with identical fields – optimal for TOON's tabular format.
|
||||
2. **Nested** (${nestedSize} e-commerce orders): Complex structures with nested customer objects and item arrays.
|
||||
3. **Analytics** (${analyticsSize} days of metrics): Time-series data with dates and numeric values.
|
||||
4. **GitHub** (${githubSize} repositories): Real-world data from top GitHub repos by stars.
|
||||
|
||||
#### Question Types
|
||||
|
||||
${totalQuestions} questions are generated dynamically across three categories:
|
||||
|
||||
- **Field retrieval (50%)**: Direct value lookups
|
||||
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
|
||||
- Example: "What is Alice's salary?" → \`75000\`
|
||||
- Example: "How many items are in order ORD-0042?" → \`3\`
|
||||
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
|
||||
|
||||
- **Aggregation (25%)**: Counting and summation tasks
|
||||
- **Aggregation (${aggregationPercent}%)**: Dataset-level totals and averages plus single-condition filters (counts, sums, min/max comparisons)
|
||||
- Example: "How many employees work in Engineering?" → \`17\`
|
||||
- Example: "What is the total revenue across all orders?" → \`45123.50\`
|
||||
- Example: "How many employees have salary > 80000?" → \`23\`
|
||||
|
||||
- **Filtering (25%)**: Conditional queries
|
||||
- **Filtering (${filteringPercent}%)**: Multi-condition queries requiring compound logic (AND constraints across fields)
|
||||
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
|
||||
- Example: "How many orders have total > 400?" → \`12\`
|
||||
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
|
||||
|
||||
#### Evaluation Process
|
||||
|
||||
1. **Format conversion:** Each dataset is converted to all 5 formats (TOON, JSON, YAML, CSV, XML).
|
||||
1. **Format conversion:** Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => f.format.toUpperCase()).join(', ')}).
|
||||
2. **Query LLM**: Each model receives formatted data + question in a prompt and extracts the answer.
|
||||
4. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||
3. **Validate with LLM-as-judge**: \`gpt-5-nano\` validates if the answer is semantically correct (e.g., \`50000\` = \`$50,000\`, \`Engineering\` = \`engineering\`, \`2025-01-01\` = \`January 1, 2025\`).
|
||||
|
||||
#### Models & Configuration
|
||||
|
||||
- **Models tested**: \`gpt-5-nano\`, \`claude-haiku-4-5\`, \`gemini-2.5-flash\`
|
||||
- **Models tested**: ${modelsListStr}
|
||||
- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
|
||||
- **Temperature**: 0 (for non-reasoning models)
|
||||
- **Total evaluations**: 159 questions × 5 formats × 3 models = 2,385 LLM calls
|
||||
- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
|
||||
|
||||
</details>
|
||||
`.trimStart()
|
||||
@@ -272,6 +283,10 @@ export function calculateTokenCounts(
|
||||
|
||||
/**
|
||||
* Save results to disk
|
||||
*
|
||||
* @remarks
|
||||
* Per-model results are managed separately via storage.ts
|
||||
* This function only generates the aggregated markdown report
|
||||
*/
|
||||
export async function saveResults(
|
||||
results: EvaluationResult[],
|
||||
@@ -279,31 +294,12 @@ export async function saveResults(
|
||||
questions: Question[],
|
||||
tokenCounts: Record<string, number>,
|
||||
): Promise<string> {
|
||||
const resultsDir = path.join(BENCHMARKS_DIR, 'results', 'accuracy')
|
||||
const resultsDir = path.join(BENCHMARKS_DIR, 'results')
|
||||
await ensureDir(resultsDir)
|
||||
|
||||
// Save raw results
|
||||
await saveJsonFile(path.join(resultsDir, 'raw-results.json'), results)
|
||||
|
||||
// Save summary
|
||||
await saveJsonFile(
|
||||
path.join(resultsDir, 'summary.json'),
|
||||
{
|
||||
formatResults,
|
||||
questions: questions.length,
|
||||
models: Object.keys(models),
|
||||
datasets: datasets.map(d => ({ name: d.name, description: d.description })),
|
||||
tokenCounts,
|
||||
timestamp: new Date().toISOString(),
|
||||
},
|
||||
)
|
||||
|
||||
// Generate markdown report
|
||||
// Generate markdown report from all available model results
|
||||
const report = generateMarkdownReport(formatResults, results, questions, tokenCounts)
|
||||
await fsp.writeFile(
|
||||
path.join(resultsDir, 'report.md'),
|
||||
report,
|
||||
)
|
||||
await fsp.writeFile(path.join(resultsDir, 'retrieval-accuracy.md'), report)
|
||||
|
||||
return resultsDir
|
||||
}
|
||||
|
||||
46
benchmarks/src/storage.ts
Normal file
46
benchmarks/src/storage.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import type { Storage, StorageValue } from 'unstorage'
|
||||
import type { EvaluationResult } from './types'
|
||||
import * as path from 'node:path'
|
||||
import { createStorage } from 'unstorage'
|
||||
import fsDriver from 'unstorage/drivers/fs'
|
||||
import { BENCHMARKS_DIR } from './constants'
|
||||
|
||||
/**
|
||||
* Storage instance for model results
|
||||
*
|
||||
* @remarks
|
||||
* Stores results in: `benchmarks/results/accuracy/models/`
|
||||
*/
|
||||
export const resultsStorage: Storage<StorageValue> = createStorage({
|
||||
driver: fsDriver({
|
||||
base: path.join(BENCHMARKS_DIR, 'results', 'accuracy', 'models'),
|
||||
}),
|
||||
})
|
||||
|
||||
export async function loadModelResults(modelId: string): Promise<EvaluationResult[] | undefined> {
|
||||
const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
|
||||
return data ?? undefined
|
||||
}
|
||||
|
||||
export async function saveModelResults(modelId: string, results: EvaluationResult[]): Promise<void> {
|
||||
await resultsStorage.setItem(modelId, results)
|
||||
}
|
||||
|
||||
export async function getAllModelResults(): Promise<Record<string, EvaluationResult[]>> {
|
||||
const keys = await resultsStorage.getKeys()
|
||||
const results: Record<string, EvaluationResult[]> = {}
|
||||
|
||||
await Promise.all(
|
||||
keys.map(async (modelId) => {
|
||||
const data = await resultsStorage.getItem<EvaluationResult[]>(modelId)
|
||||
if (data)
|
||||
results[modelId] = data
|
||||
}),
|
||||
)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
export async function hasModelResults(modelId: string): Promise<boolean> {
|
||||
return await resultsStorage.hasItem(modelId)
|
||||
}
|
||||
@@ -1,13 +1,3 @@
|
||||
/**
|
||||
* Shared utility functions for TOON benchmarks
|
||||
*
|
||||
* Provides common functionality used across multiple benchmark scripts:
|
||||
* - Progress bar visualization
|
||||
* - Token counting
|
||||
* - File I/O operations
|
||||
* - Retry logic for API calls
|
||||
*/
|
||||
|
||||
import * as fsp from 'node:fs/promises'
|
||||
import { encode } from 'gpt-tokenizer'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user