chore(benchmarks): add structure-awareness questions

This commit is contained in:
Johann Schopplich
2025-11-07 09:03:51 +01:00
parent 853c3babea
commit 89df613059
13 changed files with 522 additions and 67 deletions

View File

@@ -4,37 +4,11 @@ import * as url from 'node:url'
export const ROOT_DIR: string = url.fileURLToPath(new URL('../../', import.meta.url))
export const BENCHMARKS_DIR: string = url.fileURLToPath(new URL('../', import.meta.url))
/**
* Model-specific RPM (requests per minute) limits to handle API quotas
*
* @remarks
* Set `undefined` for models without specific limits.
*/
/// keep-sorted
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
'claude-haiku-4-5-20251001': 50,
'gemini-2.5-flash': 25,
'gpt-5-nano': 50,
'grok-4-fast-non-reasoning': 50,
}
/**
* Default concurrency for parallel evaluations to prevent bursting
*/
export const DEFAULT_CONCURRENCY = 10
/**
* Display names for data format types
*/
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
'json-pretty': 'JSON',
'json-compact': 'JSON compact',
'toon': 'TOON',
'csv': 'CSV',
'xml': 'XML',
'yaml': 'YAML',
} as const
/**
* Enable dry run mode for quick testing with limited AI requests
*
@@ -51,12 +25,80 @@ export const DRY_RUN_LIMITS = {
maxQuestions: 10,
}
/**
* Model-specific RPM (requests per minute) limits to handle API quotas
*
* @remarks
* Set `undefined` for models without specific limits.
*/
/// keep-sorted
export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
'claude-haiku-4-5-20251001': 50,
'gemini-2.5-flash': 25,
'gpt-5-nano': 50,
'grok-4-fast-non-reasoning': 50,
}
/**
* Display names for data format types
*/
export const FORMATTER_DISPLAY_NAMES: Record<string, string> = {
'json-pretty': 'JSON',
'json-compact': 'JSON compact',
'toon': 'TOON',
'csv': 'CSV',
'xml': 'XML',
'yaml': 'YAML',
} as const
/**
* Question type identifiers
*/
export const QUESTION_TYPES = [
'field-retrieval',
'aggregation',
'filtering',
'structure-awareness',
] as const
/**
* Display names for question types
*/
export const QUESTION_TYPE_LABELS = {
'field-retrieval': 'Field Retrieval',
'aggregation': 'Aggregation',
'filtering': 'Filtering',
'structure-awareness': 'Structure Awareness',
} as const
/**
* Dataset identifiers
*/
export const DATASET_NAMES = [
'tabular',
'nested',
'analytics',
'github',
'event-logs',
'nested-config',
] as const
/**
* Structure class identifiers
*/
export const STRUCTURE_CLASSES = [
'uniform',
'semi-uniform',
'nested',
'deep',
] as const
/**
* Threshold values for filtering and aggregation questions
*/
export const QUESTION_THRESHOLDS = {
tabular: {
salaryRanges: [60000, 80000, 100000, 120000],
salaryRanges: [60000, 80000, 100000],
experienceYears: [5, 10, 15, 20],
departmentSalaryThreshold: 80000,
departmentExperienceThreshold: 10,
@@ -68,11 +110,11 @@ export const QUESTION_THRESHOLDS = {
totalThresholdsForItems: [300, 500],
},
analytics: {
views: [5000, 7000],
conversions: [10, 30],
views: [6000],
conversions: [20],
viewsForFiltering: [6000, 7000],
conversionsForFiltering: 15,
revenueThresholds: [500, 1000, 1500, 2000, 2500],
revenueThresholds: [1000, 1500, 2000],
viewsThresholdForRevenue: 6000,
clicksForFiltering: [250, 400],
conversionsForClickFiltering: 15,
@@ -81,8 +123,8 @@ export const QUESTION_THRESHOLDS = {
},
github: {
stars: [100000, 150000, 200000],
forks: [20000, 35000, 50000],
watchers: [5000, 8000],
forks: [20000, 35000],
watchers: [8000],
starForkCombinations: [
{ stars: 75000, forks: 15000 },
{ stars: 100000, forks: 20000 },
@@ -101,18 +143,18 @@ export const QUESTION_THRESHOLDS = {
*/
export const QUESTION_LIMITS = {
tabular: {
fieldRetrieval: 14,
aggregationDepartments: 4,
fieldRetrieval: 12,
aggregationDepartments: 3,
filteringMultiConditionDepartments: 5,
filteringExperience: 3,
filteringDepartmentExp: 3,
filteringDepartmentActive: 3,
filteringDepartmentActive: 2,
},
nested: {
fieldRetrievalOrders: 8,
fieldRetrievalCustomers: 10,
aggregationStatuses: 5,
filteringStatusAndValue: 5,
fieldRetrievalCustomers: 8,
aggregationStatuses: 3,
filteringStatusAndValue: 4,
filteringStatusAndItems: 3,
},
analytics: {
@@ -121,16 +163,17 @@ export const QUESTION_LIMITS = {
github: {
fieldRetrievalRepos: 11,
aggregationBranches: 2,
filteringStarsAndForks: 8,
filteringStarsAndForks: 3,
},
eventLogs: {
fieldRetrieval: 10,
aggregationEndpoints: 4,
aggregationEndpoints: 2,
filteringLevelAndStatus: 3,
filteringEndpointAndStatus: 3,
filteringEndpointRetryable: 2,
},
nestedConfig: {
fieldRetrieval: 10,
filteringComplex: 6,
filteringComplex: 5,
},
} as const

View File

@@ -181,7 +181,7 @@ export function generateAnalyticsData(days: number, startDate = '2025-01-01'): {
/**
* Generate employee data (uniform tabular structure)
*/
const departments: readonly string[] = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
const departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Operations', 'Finance'] as const
function generateEmployees(count: number): { employees: Employee[] } {
return {

View File

@@ -16,6 +16,33 @@ export const models: LanguageModelV2[] = [
xai('grok-4-fast-non-reasoning'),
]
/**
* Format primers
*
* @remarks
* Neutral descriptions to help models parse each format.
*/
export const PRIMERS: Record<string, string> = {
'toon': 'TOON: Indentation-based. Arrays declare length and fields (e.g., items[N]{f1,f2}:). Rows use single delimiter. Values may be quoted.',
'json-pretty': 'JSON: Strict JSON objects/arrays with repeated keys per row.',
'json-compact': 'JSON (compact): Strict JSON without extra whitespace.',
'yaml': 'YAML: Indentation-based key/value and lists (- items).',
'xml': 'XML: Tag-based tree structure with nested elements.',
'csv': 'CSV: Header row, comma-separated values. First row contains field names.',
}
/**
* Code fence language tags for proper syntax highlighting
*/
export const FENCE: Record<string, string> = {
'toon': 'toon',
'json-pretty': 'json',
'json-compact': 'json',
'yaml': 'yaml',
'xml': 'xml',
'csv': 'csv',
}
/**
* Evaluate a single question with a specific format and model
*/
@@ -33,10 +60,15 @@ export async function evaluateQuestion(
model: LanguageModelV2
},
): Promise<EvaluationResult> {
const primer = PRIMERS[formatName] ?? ''
const fence = FENCE[formatName] ?? ''
const prompt = `
${primer}
Given the following data in ${formatName} format:
\`\`\`
\`\`\`${fence}
${formattedData}
\`\`\`

View File

@@ -166,7 +166,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
}
// Filtering: endpoint AND retryable error
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointRetryable)) {
const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
questions.push(
new QuestionBuilder()

View File

@@ -6,11 +6,12 @@ import { generateEventLogsQuestions } from './event-logs'
import { generateGithubQuestions } from './github'
import { generateNestedQuestions } from './nested'
import { generateNestedConfigQuestions } from './nested-config'
import { generateStructureQuestions } from './structure'
import { generateTabularQuestions } from './tabular'
import { createIdGenerator } from './utils'
/**
* Generate ~200 questions from all datasets
* Generate questions from all datasets
*
* @remarks
* - Field Retrieval: Direct field access with no computation
@@ -19,6 +20,8 @@ import { createIdGenerator } from './utils'
* Examples: "How many X?", "What is the total/average?", "How many X > threshold?"
* - Filtering: Multi-condition queries requiring complex logical operations
* Examples: "How many X WHERE condition1 AND condition2?"
* - Structure Awareness: Tests format-native structural affordances (TOON's [N] and {fields}, CSV's header)
* Examples: "How many records?", "List the field names", "What is the last record's field?"
*/
export function generateQuestions(): Question[] {
const questions: Question[] = []
@@ -41,5 +44,8 @@ export function generateQuestions(): Question[] {
questions.push(...generateEventLogsQuestions(eventLogs, getId))
questions.push(...generateNestedConfigQuestions(nestedConfig, getId))
// Generate structure-awareness questions (tests format-native affordances)
questions.push(...generateStructureQuestions(tabular, nested, analytics, github, eventLogs, getId))
return questions
}

View File

@@ -152,7 +152,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
// Aggregation: additional nested counts
const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
@@ -173,13 +172,6 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('How many distinct scopes are defined across all authentication providers?')
.groundTruth(String(distinctScopes))
.type('aggregation')
.dataset('nested-config')
.build(),
new QuestionBuilder()
.id(getId())
.prompt('What is the total number of variants across all feature flags?')

View File

@@ -0,0 +1,324 @@
import type { AnalyticsMetric, Employee, EventLog, Order, Repository } from '../datasets'
import type { Question } from '../types'
import { QuestionBuilder } from './utils'
/**
* Generate structure-awareness questions across all datasets
*
* These questions test format-native structural affordances:
* - TOON's explicit array length [N] and field declarations {fields}
* - CSV's header row (but no explicit length)
* - JSON/YAML have neither unless the model counts manually
*/
export function generateStructureQuestions(
employees: Employee[],
orders: Order[],
metrics: AnalyticsMetric[],
repos: Repository[],
logs: EventLog[],
getId: () => string,
): Question[] {
const questions: Question[] = []
// ========== TABULAR DATASET (Employees) ==========
// Count: Total employees (tests array length awareness)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many employees are in the dataset?')
.groundTruth(String(employees.length))
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Field list: Employee fields (tests field name awareness)
const employeeFields = 'id,name,email,department,salary,yearsExperience,active'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for employees (comma-separated, in order).')
.groundTruth(employeeFields)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Positional: Third field name for employees (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 3rd field name for employees?')
.groundTruth('email')
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Last row: Last employee's department (tests ability to find last row using length)
const lastEmployee = employees.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the department of the last employee in the dataset?')
.groundTruth(lastEmployee.department)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Last row: Last employee's name
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the name of the last employee in the dataset?')
.groundTruth(lastEmployee.name)
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// Field count: How many fields per employee (tests schema awareness)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each employee record have?')
.groundTruth('7')
.type('structure-awareness')
.dataset('tabular')
.build(),
)
// ========== NESTED DATASET (Orders) ==========
// Count: Total orders
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many orders are in the dataset?')
.groundTruth(String(orders.length))
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Field list: Order fields
const orderFields = 'orderId,customer,items,subtotal,tax,total,status,orderDate'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the top-level field names for orders (comma-separated, in order).')
.groundTruth(orderFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Nested count: Items in specific order
const orderWithManyItems = orders.reduce((max, order) =>
order.items.length > max.items.length ? order : max,
)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt(`How many items are in order ${orderWithManyItems.orderId}?`)
.groundTruth(String(orderWithManyItems.items.length))
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Nested field list: Item fields
const itemFields = 'sku,name,quantity,price'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What are the field names for items within orders (comma-separated, in order)?')
.groundTruth(itemFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Last row: Last order's status
const lastOrder = orders.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the status of the last order in the dataset?')
.groundTruth(lastOrder.status)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// Customer field list
const customerFields = 'id,name,email,phone'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What are the field names for customer objects within orders (comma-separated, in order)?')
.groundTruth(customerFields)
.type('structure-awareness')
.dataset('nested')
.build(),
)
// ========== ANALYTICS DATASET (Metrics) ==========
// Count: Total metrics
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many metric records are in the dataset?')
.groundTruth(String(metrics.length))
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Field list: Metric fields
const metricFields = 'date,views,clicks,conversions,revenue,bounceRate'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for metrics (comma-separated, in order).')
.groundTruth(metricFields)
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Positional: Fifth field name for metrics (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 5th field name for analytics metrics?')
.groundTruth('revenue')
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Last row: Last metric's date
const lastMetric = metrics.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the date of the last metric record in the dataset?')
.groundTruth(lastMetric.date)
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// Field count: How many fields per metric
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each metric record have?')
.groundTruth('6')
.type('structure-awareness')
.dataset('analytics')
.build(),
)
// ========== GITHUB DATASET (Repositories) ==========
// Count: Total repositories
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many repositories are in the dataset?')
.groundTruth(String(repos.length))
.type('structure-awareness')
.dataset('github')
.build(),
)
// Field list: Repository fields
const repoFields = 'id,name,repo,description,stars,watchers,forks,defaultBranch,createdAt,updatedAt,pushedAt'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for repositories (comma-separated, in order).')
.groundTruth(repoFields)
.type('structure-awareness')
.dataset('github')
.build(),
)
// Positional: Seventh field name for repos (tests TOON {fields} syntax)
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the 7th field name for GitHub repositories?')
.groundTruth('forks')
.type('structure-awareness')
.dataset('github')
.build(),
)
// Last row: Last repo's name
const lastRepo = repos.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the name of the last repository in the dataset?')
.groundTruth(lastRepo.name)
.type('structure-awareness')
.dataset('github')
.build(),
)
// Field count: How many fields per repository
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many fields does each repository record have?')
.groundTruth('11')
.type('structure-awareness')
.dataset('github')
.build(),
)
// ========== EVENT LOGS DATASET ==========
// Count: Total logs
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('How many log entries are in the dataset?')
.groundTruth(String(logs.length))
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
// Field list: Base log fields (including optional error)
const logFields = 'timestamp,level,endpoint,statusCode,responseTime,userId,error'
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('List the field names for log entries (comma-separated, any order, including optional fields).')
.groundTruth(logFields)
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
// Last row: Last log's level
const lastLog = logs.at(-1)!
questions.push(
new QuestionBuilder()
.id(getId())
.prompt('What is the level of the last log entry in the dataset?')
.groundTruth(lastLog.level)
.type('structure-awareness')
.dataset('event-logs')
.build(),
)
return questions
}

View File

@@ -1,5 +1,5 @@
import type { Dataset, EfficiencyRanking, EvaluationResult, FormatResult, Question } from './types'
import { FORMATTER_DISPLAY_NAMES } from './constants'
import { FORMATTER_DISPLAY_NAMES, QUESTION_TYPE_LABELS, QUESTION_TYPES } from './constants'
import { ACCURACY_DATASETS } from './datasets'
import { models } from './evaluate'
import { supportsCSV } from './formatters'
@@ -22,9 +22,9 @@ export function calculateTokenCounts(
if (formatName === 'csv' && !supportsCSV(dataset))
continue
const formatted = formatter(dataset.data)
const formattedData = formatter(dataset.data)
const key = `${formatName}-${dataset.name}`
tokenCounts[key] = tokenize(formatted)
tokenCounts[key] = tokenize(formattedData)
}
}
@@ -200,16 +200,21 @@ function generateDetailedAccuracyReport(
// Generate performance by model
const modelPerformance = generateModelPerformanceTable(formatResults, results, modelNames)
// Generate question type breakdown
const questionTypeBreakdown = generateQuestionTypeBreakdown(formatResults, results, questions)
const totalQuestions = [...new Set(results.map(r => r.questionId))].length
// Calculate question type distribution
const fieldRetrievalCount = questions.filter(q => q.type === 'field-retrieval').length
const aggregationCount = questions.filter(q => q.type === 'aggregation').length
const filteringCount = questions.filter(q => q.type === 'filtering').length
const structureAwarenessCount = questions.filter(q => q.type === 'structure-awareness').length
const fieldRetrievalPercent = ((fieldRetrievalCount / totalQuestions) * 100).toFixed(0)
const aggregationPercent = ((aggregationCount / totalQuestions) * 100).toFixed(0)
const filteringPercent = ((filteringCount / totalQuestions) * 100).toFixed(0)
const structureAwarenessPercent = ((structureAwarenessCount / totalQuestions) * 100).toFixed(0)
// Calculate dataset sizes
const tabularSize = ACCURACY_DATASETS.find(d => d.name === 'tabular')?.data.employees?.length || 0
@@ -233,7 +238,11 @@ ${modelBreakdown}
${summaryComparison}
<details>
<summary><strong>Performance by dataset and model</strong></summary>
<summary><strong>Performance by dataset, model, and question type</strong></summary>
#### Performance by Question Type
${questionTypeBreakdown}
#### Performance by Dataset
@@ -265,9 +274,9 @@ Six datasets designed to test different structural patterns:
#### Question Types
${totalQuestions} questions are generated dynamically across three categories:
${totalQuestions} questions are generated dynamically across four categories:
\- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- **Field retrieval (${fieldRetrievalPercent}%)**: Direct value lookups or values that can be read straight off a record (including booleans and simple counts such as array lengths)
- Example: "What is Alice's salary?" → \`75000\`
- Example: "How many items are in order ORD-0042?" → \`3\`
- Example: "What is the customer name for order ORD-0042?" → \`John Doe\`
@@ -281,6 +290,11 @@ ${totalQuestions} questions are generated dynamically across three categories:
- Example: "How many employees in Sales have salary > 80000?" → \`5\`
- Example: "How many active employees have more than 10 years of experience?" → \`8\`
- **Structure awareness (${structureAwarenessPercent}%)**: Tests format-native structural affordances (TOON's [N] count and {fields}, CSV's header row)
- Example: "How many employees are in the dataset?" → \`100\`
- Example: "List the field names for employees" → \`id, name, email, department, salary, yearsExperience, active\`
- Example: "What is the department of the last employee?" → \`Sales\`
#### Evaluation Process
1. **Format conversion**: Each dataset is converted to all ${formatCount} formats (${formatResults.map(f => FORMATTER_DISPLAY_NAMES[f.format] || f.format).join(', ')}).
@@ -413,6 +427,48 @@ ${tableRows}
}).filter(Boolean).join('\n').trim()
}
/**
* Generate question type breakdown table
*/
function generateQuestionTypeBreakdown(
formatResults: FormatResult[],
results: EvaluationResult[],
questions: Question[],
): string {
// Build header
const formatNames = formatResults.map(fr => FORMATTER_DISPLAY_NAMES[fr.format] || fr.format)
const header = `| Question Type | ${formatNames.join(' | ')} |`
const separator = `| ------------- | ${formatNames.map(() => '----').join(' | ')} |`
// Build rows
const rows = QUESTION_TYPES.map((type) => {
const questionIds = questions.filter(q => q.type === type).map(q => q.id)
const typeResults = results.filter(r => questionIds.includes(r.questionId))
if (typeResults.length === 0)
return undefined
const accuracies = formatResults.map((fr) => {
const formatTypeResults = typeResults.filter(r => r.format === fr.format)
if (formatTypeResults.length === 0)
return 'N/A'
const correctCount = formatTypeResults.filter(r => r.isCorrect).length
const totalCount = formatTypeResults.length
const accuracy = totalCount > 0 ? correctCount / totalCount : 0
return `${(accuracy * 100).toFixed(1)}%`
})
return `| ${QUESTION_TYPE_LABELS[type]} | ${accuracies.join(' | ')} |`
}).filter(Boolean)
return `
${header}
${separator}
${rows.join('\n')}
`.trim()
}
/**
* Generate per-model performance comparison tables
*/

View File

@@ -1,11 +1,17 @@
import type { DATASET_NAMES, QUESTION_TYPES, STRUCTURE_CLASSES } from './constants'
export type QuestionType = typeof QUESTION_TYPES[number]
export type DatasetName = typeof DATASET_NAMES[number]
export type StructureClass = typeof STRUCTURE_CLASSES[number]
export interface DatasetMetadata {
supportsCSV: boolean
structureClass: 'uniform' | 'semi-uniform' | 'nested' | 'deep'
structureClass: StructureClass
tabularEligibility: number
}
export interface Dataset {
name: string
name: DatasetName
description: string
data: Record<string, any>
metadata: DatasetMetadata
@@ -15,8 +21,8 @@ export interface Question {
id: string
prompt: string
groundTruth: string
type: 'field-retrieval' | 'aggregation' | 'filtering'
dataset: string
type: QuestionType
dataset: DatasetName
}
export interface EvaluationResult {