From a9d52fc69baadbdea58770188d171c113e336312 Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Thu, 6 Nov 2025 15:51:31 +0100 Subject: [PATCH] chore: more work on benchmarks --- benchmarks/README.md | 2 +- benchmarks/questions-generated.json | 1416 +++++++++++++++++ .../scripts/token-efficiency-benchmark.ts | 4 +- benchmarks/src/constants.ts | 20 +- benchmarks/src/datasets.ts | 116 +- benchmarks/src/evaluate.ts | 18 +- benchmarks/src/questions/analytics.ts | 29 +- benchmarks/src/questions/event-logs.ts | 49 +- benchmarks/src/questions/github.ts | 23 +- benchmarks/src/questions/index.ts | 3 +- benchmarks/src/questions/nested-config.ts | 118 ++ benchmarks/src/questions/nested.ts | 24 +- benchmarks/src/questions/tabular.ts | 27 +- benchmarks/src/questions/utils.ts | 9 +- benchmarks/src/types.ts | 2 +- 15 files changed, 1647 insertions(+), 213 deletions(-) create mode 100644 benchmarks/questions-generated.json diff --git a/benchmarks/README.md b/benchmarks/README.md index 81e78c5..1309ecb 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -34,7 +34,7 @@ Results are saved to `results/token-efficiency.md`. Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV): -1. Generate ~150-160 questions across 6 datasets (CSV only included for datasets with flat/tabular structure) +1. Generate ~200 questions across 6 datasets (CSV only included for datasets with flat/tabular structure) 2. Convert each dataset to all supported formats 3. Query each LLM with formatted data + question 4. Validate answers using `gpt-5-nano` as judge diff --git a/benchmarks/questions-generated.json b/benchmarks/questions-generated.json new file mode 100644 index 0000000..499eaee --- /dev/null +++ b/benchmarks/questions-generated.json @@ -0,0 +1,1416 @@ +[ + { + "id": "q1", + "prompt": "What is the salary of Constance Mante?", + "groundTruth": "56176", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q2", + "prompt": "What department does Alfonso Leffler work in?", + "groundTruth": "Marketing", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q3", + "prompt": "What is the email address of Mr. Corey Pfeffer?", + "groundTruth": "lorenza.kunze@yahoo.com", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q4", + "prompt": "How many years of experience does Mr. Brendan Harvey have?", + "groundTruth": "22", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q5", + "prompt": "Is Tracy Gleason an active employee?", + "groundTruth": "no", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q6", + "prompt": "What is the salary of Terri Wilkinson?", + "groundTruth": "133081", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q7", + "prompt": "What department does Aubrey Koss work in?", + "groundTruth": "Engineering", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q8", + "prompt": "What is the email address of Darren Homenick?", + "groundTruth": "delpha.russel@gmail.com", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q9", + "prompt": "How many years of experience does Dr. Ken Heller have?", + "groundTruth": "5", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q10", + "prompt": "Is Mr. Wade Collier an active employee?", + "groundTruth": "yes", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q11", + "prompt": "What is the salary of Hannah Waelchi?", + "groundTruth": "109064", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q12", + "prompt": "What department does Emily Harvey work in?", + "groundTruth": "Operations", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q13", + "prompt": "What is the email address of Chester Crist?", + "groundTruth": "henderson70@yahoo.com", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q14", + "prompt": "How many years of experience does Barbara Emard have?", + "groundTruth": "23", + "type": "field-retrieval", + "dataset": "tabular" + }, + { + "id": "q15", + "prompt": "How many employees work in Engineering?", + "groundTruth": "17", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q16", + "prompt": "How many employees work in Sales?", + "groundTruth": "17", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q17", + "prompt": "How many employees work in Marketing?", + "groundTruth": "17", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q18", + "prompt": "How many employees work in HR?", + "groundTruth": "17", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q19", + "prompt": "How many employees have a salary greater than 60000?", + "groundTruth": "91", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q20", + "prompt": "How many employees have a salary greater than 80000?", + "groundTruth": "67", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q21", + "prompt": "How many employees have a salary greater than 100000?", + "groundTruth": "41", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q22", + "prompt": "How many employees have a salary greater than 120000?", + "groundTruth": "26", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q23", + "prompt": "How many employees are in the dataset?", + "groundTruth": "100", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q24", + "prompt": "What is the average salary across all employees?", + "groundTruth": "96503", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q25", + "prompt": "How many employees are active?", + "groundTruth": "78", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q26", + "prompt": "How many employees are inactive?", + "groundTruth": "22", + "type": "aggregation", + "dataset": "tabular" + }, + { + "id": "q27", + "prompt": "How many employees in Engineering have a salary greater than 80000?", + "groundTruth": "12", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q28", + "prompt": "How many employees in Sales have a salary greater than 80000?", + "groundTruth": "11", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q29", + "prompt": "How many employees in Marketing have a salary greater than 80000?", + "groundTruth": "11", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q30", + "prompt": "How many employees in HR have a salary greater than 80000?", + "groundTruth": "12", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q31", + "prompt": "How many employees in Operations have a salary greater than 80000?", + "groundTruth": "11", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q32", + "prompt": "How many active employees have more than 5 years of experience?", + "groundTruth": "63", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q33", + "prompt": "How many active employees have more than 10 years of experience?", + "groundTruth": "53", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q34", + "prompt": "How many active employees have more than 15 years of experience?", + "groundTruth": "39", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q35", + "prompt": "How many employees in Engineering have more than 10 years of experience?", + "groundTruth": "11", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q36", + "prompt": "How many employees in Sales have more than 10 years of experience?", + "groundTruth": "8", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q37", + "prompt": "How many employees in Marketing have more than 10 years of experience?", + "groundTruth": "15", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q38", + "prompt": "How many active employees work in Engineering?", + "groundTruth": "12", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q39", + "prompt": "How many active employees work in Sales?", + "groundTruth": "11", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q40", + "prompt": "How many active employees work in Marketing?", + "groundTruth": "14", + "type": "filtering", + "dataset": "tabular" + }, + { + "id": "q41", + "prompt": "What is the total for order ORD-0001?", + "groundTruth": "103.86", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q42", + "prompt": "What is the status of order ORD-0003?", + "groundTruth": "shipped", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q43", + "prompt": "What is the total for order ORD-0005?", + "groundTruth": "422.5", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q44", + "prompt": "What is the status of order ORD-0007?", + "groundTruth": "processing", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q45", + "prompt": "What is the total for order ORD-0009?", + "groundTruth": "1822.85", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q46", + "prompt": "What is the status of order ORD-0011?", + "groundTruth": "pending", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q47", + "prompt": "What is the total for order ORD-0013?", + "groundTruth": "1311.35", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q48", + "prompt": "What is the status of order ORD-0015?", + "groundTruth": "cancelled", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q49", + "prompt": "What is the customer name for order ORD-0002?", + "groundTruth": "Debbie O'Kon I", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q50", + "prompt": "What is the customer email for order ORD-0004?", + "groundTruth": "demetris.hoeger-pollich@yahoo.com", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q51", + "prompt": "What is the order date for order ORD-0006?", + "groundTruth": "2025-09-15", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q52", + "prompt": "How many items are in order ORD-0008?", + "groundTruth": "3", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q53", + "prompt": "What is the customer name for order ORD-0010?", + "groundTruth": "Patty Senger", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q54", + "prompt": "What is the customer email for order ORD-0012?", + "groundTruth": "viva.paucek@gmail.com", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q55", + "prompt": "What is the order date for order ORD-0014?", + "groundTruth": "2025-09-20", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q56", + "prompt": "How many items are in order ORD-0016?", + "groundTruth": "2", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q57", + "prompt": "What is the customer name for order ORD-0018?", + "groundTruth": "Dennis Wunsch", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q58", + "prompt": "What is the customer email for order ORD-0020?", + "groundTruth": "wilton.oconnell@yahoo.com", + "type": "field-retrieval", + "dataset": "nested" + }, + { + "id": "q59", + "prompt": "How many orders have status \"pending\"?", + "groundTruth": "10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q60", + "prompt": "How many orders have status \"processing\"?", + "groundTruth": "10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q61", + "prompt": "How many orders have status \"shipped\"?", + "groundTruth": "10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q62", + "prompt": "How many orders have status \"delivered\"?", + "groundTruth": "10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q63", + "prompt": "How many orders have status \"cancelled\"?", + "groundTruth": "10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q64", + "prompt": "What is the total revenue across all orders?", + "groundTruth": "34904.81", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q65", + "prompt": "What is the average order value?", + "groundTruth": "698.10", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q66", + "prompt": "How many orders are in the dataset?", + "groundTruth": "50", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q67", + "prompt": "What is the highest order total?", + "groundTruth": "2152.82", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q68", + "prompt": "How many orders have a total greater than 200?", + "groundTruth": "43", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q69", + "prompt": "How many orders have a total greater than 400?", + "groundTruth": "37", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q70", + "prompt": "How many orders have a total greater than 600?", + "groundTruth": "28", + "type": "aggregation", + "dataset": "nested" + }, + { + "id": "q71", + "prompt": "How many orders have status \"pending\" and total greater than 300?", + "groundTruth": "8", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q72", + "prompt": "How many orders have status \"processing\" and total greater than 300?", + "groundTruth": "6", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q73", + "prompt": "How many orders have status \"shipped\" and total greater than 300?", + "groundTruth": "10", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q74", + "prompt": "How many orders have status \"delivered\" and total greater than 300?", + "groundTruth": "9", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q75", + "prompt": "How many orders have status \"cancelled\" and total greater than 300?", + "groundTruth": "8", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q76", + "prompt": "How many orders have status \"pending\" and at least 3 items?", + "groundTruth": "3", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q77", + "prompt": "How many orders have status \"processing\" and at least 3 items?", + "groundTruth": "3", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q78", + "prompt": "How many orders have status \"shipped\" and at least 3 items?", + "groundTruth": "5", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q79", + "prompt": "How many orders have a total greater than 300 and at least 3 items?", + "groundTruth": "20", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q80", + "prompt": "How many orders have a total greater than 500 and at least 3 items?", + "groundTruth": "19", + "type": "filtering", + "dataset": "nested" + }, + { + "id": "q81", + "prompt": "What are the views for 2025-01-01?", + "groundTruth": "4322", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q82", + "prompt": "What is the revenue for 2025-01-04?", + "groundTruth": "10432.04", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q83", + "prompt": "What is the bounce rate for 2025-01-07?", + "groundTruth": "0.53", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q84", + "prompt": "How many conversions were there on 2025-01-10?", + "groundTruth": "32", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q85", + "prompt": "What are the views for 2025-01-13?", + "groundTruth": "4096", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q86", + "prompt": "What is the revenue for 2025-01-16?", + "groundTruth": "4533.1", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q87", + "prompt": "What is the bounce rate for 2025-01-19?", + "groundTruth": "0.63", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q88", + "prompt": "How many conversions were there on 2025-01-22?", + "groundTruth": "25", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q89", + "prompt": "What are the views for 2025-01-25?", + "groundTruth": "4076", + "type": "field-retrieval", + "dataset": "analytics" + }, + { + "id": "q90", + "prompt": "How many days of data are in the dataset?", + "groundTruth": "60", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q91", + "prompt": "What is the total number of views across all dates?", + "groundTruth": "328320", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q92", + "prompt": "What is the total number of conversions across all dates?", + "groundTruth": "1791", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q93", + "prompt": "What is the total revenue across all dates?", + "groundTruth": "311695.88", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q94", + "prompt": "What is the average bounce rate?", + "groundTruth": "0.53", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q95", + "prompt": "How many days had more than 5000 views?", + "groundTruth": "33", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q96", + "prompt": "How many days had more than 7000 views?", + "groundTruth": "14", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q97", + "prompt": "How many days had more than 10 conversions?", + "groundTruth": "57", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q98", + "prompt": "How many days had more than 30 conversions?", + "groundTruth": "26", + "type": "aggregation", + "dataset": "analytics" + }, + { + "id": "q99", + "prompt": "How many days had more than 6000 views and more than 15 conversions?", + "groundTruth": "20", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q100", + "prompt": "How many days had more than 7000 views and more than 15 conversions?", + "groundTruth": "14", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q101", + "prompt": "How many days had revenue greater than 500 with views above 6000?", + "groundTruth": "22", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q102", + "prompt": "How many days had revenue greater than 1000 with views above 6000?", + "groundTruth": "22", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q103", + "prompt": "How many days had revenue greater than 1500 with views above 6000?", + "groundTruth": "22", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q104", + "prompt": "How many days had revenue greater than 2000 with views above 6000?", + "groundTruth": "20", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q105", + "prompt": "How many days had revenue greater than 2500 with views above 6000?", + "groundTruth": "18", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q106", + "prompt": "How many days had more than 250 clicks and more than 15 conversions?", + "groundTruth": "32", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q107", + "prompt": "How many days had more than 400 clicks and more than 15 conversions?", + "groundTruth": "9", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q108", + "prompt": "How many days had revenue greater than 1000 with bounce rate below 0.5?", + "groundTruth": "22", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q109", + "prompt": "How many days had revenue greater than 1500 with bounce rate below 0.5?", + "groundTruth": "22", + "type": "filtering", + "dataset": "analytics" + }, + { + "id": "q110", + "prompt": "How many stars does undefined/freeCodeCamp have?", + "groundTruth": "430886", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q111", + "prompt": "How many forks does undefined/system-design-primer have?", + "groundTruth": "52904", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q112", + "prompt": "How many watchers does undefined/vue have?", + "groundTruth": "5786", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q113", + "prompt": "What is the main branch of undefined/CS-Notes?", + "groundTruth": "master", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q114", + "prompt": "How many stars does undefined/gitignore have?", + "groundTruth": "170327", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q115", + "prompt": "How many forks does undefined/n8n have?", + "groundTruth": "48578", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q116", + "prompt": "How many watchers does undefined/yt-dlp have?", + "groundTruth": "678", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q117", + "prompt": "What is the main branch of undefined/PowerToys?", + "groundTruth": "main", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q118", + "prompt": "How many stars does undefined/free-programming-books-zh_CN have?", + "groundTruth": "115543", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q119", + "prompt": "How many forks does undefined/three.js have?", + "groundTruth": "36054", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q120", + "prompt": "How many watchers does undefined/GitHub-Chinese-Top-Charts have?", + "groundTruth": "2607", + "type": "field-retrieval", + "dataset": "github" + }, + { + "id": "q121", + "prompt": "How many repositories are in the dataset?", + "groundTruth": "100", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q122", + "prompt": "What is the total number of stars across all repositories?", + "groundTruth": "15413563", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q123", + "prompt": "What is the total number of forks across all repositories?", + "groundTruth": "2528243", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q124", + "prompt": "What is the average number of stars per repository?", + "groundTruth": "154136", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q125", + "prompt": "How many repositories use \"main\" as their default branch?", + "groundTruth": "41", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q126", + "prompt": "How many repositories use \"master\" as their default branch?", + "groundTruth": "53", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q127", + "prompt": "How many repositories have more than 100000 stars?", + "groundTruth": "77", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q128", + "prompt": "How many repositories have more than 150000 stars?", + "groundTruth": "37", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q129", + "prompt": "How many repositories have more than 200000 stars?", + "groundTruth": "16", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q130", + "prompt": "How many repositories have more than 20000 forks?", + "groundTruth": "49", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q131", + "prompt": "How many repositories have more than 35000 forks?", + "groundTruth": "23", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q132", + "prompt": "How many repositories have more than 50000 forks?", + "groundTruth": "11", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q133", + "prompt": "How many repositories have more than 5000 watchers?", + "groundTruth": "19", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q134", + "prompt": "How many repositories have more than 8000 watchers?", + "groundTruth": "4", + "type": "aggregation", + "dataset": "github" + }, + { + "id": "q135", + "prompt": "How many repositories have more than 75000 stars and more than 15000 forks?", + "groundTruth": "57", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q136", + "prompt": "How many repositories have more than 100000 stars and more than 20000 forks?", + "groundTruth": "43", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q137", + "prompt": "How many repositories have more than 150000 stars and more than 30000 forks?", + "groundTruth": "25", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q138", + "prompt": "How many repositories have more than 200000 stars and more than 45000 forks?", + "groundTruth": "6", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q139", + "prompt": "How many repositories have more than 100000 stars and more than 7000 watchers?", + "groundTruth": "6", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q140", + "prompt": "How many repositories have more than 150000 stars and more than 9000 watchers?", + "groundTruth": "1", + "type": "filtering", + "dataset": "github" + }, + { + "id": "q141", + "prompt": "What is the level of the log at 2025-11-02T16:55:04.316Z?", + "groundTruth": "error", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q142", + "prompt": "What is the endpoint for the log at 2025-10-31T02:31:28.977Z?", + "groundTruth": "/api/users", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q143", + "prompt": "What is the status code for the log at 2025-11-01T23:56:56.929Z?", + "groundTruth": "424", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q144", + "prompt": "What is the response time for the log at 2025-11-03T12:14:31.017Z?", + "groundTruth": "2849", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q145", + "prompt": "What is the level of the log at 2025-11-01T22:06:30.814Z?", + "groundTruth": "info", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q146", + "prompt": "What is the endpoint for the log at 2025-11-06T05:48:07.260Z?", + "groundTruth": "/api/orders", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q147", + "prompt": "What is the status code for the log at 2025-11-05T23:46:00.144Z?", + "groundTruth": "435", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q148", + "prompt": "What is the response time for the log at 2025-10-31T23:56:23.022Z?", + "groundTruth": "408", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q149", + "prompt": "What is the level of the log at 2025-11-06T01:23:44.734Z?", + "groundTruth": "error", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q150", + "prompt": "What is the endpoint for the log at 2025-11-03T21:54:27.889Z?", + "groundTruth": "/api/users", + "type": "field-retrieval", + "dataset": "event-logs" + }, + { + "id": "q151", + "prompt": "How many log entries are in the dataset?", + "groundTruth": "75", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q152", + "prompt": "What is the average response time across all logs?", + "groundTruth": "2453.41", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q153", + "prompt": "How many log entries have level \"error\"?", + "groundTruth": "29", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q154", + "prompt": "How many log entries have level \"warn\"?", + "groundTruth": "17", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q155", + "prompt": "How many log entries have level \"info\"?", + "groundTruth": "29", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q156", + "prompt": "How many log entries are for endpoint \"/api/products\"?", + "groundTruth": "11", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q157", + "prompt": "How many log entries are for endpoint \"/api/users\"?", + "groundTruth": "18", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q158", + "prompt": "How many log entries are for endpoint \"/api/auth\"?", + "groundTruth": "21", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q159", + "prompt": "How many log entries are for endpoint \"/api/orders\"?", + "groundTruth": "11", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q160", + "prompt": "How many log entries have a status code indicating an error (>= 400)?", + "groundTruth": "33", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q161", + "prompt": "How many log entries have a successful status code (200-299)?", + "groundTruth": "42", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q162", + "prompt": "How many log entries have a retryable error?", + "groundTruth": "25", + "type": "aggregation", + "dataset": "event-logs" + }, + { + "id": "q163", + "prompt": "How many log entries have level \"error\" and status code >= 400?", + "groundTruth": "29", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q164", + "prompt": "How many log entries have level \"warn\" and status code >= 400?", + "groundTruth": "4", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q165", + "prompt": "How many log entries have level \"info\" and status code >= 400?", + "groundTruth": "0", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q166", + "prompt": "How many log entries are for endpoint \"/api/products\" with status code >= 500?", + "groundTruth": "5", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q167", + "prompt": "How many log entries are for endpoint \"/api/users\" with status code >= 500?", + "groundTruth": "2", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q168", + "prompt": "How many log entries are for endpoint \"/api/auth\" with status code >= 500?", + "groundTruth": "3", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q169", + "prompt": "How many log entries for endpoint \"/api/products\" have a retryable error?", + "groundTruth": "4", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q170", + "prompt": "How many log entries for endpoint \"/api/users\" have a retryable error?", + "groundTruth": "5", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q171", + "prompt": "How many log entries for endpoint \"/api/auth\" have a retryable error?", + "groundTruth": "7", + "type": "filtering", + "dataset": "event-logs" + }, + { + "id": "q172", + "prompt": "What is the environment in the configuration?", + "groundTruth": "development", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q173", + "prompt": "What is the database host?", + "groundTruth": "guilty-cake.org", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q174", + "prompt": "What is the database port?", + "groundTruth": "5432", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q175", + "prompt": "What is the maximum connection pool size?", + "groundTruth": "37", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q176", + "prompt": "What is the session duration?", + "groundTruth": "86400", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q177", + "prompt": "What is the minimum connection pool size?", + "groundTruth": "2", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q178", + "prompt": "What is the connection pool idle timeout?", + "groundTruth": "30000", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q179", + "prompt": "What is the database name?", + "groundTruth": "real", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q180", + "prompt": "What is the session refresh threshold?", + "groundTruth": "3600", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q181", + "prompt": "What is the version in the configuration?", + "groundTruth": "6.8.3", + "type": "field-retrieval", + "dataset": "nested-config" + }, + { + "id": "q182", + "prompt": "How many roles are defined in permissions?", + "groundTruth": "3", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q183", + "prompt": "How many groups are defined in permissions?", + "groundTruth": "2", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q184", + "prompt": "How many authentication providers are configured?", + "groundTruth": "2", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q185", + "prompt": "How many feature flags are defined?", + "groundTruth": "2", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q186", + "prompt": "How many database replicas are configured?", + "groundTruth": "3", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q187", + "prompt": "How many authentication providers include the \"admin\" scope?", + "groundTruth": "1", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q188", + "prompt": "How many feature flags are enabled?", + "groundTruth": "0", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q189", + "prompt": "How many permissions does the admin role have?", + "groundTruth": "5", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q190", + "prompt": "What is the total number of permissions across all roles?", + "groundTruth": "8", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q191", + "prompt": "How many distinct permissions are defined across all roles?", + "groundTruth": "5", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q192", + "prompt": "How many distinct scopes are defined across all authentication providers?", + "groundTruth": "3", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q193", + "prompt": "What is the total number of variants across all feature flags?", + "groundTruth": "3", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q194", + "prompt": "How many database replicas have a priority greater than 2?", + "groundTruth": "1", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q195", + "prompt": "How many feature flags have a rollout percentage greater than 50?", + "groundTruth": "0", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q196", + "prompt": "How many groups have more than one role assigned?", + "groundTruth": "1", + "type": "aggregation", + "dataset": "nested-config" + }, + { + "id": "q197", + "prompt": "How many feature flags are enabled with rollout greater than 50%?", + "groundTruth": "0", + "type": "filtering", + "dataset": "nested-config" + }, + { + "id": "q198", + "prompt": "How many groups have the admin role?", + "groundTruth": "1", + "type": "filtering", + "dataset": "nested-config" + }, + { + "id": "q199", + "prompt": "How many database replicas have priority greater than 2 and port 5432?", + "groundTruth": "1", + "type": "filtering", + "dataset": "nested-config" + }, + { + "id": "q200", + "prompt": "How many authentication providers have more than 2 scopes?", + "groundTruth": "1", + "type": "filtering", + "dataset": "nested-config" + }, + { + "id": "q201", + "prompt": "How many roles have at least 5 permissions?", + "groundTruth": "1", + "type": "filtering", + "dataset": "nested-config" + }, + { + "id": "q202", + "prompt": "How many feature flags are disabled with rollout less than 25%?", + "groundTruth": "2", + "type": "filtering", + "dataset": "nested-config" + } +] \ No newline at end of file diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts index 1e36a13..48d3d0d 100644 --- a/benchmarks/scripts/token-efficiency-benchmark.ts +++ b/benchmarks/scripts/token-efficiency-benchmark.ts @@ -100,7 +100,7 @@ function generateTotalLines( const csvStr = baselineFormat.tokens.toLocaleString('en-US').padStart(TOKEN_PADDING) lines.push(`csv ${csvBar} ${csvStr} tokens`) - const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / totalToonTokens) * 100 + const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / baselineFormat.tokens) * 100 const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG) const toonStr = totalToonTokens.toLocaleString('en-US').padStart(TOKEN_PADDING) lines.push(`toon ${toonBar} ${toonStr} tokens (+${overheadPercent.toFixed(1)}% vs CSV)`) @@ -223,7 +223,7 @@ const flatCharts = flatOnlyDatasets // TOON line with overhead vs CSV const toonOverhead = toon.tokens - csv.tokens - const toonOverheadPercent = (toonOverhead / toon.tokens) * 100 + const toonOverheadPercent = (toonOverhead / csv.tokens) * 100 const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG) const toonStr = toon.tokens.toLocaleString('en-US') const toonVsCSV = toonOverheadPercent >= 0 diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts index 05daee3..ea1f2d1 100644 --- a/benchmarks/src/constants.ts +++ b/benchmarks/src/constants.ts @@ -101,10 +101,10 @@ export const QUESTION_THRESHOLDS = { */ export const QUESTION_LIMITS = { tabular: { - fieldRetrieval: 20, - aggregationDepartments: 6, - filteringMultiConditionDepartments: 6, - filteringExperience: 4, + fieldRetrieval: 14, + aggregationDepartments: 4, + filteringMultiConditionDepartments: 5, + filteringExperience: 3, filteringDepartmentExp: 3, filteringDepartmentActive: 3, }, @@ -116,7 +116,7 @@ export const QUESTION_LIMITS = { filteringStatusAndItems: 3, }, analytics: { - fieldRetrievalDates: 13, + fieldRetrievalDates: 9, }, github: { fieldRetrievalRepos: 11, @@ -125,12 +125,12 @@ export const QUESTION_LIMITS = { }, eventLogs: { fieldRetrieval: 10, - aggregationEndpoints: 3, - filteringLevelAndStatus: 2, - filteringEndpointAndStatus: 2, + aggregationEndpoints: 4, + filteringLevelAndStatus: 3, + filteringEndpointAndStatus: 3, }, nestedConfig: { - fieldRetrieval: 5, - filteringComplex: 2, + fieldRetrieval: 10, + filteringComplex: 6, }, } as const diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts index e763856..fe7e6fa 100644 --- a/benchmarks/src/datasets.ts +++ b/benchmarks/src/datasets.ts @@ -5,67 +5,6 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' } // Seed for reproducibility faker.seed(12345) -/** - * Calculate the tabular eligibility percentage of a data structure - * - * @remarks - * Recursively analyzes data to determine what percentage of arrays qualify - * for TOON's tabular format (uniform objects with primitive values only). - */ -export function calculateTabularEligibility(data: unknown): number { - let totalArrays = 0 - let tabularArrays = 0 - - function isTabularArray(arr: unknown[]): boolean { - if (arr.length === 0) - return false - - // Check if all elements are objects - if (!arr.every(item => typeof item === 'object' && item !== null && !Array.isArray(item))) - return false - - // Get keys from first object - const firstKeys = Object.keys(arr[0] as Record) - if (firstKeys.length === 0) - return false - - // Check if all objects have the same keys and only primitive values - return arr.every((item) => { - const itemObj = item as Record - const itemKeys = Object.keys(itemObj) - if (itemKeys.length !== firstKeys.length) - return false - if (!firstKeys.every(key => itemKeys.includes(key))) - return false - - // Check if all values are primitives (no nested objects or arrays) - return firstKeys.every((key) => { - const value = itemObj[key] - return value === null || ['string', 'number', 'boolean'].includes(typeof value) - }) - }) - } - - function traverse(obj: unknown): void { - if (Array.isArray(obj)) { - totalArrays++ - if (isTabularArray(obj)) - tabularArrays++ - - // Continue traversing array elements - obj.forEach(item => traverse(item)) - } - else if (typeof obj === 'object' && obj !== null) { - // Traverse object properties - Object.values(obj).forEach(value => traverse(value)) - } - } - - traverse(data) - - return totalArrays === 0 ? 0 : Math.round((tabularArrays / totalArrays) * 100) -} - /** * Employee record structure for tabular dataset */ @@ -275,7 +214,7 @@ const tabularDataset: Dataset = { metadata: { supportsCSV: true, structureClass: 'uniform', - tabularEligibility: 100, + tabularEligibility: 100, // All arrays contain uniform objects with primitive values only }, } @@ -285,38 +224,21 @@ const tabularDataset: Dataset = { const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const -const ORDER_CONSTANTS = { - CUSTOMER_ID_MOD: 20, - MIN_ITEMS: 1, - MAX_ITEMS: 4, - MIN_ITEM_PRICE: 9.99, - MAX_ITEM_PRICE: 199.99, - MIN_ITEM_QUANTITY: 1, - MAX_ITEM_QUANTITY: 5, - SKU_LENGTH: 6, - ORDER_ID_PADDING: 4, - RECENT_DAYS: 90, - TAX_RATE: 0.08, -} as const - function generateOrders(count: number): { orders: Order[] } { return { orders: Array.from({ length: count }, (_, i) => { - const customerId = (i % ORDER_CONSTANTS.CUSTOMER_ID_MOD) + 1 - const itemCount = faker.number.int({ min: ORDER_CONSTANTS.MIN_ITEMS, max: ORDER_CONSTANTS.MAX_ITEMS }) + const customerId = (i % 20) + 1 // Rotate through 20 customers + const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order const items = Array.from({ length: itemCount }, (_, j) => { const price = faker.number.float({ - min: ORDER_CONSTANTS.MIN_ITEM_PRICE, - max: ORDER_CONSTANTS.MAX_ITEM_PRICE, + min: 9.99, + max: 199.99, fractionDigits: 2, }) - const quantity = faker.number.int({ - min: ORDER_CONSTANTS.MIN_ITEM_QUANTITY, - max: ORDER_CONSTANTS.MAX_ITEM_QUANTITY, - }) + const quantity = faker.number.int({ min: 1, max: 5 }) return { - sku: `SKU-${faker.string.alphanumeric({ length: ORDER_CONSTANTS.SKU_LENGTH }).toUpperCase()}`, + sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`, name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!, quantity, price, @@ -324,11 +246,11 @@ function generateOrders(count: number): { orders: Order[] } { }) const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2)) - const tax = Number((subtotal * ORDER_CONSTANTS.TAX_RATE).toFixed(2)) + const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate const total = Number((subtotal + tax).toFixed(2)) return { - orderId: `ORD-${String(i + 1).padStart(ORDER_CONSTANTS.ORDER_ID_PADDING, '0')}`, + orderId: `ORD-${String(i + 1).padStart(4, '0')}`, customer: { id: customerId, name: faker.person.fullName(), @@ -340,7 +262,7 @@ function generateOrders(count: number): { orders: Order[] } { tax, total, status: ORDER_STATUSES[i % ORDER_STATUSES.length]!, - orderDate: faker.date.recent({ days: ORDER_CONSTANTS.RECENT_DAYS }).toISOString().split('T')[0], + orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0], } }), } @@ -359,7 +281,7 @@ const nestedDataset: Dataset = { metadata: { supportsCSV: false, structureClass: 'nested', - tabularEligibility: 33, // orders array is not tabular, but items arrays within are + tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular }, } @@ -376,7 +298,7 @@ const analyticsDataset: Dataset = { metadata: { supportsCSV: true, structureClass: 'uniform', - tabularEligibility: 100, + tabularEligibility: 100, // Uniform time-series records with consistent primitive fields }, } @@ -395,7 +317,7 @@ const githubDataset: Dataset = { metadata: { supportsCSV: true, structureClass: 'uniform', - tabularEligibility: 100, + tabularEligibility: 100, // Repository array contains uniform objects with primitive values }, } @@ -597,7 +519,7 @@ const eventLogsDataset: Dataset = { metadata: { supportsCSV: false, structureClass: 'semi-uniform', - tabularEligibility: 50, // ~50% of logs have nested error objects + tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects }, } @@ -614,7 +536,7 @@ const nestedConfigDataset: Dataset = { metadata: { supportsCSV: false, structureClass: 'deep', - tabularEligibility: 0, // Highly nested, minimal tabular arrays + tabularEligibility: 0, // Deeply nested configuration with no tabular arrays }, } @@ -642,7 +564,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ metadata: { supportsCSV: true, structureClass: 'uniform', - tabularEligibility: 100, + tabularEligibility: 100, // All arrays contain uniform objects with primitive values only }, }, // Nested: 500 orders @@ -653,7 +575,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ metadata: { supportsCSV: false, structureClass: 'nested', - tabularEligibility: 33, + tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular }, }, // Analytics: 365 days @@ -664,7 +586,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ metadata: { supportsCSV: true, structureClass: 'uniform', - tabularEligibility: 100, + tabularEligibility: 100, // Uniform time-series records with consistent primitive fields }, }, // GitHub: 100 repos (same as accuracy) @@ -677,7 +599,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [ metadata: { supportsCSV: false, structureClass: 'semi-uniform', - tabularEligibility: 50, + tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects }, }, // Nested config: 1 config (same as accuracy) diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts index ecf24e6..be1c3bf 100644 --- a/benchmarks/src/evaluate.ts +++ b/benchmarks/src/evaluate.ts @@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic' import { google } from '@ai-sdk/google' import { openai } from '@ai-sdk/openai' import { xai } from '@ai-sdk/xai' -import * as prompts from '@clack/prompts' import { generateText } from 'ai' /** @@ -102,17 +101,10 @@ Is the actual answer correct? Consider: Respond with only "YES" or "NO". `.trim() - try { - const { text } = await generateText({ - model: models.find(m => m.modelId === 'gpt-5-nano')!, - prompt, - }) + const { text } = await generateText({ + model: models.find(m => m.modelId === 'gpt-5-nano')!, + prompt, + }) - return text.trim().toUpperCase() === 'YES' - } - catch (error) { - prompts.log.error(`Validation error: ${error}`) - // Fallback to simple string comparison - return actual.toLowerCase().trim() === expected.toLowerCase().trim() - } + return text.trim().toUpperCase() === 'YES' } diff --git a/benchmarks/src/questions/analytics.ts b/benchmarks/src/questions/analytics.ts index 4c58639..c08b263 100644 --- a/benchmarks/src/questions/analytics.ts +++ b/benchmarks/src/questions/analytics.ts @@ -1,7 +1,7 @@ import type { AnalyticsMetric } from '../datasets' import type { Question } from '../types' import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants' -import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' +import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' /** * Generate analytics (website metrics) questions @@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] { const questions: Question[] = [] - if (metrics.length === 0) - return questions - // Field retrieval: date-based metrics const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [ (metric, getId) => new QuestionBuilder() @@ -99,7 +96,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () // Aggregation: high views/conversions for (const threshold of QUESTION_THRESHOLDS.analytics.views) { - const count = countByPredicate(metrics, m => m.views > threshold) + const count = metrics.filter(m => m.views > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -112,7 +109,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () } for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) { - const count = countByPredicate(metrics, m => m.conversions > threshold) + const count = metrics.filter(m => m.conversions > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -126,10 +123,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () // Filtering: multi-condition (views AND revenue) for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) { - const count = countByPredicate( - metrics, + const count = metrics.filter( m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -143,10 +139,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () // Filtering: revenue thresholds for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) { - const count = countByPredicate( - metrics, + const count = metrics.filter( m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -160,10 +155,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () // Filtering: clicks and conversions for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) { - const count = countByPredicate( - metrics, + const count = metrics.filter( m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -177,10 +171,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () // Filtering: revenue and bounce rate for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) { - const count = countByPredicate( - metrics, + const count = metrics.filter( m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) diff --git a/benchmarks/src/questions/event-logs.ts b/benchmarks/src/questions/event-logs.ts index 3e4650a..2a35e52 100644 --- a/benchmarks/src/questions/event-logs.ts +++ b/benchmarks/src/questions/event-logs.ts @@ -1,7 +1,7 @@ import type { EventLog } from '../datasets' import type { Question } from '../types' import { QUESTION_LIMITS } from '../constants' -import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' +import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' /** * Generate event log questions @@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] { const questions: Question[] = [] - if (logs.length === 0) - return questions - // Field retrieval: log metadata const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [ (log, getId) => new QuestionBuilder() @@ -76,7 +73,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string // Aggregation: by level const levels = [...new Set(logs.map(l => l.level))] for (const level of levels) { - const count = countByPredicate(logs, l => l.level === level) + const count = logs.filter(l => l.level === level).length questions.push( new QuestionBuilder() .id(getId()) @@ -91,7 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string // Aggregation: by endpoint const endpoints = [...new Set(logs.map(l => l.endpoint))] for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) { - const count = countByPredicate(logs, l => l.endpoint === endpoint) + const count = logs.filter(l => l.endpoint === endpoint).length questions.push( new QuestionBuilder() .id(getId()) @@ -104,8 +101,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string } // Aggregation: by status code range - const errorCount = countByPredicate(logs, l => l.statusCode >= 400) - const successCount = countByPredicate(logs, l => l.statusCode >= 200 && l.statusCode < 300) + const errorCount = logs.filter(l => l.statusCode >= 400).length + const successCount = logs.filter(l => l.statusCode >= 200 && l.statusCode < 300).length questions.push( new QuestionBuilder() @@ -124,12 +121,21 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string .build(), ) + // Aggregation: retryable errors + const retryableErrorCount = logs.filter(l => l.error?.retryable === true).length + questions.push( + new QuestionBuilder() + .id(getId()) + .prompt('How many log entries have a retryable error?') + .groundTruth(String(retryableErrorCount)) + .type('aggregation') + .dataset('event-logs') + .build(), + ) + // Filtering: multi-condition (level AND status) for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) { - const count = countByPredicate( - logs, - l => l.level === level && l.statusCode >= 400, - ) + const count = logs.filter(l => l.level === level && l.statusCode >= 400).length questions.push( new QuestionBuilder() .id(getId()) @@ -143,10 +149,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string // Filtering: endpoint AND status for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) { - const count = countByPredicate( - logs, - l => l.endpoint === endpoint && l.statusCode >= 500, - ) + const count = logs.filter(l => l.endpoint === endpoint && l.statusCode >= 500).length questions.push( new QuestionBuilder() .id(getId()) @@ -158,5 +161,19 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string ) } + // Filtering: endpoint AND retryable error + for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) { + const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length + questions.push( + new QuestionBuilder() + .id(getId()) + .prompt(`How many log entries for endpoint "${endpoint}" have a retryable error?`) + .groundTruth(String(count)) + .type('filtering') + .dataset('event-logs') + .build(), + ) + } + return questions } diff --git a/benchmarks/src/questions/github.ts b/benchmarks/src/questions/github.ts index f9b4bd3..38c378b 100644 --- a/benchmarks/src/questions/github.ts +++ b/benchmarks/src/questions/github.ts @@ -1,7 +1,7 @@ import type { Repository } from '../datasets' import type { Question } from '../types' import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants' -import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' +import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' /** * Generate GitHub repository questions @@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] { const questions: Question[] = [] - if (repos.length === 0) - return questions - // Field retrieval: repository metadata const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [ (repo, getId) => new QuestionBuilder() @@ -92,7 +89,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Aggregation: by default branch const branches = [...new Set(repos.map(r => r.defaultBranch))] for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) { - const count = countByPredicate(repos, r => r.defaultBranch === branch) + const count = repos.filter(r => r.defaultBranch === branch).length questions.push( new QuestionBuilder() .id(getId()) @@ -106,7 +103,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Aggregation: high star counts for (const threshold of QUESTION_THRESHOLDS.github.stars) { - const count = countByPredicate(repos, r => r.stars > threshold) + const count = repos.filter(r => r.stars > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -120,7 +117,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Aggregation: high fork counts for (const threshold of QUESTION_THRESHOLDS.github.forks) { - const count = countByPredicate(repos, r => r.forks > threshold) + const count = repos.filter(r => r.forks > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -134,7 +131,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Aggregation: high watcher counts for (const threshold of QUESTION_THRESHOLDS.github.watchers) { - const count = countByPredicate(repos, r => r.watchers > threshold) + const count = repos.filter(r => r.watchers > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -148,10 +145,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Filtering: multi-condition (stars AND forks) for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) { - const count = countByPredicate( - repos, + const count = repos.filter( r => r.stars > combo.stars && r.forks > combo.forks, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -165,10 +161,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string // Filtering: stars AND watchers for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) { - const count = countByPredicate( - repos, + const count = repos.filter( r => r.stars > combo.stars && r.watchers > combo.watchers, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) diff --git a/benchmarks/src/questions/index.ts b/benchmarks/src/questions/index.ts index 9bac171..63f03c9 100644 --- a/benchmarks/src/questions/index.ts +++ b/benchmarks/src/questions/index.ts @@ -10,10 +10,9 @@ import { generateTabularQuestions } from './tabular' import { createIdGenerator } from './utils' /** - * Generate all questions from datasets + * Generate ~200 questions from all datasets * * @remarks - * Generates ~150-160 questions across different question types and datasets: * - Field Retrieval: Direct field access with no computation * Examples: "What is X's salary?", "What is the status of order Y?" * - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters) diff --git a/benchmarks/src/questions/nested-config.ts b/benchmarks/src/questions/nested-config.ts index 8ebc9f6..6f04dcd 100644 --- a/benchmarks/src/questions/nested-config.ts +++ b/benchmarks/src/questions/nested-config.ts @@ -34,6 +34,26 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined, prompt: 'What is the session duration?', groundTruth: String(config.authentication.session.duration), }, + { + prompt: 'What is the minimum connection pool size?', + groundTruth: String(config.database.pool.min), + }, + { + prompt: 'What is the connection pool idle timeout?', + groundTruth: String(config.database.pool.idleTimeout), + }, + { + prompt: 'What is the database name?', + groundTruth: config.database.name, + }, + { + prompt: 'What is the session refresh threshold?', + groundTruth: String(config.authentication.session.refreshThreshold), + }, + { + prompt: 'What is the version in the configuration?', + groundTruth: config.version, + }, ] for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) { @@ -93,6 +113,18 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined, .build(), ) + // Aggregation: providers with admin scope + const adminScopeProviderCount = config.authentication.providers.filter(p => p.scopes.includes('admin')).length + questions.push( + new QuestionBuilder() + .id(getId()) + .prompt('How many authentication providers include the "admin" scope?') + .groundTruth(String(adminScopeProviderCount)) + .type('aggregation') + .dataset('nested-config') + .build(), + ) + // Aggregation: feature flag details const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length questions.push( @@ -117,6 +149,67 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined, .build(), ) + // Aggregation: additional nested counts + const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0) + const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size + const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size + const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0) + const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length + const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length + const groupsWithMultipleRoles = Object.values(config.permissions.groups).filter(g => g.roles.length > 1).length + + questions.push( + new QuestionBuilder() + .id(getId()) + .prompt('What is the total number of permissions across all roles?') + .groundTruth(String(totalPermissions)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('How many distinct permissions are defined across all roles?') + .groundTruth(String(distinctPermissions)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('How many distinct scopes are defined across all authentication providers?') + .groundTruth(String(distinctScopes)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('What is the total number of variants across all feature flags?') + .groundTruth(String(totalVariants)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('How many database replicas have a priority greater than 2?') + .groundTruth(String(highPriorityReplicas)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('How many feature flags have a rollout percentage greater than 50?') + .groundTruth(String(featuresWithHighRollout)) + .type('aggregation') + .dataset('nested-config') + .build(), + new QuestionBuilder() + .id(getId()) + .prompt('How many groups have more than one role assigned?') + .groundTruth(String(groupsWithMultipleRoles)) + .type('aggregation') + .dataset('nested-config') + .build(), + ) + // Filtering: complex multi-condition queries const filteringQuestions = [ { @@ -129,6 +222,31 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined, groundTruth: String(Object.entries(config.permissions.groups) .filter(([_, g]) => g.roles.includes('admin')).length), }, + { + prompt: 'How many database replicas have priority greater than 2 and port 5432?', + groundTruth: String(config.database.replicas + .filter(r => r.priority > 2 && r.port === 5432).length), + }, + { + prompt: 'How many authentication providers have more than 2 scopes?', + groundTruth: String(config.authentication.providers + .filter(p => p.scopes.length > 2).length), + }, + { + prompt: 'How many roles have at least 5 permissions?', + groundTruth: String(Object.values(config.permissions.roles) + .filter(r => r.permissions.length >= 5).length), + }, + { + prompt: 'How many feature flags are disabled with rollout less than 25%?', + groundTruth: String(Object.values(config.features) + .filter(f => !f.enabled && f.rollout < 25).length), + }, + { + prompt: 'How many enabled features have at least 2 variants?', + groundTruth: String(Object.values(config.features) + .filter(f => f.enabled && f.variants.length >= 2).length), + }, ] for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) { diff --git a/benchmarks/src/questions/nested.ts b/benchmarks/src/questions/nested.ts index e54512b..de16c55 100644 --- a/benchmarks/src/questions/nested.ts +++ b/benchmarks/src/questions/nested.ts @@ -1,7 +1,7 @@ import type { Order } from '../datasets' import type { Question } from '../types' import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants' -import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' +import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' /** * Generate nested (orders) questions @@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] { const questions: Question[] = [] - if (orders.length === 0) - return questions - // Field retrieval: order totals and statuses const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [ (order, getId) => new QuestionBuilder() @@ -89,7 +86,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q // Count by status const statuses = [...new Set(orders.map(o => o.status))] for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) { - const count = countByPredicate(orders, o => o.status === status) + const count = orders.filter(o => o.status === status).length questions.push( new QuestionBuilder() .id(getId()) @@ -134,7 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q // Aggregation: high-value orders (single-condition filter) for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) { - const count = countByPredicate(orders, o => o.total > threshold) + const count = orders.filter(o => o.total > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -149,10 +146,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q // Filtering: multi-condition queries (status AND value) const orderStatuses = [...new Set(orders.map(o => o.status))] for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) { - const count = countByPredicate( - orders, + const count = orders.filter( o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -166,10 +162,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q // Filtering: status AND items count (multi-condition) for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) { - const count = countByPredicate( - orders, + const count = orders.filter( o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -183,10 +178,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q // Filtering: total AND items count (multi-condition) for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) { - const count = countByPredicate( - orders, + const count = orders.filter( o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) diff --git a/benchmarks/src/questions/tabular.ts b/benchmarks/src/questions/tabular.ts index 951bfdb..b9a5a01 100644 --- a/benchmarks/src/questions/tabular.ts +++ b/benchmarks/src/questions/tabular.ts @@ -1,7 +1,7 @@ import type { Employee } from '../datasets' import type { Question } from '../types' import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants' -import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' +import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils' /** * Generate tabular (employee) questions @@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] { const questions: Question[] = [] - if (employees.length === 0) - return questions - // Field retrieval: specific employees const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [ (emp, getId) => new QuestionBuilder() @@ -62,7 +59,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Aggregation: count by department const departments = [...new Set(employees.map(e => e.department))] for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) { - const count = countByPredicate(employees, e => e.department === dept) + const count = employees.filter(e => e.department === dept).length questions.push( new QuestionBuilder() .id(getId()) @@ -76,7 +73,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Aggregation: salary ranges (single-condition filters) for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) { - const count = countByPredicate(employees, e => e.salary > threshold) + const count = employees.filter(e => e.salary > threshold).length questions.push( new QuestionBuilder() .id(getId()) @@ -91,8 +88,8 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Aggregation: totals and averages const totalEmployees = employees.length const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees) - const activeCount = countByPredicate(employees, e => e.active) - const inactiveCount = countByPredicate(employees, e => !e.active) + const activeCount = employees.filter(e => e.active).length + const inactiveCount = employees.filter(e => !e.active).length questions.push( new QuestionBuilder() @@ -127,10 +124,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Filtering: count by department with salary filter (multi-condition) for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) { - const count = countByPredicate( - employees, + const count = employees.filter( e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -144,7 +140,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Filtering: active employees by experience (multi-condition) for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) { - const count = countByPredicate(employees, e => e.yearsExperience > exp && e.active) + const count = employees.filter(e => e.yearsExperience > exp && e.active).length questions.push( new QuestionBuilder() .id(getId()) @@ -158,10 +154,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Filtering: department by experience (multi-condition) for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) { - const count = countByPredicate( - employees, + const count = employees.filter( e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold, - ) + ).length questions.push( new QuestionBuilder() .id(getId()) @@ -175,7 +170,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str // Filtering: department by active status (multi-condition) for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) { - const count = countByPredicate(employees, e => e.department === dept && e.active) + const count = employees.filter(e => e.department === dept && e.active).length questions.push( new QuestionBuilder() .id(getId()) diff --git a/benchmarks/src/questions/utils.ts b/benchmarks/src/questions/utils.ts index 45c2c58..e3004e6 100644 --- a/benchmarks/src/questions/utils.ts +++ b/benchmarks/src/questions/utils.ts @@ -61,14 +61,7 @@ export class QuestionBuilder { } /** - * Helper: Count items matching a predicate - */ -export function countByPredicate(items: T[], predicate: (item: T) => boolean): number { - return items.filter(predicate).length -} - -/** - * Helper: Rotate through question generators + * Rotate through question generators */ export function rotateQuestions( items: T[], diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts index 5676920..e586cb5 100644 --- a/benchmarks/src/types.ts +++ b/benchmarks/src/types.ts @@ -15,7 +15,7 @@ export interface Question { id: string prompt: string groundTruth: string - type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison' + type: 'field-retrieval' | 'aggregation' | 'filtering' dataset: string }