From a9d52fc69baadbdea58770188d171c113e336312 Mon Sep 17 00:00:00 2001
From: Johann Schopplich <mail@johannschopplich.com>
Date: Thu, 6 Nov 2025 15:51:31 +0100
Subject: [PATCH] chore: more work on benchmarks

---
 benchmarks/README.md                          |    2 +-
 benchmarks/questions-generated.json           | 1416 +++++++++++++++++
 .../scripts/token-efficiency-benchmark.ts     |    4 +-
 benchmarks/src/constants.ts                   |   20 +-
 benchmarks/src/datasets.ts                    |  116 +-
 benchmarks/src/evaluate.ts                    |   18 +-
 benchmarks/src/questions/analytics.ts         |   29 +-
 benchmarks/src/questions/event-logs.ts        |   49 +-
 benchmarks/src/questions/github.ts            |   23 +-
 benchmarks/src/questions/index.ts             |    3 +-
 benchmarks/src/questions/nested-config.ts     |  118 ++
 benchmarks/src/questions/nested.ts            |   24 +-
 benchmarks/src/questions/tabular.ts           |   27 +-
 benchmarks/src/questions/utils.ts             |    9 +-
 benchmarks/src/types.ts                       |    2 +-
 15 files changed, 1647 insertions(+), 213 deletions(-)
 create mode 100644 benchmarks/questions-generated.json

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 81e78c5..1309ecb 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -34,7 +34,7 @@ Results are saved to `results/token-efficiency.md`.
 
 Tests how well LLMs can answer questions about data in different formats (TOON, JSON, JSON compact, XML, YAML, CSV):
 
-1. Generate ~150-160 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
+1. Generate ~200 questions across 6 datasets (CSV only included for datasets with flat/tabular structure)
 2. Convert each dataset to all supported formats
 3. Query each LLM with formatted data + question
 4. Validate answers using `gpt-5-nano` as judge
diff --git a/benchmarks/questions-generated.json b/benchmarks/questions-generated.json
new file mode 100644
index 0000000..499eaee
--- /dev/null
+++ b/benchmarks/questions-generated.json
@@ -0,0 +1,1416 @@
+[
+  {
+    "id": "q1",
+    "prompt": "What is the salary of Constance Mante?",
+    "groundTruth": "56176",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q2",
+    "prompt": "What department does Alfonso Leffler work in?",
+    "groundTruth": "Marketing",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q3",
+    "prompt": "What is the email address of Mr. Corey Pfeffer?",
+    "groundTruth": "lorenza.kunze@yahoo.com",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q4",
+    "prompt": "How many years of experience does Mr. Brendan Harvey have?",
+    "groundTruth": "22",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q5",
+    "prompt": "Is Tracy Gleason an active employee?",
+    "groundTruth": "no",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q6",
+    "prompt": "What is the salary of Terri Wilkinson?",
+    "groundTruth": "133081",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q7",
+    "prompt": "What department does Aubrey Koss work in?",
+    "groundTruth": "Engineering",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q8",
+    "prompt": "What is the email address of Darren Homenick?",
+    "groundTruth": "delpha.russel@gmail.com",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q9",
+    "prompt": "How many years of experience does Dr. Ken Heller have?",
+    "groundTruth": "5",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q10",
+    "prompt": "Is Mr. Wade Collier an active employee?",
+    "groundTruth": "yes",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q11",
+    "prompt": "What is the salary of Hannah Waelchi?",
+    "groundTruth": "109064",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q12",
+    "prompt": "What department does Emily Harvey work in?",
+    "groundTruth": "Operations",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q13",
+    "prompt": "What is the email address of Chester Crist?",
+    "groundTruth": "henderson70@yahoo.com",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q14",
+    "prompt": "How many years of experience does Barbara Emard have?",
+    "groundTruth": "23",
+    "type": "field-retrieval",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q15",
+    "prompt": "How many employees work in Engineering?",
+    "groundTruth": "17",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q16",
+    "prompt": "How many employees work in Sales?",
+    "groundTruth": "17",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q17",
+    "prompt": "How many employees work in Marketing?",
+    "groundTruth": "17",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q18",
+    "prompt": "How many employees work in HR?",
+    "groundTruth": "17",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q19",
+    "prompt": "How many employees have a salary greater than 60000?",
+    "groundTruth": "91",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q20",
+    "prompt": "How many employees have a salary greater than 80000?",
+    "groundTruth": "67",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q21",
+    "prompt": "How many employees have a salary greater than 100000?",
+    "groundTruth": "41",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q22",
+    "prompt": "How many employees have a salary greater than 120000?",
+    "groundTruth": "26",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q23",
+    "prompt": "How many employees are in the dataset?",
+    "groundTruth": "100",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q24",
+    "prompt": "What is the average salary across all employees?",
+    "groundTruth": "96503",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q25",
+    "prompt": "How many employees are active?",
+    "groundTruth": "78",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q26",
+    "prompt": "How many employees are inactive?",
+    "groundTruth": "22",
+    "type": "aggregation",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q27",
+    "prompt": "How many employees in Engineering have a salary greater than 80000?",
+    "groundTruth": "12",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q28",
+    "prompt": "How many employees in Sales have a salary greater than 80000?",
+    "groundTruth": "11",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q29",
+    "prompt": "How many employees in Marketing have a salary greater than 80000?",
+    "groundTruth": "11",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q30",
+    "prompt": "How many employees in HR have a salary greater than 80000?",
+    "groundTruth": "12",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q31",
+    "prompt": "How many employees in Operations have a salary greater than 80000?",
+    "groundTruth": "11",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q32",
+    "prompt": "How many active employees have more than 5 years of experience?",
+    "groundTruth": "63",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q33",
+    "prompt": "How many active employees have more than 10 years of experience?",
+    "groundTruth": "53",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q34",
+    "prompt": "How many active employees have more than 15 years of experience?",
+    "groundTruth": "39",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q35",
+    "prompt": "How many employees in Engineering have more than 10 years of experience?",
+    "groundTruth": "11",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q36",
+    "prompt": "How many employees in Sales have more than 10 years of experience?",
+    "groundTruth": "8",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q37",
+    "prompt": "How many employees in Marketing have more than 10 years of experience?",
+    "groundTruth": "15",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q38",
+    "prompt": "How many active employees work in Engineering?",
+    "groundTruth": "12",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q39",
+    "prompt": "How many active employees work in Sales?",
+    "groundTruth": "11",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q40",
+    "prompt": "How many active employees work in Marketing?",
+    "groundTruth": "14",
+    "type": "filtering",
+    "dataset": "tabular"
+  },
+  {
+    "id": "q41",
+    "prompt": "What is the total for order ORD-0001?",
+    "groundTruth": "103.86",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q42",
+    "prompt": "What is the status of order ORD-0003?",
+    "groundTruth": "shipped",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q43",
+    "prompt": "What is the total for order ORD-0005?",
+    "groundTruth": "422.5",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q44",
+    "prompt": "What is the status of order ORD-0007?",
+    "groundTruth": "processing",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q45",
+    "prompt": "What is the total for order ORD-0009?",
+    "groundTruth": "1822.85",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q46",
+    "prompt": "What is the status of order ORD-0011?",
+    "groundTruth": "pending",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q47",
+    "prompt": "What is the total for order ORD-0013?",
+    "groundTruth": "1311.35",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q48",
+    "prompt": "What is the status of order ORD-0015?",
+    "groundTruth": "cancelled",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q49",
+    "prompt": "What is the customer name for order ORD-0002?",
+    "groundTruth": "Debbie O'Kon I",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q50",
+    "prompt": "What is the customer email for order ORD-0004?",
+    "groundTruth": "demetris.hoeger-pollich@yahoo.com",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q51",
+    "prompt": "What is the order date for order ORD-0006?",
+    "groundTruth": "2025-09-15",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q52",
+    "prompt": "How many items are in order ORD-0008?",
+    "groundTruth": "3",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q53",
+    "prompt": "What is the customer name for order ORD-0010?",
+    "groundTruth": "Patty Senger",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q54",
+    "prompt": "What is the customer email for order ORD-0012?",
+    "groundTruth": "viva.paucek@gmail.com",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q55",
+    "prompt": "What is the order date for order ORD-0014?",
+    "groundTruth": "2025-09-20",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q56",
+    "prompt": "How many items are in order ORD-0016?",
+    "groundTruth": "2",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q57",
+    "prompt": "What is the customer name for order ORD-0018?",
+    "groundTruth": "Dennis Wunsch",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q58",
+    "prompt": "What is the customer email for order ORD-0020?",
+    "groundTruth": "wilton.oconnell@yahoo.com",
+    "type": "field-retrieval",
+    "dataset": "nested"
+  },
+  {
+    "id": "q59",
+    "prompt": "How many orders have status \"pending\"?",
+    "groundTruth": "10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q60",
+    "prompt": "How many orders have status \"processing\"?",
+    "groundTruth": "10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q61",
+    "prompt": "How many orders have status \"shipped\"?",
+    "groundTruth": "10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q62",
+    "prompt": "How many orders have status \"delivered\"?",
+    "groundTruth": "10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q63",
+    "prompt": "How many orders have status \"cancelled\"?",
+    "groundTruth": "10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q64",
+    "prompt": "What is the total revenue across all orders?",
+    "groundTruth": "34904.81",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q65",
+    "prompt": "What is the average order value?",
+    "groundTruth": "698.10",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q66",
+    "prompt": "How many orders are in the dataset?",
+    "groundTruth": "50",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q67",
+    "prompt": "What is the highest order total?",
+    "groundTruth": "2152.82",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q68",
+    "prompt": "How many orders have a total greater than 200?",
+    "groundTruth": "43",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q69",
+    "prompt": "How many orders have a total greater than 400?",
+    "groundTruth": "37",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q70",
+    "prompt": "How many orders have a total greater than 600?",
+    "groundTruth": "28",
+    "type": "aggregation",
+    "dataset": "nested"
+  },
+  {
+    "id": "q71",
+    "prompt": "How many orders have status \"pending\" and total greater than 300?",
+    "groundTruth": "8",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q72",
+    "prompt": "How many orders have status \"processing\" and total greater than 300?",
+    "groundTruth": "6",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q73",
+    "prompt": "How many orders have status \"shipped\" and total greater than 300?",
+    "groundTruth": "10",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q74",
+    "prompt": "How many orders have status \"delivered\" and total greater than 300?",
+    "groundTruth": "9",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q75",
+    "prompt": "How many orders have status \"cancelled\" and total greater than 300?",
+    "groundTruth": "8",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q76",
+    "prompt": "How many orders have status \"pending\" and at least 3 items?",
+    "groundTruth": "3",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q77",
+    "prompt": "How many orders have status \"processing\" and at least 3 items?",
+    "groundTruth": "3",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q78",
+    "prompt": "How many orders have status \"shipped\" and at least 3 items?",
+    "groundTruth": "5",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q79",
+    "prompt": "How many orders have a total greater than 300 and at least 3 items?",
+    "groundTruth": "20",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q80",
+    "prompt": "How many orders have a total greater than 500 and at least 3 items?",
+    "groundTruth": "19",
+    "type": "filtering",
+    "dataset": "nested"
+  },
+  {
+    "id": "q81",
+    "prompt": "What are the views for 2025-01-01?",
+    "groundTruth": "4322",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q82",
+    "prompt": "What is the revenue for 2025-01-04?",
+    "groundTruth": "10432.04",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q83",
+    "prompt": "What is the bounce rate for 2025-01-07?",
+    "groundTruth": "0.53",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q84",
+    "prompt": "How many conversions were there on 2025-01-10?",
+    "groundTruth": "32",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q85",
+    "prompt": "What are the views for 2025-01-13?",
+    "groundTruth": "4096",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q86",
+    "prompt": "What is the revenue for 2025-01-16?",
+    "groundTruth": "4533.1",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q87",
+    "prompt": "What is the bounce rate for 2025-01-19?",
+    "groundTruth": "0.63",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q88",
+    "prompt": "How many conversions were there on 2025-01-22?",
+    "groundTruth": "25",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q89",
+    "prompt": "What are the views for 2025-01-25?",
+    "groundTruth": "4076",
+    "type": "field-retrieval",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q90",
+    "prompt": "How many days of data are in the dataset?",
+    "groundTruth": "60",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q91",
+    "prompt": "What is the total number of views across all dates?",
+    "groundTruth": "328320",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q92",
+    "prompt": "What is the total number of conversions across all dates?",
+    "groundTruth": "1791",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q93",
+    "prompt": "What is the total revenue across all dates?",
+    "groundTruth": "311695.88",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q94",
+    "prompt": "What is the average bounce rate?",
+    "groundTruth": "0.53",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q95",
+    "prompt": "How many days had more than 5000 views?",
+    "groundTruth": "33",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q96",
+    "prompt": "How many days had more than 7000 views?",
+    "groundTruth": "14",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q97",
+    "prompt": "How many days had more than 10 conversions?",
+    "groundTruth": "57",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q98",
+    "prompt": "How many days had more than 30 conversions?",
+    "groundTruth": "26",
+    "type": "aggregation",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q99",
+    "prompt": "How many days had more than 6000 views and more than 15 conversions?",
+    "groundTruth": "20",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q100",
+    "prompt": "How many days had more than 7000 views and more than 15 conversions?",
+    "groundTruth": "14",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q101",
+    "prompt": "How many days had revenue greater than 500 with views above 6000?",
+    "groundTruth": "22",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q102",
+    "prompt": "How many days had revenue greater than 1000 with views above 6000?",
+    "groundTruth": "22",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q103",
+    "prompt": "How many days had revenue greater than 1500 with views above 6000?",
+    "groundTruth": "22",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q104",
+    "prompt": "How many days had revenue greater than 2000 with views above 6000?",
+    "groundTruth": "20",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q105",
+    "prompt": "How many days had revenue greater than 2500 with views above 6000?",
+    "groundTruth": "18",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q106",
+    "prompt": "How many days had more than 250 clicks and more than 15 conversions?",
+    "groundTruth": "32",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q107",
+    "prompt": "How many days had more than 400 clicks and more than 15 conversions?",
+    "groundTruth": "9",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q108",
+    "prompt": "How many days had revenue greater than 1000 with bounce rate below 0.5?",
+    "groundTruth": "22",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q109",
+    "prompt": "How many days had revenue greater than 1500 with bounce rate below 0.5?",
+    "groundTruth": "22",
+    "type": "filtering",
+    "dataset": "analytics"
+  },
+  {
+    "id": "q110",
+    "prompt": "How many stars does undefined/freeCodeCamp have?",
+    "groundTruth": "430886",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q111",
+    "prompt": "How many forks does undefined/system-design-primer have?",
+    "groundTruth": "52904",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q112",
+    "prompt": "How many watchers does undefined/vue have?",
+    "groundTruth": "5786",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q113",
+    "prompt": "What is the main branch of undefined/CS-Notes?",
+    "groundTruth": "master",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q114",
+    "prompt": "How many stars does undefined/gitignore have?",
+    "groundTruth": "170327",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q115",
+    "prompt": "How many forks does undefined/n8n have?",
+    "groundTruth": "48578",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q116",
+    "prompt": "How many watchers does undefined/yt-dlp have?",
+    "groundTruth": "678",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q117",
+    "prompt": "What is the main branch of undefined/PowerToys?",
+    "groundTruth": "main",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q118",
+    "prompt": "How many stars does undefined/free-programming-books-zh_CN have?",
+    "groundTruth": "115543",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q119",
+    "prompt": "How many forks does undefined/three.js have?",
+    "groundTruth": "36054",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q120",
+    "prompt": "How many watchers does undefined/GitHub-Chinese-Top-Charts have?",
+    "groundTruth": "2607",
+    "type": "field-retrieval",
+    "dataset": "github"
+  },
+  {
+    "id": "q121",
+    "prompt": "How many repositories are in the dataset?",
+    "groundTruth": "100",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q122",
+    "prompt": "What is the total number of stars across all repositories?",
+    "groundTruth": "15413563",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q123",
+    "prompt": "What is the total number of forks across all repositories?",
+    "groundTruth": "2528243",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q124",
+    "prompt": "What is the average number of stars per repository?",
+    "groundTruth": "154136",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q125",
+    "prompt": "How many repositories use \"main\" as their default branch?",
+    "groundTruth": "41",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q126",
+    "prompt": "How many repositories use \"master\" as their default branch?",
+    "groundTruth": "53",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q127",
+    "prompt": "How many repositories have more than 100000 stars?",
+    "groundTruth": "77",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q128",
+    "prompt": "How many repositories have more than 150000 stars?",
+    "groundTruth": "37",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q129",
+    "prompt": "How many repositories have more than 200000 stars?",
+    "groundTruth": "16",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q130",
+    "prompt": "How many repositories have more than 20000 forks?",
+    "groundTruth": "49",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q131",
+    "prompt": "How many repositories have more than 35000 forks?",
+    "groundTruth": "23",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q132",
+    "prompt": "How many repositories have more than 50000 forks?",
+    "groundTruth": "11",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q133",
+    "prompt": "How many repositories have more than 5000 watchers?",
+    "groundTruth": "19",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q134",
+    "prompt": "How many repositories have more than 8000 watchers?",
+    "groundTruth": "4",
+    "type": "aggregation",
+    "dataset": "github"
+  },
+  {
+    "id": "q135",
+    "prompt": "How many repositories have more than 75000 stars and more than 15000 forks?",
+    "groundTruth": "57",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q136",
+    "prompt": "How many repositories have more than 100000 stars and more than 20000 forks?",
+    "groundTruth": "43",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q137",
+    "prompt": "How many repositories have more than 150000 stars and more than 30000 forks?",
+    "groundTruth": "25",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q138",
+    "prompt": "How many repositories have more than 200000 stars and more than 45000 forks?",
+    "groundTruth": "6",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q139",
+    "prompt": "How many repositories have more than 100000 stars and more than 7000 watchers?",
+    "groundTruth": "6",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q140",
+    "prompt": "How many repositories have more than 150000 stars and more than 9000 watchers?",
+    "groundTruth": "1",
+    "type": "filtering",
+    "dataset": "github"
+  },
+  {
+    "id": "q141",
+    "prompt": "What is the level of the log at 2025-11-02T16:55:04.316Z?",
+    "groundTruth": "error",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q142",
+    "prompt": "What is the endpoint for the log at 2025-10-31T02:31:28.977Z?",
+    "groundTruth": "/api/users",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q143",
+    "prompt": "What is the status code for the log at 2025-11-01T23:56:56.929Z?",
+    "groundTruth": "424",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q144",
+    "prompt": "What is the response time for the log at 2025-11-03T12:14:31.017Z?",
+    "groundTruth": "2849",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q145",
+    "prompt": "What is the level of the log at 2025-11-01T22:06:30.814Z?",
+    "groundTruth": "info",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q146",
+    "prompt": "What is the endpoint for the log at 2025-11-06T05:48:07.260Z?",
+    "groundTruth": "/api/orders",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q147",
+    "prompt": "What is the status code for the log at 2025-11-05T23:46:00.144Z?",
+    "groundTruth": "435",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q148",
+    "prompt": "What is the response time for the log at 2025-10-31T23:56:23.022Z?",
+    "groundTruth": "408",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q149",
+    "prompt": "What is the level of the log at 2025-11-06T01:23:44.734Z?",
+    "groundTruth": "error",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q150",
+    "prompt": "What is the endpoint for the log at 2025-11-03T21:54:27.889Z?",
+    "groundTruth": "/api/users",
+    "type": "field-retrieval",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q151",
+    "prompt": "How many log entries are in the dataset?",
+    "groundTruth": "75",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q152",
+    "prompt": "What is the average response time across all logs?",
+    "groundTruth": "2453.41",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q153",
+    "prompt": "How many log entries have level \"error\"?",
+    "groundTruth": "29",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q154",
+    "prompt": "How many log entries have level \"warn\"?",
+    "groundTruth": "17",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q155",
+    "prompt": "How many log entries have level \"info\"?",
+    "groundTruth": "29",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q156",
+    "prompt": "How many log entries are for endpoint \"/api/products\"?",
+    "groundTruth": "11",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q157",
+    "prompt": "How many log entries are for endpoint \"/api/users\"?",
+    "groundTruth": "18",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q158",
+    "prompt": "How many log entries are for endpoint \"/api/auth\"?",
+    "groundTruth": "21",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q159",
+    "prompt": "How many log entries are for endpoint \"/api/orders\"?",
+    "groundTruth": "11",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q160",
+    "prompt": "How many log entries have a status code indicating an error (>= 400)?",
+    "groundTruth": "33",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q161",
+    "prompt": "How many log entries have a successful status code (200-299)?",
+    "groundTruth": "42",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q162",
+    "prompt": "How many log entries have a retryable error?",
+    "groundTruth": "25",
+    "type": "aggregation",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q163",
+    "prompt": "How many log entries have level \"error\" and status code >= 400?",
+    "groundTruth": "29",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q164",
+    "prompt": "How many log entries have level \"warn\" and status code >= 400?",
+    "groundTruth": "4",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q165",
+    "prompt": "How many log entries have level \"info\" and status code >= 400?",
+    "groundTruth": "0",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q166",
+    "prompt": "How many log entries are for endpoint \"/api/products\" with status code >= 500?",
+    "groundTruth": "5",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q167",
+    "prompt": "How many log entries are for endpoint \"/api/users\" with status code >= 500?",
+    "groundTruth": "2",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q168",
+    "prompt": "How many log entries are for endpoint \"/api/auth\" with status code >= 500?",
+    "groundTruth": "3",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q169",
+    "prompt": "How many log entries for endpoint \"/api/products\" have a retryable error?",
+    "groundTruth": "4",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q170",
+    "prompt": "How many log entries for endpoint \"/api/users\" have a retryable error?",
+    "groundTruth": "5",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q171",
+    "prompt": "How many log entries for endpoint \"/api/auth\" have a retryable error?",
+    "groundTruth": "7",
+    "type": "filtering",
+    "dataset": "event-logs"
+  },
+  {
+    "id": "q172",
+    "prompt": "What is the environment in the configuration?",
+    "groundTruth": "development",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q173",
+    "prompt": "What is the database host?",
+    "groundTruth": "guilty-cake.org",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q174",
+    "prompt": "What is the database port?",
+    "groundTruth": "5432",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q175",
+    "prompt": "What is the maximum connection pool size?",
+    "groundTruth": "37",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q176",
+    "prompt": "What is the session duration?",
+    "groundTruth": "86400",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q177",
+    "prompt": "What is the minimum connection pool size?",
+    "groundTruth": "2",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q178",
+    "prompt": "What is the connection pool idle timeout?",
+    "groundTruth": "30000",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q179",
+    "prompt": "What is the database name?",
+    "groundTruth": "real",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q180",
+    "prompt": "What is the session refresh threshold?",
+    "groundTruth": "3600",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q181",
+    "prompt": "What is the version in the configuration?",
+    "groundTruth": "6.8.3",
+    "type": "field-retrieval",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q182",
+    "prompt": "How many roles are defined in permissions?",
+    "groundTruth": "3",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q183",
+    "prompt": "How many groups are defined in permissions?",
+    "groundTruth": "2",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q184",
+    "prompt": "How many authentication providers are configured?",
+    "groundTruth": "2",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q185",
+    "prompt": "How many feature flags are defined?",
+    "groundTruth": "2",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q186",
+    "prompt": "How many database replicas are configured?",
+    "groundTruth": "3",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q187",
+    "prompt": "How many authentication providers include the \"admin\" scope?",
+    "groundTruth": "1",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q188",
+    "prompt": "How many feature flags are enabled?",
+    "groundTruth": "0",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q189",
+    "prompt": "How many permissions does the admin role have?",
+    "groundTruth": "5",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q190",
+    "prompt": "What is the total number of permissions across all roles?",
+    "groundTruth": "8",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q191",
+    "prompt": "How many distinct permissions are defined across all roles?",
+    "groundTruth": "5",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q192",
+    "prompt": "How many distinct scopes are defined across all authentication providers?",
+    "groundTruth": "3",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q193",
+    "prompt": "What is the total number of variants across all feature flags?",
+    "groundTruth": "3",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q194",
+    "prompt": "How many database replicas have a priority greater than 2?",
+    "groundTruth": "1",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q195",
+    "prompt": "How many feature flags have a rollout percentage greater than 50?",
+    "groundTruth": "0",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q196",
+    "prompt": "How many groups have more than one role assigned?",
+    "groundTruth": "1",
+    "type": "aggregation",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q197",
+    "prompt": "How many feature flags are enabled with rollout greater than 50%?",
+    "groundTruth": "0",
+    "type": "filtering",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q198",
+    "prompt": "How many groups have the admin role?",
+    "groundTruth": "1",
+    "type": "filtering",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q199",
+    "prompt": "How many database replicas have priority greater than 2 and port 5432?",
+    "groundTruth": "1",
+    "type": "filtering",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q200",
+    "prompt": "How many authentication providers have more than 2 scopes?",
+    "groundTruth": "1",
+    "type": "filtering",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q201",
+    "prompt": "How many roles have at least 5 permissions?",
+    "groundTruth": "1",
+    "type": "filtering",
+    "dataset": "nested-config"
+  },
+  {
+    "id": "q202",
+    "prompt": "How many feature flags are disabled with rollout less than 25%?",
+    "groundTruth": "2",
+    "type": "filtering",
+    "dataset": "nested-config"
+  }
+]
\ No newline at end of file
diff --git a/benchmarks/scripts/token-efficiency-benchmark.ts b/benchmarks/scripts/token-efficiency-benchmark.ts
index 1e36a13..48d3d0d 100644
--- a/benchmarks/scripts/token-efficiency-benchmark.ts
+++ b/benchmarks/scripts/token-efficiency-benchmark.ts
@@ -100,7 +100,7 @@ function generateTotalLines(
     const csvStr = baselineFormat.tokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
     lines.push(`csv                   ${csvBar}   ${csvStr} tokens`)
 
-    const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / totalToonTokens) * 100
+    const overheadPercent = ((totalToonTokens - baselineFormat.tokens) / baselineFormat.tokens) * 100
     const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
     const toonStr = totalToonTokens.toLocaleString('en-US').padStart(TOKEN_PADDING)
     lines.push(`toon                  ${toonBar}   ${toonStr} tokens   (+${overheadPercent.toFixed(1)}% vs CSV)`)
@@ -223,7 +223,7 @@ const flatCharts = flatOnlyDatasets
 
     // TOON line with overhead vs CSV
     const toonOverhead = toon.tokens - csv.tokens
-    const toonOverheadPercent = (toonOverhead / toon.tokens) * 100
+    const toonOverheadPercent = (toonOverhead / csv.tokens) * 100
     const toonBar = createProgressBar(100, 100, PROGRESS_BAR_WIDTH, PROGRESS_BAR_CONFIG)
     const toonStr = toon.tokens.toLocaleString('en-US')
     const toonVsCSV = toonOverheadPercent >= 0
diff --git a/benchmarks/src/constants.ts b/benchmarks/src/constants.ts
index 05daee3..ea1f2d1 100644
--- a/benchmarks/src/constants.ts
+++ b/benchmarks/src/constants.ts
@@ -101,10 +101,10 @@ export const QUESTION_THRESHOLDS = {
  */
 export const QUESTION_LIMITS = {
   tabular: {
-    fieldRetrieval: 20,
-    aggregationDepartments: 6,
-    filteringMultiConditionDepartments: 6,
-    filteringExperience: 4,
+    fieldRetrieval: 14,
+    aggregationDepartments: 4,
+    filteringMultiConditionDepartments: 5,
+    filteringExperience: 3,
     filteringDepartmentExp: 3,
     filteringDepartmentActive: 3,
   },
@@ -116,7 +116,7 @@ export const QUESTION_LIMITS = {
     filteringStatusAndItems: 3,
   },
   analytics: {
-    fieldRetrievalDates: 13,
+    fieldRetrievalDates: 9,
   },
   github: {
     fieldRetrievalRepos: 11,
@@ -125,12 +125,12 @@ export const QUESTION_LIMITS = {
   },
   eventLogs: {
     fieldRetrieval: 10,
-    aggregationEndpoints: 3,
-    filteringLevelAndStatus: 2,
-    filteringEndpointAndStatus: 2,
+    aggregationEndpoints: 4,
+    filteringLevelAndStatus: 3,
+    filteringEndpointAndStatus: 3,
   },
   nestedConfig: {
-    fieldRetrieval: 5,
-    filteringComplex: 2,
+    fieldRetrieval: 10,
+    filteringComplex: 6,
   },
 } as const
diff --git a/benchmarks/src/datasets.ts b/benchmarks/src/datasets.ts
index e763856..fe7e6fa 100644
--- a/benchmarks/src/datasets.ts
+++ b/benchmarks/src/datasets.ts
@@ -5,67 +5,6 @@ import githubRepos from '../data/github-repos.json' with { type: 'json' }
 // Seed for reproducibility
 faker.seed(12345)
 
-/**
- * Calculate the tabular eligibility percentage of a data structure
- *
- * @remarks
- * Recursively analyzes data to determine what percentage of arrays qualify
- * for TOON's tabular format (uniform objects with primitive values only).
- */
-export function calculateTabularEligibility(data: unknown): number {
-  let totalArrays = 0
-  let tabularArrays = 0
-
-  function isTabularArray(arr: unknown[]): boolean {
-    if (arr.length === 0)
-      return false
-
-    // Check if all elements are objects
-    if (!arr.every(item => typeof item === 'object' && item !== null && !Array.isArray(item)))
-      return false
-
-    // Get keys from first object
-    const firstKeys = Object.keys(arr[0] as Record<string, unknown>)
-    if (firstKeys.length === 0)
-      return false
-
-    // Check if all objects have the same keys and only primitive values
-    return arr.every((item) => {
-      const itemObj = item as Record<string, unknown>
-      const itemKeys = Object.keys(itemObj)
-      if (itemKeys.length !== firstKeys.length)
-        return false
-      if (!firstKeys.every(key => itemKeys.includes(key)))
-        return false
-
-      // Check if all values are primitives (no nested objects or arrays)
-      return firstKeys.every((key) => {
-        const value = itemObj[key]
-        return value === null || ['string', 'number', 'boolean'].includes(typeof value)
-      })
-    })
-  }
-
-  function traverse(obj: unknown): void {
-    if (Array.isArray(obj)) {
-      totalArrays++
-      if (isTabularArray(obj))
-        tabularArrays++
-
-      // Continue traversing array elements
-      obj.forEach(item => traverse(item))
-    }
-    else if (typeof obj === 'object' && obj !== null) {
-      // Traverse object properties
-      Object.values(obj).forEach(value => traverse(value))
-    }
-  }
-
-  traverse(data)
-
-  return totalArrays === 0 ? 0 : Math.round((tabularArrays / totalArrays) * 100)
-}
-
 /**
  * Employee record structure for tabular dataset
  */
@@ -275,7 +214,7 @@ const tabularDataset: Dataset = {
   metadata: {
     supportsCSV: true,
     structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
   },
 }
 
@@ -285,38 +224,21 @@ const tabularDataset: Dataset = {
 const PRODUCT_NAMES = ['Wireless Mouse', 'USB Cable', 'Laptop Stand', 'Keyboard', 'Webcam', 'Headphones', 'Monitor', 'Desk Lamp'] as const
 const ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled'] as const
 
-const ORDER_CONSTANTS = {
-  CUSTOMER_ID_MOD: 20,
-  MIN_ITEMS: 1,
-  MAX_ITEMS: 4,
-  MIN_ITEM_PRICE: 9.99,
-  MAX_ITEM_PRICE: 199.99,
-  MIN_ITEM_QUANTITY: 1,
-  MAX_ITEM_QUANTITY: 5,
-  SKU_LENGTH: 6,
-  ORDER_ID_PADDING: 4,
-  RECENT_DAYS: 90,
-  TAX_RATE: 0.08,
-} as const
-
 function generateOrders(count: number): { orders: Order[] } {
   return {
     orders: Array.from({ length: count }, (_, i) => {
-      const customerId = (i % ORDER_CONSTANTS.CUSTOMER_ID_MOD) + 1
-      const itemCount = faker.number.int({ min: ORDER_CONSTANTS.MIN_ITEMS, max: ORDER_CONSTANTS.MAX_ITEMS })
+      const customerId = (i % 20) + 1 // Rotate through 20 customers
+      const itemCount = faker.number.int({ min: 1, max: 4 }) // 1-4 items per order
 
       const items = Array.from({ length: itemCount }, (_, j) => {
         const price = faker.number.float({
-          min: ORDER_CONSTANTS.MIN_ITEM_PRICE,
-          max: ORDER_CONSTANTS.MAX_ITEM_PRICE,
+          min: 9.99,
+          max: 199.99,
           fractionDigits: 2,
         })
-        const quantity = faker.number.int({
-          min: ORDER_CONSTANTS.MIN_ITEM_QUANTITY,
-          max: ORDER_CONSTANTS.MAX_ITEM_QUANTITY,
-        })
+        const quantity = faker.number.int({ min: 1, max: 5 })
         return {
-          sku: `SKU-${faker.string.alphanumeric({ length: ORDER_CONSTANTS.SKU_LENGTH }).toUpperCase()}`,
+          sku: `SKU-${faker.string.alphanumeric({ length: 6 }).toUpperCase()}`,
           name: PRODUCT_NAMES[j % PRODUCT_NAMES.length]!,
           quantity,
           price,
@@ -324,11 +246,11 @@ function generateOrders(count: number): { orders: Order[] } {
       })
 
       const subtotal = Number(items.reduce((sum, item) => sum + (item.price * item.quantity), 0).toFixed(2))
-      const tax = Number((subtotal * ORDER_CONSTANTS.TAX_RATE).toFixed(2))
+      const tax = Number((subtotal * 0.08).toFixed(2)) // 8% tax rate
       const total = Number((subtotal + tax).toFixed(2))
 
       return {
-        orderId: `ORD-${String(i + 1).padStart(ORDER_CONSTANTS.ORDER_ID_PADDING, '0')}`,
+        orderId: `ORD-${String(i + 1).padStart(4, '0')}`,
         customer: {
           id: customerId,
           name: faker.person.fullName(),
@@ -340,7 +262,7 @@ function generateOrders(count: number): { orders: Order[] } {
         tax,
         total,
         status: ORDER_STATUSES[i % ORDER_STATUSES.length]!,
-        orderDate: faker.date.recent({ days: ORDER_CONSTANTS.RECENT_DAYS }).toISOString().split('T')[0],
+        orderDate: faker.date.recent({ days: 90 }).toISOString().split('T')[0],
       }
     }),
   }
@@ -359,7 +281,7 @@ const nestedDataset: Dataset = {
   metadata: {
     supportsCSV: false,
     structureClass: 'nested',
-    tabularEligibility: 33, // orders array is not tabular, but items arrays within are
+    tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
   },
 }
 
@@ -376,7 +298,7 @@ const analyticsDataset: Dataset = {
   metadata: {
     supportsCSV: true,
     structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
   },
 }
 
@@ -395,7 +317,7 @@ const githubDataset: Dataset = {
   metadata: {
     supportsCSV: true,
     structureClass: 'uniform',
-    tabularEligibility: 100,
+    tabularEligibility: 100, // Repository array contains uniform objects with primitive values
   },
 }
 
@@ -597,7 +519,7 @@ const eventLogsDataset: Dataset = {
   metadata: {
     supportsCSV: false,
     structureClass: 'semi-uniform',
-    tabularEligibility: 50, // ~50% of logs have nested error objects
+    tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
   },
 }
 
@@ -614,7 +536,7 @@ const nestedConfigDataset: Dataset = {
   metadata: {
     supportsCSV: false,
     structureClass: 'deep',
-    tabularEligibility: 0, // Highly nested, minimal tabular arrays
+    tabularEligibility: 0, // Deeply nested configuration with no tabular arrays
   },
 }
 
@@ -642,7 +564,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
     metadata: {
       supportsCSV: true,
       structureClass: 'uniform',
-      tabularEligibility: 100,
+      tabularEligibility: 100, // All arrays contain uniform objects with primitive values only
     },
   },
   // Nested: 500 orders
@@ -653,7 +575,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
     metadata: {
       supportsCSV: false,
       structureClass: 'nested',
-      tabularEligibility: 33,
+      tabularEligibility: 33, // Top-level orders array has nested objects (not tabular), but nested items arrays are tabular
     },
   },
   // Analytics: 365 days
@@ -664,7 +586,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
     metadata: {
       supportsCSV: true,
       structureClass: 'uniform',
-      tabularEligibility: 100,
+      tabularEligibility: 100, // Uniform time-series records with consistent primitive fields
     },
   },
   // GitHub: 100 repos (same as accuracy)
@@ -677,7 +599,7 @@ export const TOKEN_EFFICIENCY_DATASETS: Dataset[] = [
     metadata: {
       supportsCSV: false,
       structureClass: 'semi-uniform',
-      tabularEligibility: 50,
+      tabularEligibility: 50, // Top-level logs array is tabular, but ~50% have nested optional error objects
     },
   },
   // Nested config: 1 config (same as accuracy)
diff --git a/benchmarks/src/evaluate.ts b/benchmarks/src/evaluate.ts
index ecf24e6..be1c3bf 100644
--- a/benchmarks/src/evaluate.ts
+++ b/benchmarks/src/evaluate.ts
@@ -4,7 +4,6 @@ import { anthropic } from '@ai-sdk/anthropic'
 import { google } from '@ai-sdk/google'
 import { openai } from '@ai-sdk/openai'
 import { xai } from '@ai-sdk/xai'
-import * as prompts from '@clack/prompts'
 import { generateText } from 'ai'
 
 /**
@@ -102,17 +101,10 @@ Is the actual answer correct? Consider:
 Respond with only "YES" or "NO".
 `.trim()
 
-  try {
-    const { text } = await generateText({
-      model: models.find(m => m.modelId === 'gpt-5-nano')!,
-      prompt,
-    })
+  const { text } = await generateText({
+    model: models.find(m => m.modelId === 'gpt-5-nano')!,
+    prompt,
+  })
 
-    return text.trim().toUpperCase() === 'YES'
-  }
-  catch (error) {
-    prompts.log.error(`Validation error: ${error}`)
-    // Fallback to simple string comparison
-    return actual.toLowerCase().trim() === expected.toLowerCase().trim()
-  }
+  return text.trim().toUpperCase() === 'YES'
 }
diff --git a/benchmarks/src/questions/analytics.ts b/benchmarks/src/questions/analytics.ts
index 4c58639..c08b263 100644
--- a/benchmarks/src/questions/analytics.ts
+++ b/benchmarks/src/questions/analytics.ts
@@ -1,7 +1,7 @@
 import type { AnalyticsMetric } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
 
 /**
  * Generate analytics (website metrics) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: () => string): Question[] {
   const questions: Question[] = []
 
-  if (metrics.length === 0)
-    return questions
-
   // Field retrieval: date-based metrics
   const metricFieldGenerators: Array<(metric: AnalyticsMetric, getId: () => string) => Question> = [
     (metric, getId) => new QuestionBuilder()
@@ -99,7 +96,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
 
   // Aggregation: high views/conversions
   for (const threshold of QUESTION_THRESHOLDS.analytics.views) {
-    const count = countByPredicate(metrics, m => m.views > threshold)
+    const count = metrics.filter(m => m.views > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -112,7 +109,7 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
   }
 
   for (const threshold of QUESTION_THRESHOLDS.analytics.conversions) {
-    const count = countByPredicate(metrics, m => m.conversions > threshold)
+    const count = metrics.filter(m => m.conversions > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -126,10 +123,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
 
   // Filtering: multi-condition (views AND revenue)
   for (const threshold of QUESTION_THRESHOLDS.analytics.viewsForFiltering) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
       m => m.views > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForFiltering,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -143,10 +139,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
 
   // Filtering: revenue thresholds
   for (const threshold of QUESTION_THRESHOLDS.analytics.revenueThresholds) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
       m => m.revenue > threshold && m.views > QUESTION_THRESHOLDS.analytics.viewsThresholdForRevenue,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -160,10 +155,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
 
   // Filtering: clicks and conversions
   for (const threshold of QUESTION_THRESHOLDS.analytics.clicksForFiltering) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
       m => m.clicks > threshold && m.conversions > QUESTION_THRESHOLDS.analytics.conversionsForClickFiltering,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -177,10 +171,9 @@ export function generateAnalyticsQuestions(metrics: AnalyticsMetric[], getId: ()
 
   // Filtering: revenue and bounce rate
   for (const threshold of QUESTION_THRESHOLDS.analytics.revenueForBounceRate) {
-    const count = countByPredicate(
-      metrics,
+    const count = metrics.filter(
       m => m.revenue > threshold && m.bounceRate < QUESTION_THRESHOLDS.analytics.bounceRateThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
diff --git a/benchmarks/src/questions/event-logs.ts b/benchmarks/src/questions/event-logs.ts
index 3e4650a..2a35e52 100644
--- a/benchmarks/src/questions/event-logs.ts
+++ b/benchmarks/src/questions/event-logs.ts
@@ -1,7 +1,7 @@
 import type { EventLog } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
 
 /**
  * Generate event log questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateEventLogsQuestions(logs: EventLog[], getId: () => string): Question[] {
   const questions: Question[] = []
 
-  if (logs.length === 0)
-    return questions
-
   // Field retrieval: log metadata
   const logFieldGenerators: Array<(log: EventLog, getId: () => string) => Question> = [
     (log, getId) => new QuestionBuilder()
@@ -76,7 +73,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
   // Aggregation: by level
   const levels = [...new Set(logs.map(l => l.level))]
   for (const level of levels) {
-    const count = countByPredicate(logs, l => l.level === level)
+    const count = logs.filter(l => l.level === level).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -91,7 +88,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
   // Aggregation: by endpoint
   const endpoints = [...new Set(logs.map(l => l.endpoint))]
   for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.aggregationEndpoints)) {
-    const count = countByPredicate(logs, l => l.endpoint === endpoint)
+    const count = logs.filter(l => l.endpoint === endpoint).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -104,8 +101,8 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
   }
 
   // Aggregation: by status code range
-  const errorCount = countByPredicate(logs, l => l.statusCode >= 400)
-  const successCount = countByPredicate(logs, l => l.statusCode >= 200 && l.statusCode < 300)
+  const errorCount = logs.filter(l => l.statusCode >= 400).length
+  const successCount = logs.filter(l => l.statusCode >= 200 && l.statusCode < 300).length
 
   questions.push(
     new QuestionBuilder()
@@ -124,12 +121,21 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
       .build(),
   )
 
+  // Aggregation: retryable errors
+  const retryableErrorCount = logs.filter(l => l.error?.retryable === true).length
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many log entries have a retryable error?')
+      .groundTruth(String(retryableErrorCount))
+      .type('aggregation')
+      .dataset('event-logs')
+      .build(),
+  )
+
   // Filtering: multi-condition (level AND status)
   for (const level of levels.slice(0, QUESTION_LIMITS.eventLogs.filteringLevelAndStatus)) {
-    const count = countByPredicate(
-      logs,
-      l => l.level === level && l.statusCode >= 400,
-    )
+    const count = logs.filter(l => l.level === level && l.statusCode >= 400).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -143,10 +149,7 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
 
   // Filtering: endpoint AND status
   for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
-    const count = countByPredicate(
-      logs,
-      l => l.endpoint === endpoint && l.statusCode >= 500,
-    )
+    const count = logs.filter(l => l.endpoint === endpoint && l.statusCode >= 500).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -158,5 +161,19 @@ export function generateEventLogsQuestions(logs: EventLog[], getId: () => string
     )
   }
 
+  // Filtering: endpoint AND retryable error
+  for (const endpoint of endpoints.slice(0, QUESTION_LIMITS.eventLogs.filteringEndpointAndStatus)) {
+    const count = logs.filter(l => l.endpoint === endpoint && l.error?.retryable === true).length
+    questions.push(
+      new QuestionBuilder()
+        .id(getId())
+        .prompt(`How many log entries for endpoint "${endpoint}" have a retryable error?`)
+        .groundTruth(String(count))
+        .type('filtering')
+        .dataset('event-logs')
+        .build(),
+    )
+  }
+
   return questions
 }
diff --git a/benchmarks/src/questions/github.ts b/benchmarks/src/questions/github.ts
index f9b4bd3..38c378b 100644
--- a/benchmarks/src/questions/github.ts
+++ b/benchmarks/src/questions/github.ts
@@ -1,7 +1,7 @@
 import type { Repository } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
 
 /**
  * Generate GitHub repository questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateGithubQuestions(repos: Repository[], getId: () => string): Question[] {
   const questions: Question[] = []
 
-  if (repos.length === 0)
-    return questions
-
   // Field retrieval: repository metadata
   const repoFieldGenerators: Array<(repo: Repository, getId: () => string) => Question> = [
     (repo, getId) => new QuestionBuilder()
@@ -92,7 +89,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
   // Aggregation: by default branch
   const branches = [...new Set(repos.map(r => r.defaultBranch))]
   for (const branch of branches.slice(0, QUESTION_LIMITS.github.aggregationBranches)) {
-    const count = countByPredicate(repos, r => r.defaultBranch === branch)
+    const count = repos.filter(r => r.defaultBranch === branch).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -106,7 +103,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
 
   // Aggregation: high star counts
   for (const threshold of QUESTION_THRESHOLDS.github.stars) {
-    const count = countByPredicate(repos, r => r.stars > threshold)
+    const count = repos.filter(r => r.stars > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -120,7 +117,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
 
   // Aggregation: high fork counts
   for (const threshold of QUESTION_THRESHOLDS.github.forks) {
-    const count = countByPredicate(repos, r => r.forks > threshold)
+    const count = repos.filter(r => r.forks > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -134,7 +131,7 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
 
   // Aggregation: high watcher counts
   for (const threshold of QUESTION_THRESHOLDS.github.watchers) {
-    const count = countByPredicate(repos, r => r.watchers > threshold)
+    const count = repos.filter(r => r.watchers > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -148,10 +145,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
 
   // Filtering: multi-condition (stars AND forks)
   for (const combo of QUESTION_THRESHOLDS.github.starForkCombinations.slice(0, QUESTION_LIMITS.github.filteringStarsAndForks)) {
-    const count = countByPredicate(
-      repos,
+    const count = repos.filter(
       r => r.stars > combo.stars && r.forks > combo.forks,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -165,10 +161,9 @@ export function generateGithubQuestions(repos: Repository[], getId: () => string
 
   // Filtering: stars AND watchers
   for (const combo of QUESTION_THRESHOLDS.github.starWatcherCombinations) {
-    const count = countByPredicate(
-      repos,
+    const count = repos.filter(
       r => r.stars > combo.stars && r.watchers > combo.watchers,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
diff --git a/benchmarks/src/questions/index.ts b/benchmarks/src/questions/index.ts
index 9bac171..63f03c9 100644
--- a/benchmarks/src/questions/index.ts
+++ b/benchmarks/src/questions/index.ts
@@ -10,10 +10,9 @@ import { generateTabularQuestions } from './tabular'
 import { createIdGenerator } from './utils'
 
 /**
- * Generate all questions from datasets
+ * Generate ~200 questions from all datasets
  *
  * @remarks
- * Generates ~150-160 questions across different question types and datasets:
  * - Field Retrieval: Direct field access with no computation
  *   Examples: "What is X's salary?", "What is the status of order Y?"
  * - Aggregation: Counts, sums, averages, min/max operations (including single-condition filters)
diff --git a/benchmarks/src/questions/nested-config.ts b/benchmarks/src/questions/nested-config.ts
index 8ebc9f6..6f04dcd 100644
--- a/benchmarks/src/questions/nested-config.ts
+++ b/benchmarks/src/questions/nested-config.ts
@@ -34,6 +34,26 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
       prompt: 'What is the session duration?',
       groundTruth: String(config.authentication.session.duration),
     },
+    {
+      prompt: 'What is the minimum connection pool size?',
+      groundTruth: String(config.database.pool.min),
+    },
+    {
+      prompt: 'What is the connection pool idle timeout?',
+      groundTruth: String(config.database.pool.idleTimeout),
+    },
+    {
+      prompt: 'What is the database name?',
+      groundTruth: config.database.name,
+    },
+    {
+      prompt: 'What is the session refresh threshold?',
+      groundTruth: String(config.authentication.session.refreshThreshold),
+    },
+    {
+      prompt: 'What is the version in the configuration?',
+      groundTruth: config.version,
+    },
   ]
 
   for (const q of fieldRetrievalQuestions.slice(0, QUESTION_LIMITS.nestedConfig.fieldRetrieval)) {
@@ -93,6 +113,18 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
       .build(),
   )
 
+  // Aggregation: providers with admin scope
+  const adminScopeProviderCount = config.authentication.providers.filter(p => p.scopes.includes('admin')).length
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many authentication providers include the "admin" scope?')
+      .groundTruth(String(adminScopeProviderCount))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+  )
+
   // Aggregation: feature flag details
   const enabledFeatures = Object.entries(config.features).filter(([_, f]) => f.enabled).length
   questions.push(
@@ -117,6 +149,67 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
       .build(),
   )
 
+  // Aggregation: additional nested counts
+  const totalPermissions = Object.values(config.permissions.roles).reduce((sum, role) => sum + role.permissions.length, 0)
+  const distinctPermissions = new Set(Object.values(config.permissions.roles).flatMap(r => r.permissions)).size
+  const distinctScopes = new Set(config.authentication.providers.flatMap(p => p.scopes)).size
+  const totalVariants = Object.values(config.features).reduce((sum, f) => sum + f.variants.length, 0)
+  const highPriorityReplicas = config.database.replicas.filter(r => r.priority > 2).length
+  const featuresWithHighRollout = Object.values(config.features).filter(f => f.rollout > 50).length
+  const groupsWithMultipleRoles = Object.values(config.permissions.groups).filter(g => g.roles.length > 1).length
+
+  questions.push(
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the total number of permissions across all roles?')
+      .groundTruth(String(totalPermissions))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many distinct permissions are defined across all roles?')
+      .groundTruth(String(distinctPermissions))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many distinct scopes are defined across all authentication providers?')
+      .groundTruth(String(distinctScopes))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('What is the total number of variants across all feature flags?')
+      .groundTruth(String(totalVariants))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many database replicas have a priority greater than 2?')
+      .groundTruth(String(highPriorityReplicas))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many feature flags have a rollout percentage greater than 50?')
+      .groundTruth(String(featuresWithHighRollout))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+    new QuestionBuilder()
+      .id(getId())
+      .prompt('How many groups have more than one role assigned?')
+      .groundTruth(String(groupsWithMultipleRoles))
+      .type('aggregation')
+      .dataset('nested-config')
+      .build(),
+  )
+
   // Filtering: complex multi-condition queries
   const filteringQuestions = [
     {
@@ -129,6 +222,31 @@ export function generateNestedConfigQuestions(config: NestedConfig | undefined,
       groundTruth: String(Object.entries(config.permissions.groups)
         .filter(([_, g]) => g.roles.includes('admin')).length),
     },
+    {
+      prompt: 'How many database replicas have priority greater than 2 and port 5432?',
+      groundTruth: String(config.database.replicas
+        .filter(r => r.priority > 2 && r.port === 5432).length),
+    },
+    {
+      prompt: 'How many authentication providers have more than 2 scopes?',
+      groundTruth: String(config.authentication.providers
+        .filter(p => p.scopes.length > 2).length),
+    },
+    {
+      prompt: 'How many roles have at least 5 permissions?',
+      groundTruth: String(Object.values(config.permissions.roles)
+        .filter(r => r.permissions.length >= 5).length),
+    },
+    {
+      prompt: 'How many feature flags are disabled with rollout less than 25%?',
+      groundTruth: String(Object.values(config.features)
+        .filter(f => !f.enabled && f.rollout < 25).length),
+    },
+    {
+      prompt: 'How many enabled features have at least 2 variants?',
+      groundTruth: String(Object.values(config.features)
+        .filter(f => f.enabled && f.variants.length >= 2).length),
+    },
   ]
 
   for (const q of filteringQuestions.slice(0, QUESTION_LIMITS.nestedConfig.filteringComplex)) {
diff --git a/benchmarks/src/questions/nested.ts b/benchmarks/src/questions/nested.ts
index e54512b..de16c55 100644
--- a/benchmarks/src/questions/nested.ts
+++ b/benchmarks/src/questions/nested.ts
@@ -1,7 +1,7 @@
 import type { Order } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
 
 /**
  * Generate nested (orders) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateNestedQuestions(orders: Order[], getId: () => string): Question[] {
   const questions: Question[] = []
 
-  if (orders.length === 0)
-    return questions
-
   // Field retrieval: order totals and statuses
   const orderFieldGenerators: Array<(order: Order, getId: () => string) => Question> = [
     (order, getId) => new QuestionBuilder()
@@ -89,7 +86,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
   // Count by status
   const statuses = [...new Set(orders.map(o => o.status))]
   for (const status of statuses.slice(0, QUESTION_LIMITS.nested.aggregationStatuses)) {
-    const count = countByPredicate(orders, o => o.status === status)
+    const count = orders.filter(o => o.status === status).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -134,7 +131,7 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
 
   // Aggregation: high-value orders (single-condition filter)
   for (const threshold of QUESTION_THRESHOLDS.nested.highValueOrders) {
-    const count = countByPredicate(orders, o => o.total > threshold)
+    const count = orders.filter(o => o.total > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -149,10 +146,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
   // Filtering: multi-condition queries (status AND value)
   const orderStatuses = [...new Set(orders.map(o => o.status))]
   for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndValue)) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
       o => o.status === status && o.total > QUESTION_THRESHOLDS.nested.statusValueThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -166,10 +162,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
 
   // Filtering: status AND items count (multi-condition)
   for (const status of orderStatuses.slice(0, QUESTION_LIMITS.nested.filteringStatusAndItems)) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
       o => o.status === status && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -183,10 +178,9 @@ export function generateNestedQuestions(orders: Order[], getId: () => string): Q
 
   // Filtering: total AND items count (multi-condition)
   for (const threshold of QUESTION_THRESHOLDS.nested.totalThresholdsForItems) {
-    const count = countByPredicate(
-      orders,
+    const count = orders.filter(
       o => o.total > threshold && o.items.length >= QUESTION_THRESHOLDS.nested.itemCountThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
diff --git a/benchmarks/src/questions/tabular.ts b/benchmarks/src/questions/tabular.ts
index 951bfdb..b9a5a01 100644
--- a/benchmarks/src/questions/tabular.ts
+++ b/benchmarks/src/questions/tabular.ts
@@ -1,7 +1,7 @@
 import type { Employee } from '../datasets'
 import type { Question } from '../types'
 import { QUESTION_LIMITS, QUESTION_THRESHOLDS } from '../constants'
-import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
+import { QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } from './utils'
 
 /**
  * Generate tabular (employee) questions
@@ -9,9 +9,6 @@ import { countByPredicate, QuestionBuilder, rotateQuestions, SAMPLE_STRIDES } fr
 export function generateTabularQuestions(employees: Employee[], getId: () => string): Question[] {
   const questions: Question[] = []
 
-  if (employees.length === 0)
-    return questions
-
   // Field retrieval: specific employees
   const fieldGenerators: Array<(emp: Employee, getId: () => string) => Question> = [
     (emp, getId) => new QuestionBuilder()
@@ -62,7 +59,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
   // Aggregation: count by department
   const departments = [...new Set(employees.map(e => e.department))]
   for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.aggregationDepartments)) {
-    const count = countByPredicate(employees, e => e.department === dept)
+    const count = employees.filter(e => e.department === dept).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -76,7 +73,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
 
   // Aggregation: salary ranges (single-condition filters)
   for (const threshold of QUESTION_THRESHOLDS.tabular.salaryRanges) {
-    const count = countByPredicate(employees, e => e.salary > threshold)
+    const count = employees.filter(e => e.salary > threshold).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -91,8 +88,8 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
   // Aggregation: totals and averages
   const totalEmployees = employees.length
   const avgSalary = Math.round(employees.reduce((sum, e) => sum + e.salary, 0) / totalEmployees)
-  const activeCount = countByPredicate(employees, e => e.active)
-  const inactiveCount = countByPredicate(employees, e => !e.active)
+  const activeCount = employees.filter(e => e.active).length
+  const inactiveCount = employees.filter(e => !e.active).length
 
   questions.push(
     new QuestionBuilder()
@@ -127,10 +124,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
 
   // Filtering: count by department with salary filter (multi-condition)
   for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringMultiConditionDepartments)) {
-    const count = countByPredicate(
-      employees,
+    const count = employees.filter(
       e => e.department === dept && e.salary > QUESTION_THRESHOLDS.tabular.departmentSalaryThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -144,7 +140,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
 
   // Filtering: active employees by experience (multi-condition)
   for (const exp of QUESTION_THRESHOLDS.tabular.experienceYears.slice(0, QUESTION_LIMITS.tabular.filteringExperience)) {
-    const count = countByPredicate(employees, e => e.yearsExperience > exp && e.active)
+    const count = employees.filter(e => e.yearsExperience > exp && e.active).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -158,10 +154,9 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
 
   // Filtering: department by experience (multi-condition)
   for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentExp)) {
-    const count = countByPredicate(
-      employees,
+    const count = employees.filter(
       e => e.department === dept && e.yearsExperience > QUESTION_THRESHOLDS.tabular.departmentExperienceThreshold,
-    )
+    ).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
@@ -175,7 +170,7 @@ export function generateTabularQuestions(employees: Employee[], getId: () => str
 
   // Filtering: department by active status (multi-condition)
   for (const dept of departments.slice(0, QUESTION_LIMITS.tabular.filteringDepartmentActive)) {
-    const count = countByPredicate(employees, e => e.department === dept && e.active)
+    const count = employees.filter(e => e.department === dept && e.active).length
     questions.push(
       new QuestionBuilder()
         .id(getId())
diff --git a/benchmarks/src/questions/utils.ts b/benchmarks/src/questions/utils.ts
index 45c2c58..e3004e6 100644
--- a/benchmarks/src/questions/utils.ts
+++ b/benchmarks/src/questions/utils.ts
@@ -61,14 +61,7 @@ export class QuestionBuilder {
 }
 
 /**
- * Helper: Count items matching a predicate
- */
-export function countByPredicate<T>(items: T[], predicate: (item: T) => boolean): number {
-  return items.filter(predicate).length
-}
-
-/**
- * Helper: Rotate through question generators
+ * Rotate through question generators
  */
 export function rotateQuestions<T>(
   items: T[],
diff --git a/benchmarks/src/types.ts b/benchmarks/src/types.ts
index 5676920..e586cb5 100644
--- a/benchmarks/src/types.ts
+++ b/benchmarks/src/types.ts
@@ -15,7 +15,7 @@ export interface Question {
   id: string
   prompt: string
   groundTruth: string
-  type: 'field-retrieval' | 'aggregation' | 'filtering' | 'comparison'
+  type: 'field-retrieval' | 'aggregation' | 'filtering'
   dataset: string
 }