mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
text(accuracy): add Grok-4-fast, remove default temperature
This commit is contained in:
109
README.md
109
README.md
@@ -212,7 +212,7 @@ metrics[5]{date,views,clicks,conversions,revenue,bounceRate}:
|
||||
|
||||
### Retrieval Accuracy
|
||||
|
||||
Accuracy across **3 LLMs** on **154 data retrieval questions**:
|
||||
Accuracy across **4 LLMs** on 154 data retrieval questions:
|
||||
|
||||
```
|
||||
gpt-5-nano
|
||||
@@ -222,22 +222,29 @@ gpt-5-nano
|
||||
json ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 83.8% (129/154)
|
||||
|
||||
gemini-2.5-flash
|
||||
xml ██████████████████░░ 90.3% (139/154)
|
||||
csv ██████████████████░░ 89.0% (137/154)
|
||||
toon █████████████████░░░ 87.0% (134/154)
|
||||
json ████████████████░░░░ 79.2% (122/154)
|
||||
yaml ███████████████░░░░░ 76.0% (117/154)
|
||||
|
||||
claude-haiku-4-5-20251001
|
||||
json ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
yaml ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||
toon ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
csv ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
json █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
xml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
|
||||
gemini-2.5-flash
|
||||
csv ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 85.1% (131/154)
|
||||
toon █████████████████░░░ 83.8% (129/154)
|
||||
json ████████████████░░░░ 78.6% (121/154)
|
||||
yaml ███████████████░░░░░ 76.6% (118/154)
|
||||
|
||||
grok-4-fast-non-reasoning
|
||||
toon ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
json ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
xml █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
yaml █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
yaml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
csv █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||
```
|
||||
|
||||
**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.
|
||||
**Advantage:** TOON achieves **69.2% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens**.
|
||||
|
||||
<details>
|
||||
<summary><strong>Performance by dataset and model</strong></summary>
|
||||
@@ -248,41 +255,41 @@ claude-haiku-4-5-20251001
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 74.7% | 2,337 | 112/150 |
|
||||
| `toon` | 76.7% | 2,483 | 115/150 |
|
||||
| `yaml` | 70.7% | 4,969 | 106/150 |
|
||||
| `xml` | 77.3% | 7,314 | 116/150 |
|
||||
| `json` | 69.3% | 6,347 | 104/150 |
|
||||
| `csv` | 67.0% | 2,337 | 134/200 |
|
||||
| `toon` | 66.5% | 2,483 | 133/200 |
|
||||
| `yaml` | 65.5% | 4,969 | 131/200 |
|
||||
| `json` | 63.5% | 6,347 | 127/200 |
|
||||
| `xml` | 66.5% | 7,314 | 133/200 |
|
||||
|
||||
##### E-commerce orders with nested structures
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `toon` | 80.0% | 5,967 | 96/120 |
|
||||
| `csv` | 75.8% | 6,735 | 91/120 |
|
||||
| `yaml` | 74.2% | 7,328 | 89/120 |
|
||||
| `json` | 79.2% | 9,694 | 95/120 |
|
||||
| `xml` | 78.3% | 10,992 | 94/120 |
|
||||
| `toon` | 78.8% | 5,967 | 126/160 |
|
||||
| `csv` | 71.9% | 6,735 | 115/160 |
|
||||
| `yaml` | 71.9% | 7,328 | 115/160 |
|
||||
| `json` | 73.1% | 9,694 | 117/160 |
|
||||
| `xml` | 73.8% | 10,992 | 118/160 |
|
||||
|
||||
##### Time-series analytics data
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 75.5% | 1,393 | 77/102 |
|
||||
| `toon` | 76.5% | 1,515 | 78/102 |
|
||||
| `yaml` | 74.5% | 2,938 | 76/102 |
|
||||
| `json` | 76.5% | 3,665 | 78/102 |
|
||||
| `xml` | 74.5% | 4,376 | 76/102 |
|
||||
| `csv` | 67.6% | 1,393 | 92/136 |
|
||||
| `toon` | 67.6% | 1,515 | 92/136 |
|
||||
| `yaml` | 64.7% | 2,938 | 88/136 |
|
||||
| `json` | 68.4% | 3,665 | 93/136 |
|
||||
| `xml` | 66.2% | 4,376 | 90/136 |
|
||||
|
||||
##### Top 100 GitHub repositories
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `toon` | 74.4% | 8,745 | 67/90 |
|
||||
| `csv` | 73.3% | 8,513 | 66/90 |
|
||||
| `yaml` | 62.2% | 13,129 | 56/90 |
|
||||
| `json` | 61.1% | 15,145 | 55/90 |
|
||||
| `xml` | 61.1% | 17,095 | 55/90 |
|
||||
| `csv` | 64.2% | 8,513 | 77/120 |
|
||||
| `toon` | 62.5% | 8,745 | 75/120 |
|
||||
| `yaml` | 57.5% | 13,129 | 69/120 |
|
||||
| `json` | 55.0% | 15,145 | 66/120 |
|
||||
| `xml` | 53.3% | 17,095 | 64/120 |
|
||||
|
||||
#### Performance by Model
|
||||
|
||||
@@ -296,24 +303,34 @@ claude-haiku-4-5-20251001
|
||||
| `json` | 87.7% | 135/154 |
|
||||
| `xml` | 83.8% | 129/154 |
|
||||
|
||||
##### gemini-2.5-flash
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `xml` | 90.3% | 139/154 |
|
||||
| `csv` | 89.0% | 137/154 |
|
||||
| `toon` | 87.0% | 134/154 |
|
||||
| `json` | 79.2% | 122/154 |
|
||||
| `yaml` | 76.0% | 117/154 |
|
||||
|
||||
##### claude-haiku-4-5-20251001
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `json` | 48.7% | 75/154 |
|
||||
| `yaml` | 49.4% | 76/154 |
|
||||
| `toon` | 48.1% | 74/154 |
|
||||
| `csv` | 48.1% | 74/154 |
|
||||
| `json` | 47.4% | 73/154 |
|
||||
| `xml` | 46.8% | 72/154 |
|
||||
|
||||
##### gemini-2.5-flash
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `csv` | 87.7% | 135/154 |
|
||||
| `xml` | 85.1% | 131/154 |
|
||||
| `toon` | 83.8% | 129/154 |
|
||||
| `json` | 78.6% | 121/154 |
|
||||
| `yaml` | 76.6% | 118/154 |
|
||||
|
||||
##### grok-4-fast-non-reasoning
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `toon` | 48.7% | 75/154 |
|
||||
| `json` | 48.1% | 74/154 |
|
||||
| `xml` | 47.4% | 73/154 |
|
||||
| `yaml` | 47.4% | 73/154 |
|
||||
| `yaml` | 46.8% | 72/154 |
|
||||
| `csv` | 45.5% | 70/154 |
|
||||
|
||||
</details>
|
||||
@@ -360,10 +377,10 @@ Four datasets designed to test different structural patterns:
|
||||
|
||||
#### Models & Configuration
|
||||
|
||||
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`
|
||||
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||
- **Temperature**: 0 (for non-reasoning models)
|
||||
- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls
|
||||
- **Temperature**: Not set (models use their defaults)
|
||||
- **Total evaluations**: 154 questions × 5 formats × 4 models = 3,080 LLM calls
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# Add keys for the models you're testing
|
||||
OPENAI_API_KEY=
|
||||
ANTHROPIC_API_KEY=
|
||||
GOOGLE_GENERATIVE_AI_API_KEY=
|
||||
XAI_API_KEY=
|
||||
|
||||
108
benchmarks/README.md
Normal file
108
benchmarks/README.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# TOON Benchmarks
|
||||
|
||||
Benchmarks measuring TOON's **token efficiency** and **retrieval accuracy** compared to JSON, XML, YAML, and CSV.
|
||||
|
||||
> [!NOTE]
|
||||
> Results are automatically embedded in the [main README](../README.md#benchmarks). This guide focuses on running the benchmarks locally.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Run token efficiency benchmark
|
||||
pnpm benchmark:token-efficiency
|
||||
|
||||
# Run retrieval accuracy benchmark (requires API keys)
|
||||
pnpm benchmark:accuracy
|
||||
```
|
||||
|
||||
## Token Efficiency Benchmark
|
||||
|
||||
Measures token count reduction across JSON, XML, YAML, CSV, and TOON:
|
||||
|
||||
1. Generate datasets (GitHub repos, analytics, orders)
|
||||
2. Convert to all formats (TOON, JSON, XML, YAML, CSV)
|
||||
3. Tokenize using `gpt-tokenizer` (`o200k_base` encoding)
|
||||
4. Calculate savings and generate report
|
||||
|
||||
```bash
|
||||
pnpm benchmark:token-efficiency
|
||||
```
|
||||
|
||||
Results are saved to `results/token-efficiency.md`.
|
||||
|
||||
## Retrieval Accuracy Benchmark
|
||||
|
||||
Tests how well LLMs can answer questions about data in different formats (TOON, JSON, XML, YAML, CSV):
|
||||
|
||||
1. Generate 154 questions across 4 datasets
|
||||
2. Convert each dataset to all 5 formats
|
||||
3. Query each LLM with formatted data + question
|
||||
4. Validate answers using `gpt-5-nano` as judge
|
||||
5. Aggregate metrics and generate report
|
||||
|
||||
### Setup
|
||||
|
||||
1. Edit [`src/evaluate.ts`](./src/evaluate.ts) and add models to the `models` array:
|
||||
```ts
|
||||
export const models: LanguageModelV2[] = [
|
||||
openai('gpt-5-nano'),
|
||||
anthropic('claude-haiku-4-5-20251001'),
|
||||
google('gemini-2.5-flash'),
|
||||
xai('grok-4-fast-non-reasoning'),
|
||||
// Add your models here
|
||||
]
|
||||
```
|
||||
2. Duplicate `.env.example` to `.env` and add your API keys:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Full benchmark
|
||||
pnpm benchmark:accuracy
|
||||
|
||||
# Dry run (10 questions only, for testing setup)
|
||||
DRY_RUN=true pnpm benchmark:accuracy
|
||||
```
|
||||
|
||||
Running the script will:
|
||||
|
||||
1. Prompt you to select which models to test.
|
||||
2. Skip models with existing results (rerun to overwrite).
|
||||
3. Show progress with rate limiting.
|
||||
4. Save results to `results/accuracy/models/{model-id}.json`.
|
||||
5. Generate report at `results/retrieval-accuracy.md`.
|
||||
|
||||
### Configuration
|
||||
|
||||
Edit [`src/constants.ts`](./src/constants.ts) to adjust:
|
||||
|
||||
- `MODEL_RPM_LIMITS` – Rate limits per model
|
||||
- `DEFAULT_CONCURRENCY` – Parallel tasks (default: 10)
|
||||
- `DRY_RUN_LIMITS` – Questions per dry run (default: 10)
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
scripts/
|
||||
├── accuracy-benchmark.ts # Retrieval accuracy benchmark
|
||||
├── token-efficiency-benchmark.ts # Token counting benchmark
|
||||
└── fetch-github-repos.ts # Update GitHub dataset
|
||||
src/
|
||||
├── constants.ts # Configuration
|
||||
├── datasets.ts # Test data generators
|
||||
├── evaluate.ts # LLM evaluation
|
||||
├── formatters.ts # Format converters
|
||||
├── questions.ts # Question generation
|
||||
├── report.ts # Markdown reports
|
||||
├── storage.ts # Result caching
|
||||
└── utils.ts # Helpers
|
||||
data/
|
||||
└── github-repos.json # Top 100 GitHub repos
|
||||
results/
|
||||
├── token-efficiency.md # Token savings report
|
||||
├── retrieval-accuracy.md # Accuracy report
|
||||
└── accuracy/models/ # Per-model results (JSON)
|
||||
```
|
||||
@@ -5,14 +5,14 @@
|
||||
"scripts": {
|
||||
"benchmark:token-efficiency": "tsx scripts/token-efficiency-benchmark.ts",
|
||||
"benchmark:accuracy": "tsx --env-file=.env scripts/accuracy-benchmark.ts",
|
||||
"fetch:github-repos": "tsx scripts/fetch-github-repos.ts",
|
||||
"test": "vitest"
|
||||
"fetch:github-repos": "tsx scripts/fetch-github-repos.ts"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@ai-sdk/anthropic": "^2.0.37",
|
||||
"@ai-sdk/google": "^2.0.23",
|
||||
"@ai-sdk/openai": "^2.0.53",
|
||||
"@ai-sdk/provider": "^2.0.0",
|
||||
"@ai-sdk/xai": "^2.0.28",
|
||||
"@antfu/eslint-config": "^6.1.0",
|
||||
"@clack/prompts": "^0.11.0",
|
||||
"@faker-js/faker": "^10.1.0",
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,6 +1,6 @@
|
||||
### Retrieval Accuracy
|
||||
|
||||
Accuracy across **3 LLMs** on **154 data retrieval questions**:
|
||||
Accuracy across **4 LLMs** on 154 data retrieval questions:
|
||||
|
||||
```
|
||||
gpt-5-nano
|
||||
@@ -10,22 +10,29 @@ gpt-5-nano
|
||||
json ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 83.8% (129/154)
|
||||
|
||||
gemini-2.5-flash
|
||||
xml ██████████████████░░ 90.3% (139/154)
|
||||
csv ██████████████████░░ 89.0% (137/154)
|
||||
toon █████████████████░░░ 87.0% (134/154)
|
||||
json ████████████████░░░░ 79.2% (122/154)
|
||||
yaml ███████████████░░░░░ 76.0% (117/154)
|
||||
|
||||
claude-haiku-4-5-20251001
|
||||
json ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
yaml ██████████░░░░░░░░░░ 49.4% (76/154)
|
||||
toon ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
csv ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
json █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
xml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
|
||||
gemini-2.5-flash
|
||||
csv ██████████████████░░ 87.7% (135/154)
|
||||
xml █████████████████░░░ 85.1% (131/154)
|
||||
toon █████████████████░░░ 83.8% (129/154)
|
||||
json ████████████████░░░░ 78.6% (121/154)
|
||||
yaml ███████████████░░░░░ 76.6% (118/154)
|
||||
|
||||
grok-4-fast-non-reasoning
|
||||
toon ██████████░░░░░░░░░░ 48.7% (75/154)
|
||||
json ██████████░░░░░░░░░░ 48.1% (74/154)
|
||||
xml █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
yaml █████████░░░░░░░░░░░ 47.4% (73/154)
|
||||
yaml █████████░░░░░░░░░░░ 46.8% (72/154)
|
||||
csv █████████░░░░░░░░░░░ 45.5% (70/154)
|
||||
```
|
||||
|
||||
**Advantage:** TOON achieves **77.1% accuracy** (vs JSON's 71.9%) while using **46.3% fewer tokens**.
|
||||
**Advantage:** TOON achieves **69.2% accuracy** (vs JSON's 65.4%) while using **46.3% fewer tokens**.
|
||||
|
||||
<details>
|
||||
<summary><strong>Performance by dataset and model</strong></summary>
|
||||
@@ -36,41 +43,41 @@ claude-haiku-4-5-20251001
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 74.7% | 2,337 | 112/150 |
|
||||
| `toon` | 76.7% | 2,483 | 115/150 |
|
||||
| `yaml` | 70.7% | 4,969 | 106/150 |
|
||||
| `xml` | 77.3% | 7,314 | 116/150 |
|
||||
| `json` | 69.3% | 6,347 | 104/150 |
|
||||
| `csv` | 67.0% | 2,337 | 134/200 |
|
||||
| `toon` | 66.5% | 2,483 | 133/200 |
|
||||
| `yaml` | 65.5% | 4,969 | 131/200 |
|
||||
| `json` | 63.5% | 6,347 | 127/200 |
|
||||
| `xml` | 66.5% | 7,314 | 133/200 |
|
||||
|
||||
##### E-commerce orders with nested structures
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `toon` | 80.0% | 5,967 | 96/120 |
|
||||
| `csv` | 75.8% | 6,735 | 91/120 |
|
||||
| `yaml` | 74.2% | 7,328 | 89/120 |
|
||||
| `json` | 79.2% | 9,694 | 95/120 |
|
||||
| `xml` | 78.3% | 10,992 | 94/120 |
|
||||
| `toon` | 78.8% | 5,967 | 126/160 |
|
||||
| `csv` | 71.9% | 6,735 | 115/160 |
|
||||
| `yaml` | 71.9% | 7,328 | 115/160 |
|
||||
| `json` | 73.1% | 9,694 | 117/160 |
|
||||
| `xml` | 73.8% | 10,992 | 118/160 |
|
||||
|
||||
##### Time-series analytics data
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `csv` | 75.5% | 1,393 | 77/102 |
|
||||
| `toon` | 76.5% | 1,515 | 78/102 |
|
||||
| `yaml` | 74.5% | 2,938 | 76/102 |
|
||||
| `json` | 76.5% | 3,665 | 78/102 |
|
||||
| `xml` | 74.5% | 4,376 | 76/102 |
|
||||
| `csv` | 67.6% | 1,393 | 92/136 |
|
||||
| `toon` | 67.6% | 1,515 | 92/136 |
|
||||
| `yaml` | 64.7% | 2,938 | 88/136 |
|
||||
| `json` | 68.4% | 3,665 | 93/136 |
|
||||
| `xml` | 66.2% | 4,376 | 90/136 |
|
||||
|
||||
##### Top 100 GitHub repositories
|
||||
|
||||
| Format | Accuracy | Tokens | Correct/Total |
|
||||
| ------ | -------- | ------ | ------------- |
|
||||
| `toon` | 74.4% | 8,745 | 67/90 |
|
||||
| `csv` | 73.3% | 8,513 | 66/90 |
|
||||
| `yaml` | 62.2% | 13,129 | 56/90 |
|
||||
| `json` | 61.1% | 15,145 | 55/90 |
|
||||
| `xml` | 61.1% | 17,095 | 55/90 |
|
||||
| `csv` | 64.2% | 8,513 | 77/120 |
|
||||
| `toon` | 62.5% | 8,745 | 75/120 |
|
||||
| `yaml` | 57.5% | 13,129 | 69/120 |
|
||||
| `json` | 55.0% | 15,145 | 66/120 |
|
||||
| `xml` | 53.3% | 17,095 | 64/120 |
|
||||
|
||||
#### Performance by Model
|
||||
|
||||
@@ -84,24 +91,34 @@ claude-haiku-4-5-20251001
|
||||
| `json` | 87.7% | 135/154 |
|
||||
| `xml` | 83.8% | 129/154 |
|
||||
|
||||
##### gemini-2.5-flash
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `xml` | 90.3% | 139/154 |
|
||||
| `csv` | 89.0% | 137/154 |
|
||||
| `toon` | 87.0% | 134/154 |
|
||||
| `json` | 79.2% | 122/154 |
|
||||
| `yaml` | 76.0% | 117/154 |
|
||||
|
||||
##### claude-haiku-4-5-20251001
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `json` | 48.7% | 75/154 |
|
||||
| `yaml` | 49.4% | 76/154 |
|
||||
| `toon` | 48.1% | 74/154 |
|
||||
| `csv` | 48.1% | 74/154 |
|
||||
| `json` | 47.4% | 73/154 |
|
||||
| `xml` | 46.8% | 72/154 |
|
||||
|
||||
##### gemini-2.5-flash
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `csv` | 87.7% | 135/154 |
|
||||
| `xml` | 85.1% | 131/154 |
|
||||
| `toon` | 83.8% | 129/154 |
|
||||
| `json` | 78.6% | 121/154 |
|
||||
| `yaml` | 76.6% | 118/154 |
|
||||
|
||||
##### grok-4-fast-non-reasoning
|
||||
|
||||
| Format | Accuracy | Correct/Total |
|
||||
| ------ | -------- | ------------- |
|
||||
| `toon` | 48.7% | 75/154 |
|
||||
| `json` | 48.1% | 74/154 |
|
||||
| `xml` | 47.4% | 73/154 |
|
||||
| `yaml` | 47.4% | 73/154 |
|
||||
| `yaml` | 46.8% | 72/154 |
|
||||
| `csv` | 45.5% | 70/154 |
|
||||
|
||||
</details>
|
||||
@@ -148,9 +165,9 @@ Four datasets designed to test different structural patterns:
|
||||
|
||||
#### Models & Configuration
|
||||
|
||||
- **Models tested**: `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `gpt-5-nano`
|
||||
- **Models tested**: `gpt-5-nano`, `claude-haiku-4-5-20251001`, `gemini-2.5-flash`, `grok-4-fast-non-reasoning`
|
||||
- **Token counting**: Using `gpt-tokenizer` with `o200k_base` encoding (GPT-5 tokenizer)
|
||||
- **Temperature**: 0 (for non-reasoning models)
|
||||
- **Total evaluations**: 154 questions × 5 formats × 3 models = 2,310 LLM calls
|
||||
- **Temperature**: Not set (models use their defaults)
|
||||
- **Total evaluations**: 154 questions × 5 formats × 4 models = 3,080 LLM calls
|
||||
|
||||
</details>
|
||||
|
||||
@@ -84,8 +84,8 @@ for (const model of activeModels) {
|
||||
const rpmLimit = MODEL_RPM_LIMITS[modelId]
|
||||
const queue = new PQueue({
|
||||
concurrency: DEFAULT_CONCURRENCY,
|
||||
intervalCap: rpmLimit,
|
||||
interval: rpmLimit ? 60_000 : undefined,
|
||||
intervalCap: rpmLimit ?? Infinity,
|
||||
interval: rpmLimit ? 60_000 : 0,
|
||||
})
|
||||
|
||||
const evalSpinner = prompts.spinner()
|
||||
|
||||
@@ -15,6 +15,7 @@ export const MODEL_RPM_LIMITS: Record<string, number | undefined> = {
|
||||
'claude-haiku-4-5-20251001': 50,
|
||||
'gemini-2.5-flash': 25,
|
||||
'gpt-5-nano': undefined,
|
||||
'grok-4-fast-non-reasoning': 50,
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -3,6 +3,7 @@ import type { EvaluationResult, Question } from './types'
|
||||
import { anthropic } from '@ai-sdk/anthropic'
|
||||
import { google } from '@ai-sdk/google'
|
||||
import { openai } from '@ai-sdk/openai'
|
||||
import { xai } from '@ai-sdk/xai'
|
||||
import * as prompts from '@clack/prompts'
|
||||
import { generateText } from 'ai'
|
||||
|
||||
@@ -11,8 +12,9 @@ import { generateText } from 'ai'
|
||||
*/
|
||||
export const models: LanguageModelV2[] = [
|
||||
openai('gpt-5-nano'),
|
||||
google('gemini-2.5-flash'),
|
||||
anthropic('claude-haiku-4-5-20251001'),
|
||||
google('gemini-2.5-flash'),
|
||||
xai('grok-4-fast-non-reasoning'),
|
||||
]
|
||||
|
||||
/**
|
||||
@@ -45,16 +47,13 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
`.trim()
|
||||
|
||||
const startTime = performance.now()
|
||||
const { text, usage } = await generateText({
|
||||
model,
|
||||
prompt,
|
||||
temperature: !model.modelId.startsWith('gpt-5') ? 0 : undefined,
|
||||
})
|
||||
const { text, usage } = await generateText({ model, prompt })
|
||||
|
||||
const actual = text.trim()
|
||||
const latencyMs = performance.now() - startTime
|
||||
|
||||
const isCorrect = await validateAnswer({
|
||||
actual: text.trim(),
|
||||
actual,
|
||||
expected: question.groundTruth,
|
||||
question: question.prompt,
|
||||
})
|
||||
@@ -64,7 +63,7 @@ Provide only the direct answer, without any additional explanation or formatting
|
||||
format: formatName,
|
||||
model: model.modelId,
|
||||
expected: question.groundTruth,
|
||||
actual: text.trim(),
|
||||
actual,
|
||||
isCorrect,
|
||||
inputTokens: usage.inputTokens,
|
||||
outputTokens: usage.outputTokens,
|
||||
|
||||
@@ -3,6 +3,7 @@ import * as fsp from 'node:fs/promises'
|
||||
import * as path from 'node:path'
|
||||
import { BENCHMARKS_DIR } from './constants'
|
||||
import { datasets } from './datasets'
|
||||
import { models } from './evaluate'
|
||||
import { createProgressBar, ensureDir, tokenize } from './utils'
|
||||
|
||||
/**
|
||||
@@ -50,9 +51,8 @@ export function generateMarkdownReport(
|
||||
const toon = formatResults.find(r => r.format === 'toon')
|
||||
const json = formatResults.find(r => r.format === 'json')
|
||||
|
||||
// Build model-by-model breakdown with ASCII bars
|
||||
const modelNames = [...new Set(results.map(r => r.model))].reverse()
|
||||
const modelCount = modelNames.length
|
||||
const modelIds = models.map(m => m.modelId)
|
||||
const modelNames = modelIds.filter(id => results.some(r => r.model === id))
|
||||
|
||||
const modelBreakdown = modelNames.map((modelName, i) => {
|
||||
const modelResults = formatResults.map((fr) => {
|
||||
@@ -183,16 +183,14 @@ ${tableRows}
|
||||
const analyticsSize = datasets.find(d => d.name === 'analytics')?.data.metrics?.length || 0
|
||||
const githubSize = datasets.find(d => d.name === 'github')?.data.repositories?.length || 0
|
||||
|
||||
// Calculate number of formats and models
|
||||
// Calculate number of formats and evaluations
|
||||
const formatCount = formatResults.length
|
||||
const modelsUsed = [...new Set(results.map(r => r.model))]
|
||||
const modelsListStr = modelsUsed.map(m => `\`${m}\``).join(', ')
|
||||
const totalEvaluations = totalQuestions * formatCount * modelsUsed.length
|
||||
const totalEvaluations = totalQuestions * formatCount * modelNames.length
|
||||
|
||||
return `
|
||||
### Retrieval Accuracy
|
||||
|
||||
Accuracy across **${modelCount} ${modelCount === 1 ? 'LLM' : 'LLMs'}** on **${totalQuestions} data retrieval questions**:
|
||||
Accuracy across **${modelNames.length} ${modelNames.length === 1 ? 'LLM' : 'LLMs'}** on ${totalQuestions} data retrieval questions:
|
||||
|
||||
\`\`\`
|
||||
${modelBreakdown}
|
||||
@@ -253,10 +251,10 @@ ${totalQuestions} questions are generated dynamically across three categories:
|
||||
|
||||
#### Models & Configuration
|
||||
|
||||
- **Models tested**: ${modelsListStr}
|
||||
- **Models tested**: ${modelNames.map(m => `\`${m}\``).join(', ')}
|
||||
- **Token counting**: Using \`gpt-tokenizer\` with \`o200k_base\` encoding (GPT-5 tokenizer)
|
||||
- **Temperature**: 0 (for non-reasoning models)
|
||||
- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelsUsed.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
|
||||
- **Temperature**: Not set (models use their defaults)
|
||||
- **Total evaluations**: ${totalQuestions} questions × ${formatCount} formats × ${modelNames.length} models = ${totalEvaluations.toLocaleString('en-US')} LLM calls
|
||||
|
||||
</details>
|
||||
`.trimStart()
|
||||
|
||||
41
pnpm-lock.yaml
generated
41
pnpm-lock.yaml
generated
@@ -53,6 +53,9 @@ importers:
|
||||
'@ai-sdk/provider':
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
'@ai-sdk/xai':
|
||||
specifier: ^2.0.28
|
||||
version: 2.0.28(zod@4.1.12)
|
||||
'@antfu/eslint-config':
|
||||
specifier: ^6.1.0
|
||||
version: 6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))
|
||||
@@ -110,6 +113,12 @@ packages:
|
||||
peerDependencies:
|
||||
zod: ^3.25.76 || ^4.1.8
|
||||
|
||||
'@ai-sdk/openai-compatible@1.0.23':
|
||||
resolution: {integrity: sha512-nCmdy8/LqaaUQhV4b6LcTFbmdGVy+aAwFCmQWHEyTAwaGfuqrWIwphhFVqqAZwqB+j8Ymy350IpKk1M5P5uuEw==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.25.76 || ^4.1.8
|
||||
|
||||
'@ai-sdk/openai@2.0.53':
|
||||
resolution: {integrity: sha512-GIkR3+Fyif516ftXv+YPSPstnAHhcZxNoR2s8uSHhQ1yBT7I7aQYTVwpjAuYoT3GR+TeP50q7onj2/nDRbT2FQ==}
|
||||
engines: {node: '>=18'}
|
||||
@@ -122,10 +131,22 @@ packages:
|
||||
peerDependencies:
|
||||
zod: ^3.25.76 || ^4.1.8
|
||||
|
||||
'@ai-sdk/provider-utils@3.0.13':
|
||||
resolution: {integrity: sha512-aXFLBLRPTUYA853MJliItefSXeJPl+mg0KSjbToP41kJ+banBmHO8ZPGLJhNqGlCU82o11TYN7G05EREKX8CkA==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.25.76 || ^4.1.8
|
||||
|
||||
'@ai-sdk/provider@2.0.0':
|
||||
resolution: {integrity: sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@ai-sdk/xai@2.0.28':
|
||||
resolution: {integrity: sha512-iVrrX/A3YVzCltW7eO3rYB+QtyIFZYEIllWiE7oM48AO/GX2LJcCd2IBCFY820bu2ZXzlOtGty9JQEJIzP5Gug==}
|
||||
engines: {node: '>=18'}
|
||||
peerDependencies:
|
||||
zod: ^3.25.76 || ^4.1.8
|
||||
|
||||
'@antfu/eslint-config@6.1.0':
|
||||
resolution: {integrity: sha512-m/L9TGvtG3r4tkfq5BY6THz7pk0g6yuJwwA0SkLEDHJJpt0upuABhs8v3SU8yaPtCGUxq8k2QTLMZ3WPg4vSdw==}
|
||||
hasBin: true
|
||||
@@ -2479,6 +2500,12 @@ snapshots:
|
||||
'@ai-sdk/provider-utils': 3.0.12(zod@4.1.12)
|
||||
zod: 4.1.12
|
||||
|
||||
'@ai-sdk/openai-compatible@1.0.23(zod@4.1.12)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 2.0.0
|
||||
'@ai-sdk/provider-utils': 3.0.13(zod@4.1.12)
|
||||
zod: 4.1.12
|
||||
|
||||
'@ai-sdk/openai@2.0.53(zod@4.1.12)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 2.0.0
|
||||
@@ -2492,10 +2519,24 @@ snapshots:
|
||||
eventsource-parser: 3.0.6
|
||||
zod: 4.1.12
|
||||
|
||||
'@ai-sdk/provider-utils@3.0.13(zod@4.1.12)':
|
||||
dependencies:
|
||||
'@ai-sdk/provider': 2.0.0
|
||||
'@standard-schema/spec': 1.0.0
|
||||
eventsource-parser: 3.0.6
|
||||
zod: 4.1.12
|
||||
|
||||
'@ai-sdk/provider@2.0.0':
|
||||
dependencies:
|
||||
json-schema: 0.4.0
|
||||
|
||||
'@ai-sdk/xai@2.0.28(zod@4.1.12)':
|
||||
dependencies:
|
||||
'@ai-sdk/openai-compatible': 1.0.23(zod@4.1.12)
|
||||
'@ai-sdk/provider': 2.0.0
|
||||
'@ai-sdk/provider-utils': 3.0.13(zod@4.1.12)
|
||||
zod: 4.1.12
|
||||
|
||||
'@antfu/eslint-config@6.1.0(@vue/compiler-sfc@3.5.22)(eslint@9.38.0(jiti@2.6.1))(typescript@5.9.3)(vitest@4.0.3(@types/debug@4.1.12)(@types/node@24.9.1)(jiti@2.6.1)(tsx@4.20.6)(yaml@2.8.1))':
|
||||
dependencies:
|
||||
'@antfu/install-pkg': 1.1.0
|
||||
|
||||
Reference in New Issue
Block a user