mirror of
https://github.com/voson-wang/toon.git
synced 2026-01-29 15:24:10 +08:00
feat(cli): memory-efficient streaming for encoding
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
# @toon-format/cli
|
||||
|
||||
Command-line tool for converting between JSON and TOON formats.
|
||||
Command-line tool for converting JSON to TOON and back, with token analysis and streaming support.
|
||||
|
||||
[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage.
|
||||
[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable encoding of the JSON data model that minimizes tokens for LLM input. The CLI lets you test conversions, analyze token savings, and integrate TOON into shell pipelines with stdin/stdout support—no code required.
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -79,11 +79,12 @@ toon data.json --stats -o output.toon
|
||||
```
|
||||
|
||||
Example output:
|
||||
|
||||
```
|
||||
✓ Encoded to TOON
|
||||
Input: 15,145 tokens (JSON)
|
||||
Output: 8,745 tokens (TOON)
|
||||
Saved: 6,400 tokens (42.3% reduction)
|
||||
✔ Encoded data.json → output.toon
|
||||
|
||||
ℹ Token estimates: ~15,145 (JSON) → ~8,745 (TOON)
|
||||
✔ Saved ~6,400 tokens (-42.3%)
|
||||
```
|
||||
|
||||
### Alternative Delimiters
|
||||
@@ -115,6 +116,21 @@ cat large-dataset.json | toon --delimiter "\t" > output.toon
|
||||
jq '.results' data.json | toon > filtered.toon
|
||||
```
|
||||
|
||||
### Large Dataset Processing
|
||||
|
||||
The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets:
|
||||
|
||||
```bash
|
||||
# Encode large JSON file with minimal memory usage
|
||||
toon huge-dataset.json -o output.toon
|
||||
|
||||
# Process millions of records efficiently
|
||||
cat million-records.json | toon > output.toon
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
|
||||
|
||||
### Key Folding (Since v1.5)
|
||||
|
||||
Collapse nested wrapper chains to reduce tokens:
|
||||
@@ -190,6 +206,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon
|
||||
- **Pipeline integration** with existing JSON-based workflows
|
||||
- **Flexible formatting** with delimiter and indentation options
|
||||
- **Key folding** to collapse nested wrappers for additional token savings
|
||||
- **Memory-efficient streaming** for processing large datasets without loading everything into memory
|
||||
|
||||
## Related
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import type { FileHandle } from 'node:fs/promises'
|
||||
import type { DecodeOptions, EncodeOptions } from '../../toon/src'
|
||||
import type { InputSource } from './types'
|
||||
import * as fsp from 'node:fs/promises'
|
||||
@@ -34,38 +35,42 @@ export async function encodeToToon(config: {
|
||||
flattenDepth: config.flattenDepth,
|
||||
}
|
||||
|
||||
let toonOutput: string
|
||||
|
||||
// When printing stats, we need the full string for token counting
|
||||
if (config.printStats) {
|
||||
toonOutput = encode(data, encodeOptions)
|
||||
}
|
||||
else {
|
||||
// Use streaming encoder for non-stats path
|
||||
const lines = Array.from(encodeLines(data, encodeOptions))
|
||||
toonOutput = lines.join('\n')
|
||||
}
|
||||
const toonOutput = encode(data, encodeOptions)
|
||||
|
||||
if (config.output) {
|
||||
await fsp.writeFile(config.output, toonOutput, 'utf-8')
|
||||
const relativeInputPath = formatInputLabel(config.input)
|
||||
const relativeOutputPath = path.relative(process.cwd(), config.output)
|
||||
consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
|
||||
}
|
||||
else {
|
||||
console.log(toonOutput)
|
||||
}
|
||||
if (config.output) {
|
||||
await fsp.writeFile(config.output, toonOutput, 'utf-8')
|
||||
}
|
||||
else {
|
||||
console.log(toonOutput)
|
||||
}
|
||||
|
||||
if (config.printStats) {
|
||||
const jsonTokens = estimateTokenCount(jsonContent)
|
||||
const toonTokens = estimateTokenCount(toonOutput)
|
||||
const diff = jsonTokens - toonTokens
|
||||
const percent = ((diff / jsonTokens) * 100).toFixed(1)
|
||||
|
||||
if (config.output) {
|
||||
const relativeInputPath = formatInputLabel(config.input)
|
||||
const relativeOutputPath = path.relative(process.cwd(), config.output)
|
||||
consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
|
||||
}
|
||||
|
||||
console.log()
|
||||
consola.info(`Token estimates: ~${jsonTokens} (JSON) → ~${toonTokens} (TOON)`)
|
||||
consola.success(`Saved ~${diff} tokens (-${percent}%)`)
|
||||
}
|
||||
else {
|
||||
// Use streaming encoder for memory-efficient output
|
||||
await writeStreamingToon(encodeLines(data, encodeOptions), config.output)
|
||||
|
||||
if (config.output) {
|
||||
const relativeInputPath = formatInputLabel(config.input)
|
||||
const relativeOutputPath = path.relative(process.cwd(), config.output)
|
||||
consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function decodeToJson(config: {
|
||||
@@ -102,3 +107,50 @@ export async function decodeToJson(config: {
|
||||
console.log(jsonOutput)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes TOON lines to a file or stdout using streaming approach.
|
||||
* Lines are written one at a time without building the full string in memory.
|
||||
*
|
||||
* @param lines - Iterable of TOON lines (without trailing newlines)
|
||||
* @param outputPath - File path to write to, or undefined for stdout
|
||||
*/
|
||||
async function writeStreamingToon(
|
||||
lines: Iterable<string>,
|
||||
outputPath?: string,
|
||||
): Promise<void> {
|
||||
let isFirst = true
|
||||
|
||||
// Stream to file using fs/promises API
|
||||
if (outputPath) {
|
||||
let fileHandle: FileHandle | undefined
|
||||
|
||||
try {
|
||||
fileHandle = await fsp.open(outputPath, 'w')
|
||||
|
||||
for (const line of lines) {
|
||||
if (!isFirst)
|
||||
await fileHandle.write('\n')
|
||||
|
||||
await fileHandle.write(line)
|
||||
isFirst = false
|
||||
}
|
||||
}
|
||||
finally {
|
||||
await fileHandle?.close()
|
||||
}
|
||||
}
|
||||
// Stream to stdout
|
||||
else {
|
||||
for (const line of lines) {
|
||||
if (!isFirst)
|
||||
process.stdout.write('\n')
|
||||
|
||||
process.stdout.write(line)
|
||||
isFirst = false
|
||||
}
|
||||
|
||||
// Add final newline for stdout
|
||||
process.stdout.write('\n')
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,15 +33,16 @@ describe('toon CLI', () => {
|
||||
}
|
||||
const cleanup = mockStdin(JSON.stringify(data))
|
||||
|
||||
const stdout: string[] = []
|
||||
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
|
||||
stdout.push(String(message ?? ''))
|
||||
const writeChunks: string[] = []
|
||||
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
|
||||
writeChunks.push(String(chunk))
|
||||
return true
|
||||
})
|
||||
|
||||
try {
|
||||
await runCli()
|
||||
expect(stdout).toHaveLength(1)
|
||||
expect(stdout[0]).toBe(encode(data))
|
||||
const fullOutput = writeChunks.join('')
|
||||
expect(fullOutput).toBe(`${encode(data)}\n`)
|
||||
}
|
||||
finally {
|
||||
cleanup()
|
||||
@@ -83,16 +84,17 @@ describe('toon CLI', () => {
|
||||
'input.json': JSON.stringify(data),
|
||||
})
|
||||
|
||||
const stdout: string[] = []
|
||||
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
|
||||
stdout.push(String(message ?? ''))
|
||||
const writeChunks: string[] = []
|
||||
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
|
||||
writeChunks.push(String(chunk))
|
||||
return true
|
||||
})
|
||||
|
||||
try {
|
||||
await context.run(['input.json'])
|
||||
|
||||
expect(stdout).toHaveLength(1)
|
||||
expect(stdout[0]).toBe(encode(data))
|
||||
const fullOutput = writeChunks.join('')
|
||||
expect(fullOutput).toBe(`${encode(data)}\n`)
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
@@ -230,16 +232,17 @@ describe('toon CLI', () => {
|
||||
const data = { items: [1, 2, 3] }
|
||||
const cleanup = mockStdin(JSON.stringify(data))
|
||||
|
||||
const stdout: string[] = []
|
||||
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
|
||||
stdout.push(String(message ?? ''))
|
||||
const writeChunks: string[] = []
|
||||
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
|
||||
writeChunks.push(String(chunk))
|
||||
return true
|
||||
})
|
||||
|
||||
try {
|
||||
await runCli({ rawArgs: ['--delimiter', '|'] })
|
||||
|
||||
expect(stdout).toHaveLength(1)
|
||||
expect(stdout[0]).toBe(encode(data, { delimiter: '|' }))
|
||||
const fullOutput = writeChunks.join('')
|
||||
expect(fullOutput).toBe(`${encode(data, { delimiter: '|' })}\n`)
|
||||
}
|
||||
finally {
|
||||
cleanup()
|
||||
@@ -254,16 +257,17 @@ describe('toon CLI', () => {
|
||||
}
|
||||
const cleanup = mockStdin(JSON.stringify(data))
|
||||
|
||||
const stdout: string[] = []
|
||||
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
|
||||
stdout.push(String(message ?? ''))
|
||||
const writeChunks: string[] = []
|
||||
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
|
||||
writeChunks.push(String(chunk))
|
||||
return true
|
||||
})
|
||||
|
||||
try {
|
||||
await runCli({ rawArgs: ['--indent', '4'] })
|
||||
|
||||
expect(stdout).toHaveLength(1)
|
||||
expect(stdout[0]).toBe(encode(data, { indent: 4 }))
|
||||
const fullOutput = writeChunks.join('')
|
||||
expect(fullOutput).toBe(`${encode(data, { indent: 4 })}\n`)
|
||||
}
|
||||
finally {
|
||||
cleanup()
|
||||
@@ -293,6 +297,138 @@ describe('toon CLI', () => {
|
||||
})
|
||||
})
|
||||
|
||||
describe('streaming output', () => {
|
||||
it('streams large JSON to TOON file with identical output', async () => {
|
||||
const data = {
|
||||
items: Array.from({ length: 1000 }, (_, i) => ({
|
||||
id: i,
|
||||
name: `Item ${i}`,
|
||||
value: Math.random(),
|
||||
})),
|
||||
}
|
||||
|
||||
const context = await createCliTestContext({
|
||||
'large-input.json': JSON.stringify(data, undefined, 2),
|
||||
})
|
||||
|
||||
const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined)
|
||||
|
||||
try {
|
||||
await context.run(['large-input.json', '--output', 'output.toon'])
|
||||
|
||||
const output = await context.read('output.toon')
|
||||
// Verify streaming produces identical output to `encode()`
|
||||
const expected = encode(data, {
|
||||
delimiter: DEFAULT_DELIMITER,
|
||||
indent: 2,
|
||||
})
|
||||
|
||||
expect(output).toBe(expected)
|
||||
expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Encoded .* → .*/))
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
}
|
||||
})
|
||||
|
||||
it('streams to stdout using process.stdout.write', async () => {
|
||||
const data = {
|
||||
users: [
|
||||
{ id: 1, name: 'Alice' },
|
||||
{ id: 2, name: 'Bob' },
|
||||
],
|
||||
}
|
||||
|
||||
const context = await createCliTestContext({
|
||||
'input.json': JSON.stringify(data),
|
||||
})
|
||||
|
||||
const writeChunks: string[] = []
|
||||
const writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
|
||||
writeChunks.push(String(chunk))
|
||||
return true
|
||||
})
|
||||
|
||||
try {
|
||||
await context.run(['input.json'])
|
||||
|
||||
expect(writeSpy).toHaveBeenCalled()
|
||||
|
||||
// Verify complete output matches `encode()`
|
||||
const fullOutput = writeChunks.join('')
|
||||
const expected = `${encode(data)}\n`
|
||||
expect(fullOutput).toBe(expected)
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
}
|
||||
})
|
||||
|
||||
it('handles empty object streaming correctly', async () => {
|
||||
const data = {}
|
||||
|
||||
const context = await createCliTestContext({
|
||||
'empty.json': JSON.stringify(data),
|
||||
})
|
||||
|
||||
try {
|
||||
await context.run(['empty.json', '--output', 'output.toon'])
|
||||
|
||||
const output = await context.read('output.toon')
|
||||
expect(output).toBe(encode(data))
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
}
|
||||
})
|
||||
|
||||
it('handles single-line output streaming correctly', async () => {
|
||||
const data = { key: 'value' }
|
||||
|
||||
const context = await createCliTestContext({
|
||||
'single.json': JSON.stringify(data),
|
||||
})
|
||||
|
||||
try {
|
||||
await context.run(['single.json', '--output', 'output.toon'])
|
||||
|
||||
const output = await context.read('output.toon')
|
||||
expect(output).toBe(encode(data))
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
}
|
||||
})
|
||||
|
||||
it('uses non-streaming path when stats are enabled', async () => {
|
||||
const data = {
|
||||
items: [
|
||||
{ id: 1, value: 'test' },
|
||||
{ id: 2, value: 'data' },
|
||||
],
|
||||
}
|
||||
|
||||
const context = await createCliTestContext({
|
||||
'input.json': JSON.stringify(data),
|
||||
})
|
||||
|
||||
const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined)
|
||||
const consolaInfo = vi.spyOn(consola, 'info').mockImplementation(() => undefined)
|
||||
const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined)
|
||||
|
||||
try {
|
||||
await context.run(['input.json', '--stats'])
|
||||
|
||||
expect(consolaInfo).toHaveBeenCalledWith(expect.stringMatching(/Token estimates:/))
|
||||
expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Saved.*tokens/))
|
||||
expect(consoleLogSpy).toHaveBeenCalledWith(encode(data))
|
||||
}
|
||||
finally {
|
||||
await context.cleanup()
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
describe('error handling', () => {
|
||||
it('rejects invalid delimiter', async () => {
|
||||
const context = await createCliTestContext({
|
||||
|
||||
Reference in New Issue
Block a user