feat(cli): stream output for both encoding and decoding

This commit is contained in:
Johann Schopplich
2025-11-21 16:52:34 +01:00
parent cfbbb09358
commit 9ebad53ea3
6 changed files with 486 additions and 24 deletions

View File

@@ -104,20 +104,34 @@ cat data.toon | toon --decode
## Performance
### Streaming Encoding
### Streaming Output
JSON→TOON conversions use line-by-line encoding internally, which avoids holding the entire TOON document in memory. This makes the CLI efficient for large datasets without requiring additional configuration.
Both encoding and decoding operations use streaming output, writing incrementally without building the full output string in memory. This makes the CLI efficient for large datasets without requiring additional configuration.
**JSON → TOON (Encode)**
- Streams TOON lines to output
- No full TOON string in memory
**TOON → JSON (Decode)**
- Streams JSON tokens to output
- No full JSON string in memory
```bash
# Encode large JSON file with minimal memory usage
toon huge-dataset.json -o output.toon
# Decode large TOON file with minimal memory usage
toon huge-dataset.toon -o output.json
# Process millions of records efficiently via stdin
cat million-records.json | toon > output.toon
cat million-records.toon | toon --decode > output.json
```
Peak memory usage scales with data depth, not total size. This allows processing arbitrarily large files as long as individual nested structures fit in memory.
::: info Token Statistics
When using the `--stats` flag, the CLI builds the full TOON string once to compute accurate token counts. For maximum memory efficiency on very large files, omit `--stats`.
When using the `--stats` flag with encode, the CLI builds the full TOON string once to compute accurate token counts. For maximum memory efficiency on very large files, omit `--stats`.
:::
## Options

View File

@@ -118,18 +118,27 @@ jq '.results' data.json | toon > filtered.toon
### Large Dataset Processing
The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets:
The CLI uses streaming output for both encoding and decoding, writing incrementally without building the full output string in memory:
```bash
# Encode large JSON file with minimal memory usage
toon huge-dataset.json -o output.toon
# Process millions of records efficiently
# Decode large TOON file with streaming JSON output
toon huge-dataset.toon -o output.json
# Process millions of records efficiently via stdin
cat million-records.json | toon > output.toon
cat million-records.toon | toon --decode > output.json
```
**Memory efficiency:**
- **Encode (JSON → TOON)**: Streams TOON lines to output without full string in memory
- **Decode (TOON → JSON)**: Streams JSON tokens to output without full string in memory
- Peak memory usage scales with data depth, not total size
> [!NOTE]
> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
> When using `--stats` with encode, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
### Key Folding (Since v1.5)
@@ -206,7 +215,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon
- **Pipeline integration** with existing JSON-based workflows
- **Flexible formatting** with delimiter and indentation options
- **Key folding** to collapse nested wrappers for additional token savings
- **Memory-efficient streaming** for processing large datasets without loading everything into memory
- **Memory-efficient streaming** for both encode and decode operations - process large datasets without loading entire outputs into memory
## Related

View File

@@ -7,6 +7,7 @@ import process from 'node:process'
import { consola } from 'consola'
import { estimateTokenCount } from 'tokenx'
import { decode, encode, encodeLines } from '../../toon/src'
import { jsonStringifyLines } from './json-stringify-stream'
import { formatInputLabel, readInput } from './utils'
export async function encodeToToon(config: {
@@ -62,7 +63,6 @@ export async function encodeToToon(config: {
consola.success(`Saved ~${diff} tokens (-${percent}%)`)
}
else {
// Use streaming encoder for memory-efficient output
await writeStreamingToon(encodeLines(data, encodeOptions), config.output)
if (config.output) {
@@ -95,25 +95,52 @@ export async function decodeToJson(config: {
throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`)
}
const jsonOutput = JSON.stringify(data, undefined, config.indent)
await writeStreamingJson(jsonStringifyLines(data, config.indent), config.output)
if (config.output) {
await fsp.writeFile(config.output, jsonOutput, 'utf-8')
const relativeInputPath = formatInputLabel(config.input)
const relativeOutputPath = path.relative(process.cwd(), config.output)
consola.success(`Decoded \`${relativeInputPath}\`\`${relativeOutputPath}\``)
}
}
/**
* Writes JSON chunks to a file or stdout using streaming approach.
* Chunks are written one at a time without building the full string in memory.
*/
async function writeStreamingJson(
chunks: Iterable<string>,
outputPath?: string,
): Promise<void> {
// Stream to file using fs/promises API
if (outputPath) {
let fileHandle: FileHandle | undefined
try {
fileHandle = await fsp.open(outputPath, 'w')
for (const chunk of chunks) {
await fileHandle.write(chunk)
}
}
finally {
await fileHandle?.close()
}
}
// Stream to stdout
else {
console.log(jsonOutput)
for (const chunk of chunks) {
process.stdout.write(chunk)
}
// Add final newline for stdout
process.stdout.write('\n')
}
}
/**
* Writes TOON lines to a file or stdout using streaming approach.
* Lines are written one at a time without building the full string in memory.
*
* @param lines - Iterable of TOON lines (without trailing newlines)
* @param outputPath - File path to write to, or undefined for stdout
*/
async function writeStreamingToon(
lines: Iterable<string>,

View File

@@ -0,0 +1,161 @@
/**
* Streaming JSON stringifier.
*
* Yields JSON tokens one at a time, allowing streaming output without holding
* the entire JSON string in memory.
*
* @param value - The value to stringify (must be JSON-serializable)
* @param indent - Number of spaces for indentation (0 = compact, >0 = pretty)
* @returns Generator that yields JSON string chunks
*
* @example
* ```ts
* const data = { name: "Alice", scores: [95, 87, 92] }
* for (const chunk of jsonStringifyLines(data, 2)) {
* process.stdout.write(chunk)
* }
* ```
*/
export function* jsonStringifyLines(
value: unknown,
indent: number = 2,
): Iterable<string> {
yield* stringifyValue(value, 0, indent)
}
/**
* Internal generator for recursive stringification.
*/
function* stringifyValue(
value: unknown,
depth: number,
indent: number,
): Iterable<string> {
// Handle null
if (value === null) {
yield 'null'
return
}
const type = typeof value
// Handle primitives
if (type === 'boolean' || type === 'number') {
yield JSON.stringify(value)
return
}
if (type === 'string') {
yield JSON.stringify(value)
return
}
// Handle arrays
if (Array.isArray(value)) {
yield* stringifyArray(value, depth, indent)
return
}
// Handle objects
if (type === 'object') {
yield* stringifyObject(value as Record<string, unknown>, depth, indent)
return
}
// Undefined, functions, symbols become null in JSON
yield 'null'
}
/**
* Stringify an array with proper formatting.
*/
function* stringifyArray(
arr: unknown[],
depth: number,
indent: number,
): Iterable<string> {
if (arr.length === 0) {
yield '[]'
return
}
yield '['
if (indent > 0) {
// Pretty-printed format
for (let i = 0; i < arr.length; i++) {
yield '\n'
yield ' '.repeat((depth + 1) * indent)
yield* stringifyValue(arr[i], depth + 1, indent)
if (i < arr.length - 1) {
yield ','
}
}
yield '\n'
yield ' '.repeat(depth * indent)
yield ']'
}
else {
// Compact format
for (let i = 0; i < arr.length; i++) {
yield* stringifyValue(arr[i], depth + 1, indent)
if (i < arr.length - 1) {
yield ','
}
}
yield ']'
}
}
/**
* Stringify an object with proper formatting.
*/
function* stringifyObject(
obj: Record<string, unknown>,
depth: number,
indent: number,
): Iterable<string> {
const keys = Object.keys(obj)
if (keys.length === 0) {
yield '{}'
return
}
yield '{'
if (indent > 0) {
// Pretty-printed format
for (let i = 0; i < keys.length; i++) {
const key = keys[i]!
const value = obj[key]
yield '\n'
yield ' '.repeat((depth + 1) * indent)
yield JSON.stringify(key)
yield ': '
yield* stringifyValue(value, depth + 1, indent)
if (i < keys.length - 1) {
yield ','
}
}
yield '\n'
yield ' '.repeat(depth * indent)
yield '}'
}
else {
// Compact format
for (let i = 0; i < keys.length; i++) {
const key = keys[i]!
const value = obj[key]
yield JSON.stringify(key)
yield ':'
yield* stringifyValue(value, depth + 1, indent)
if (i < keys.length - 1) {
yield ','
}
}
yield '}'
}
}

View File

@@ -153,15 +153,18 @@ describe('toon CLI', () => {
const cleanup = mockStdin(toonInput)
const stdout: string[] = []
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
stdout.push(String(message ?? ''))
const writeChunks: string[] = []
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
writeChunks.push(String(chunk))
return true
})
try {
await runCli({ rawArgs: ['--decode'] })
expect(stdout).toHaveLength(1)
const result = JSON.parse(stdout?.at(0) ?? '')
const fullOutput = writeChunks.join('')
// Remove trailing newline before parsing
const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput
const result = JSON.parse(jsonOutput)
expect(result).toEqual(data)
}
finally {
@@ -279,16 +282,19 @@ describe('toon CLI', () => {
const toonInput = encode(data)
const cleanup = mockStdin(toonInput)
const stdout: string[] = []
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
stdout.push(String(message ?? ''))
const writeChunks: string[] = []
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
writeChunks.push(String(chunk))
return true
})
try {
await runCli({ rawArgs: ['--decode', '--no-strict'] })
expect(stdout).toHaveLength(1)
const result = JSON.parse(stdout?.at(0) ?? '')
const fullOutput = writeChunks.join('')
// Remove trailing newline before parsing
const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput
const result = JSON.parse(jsonOutput)
expect(result).toEqual(data)
}
finally {

View File

@@ -0,0 +1,245 @@
import { describe, expect, it } from 'vitest'
import { jsonStringifyLines } from '../src/json-stringify-stream'
describe('jsonStringifyLines', () => {
describe('primitives', () => {
it('stringifies null', () => {
expect(join(jsonStringifyLines(null, 0))).toBe(JSON.stringify(null))
expect(join(jsonStringifyLines(null, 2))).toBe(JSON.stringify(null, null, 2))
})
it('stringifies booleans', () => {
expect(join(jsonStringifyLines(true, 0))).toBe(JSON.stringify(true))
expect(join(jsonStringifyLines(false, 0))).toBe(JSON.stringify(false))
expect(join(jsonStringifyLines(true, 2))).toBe(JSON.stringify(true, null, 2))
})
it('stringifies numbers', () => {
expect(join(jsonStringifyLines(0, 0))).toBe(JSON.stringify(0))
expect(join(jsonStringifyLines(42, 0))).toBe(JSON.stringify(42))
expect(join(jsonStringifyLines(-17, 0))).toBe(JSON.stringify(-17))
expect(join(jsonStringifyLines(3.14159, 0))).toBe(JSON.stringify(3.14159))
expect(join(jsonStringifyLines(1e10, 2))).toBe(JSON.stringify(1e10, null, 2))
})
it('stringifies strings', () => {
expect(join(jsonStringifyLines('', 0))).toBe(JSON.stringify(''))
expect(join(jsonStringifyLines('hello', 0))).toBe(JSON.stringify('hello'))
expect(join(jsonStringifyLines('with "quotes"', 0))).toBe(JSON.stringify('with "quotes"'))
expect(join(jsonStringifyLines('with\nnewlines', 2))).toBe(JSON.stringify('with\nnewlines', null, 2))
expect(join(jsonStringifyLines('with\ttabs', 0))).toBe(JSON.stringify('with\ttabs'))
})
it('converts undefined to null', () => {
expect(join(jsonStringifyLines(undefined, 0))).toBe('null')
expect(join(jsonStringifyLines(undefined, 2))).toBe('null')
})
})
describe('empty containers', () => {
it('stringifies empty arrays', () => {
expect(join(jsonStringifyLines([], 0))).toBe(JSON.stringify([], null, 0))
expect(join(jsonStringifyLines([], 2))).toBe(JSON.stringify([], null, 2))
})
it('stringifies empty objects', () => {
expect(join(jsonStringifyLines({}, 0))).toBe(JSON.stringify({}, null, 0))
expect(join(jsonStringifyLines({}, 2))).toBe(JSON.stringify({}, null, 2))
})
})
describe('arrays', () => {
it('stringifies arrays with compact formatting (indent=0)', () => {
const value = [1, 2, 3]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('stringifies arrays with pretty formatting (indent=2)', () => {
const value = [1, 2, 3]
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies mixed-type arrays', () => {
const value = [1, 'two', true, null, { key: 'value' }]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies nested arrays', () => {
const value = [[1, 2], [3, 4], [5, 6]]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies deeply nested arrays', () => {
const value = [[[1]], [[2]], [[3]]]
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4))
})
})
describe('objects', () => {
it('stringifies simple objects with compact formatting', () => {
const value = { a: 1, b: 2, c: 3 }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('stringifies simple objects with pretty formatting', () => {
const value = { a: 1, b: 2, c: 3 }
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies objects with mixed value types', () => {
const value = {
num: 42,
str: 'hello',
bool: true,
nil: null,
arr: [1, 2, 3],
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies nested objects', () => {
const value = {
level1: {
level2: {
level3: 'deep',
},
},
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('preserves key order', () => {
const value = { z: 1, a: 2, m: 3 }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles special characters in keys', () => {
const value = {
'normal-key': 1,
'key with spaces': 2,
'key:with:colons': 3,
'key"with"quotes': 4,
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
describe('complex nested structures', () => {
it('stringifies objects containing arrays', () => {
const value = {
name: 'Alice',
scores: [95, 87, 92],
metadata: {
tags: ['math', 'science'],
},
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies arrays of objects', () => {
const value = [
{ id: 1, name: 'Alice' },
{ id: 2, name: 'Bob' },
{ id: 3, name: 'Charlie' },
]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies deeply nested mixed structures', () => {
const value = {
users: [
{
name: 'Alice',
roles: ['admin', 'user'],
settings: {
theme: 'dark',
notifications: true,
},
},
{
name: 'Bob',
roles: ['user'],
settings: {
theme: 'light',
notifications: false,
},
},
],
count: 2,
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
describe('indentation levels', () => {
const value = { a: [1, 2], b: { c: 3 } }
it('handles indent=0 (compact)', () => {
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('handles indent=2', () => {
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles indent=4', () => {
expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4))
})
it('handles indent=8', () => {
expect(join(jsonStringifyLines(value, 8))).toBe(JSON.stringify(value, null, 8))
})
})
describe('edge cases', () => {
it('handles arrays with undefined values (converted to null)', () => {
const value = [1, undefined, 3]
const expected = JSON.stringify(value, null, 2)
expect(join(jsonStringifyLines(value, 2))).toBe(expected)
})
it('handles single-element arrays', () => {
const value = [42]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles single-property objects', () => {
const value = { only: 'one' }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles objects with many properties', () => {
const value: Record<string, number> = {}
for (let i = 0; i < 100; i++) {
value[`key${i}`] = i
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles large arrays', () => {
const value = Array.from({ length: 1000 }, (_, i) => i)
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
})
/**
* Joins chunks from an iterable into a single string.
*/
function join(iter: Iterable<string>): string {
return Array.from(iter).join('')
}