feat(cli): stream output for both encoding and decoding

This commit is contained in:
Johann Schopplich
2025-11-21 16:52:34 +01:00
parent cfbbb09358
commit 9ebad53ea3
6 changed files with 486 additions and 24 deletions

View File

@@ -118,18 +118,27 @@ jq '.results' data.json | toon > filtered.toon
### Large Dataset Processing
The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets:
The CLI uses streaming output for both encoding and decoding, writing incrementally without building the full output string in memory:
```bash
# Encode large JSON file with minimal memory usage
toon huge-dataset.json -o output.toon
# Process millions of records efficiently
# Decode large TOON file with streaming JSON output
toon huge-dataset.toon -o output.json
# Process millions of records efficiently via stdin
cat million-records.json | toon > output.toon
cat million-records.toon | toon --decode > output.json
```
**Memory efficiency:**
- **Encode (JSON → TOON)**: Streams TOON lines to output without full string in memory
- **Decode (TOON → JSON)**: Streams JSON tokens to output without full string in memory
- Peak memory usage scales with data depth, not total size
> [!NOTE]
> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
> When using `--stats` with encode, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
### Key Folding (Since v1.5)
@@ -206,7 +215,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon
- **Pipeline integration** with existing JSON-based workflows
- **Flexible formatting** with delimiter and indentation options
- **Key folding** to collapse nested wrappers for additional token savings
- **Memory-efficient streaming** for processing large datasets without loading everything into memory
- **Memory-efficient streaming** for both encode and decode operations - process large datasets without loading entire outputs into memory
## Related

View File

@@ -7,6 +7,7 @@ import process from 'node:process'
import { consola } from 'consola'
import { estimateTokenCount } from 'tokenx'
import { decode, encode, encodeLines } from '../../toon/src'
import { jsonStringifyLines } from './json-stringify-stream'
import { formatInputLabel, readInput } from './utils'
export async function encodeToToon(config: {
@@ -62,7 +63,6 @@ export async function encodeToToon(config: {
consola.success(`Saved ~${diff} tokens (-${percent}%)`)
}
else {
// Use streaming encoder for memory-efficient output
await writeStreamingToon(encodeLines(data, encodeOptions), config.output)
if (config.output) {
@@ -95,25 +95,52 @@ export async function decodeToJson(config: {
throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`)
}
const jsonOutput = JSON.stringify(data, undefined, config.indent)
await writeStreamingJson(jsonStringifyLines(data, config.indent), config.output)
if (config.output) {
await fsp.writeFile(config.output, jsonOutput, 'utf-8')
const relativeInputPath = formatInputLabel(config.input)
const relativeOutputPath = path.relative(process.cwd(), config.output)
consola.success(`Decoded \`${relativeInputPath}\`\`${relativeOutputPath}\``)
}
}
/**
* Writes JSON chunks to a file or stdout using streaming approach.
* Chunks are written one at a time without building the full string in memory.
*/
async function writeStreamingJson(
chunks: Iterable<string>,
outputPath?: string,
): Promise<void> {
// Stream to file using fs/promises API
if (outputPath) {
let fileHandle: FileHandle | undefined
try {
fileHandle = await fsp.open(outputPath, 'w')
for (const chunk of chunks) {
await fileHandle.write(chunk)
}
}
finally {
await fileHandle?.close()
}
}
// Stream to stdout
else {
console.log(jsonOutput)
for (const chunk of chunks) {
process.stdout.write(chunk)
}
// Add final newline for stdout
process.stdout.write('\n')
}
}
/**
* Writes TOON lines to a file or stdout using streaming approach.
* Lines are written one at a time without building the full string in memory.
*
* @param lines - Iterable of TOON lines (without trailing newlines)
* @param outputPath - File path to write to, or undefined for stdout
*/
async function writeStreamingToon(
lines: Iterable<string>,

View File

@@ -0,0 +1,161 @@
/**
* Streaming JSON stringifier.
*
* Yields JSON tokens one at a time, allowing streaming output without holding
* the entire JSON string in memory.
*
* @param value - The value to stringify (must be JSON-serializable)
* @param indent - Number of spaces for indentation (0 = compact, >0 = pretty)
* @returns Generator that yields JSON string chunks
*
* @example
* ```ts
* const data = { name: "Alice", scores: [95, 87, 92] }
* for (const chunk of jsonStringifyLines(data, 2)) {
* process.stdout.write(chunk)
* }
* ```
*/
export function* jsonStringifyLines(
value: unknown,
indent: number = 2,
): Iterable<string> {
yield* stringifyValue(value, 0, indent)
}
/**
* Internal generator for recursive stringification.
*/
function* stringifyValue(
value: unknown,
depth: number,
indent: number,
): Iterable<string> {
// Handle null
if (value === null) {
yield 'null'
return
}
const type = typeof value
// Handle primitives
if (type === 'boolean' || type === 'number') {
yield JSON.stringify(value)
return
}
if (type === 'string') {
yield JSON.stringify(value)
return
}
// Handle arrays
if (Array.isArray(value)) {
yield* stringifyArray(value, depth, indent)
return
}
// Handle objects
if (type === 'object') {
yield* stringifyObject(value as Record<string, unknown>, depth, indent)
return
}
// Undefined, functions, symbols become null in JSON
yield 'null'
}
/**
* Stringify an array with proper formatting.
*/
function* stringifyArray(
arr: unknown[],
depth: number,
indent: number,
): Iterable<string> {
if (arr.length === 0) {
yield '[]'
return
}
yield '['
if (indent > 0) {
// Pretty-printed format
for (let i = 0; i < arr.length; i++) {
yield '\n'
yield ' '.repeat((depth + 1) * indent)
yield* stringifyValue(arr[i], depth + 1, indent)
if (i < arr.length - 1) {
yield ','
}
}
yield '\n'
yield ' '.repeat(depth * indent)
yield ']'
}
else {
// Compact format
for (let i = 0; i < arr.length; i++) {
yield* stringifyValue(arr[i], depth + 1, indent)
if (i < arr.length - 1) {
yield ','
}
}
yield ']'
}
}
/**
* Stringify an object with proper formatting.
*/
function* stringifyObject(
obj: Record<string, unknown>,
depth: number,
indent: number,
): Iterable<string> {
const keys = Object.keys(obj)
if (keys.length === 0) {
yield '{}'
return
}
yield '{'
if (indent > 0) {
// Pretty-printed format
for (let i = 0; i < keys.length; i++) {
const key = keys[i]!
const value = obj[key]
yield '\n'
yield ' '.repeat((depth + 1) * indent)
yield JSON.stringify(key)
yield ': '
yield* stringifyValue(value, depth + 1, indent)
if (i < keys.length - 1) {
yield ','
}
}
yield '\n'
yield ' '.repeat(depth * indent)
yield '}'
}
else {
// Compact format
for (let i = 0; i < keys.length; i++) {
const key = keys[i]!
const value = obj[key]
yield JSON.stringify(key)
yield ':'
yield* stringifyValue(value, depth + 1, indent)
if (i < keys.length - 1) {
yield ','
}
}
yield '}'
}
}

View File

@@ -153,15 +153,18 @@ describe('toon CLI', () => {
const cleanup = mockStdin(toonInput)
const stdout: string[] = []
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
stdout.push(String(message ?? ''))
const writeChunks: string[] = []
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
writeChunks.push(String(chunk))
return true
})
try {
await runCli({ rawArgs: ['--decode'] })
expect(stdout).toHaveLength(1)
const result = JSON.parse(stdout?.at(0) ?? '')
const fullOutput = writeChunks.join('')
// Remove trailing newline before parsing
const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput
const result = JSON.parse(jsonOutput)
expect(result).toEqual(data)
}
finally {
@@ -279,16 +282,19 @@ describe('toon CLI', () => {
const toonInput = encode(data)
const cleanup = mockStdin(toonInput)
const stdout: string[] = []
vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
stdout.push(String(message ?? ''))
const writeChunks: string[] = []
vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
writeChunks.push(String(chunk))
return true
})
try {
await runCli({ rawArgs: ['--decode', '--no-strict'] })
expect(stdout).toHaveLength(1)
const result = JSON.parse(stdout?.at(0) ?? '')
const fullOutput = writeChunks.join('')
// Remove trailing newline before parsing
const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput
const result = JSON.parse(jsonOutput)
expect(result).toEqual(data)
}
finally {

View File

@@ -0,0 +1,245 @@
import { describe, expect, it } from 'vitest'
import { jsonStringifyLines } from '../src/json-stringify-stream'
describe('jsonStringifyLines', () => {
describe('primitives', () => {
it('stringifies null', () => {
expect(join(jsonStringifyLines(null, 0))).toBe(JSON.stringify(null))
expect(join(jsonStringifyLines(null, 2))).toBe(JSON.stringify(null, null, 2))
})
it('stringifies booleans', () => {
expect(join(jsonStringifyLines(true, 0))).toBe(JSON.stringify(true))
expect(join(jsonStringifyLines(false, 0))).toBe(JSON.stringify(false))
expect(join(jsonStringifyLines(true, 2))).toBe(JSON.stringify(true, null, 2))
})
it('stringifies numbers', () => {
expect(join(jsonStringifyLines(0, 0))).toBe(JSON.stringify(0))
expect(join(jsonStringifyLines(42, 0))).toBe(JSON.stringify(42))
expect(join(jsonStringifyLines(-17, 0))).toBe(JSON.stringify(-17))
expect(join(jsonStringifyLines(3.14159, 0))).toBe(JSON.stringify(3.14159))
expect(join(jsonStringifyLines(1e10, 2))).toBe(JSON.stringify(1e10, null, 2))
})
it('stringifies strings', () => {
expect(join(jsonStringifyLines('', 0))).toBe(JSON.stringify(''))
expect(join(jsonStringifyLines('hello', 0))).toBe(JSON.stringify('hello'))
expect(join(jsonStringifyLines('with "quotes"', 0))).toBe(JSON.stringify('with "quotes"'))
expect(join(jsonStringifyLines('with\nnewlines', 2))).toBe(JSON.stringify('with\nnewlines', null, 2))
expect(join(jsonStringifyLines('with\ttabs', 0))).toBe(JSON.stringify('with\ttabs'))
})
it('converts undefined to null', () => {
expect(join(jsonStringifyLines(undefined, 0))).toBe('null')
expect(join(jsonStringifyLines(undefined, 2))).toBe('null')
})
})
describe('empty containers', () => {
it('stringifies empty arrays', () => {
expect(join(jsonStringifyLines([], 0))).toBe(JSON.stringify([], null, 0))
expect(join(jsonStringifyLines([], 2))).toBe(JSON.stringify([], null, 2))
})
it('stringifies empty objects', () => {
expect(join(jsonStringifyLines({}, 0))).toBe(JSON.stringify({}, null, 0))
expect(join(jsonStringifyLines({}, 2))).toBe(JSON.stringify({}, null, 2))
})
})
describe('arrays', () => {
it('stringifies arrays with compact formatting (indent=0)', () => {
const value = [1, 2, 3]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('stringifies arrays with pretty formatting (indent=2)', () => {
const value = [1, 2, 3]
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies mixed-type arrays', () => {
const value = [1, 'two', true, null, { key: 'value' }]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies nested arrays', () => {
const value = [[1, 2], [3, 4], [5, 6]]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies deeply nested arrays', () => {
const value = [[[1]], [[2]], [[3]]]
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4))
})
})
describe('objects', () => {
it('stringifies simple objects with compact formatting', () => {
const value = { a: 1, b: 2, c: 3 }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('stringifies simple objects with pretty formatting', () => {
const value = { a: 1, b: 2, c: 3 }
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies objects with mixed value types', () => {
const value = {
num: 42,
str: 'hello',
bool: true,
nil: null,
arr: [1, 2, 3],
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies nested objects', () => {
const value = {
level1: {
level2: {
level3: 'deep',
},
},
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('preserves key order', () => {
const value = { z: 1, a: 2, m: 3 }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles special characters in keys', () => {
const value = {
'normal-key': 1,
'key with spaces': 2,
'key:with:colons': 3,
'key"with"quotes': 4,
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
describe('complex nested structures', () => {
it('stringifies objects containing arrays', () => {
const value = {
name: 'Alice',
scores: [95, 87, 92],
metadata: {
tags: ['math', 'science'],
},
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies arrays of objects', () => {
const value = [
{ id: 1, name: 'Alice' },
{ id: 2, name: 'Bob' },
{ id: 3, name: 'Charlie' },
]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('stringifies deeply nested mixed structures', () => {
const value = {
users: [
{
name: 'Alice',
roles: ['admin', 'user'],
settings: {
theme: 'dark',
notifications: true,
},
},
{
name: 'Bob',
roles: ['user'],
settings: {
theme: 'light',
notifications: false,
},
},
],
count: 2,
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
describe('indentation levels', () => {
const value = { a: [1, 2], b: { c: 3 } }
it('handles indent=0 (compact)', () => {
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
})
it('handles indent=2', () => {
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles indent=4', () => {
expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4))
})
it('handles indent=8', () => {
expect(join(jsonStringifyLines(value, 8))).toBe(JSON.stringify(value, null, 8))
})
})
describe('edge cases', () => {
it('handles arrays with undefined values (converted to null)', () => {
const value = [1, undefined, 3]
const expected = JSON.stringify(value, null, 2)
expect(join(jsonStringifyLines(value, 2))).toBe(expected)
})
it('handles single-element arrays', () => {
const value = [42]
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles single-property objects', () => {
const value = { only: 'one' }
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles objects with many properties', () => {
const value: Record<string, number> = {}
for (let i = 0; i < 100; i++) {
value[`key${i}`] = i
}
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
it('handles large arrays', () => {
const value = Array.from({ length: 1000 }, (_, i) => i)
expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0))
expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2))
})
})
})
/**
* Joins chunks from an iterable into a single string.
*/
function join(iter: Iterable<string>): string {
return Array.from(iter).join('')
}