diff --git a/docs/cli/index.md b/docs/cli/index.md index 38225ae..2312413 100644 --- a/docs/cli/index.md +++ b/docs/cli/index.md @@ -104,20 +104,34 @@ cat data.toon | toon --decode ## Performance -### Streaming Encoding +### Streaming Output -JSON→TOON conversions use line-by-line encoding internally, which avoids holding the entire TOON document in memory. This makes the CLI efficient for large datasets without requiring additional configuration. +Both encoding and decoding operations use streaming output, writing incrementally without building the full output string in memory. This makes the CLI efficient for large datasets without requiring additional configuration. + +**JSON → TOON (Encode)** +- Streams TOON lines to output +- No full TOON string in memory + +**TOON → JSON (Decode)** +- Streams JSON tokens to output +- No full JSON string in memory ```bash # Encode large JSON file with minimal memory usage toon huge-dataset.json -o output.toon +# Decode large TOON file with minimal memory usage +toon huge-dataset.toon -o output.json + # Process millions of records efficiently via stdin cat million-records.json | toon > output.toon +cat million-records.toon | toon --decode > output.json ``` +Peak memory usage scales with data depth, not total size. This allows processing arbitrarily large files as long as individual nested structures fit in memory. + ::: info Token Statistics -When using the `--stats` flag, the CLI builds the full TOON string once to compute accurate token counts. For maximum memory efficiency on very large files, omit `--stats`. +When using the `--stats` flag with encode, the CLI builds the full TOON string once to compute accurate token counts. For maximum memory efficiency on very large files, omit `--stats`. ::: ## Options diff --git a/packages/cli/README.md b/packages/cli/README.md index 1921e95..1ef6eb2 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -118,18 +118,27 @@ jq '.results' data.json | toon > filtered.toon ### Large Dataset Processing -The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets: +The CLI uses streaming output for both encoding and decoding, writing incrementally without building the full output string in memory: ```bash # Encode large JSON file with minimal memory usage toon huge-dataset.json -o output.toon -# Process millions of records efficiently +# Decode large TOON file with streaming JSON output +toon huge-dataset.toon -o output.json + +# Process millions of records efficiently via stdin cat million-records.json | toon > output.toon +cat million-records.toon | toon --decode > output.json ``` +**Memory efficiency:** +- **Encode (JSON → TOON)**: Streams TOON lines to output without full string in memory +- **Decode (TOON → JSON)**: Streams JSON tokens to output without full string in memory +- Peak memory usage scales with data depth, not total size + > [!NOTE] -> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets. +> When using `--stats` with encode, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets. ### Key Folding (Since v1.5) @@ -206,7 +215,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon - **Pipeline integration** with existing JSON-based workflows - **Flexible formatting** with delimiter and indentation options - **Key folding** to collapse nested wrappers for additional token savings -- **Memory-efficient streaming** for processing large datasets without loading everything into memory +- **Memory-efficient streaming** for both encode and decode operations - process large datasets without loading entire outputs into memory ## Related diff --git a/packages/cli/src/conversion.ts b/packages/cli/src/conversion.ts index 202d770..aa341bb 100644 --- a/packages/cli/src/conversion.ts +++ b/packages/cli/src/conversion.ts @@ -7,6 +7,7 @@ import process from 'node:process' import { consola } from 'consola' import { estimateTokenCount } from 'tokenx' import { decode, encode, encodeLines } from '../../toon/src' +import { jsonStringifyLines } from './json-stringify-stream' import { formatInputLabel, readInput } from './utils' export async function encodeToToon(config: { @@ -62,7 +63,6 @@ export async function encodeToToon(config: { consola.success(`Saved ~${diff} tokens (-${percent}%)`) } else { - // Use streaming encoder for memory-efficient output await writeStreamingToon(encodeLines(data, encodeOptions), config.output) if (config.output) { @@ -95,25 +95,52 @@ export async function decodeToJson(config: { throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`) } - const jsonOutput = JSON.stringify(data, undefined, config.indent) + await writeStreamingJson(jsonStringifyLines(data, config.indent), config.output) if (config.output) { - await fsp.writeFile(config.output, jsonOutput, 'utf-8') const relativeInputPath = formatInputLabel(config.input) const relativeOutputPath = path.relative(process.cwd(), config.output) consola.success(`Decoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``) } +} + +/** + * Writes JSON chunks to a file or stdout using streaming approach. + * Chunks are written one at a time without building the full string in memory. + */ +async function writeStreamingJson( + chunks: Iterable, + outputPath?: string, +): Promise { + // Stream to file using fs/promises API + if (outputPath) { + let fileHandle: FileHandle | undefined + + try { + fileHandle = await fsp.open(outputPath, 'w') + + for (const chunk of chunks) { + await fileHandle.write(chunk) + } + } + finally { + await fileHandle?.close() + } + } + // Stream to stdout else { - console.log(jsonOutput) + for (const chunk of chunks) { + process.stdout.write(chunk) + } + + // Add final newline for stdout + process.stdout.write('\n') } } /** * Writes TOON lines to a file or stdout using streaming approach. * Lines are written one at a time without building the full string in memory. - * - * @param lines - Iterable of TOON lines (without trailing newlines) - * @param outputPath - File path to write to, or undefined for stdout */ async function writeStreamingToon( lines: Iterable, diff --git a/packages/cli/src/json-stringify-stream.ts b/packages/cli/src/json-stringify-stream.ts new file mode 100644 index 0000000..611a530 --- /dev/null +++ b/packages/cli/src/json-stringify-stream.ts @@ -0,0 +1,161 @@ +/** + * Streaming JSON stringifier. + * + * Yields JSON tokens one at a time, allowing streaming output without holding + * the entire JSON string in memory. + * + * @param value - The value to stringify (must be JSON-serializable) + * @param indent - Number of spaces for indentation (0 = compact, >0 = pretty) + * @returns Generator that yields JSON string chunks + * + * @example + * ```ts + * const data = { name: "Alice", scores: [95, 87, 92] } + * for (const chunk of jsonStringifyLines(data, 2)) { + * process.stdout.write(chunk) + * } + * ``` + */ +export function* jsonStringifyLines( + value: unknown, + indent: number = 2, +): Iterable { + yield* stringifyValue(value, 0, indent) +} + +/** + * Internal generator for recursive stringification. + */ +function* stringifyValue( + value: unknown, + depth: number, + indent: number, +): Iterable { + // Handle null + if (value === null) { + yield 'null' + return + } + + const type = typeof value + + // Handle primitives + if (type === 'boolean' || type === 'number') { + yield JSON.stringify(value) + return + } + + if (type === 'string') { + yield JSON.stringify(value) + return + } + + // Handle arrays + if (Array.isArray(value)) { + yield* stringifyArray(value, depth, indent) + return + } + + // Handle objects + if (type === 'object') { + yield* stringifyObject(value as Record, depth, indent) + return + } + + // Undefined, functions, symbols become null in JSON + yield 'null' +} + +/** + * Stringify an array with proper formatting. + */ +function* stringifyArray( + arr: unknown[], + depth: number, + indent: number, +): Iterable { + if (arr.length === 0) { + yield '[]' + return + } + + yield '[' + + if (indent > 0) { + // Pretty-printed format + for (let i = 0; i < arr.length; i++) { + yield '\n' + yield ' '.repeat((depth + 1) * indent) + yield* stringifyValue(arr[i], depth + 1, indent) + if (i < arr.length - 1) { + yield ',' + } + } + yield '\n' + yield ' '.repeat(depth * indent) + yield ']' + } + else { + // Compact format + for (let i = 0; i < arr.length; i++) { + yield* stringifyValue(arr[i], depth + 1, indent) + if (i < arr.length - 1) { + yield ',' + } + } + yield ']' + } +} + +/** + * Stringify an object with proper formatting. + */ +function* stringifyObject( + obj: Record, + depth: number, + indent: number, +): Iterable { + const keys = Object.keys(obj) + + if (keys.length === 0) { + yield '{}' + return + } + + yield '{' + + if (indent > 0) { + // Pretty-printed format + for (let i = 0; i < keys.length; i++) { + const key = keys[i]! + const value = obj[key] + + yield '\n' + yield ' '.repeat((depth + 1) * indent) + yield JSON.stringify(key) + yield ': ' + yield* stringifyValue(value, depth + 1, indent) + if (i < keys.length - 1) { + yield ',' + } + } + yield '\n' + yield ' '.repeat(depth * indent) + yield '}' + } + else { + // Compact format + for (let i = 0; i < keys.length; i++) { + const key = keys[i]! + const value = obj[key] + + yield JSON.stringify(key) + yield ':' + yield* stringifyValue(value, depth + 1, indent) + if (i < keys.length - 1) { + yield ',' + } + } + yield '}' + } +} diff --git a/packages/cli/test/index.test.ts b/packages/cli/test/index.test.ts index 380ac74..0465a39 100644 --- a/packages/cli/test/index.test.ts +++ b/packages/cli/test/index.test.ts @@ -153,15 +153,18 @@ describe('toon CLI', () => { const cleanup = mockStdin(toonInput) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await runCli({ rawArgs: ['--decode'] }) - expect(stdout).toHaveLength(1) - const result = JSON.parse(stdout?.at(0) ?? '') + const fullOutput = writeChunks.join('') + // Remove trailing newline before parsing + const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput + const result = JSON.parse(jsonOutput) expect(result).toEqual(data) } finally { @@ -279,16 +282,19 @@ describe('toon CLI', () => { const toonInput = encode(data) const cleanup = mockStdin(toonInput) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await runCli({ rawArgs: ['--decode', '--no-strict'] }) - expect(stdout).toHaveLength(1) - const result = JSON.parse(stdout?.at(0) ?? '') + const fullOutput = writeChunks.join('') + // Remove trailing newline before parsing + const jsonOutput = fullOutput.endsWith('\n') ? fullOutput.slice(0, -1) : fullOutput + const result = JSON.parse(jsonOutput) expect(result).toEqual(data) } finally { diff --git a/packages/cli/test/json-stringify-stream.test.ts b/packages/cli/test/json-stringify-stream.test.ts new file mode 100644 index 0000000..1af3768 --- /dev/null +++ b/packages/cli/test/json-stringify-stream.test.ts @@ -0,0 +1,245 @@ +import { describe, expect, it } from 'vitest' +import { jsonStringifyLines } from '../src/json-stringify-stream' + +describe('jsonStringifyLines', () => { + describe('primitives', () => { + it('stringifies null', () => { + expect(join(jsonStringifyLines(null, 0))).toBe(JSON.stringify(null)) + expect(join(jsonStringifyLines(null, 2))).toBe(JSON.stringify(null, null, 2)) + }) + + it('stringifies booleans', () => { + expect(join(jsonStringifyLines(true, 0))).toBe(JSON.stringify(true)) + expect(join(jsonStringifyLines(false, 0))).toBe(JSON.stringify(false)) + expect(join(jsonStringifyLines(true, 2))).toBe(JSON.stringify(true, null, 2)) + }) + + it('stringifies numbers', () => { + expect(join(jsonStringifyLines(0, 0))).toBe(JSON.stringify(0)) + expect(join(jsonStringifyLines(42, 0))).toBe(JSON.stringify(42)) + expect(join(jsonStringifyLines(-17, 0))).toBe(JSON.stringify(-17)) + expect(join(jsonStringifyLines(3.14159, 0))).toBe(JSON.stringify(3.14159)) + expect(join(jsonStringifyLines(1e10, 2))).toBe(JSON.stringify(1e10, null, 2)) + }) + + it('stringifies strings', () => { + expect(join(jsonStringifyLines('', 0))).toBe(JSON.stringify('')) + expect(join(jsonStringifyLines('hello', 0))).toBe(JSON.stringify('hello')) + expect(join(jsonStringifyLines('with "quotes"', 0))).toBe(JSON.stringify('with "quotes"')) + expect(join(jsonStringifyLines('with\nnewlines', 2))).toBe(JSON.stringify('with\nnewlines', null, 2)) + expect(join(jsonStringifyLines('with\ttabs', 0))).toBe(JSON.stringify('with\ttabs')) + }) + + it('converts undefined to null', () => { + expect(join(jsonStringifyLines(undefined, 0))).toBe('null') + expect(join(jsonStringifyLines(undefined, 2))).toBe('null') + }) + }) + + describe('empty containers', () => { + it('stringifies empty arrays', () => { + expect(join(jsonStringifyLines([], 0))).toBe(JSON.stringify([], null, 0)) + expect(join(jsonStringifyLines([], 2))).toBe(JSON.stringify([], null, 2)) + }) + + it('stringifies empty objects', () => { + expect(join(jsonStringifyLines({}, 0))).toBe(JSON.stringify({}, null, 0)) + expect(join(jsonStringifyLines({}, 2))).toBe(JSON.stringify({}, null, 2)) + }) + }) + + describe('arrays', () => { + it('stringifies arrays with compact formatting (indent=0)', () => { + const value = [1, 2, 3] + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + }) + + it('stringifies arrays with pretty formatting (indent=2)', () => { + const value = [1, 2, 3] + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies mixed-type arrays', () => { + const value = [1, 'two', true, null, { key: 'value' }] + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies nested arrays', () => { + const value = [[1, 2], [3, 4], [5, 6]] + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies deeply nested arrays', () => { + const value = [[[1]], [[2]], [[3]]] + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4)) + }) + }) + + describe('objects', () => { + it('stringifies simple objects with compact formatting', () => { + const value = { a: 1, b: 2, c: 3 } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + }) + + it('stringifies simple objects with pretty formatting', () => { + const value = { a: 1, b: 2, c: 3 } + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies objects with mixed value types', () => { + const value = { + num: 42, + str: 'hello', + bool: true, + nil: null, + arr: [1, 2, 3], + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies nested objects', () => { + const value = { + level1: { + level2: { + level3: 'deep', + }, + }, + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('preserves key order', () => { + const value = { z: 1, a: 2, m: 3 } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('handles special characters in keys', () => { + const value = { + 'normal-key': 1, + 'key with spaces': 2, + 'key:with:colons': 3, + 'key"with"quotes': 4, + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + }) + + describe('complex nested structures', () => { + it('stringifies objects containing arrays', () => { + const value = { + name: 'Alice', + scores: [95, 87, 92], + metadata: { + tags: ['math', 'science'], + }, + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies arrays of objects', () => { + const value = [ + { id: 1, name: 'Alice' }, + { id: 2, name: 'Bob' }, + { id: 3, name: 'Charlie' }, + ] + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('stringifies deeply nested mixed structures', () => { + const value = { + users: [ + { + name: 'Alice', + roles: ['admin', 'user'], + settings: { + theme: 'dark', + notifications: true, + }, + }, + { + name: 'Bob', + roles: ['user'], + settings: { + theme: 'light', + notifications: false, + }, + }, + ], + count: 2, + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + }) + + describe('indentation levels', () => { + const value = { a: [1, 2], b: { c: 3 } } + + it('handles indent=0 (compact)', () => { + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + }) + + it('handles indent=2', () => { + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('handles indent=4', () => { + expect(join(jsonStringifyLines(value, 4))).toBe(JSON.stringify(value, null, 4)) + }) + + it('handles indent=8', () => { + expect(join(jsonStringifyLines(value, 8))).toBe(JSON.stringify(value, null, 8)) + }) + }) + + describe('edge cases', () => { + it('handles arrays with undefined values (converted to null)', () => { + const value = [1, undefined, 3] + const expected = JSON.stringify(value, null, 2) + expect(join(jsonStringifyLines(value, 2))).toBe(expected) + }) + + it('handles single-element arrays', () => { + const value = [42] + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('handles single-property objects', () => { + const value = { only: 'one' } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('handles objects with many properties', () => { + const value: Record = {} + for (let i = 0; i < 100; i++) { + value[`key${i}`] = i + } + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + + it('handles large arrays', () => { + const value = Array.from({ length: 1000 }, (_, i) => i) + expect(join(jsonStringifyLines(value, 0))).toBe(JSON.stringify(value, null, 0)) + expect(join(jsonStringifyLines(value, 2))).toBe(JSON.stringify(value, null, 2)) + }) + }) +}) + +/** + * Joins chunks from an iterable into a single string. + */ +function join(iter: Iterable): string { + return Array.from(iter).join('') +}