From 1c003c6118896cfe7efe20233189995a33569e2b Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Fri, 21 Nov 2025 14:02:22 +0100 Subject: [PATCH] feat(cli): memory-efficient streaming for encoding --- README.md | 13 +++ docs/cli/index.md | 19 +++- docs/guide/llm-prompts.md | 23 +++++ docs/reference/api.md | 8 +- packages/cli/README.md | 29 ++++-- packages/cli/src/conversion.ts | 90 ++++++++++++---- packages/cli/test/index.test.ts | 176 ++++++++++++++++++++++++++++---- 7 files changed, 308 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 36e59f9..0c14804 100644 --- a/README.md +++ b/README.md @@ -764,6 +764,19 @@ console.log(encode(data)) // 2,Bob,user ``` +**Streaming large datasets:** + +```ts +import { encodeLines } from '@toon-format/toon' + +const largeData = await fetchThousandsOfRecords() + +// Memory-efficient streaming for large data +for (const line of encodeLines(largeData)) { + process.stdout.write(`${line}\n`) +} +``` + ## Playgrounds Experiment with TOON format interactively using these community-built tools for token comparison, format conversion, and validation: diff --git a/docs/cli/index.md b/docs/cli/index.md index ffb0e94..38225ae 100644 --- a/docs/cli/index.md +++ b/docs/cli/index.md @@ -1,6 +1,6 @@ # Command Line Interface -The `@toon-format/cli` package provides a command-line interface for encoding JSON to TOON and decoding TOON back to JSON. Use it for quick conversions without writing code, estimating token savings before sending data to LLMs, or integrating TOON into shell pipelines with tools like curl and jq. It supports stdin/stdout workflows, multiple delimiter options, token statistics, and all encoding/decoding features available in the library. +The `@toon-format/cli` package provides a command-line interface for encoding JSON to TOON and decoding TOON back to JSON. Use it to analyze token savings before integrating TOON into your application, or to process JSON data through TOON in shell pipelines using stdin/stdout with tools like curl and jq. The CLI supports token statistics, streaming for large datasets, and all encoding options available in the library. The CLI is built on top of the `@toon-format/toon` TypeScript implementation and adheres to the [latest specification](/reference/spec). @@ -108,6 +108,14 @@ cat data.toon | toon --decode JSON→TOON conversions use line-by-line encoding internally, which avoids holding the entire TOON document in memory. This makes the CLI efficient for large datasets without requiring additional configuration. +```bash +# Encode large JSON file with minimal memory usage +toon huge-dataset.json -o output.toon + +# Process millions of records efficiently via stdin +cat million-records.json | toon > output.toon +``` + ::: info Token Statistics When using the `--stats` flag, the CLI builds the full TOON string once to compute accurate token counts. For maximum memory efficiency on very large files, omit `--stats`. ::: @@ -139,6 +147,15 @@ toon data.json --stats -o output.toon This helps you estimate token cost savings before sending data to LLMs. +Example output: + +``` +✔ Encoded data.json → output.toon + +ℹ Token estimates: ~15,145 (JSON) → ~8,745 (TOON) +✔ Saved ~6,400 tokens (-42.3%) +``` + ### Alternative Delimiters TOON supports three delimiters: comma (default), tab, and pipe. Alternative delimiters can provide additional token savings in specific contexts. diff --git a/docs/guide/llm-prompts.md b/docs/guide/llm-prompts.md index 4e9afeb..ba82352 100644 --- a/docs/guide/llm-prompts.md +++ b/docs/guide/llm-prompts.md @@ -95,6 +95,29 @@ const toon = encode(data, { delimiter: '\t' }) Tell the model "fields are tab-separated" when using tabs. For more on delimiters, see the [Format Overview](/guide/format-overview#delimiter-options). +## Streaming Large Outputs + +When working with large datasets (thousands of records or deeply nested structures), use `encodeLines()` to stream TOON output line-by-line instead of building the full string in memory. + +```ts +import { encodeLines } from '@toon-format/toon' + +const largeData = await fetchThousandsOfRecords() + +// Stream large dataset without loading full string in memory +for (const line of encodeLines(largeData, { delimiter: '\t' })) { + process.stdout.write(`${line}\n`) +} +``` + +The CLI also supports streaming for memory-efficient JSON-to-TOON conversion: + +```bash +toon large-dataset.json --output output.toon +``` + +This streaming approach prevents out-of-memory errors when preparing large context windows for LLMs. For complete details on `encodeLines()`, see the [API reference](/reference/api#encodelines). + ## Tips and Pitfalls **Show, don't describe.** Don't explain TOON syntax in detail – just show an example. Models learn the pattern from context. A simple code block with 2-5 rows is more effective than paragraphs of explanation. diff --git a/docs/reference/api.md b/docs/reference/api.md index f08ba00..bbcb4c7 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -129,14 +129,14 @@ encode(data, { delimiter: '\t', keyFolding: 'safe' }) ## `encodeLines(value, options?)` -Converts any JSON-serializable value to TOON format as a sequence of lines, without building the full string in memory. Suitable for streaming large outputs to files, HTTP responses, or process stdout. +**Preferred method for streaming TOON output.** Converts any JSON-serializable value to TOON format as a sequence of lines, without building the full string in memory. Suitable for streaming large outputs to files, HTTP responses, or process stdout. ```ts import { encodeLines } from '@toon-format/toon' -// Stream to stdout +// Stream to stdout (Node.js) for (const line of encodeLines(data)) { - console.log(line) + process.stdout.write(`${line}\n`) } // Write to file line-by-line @@ -158,7 +158,7 @@ const lineArray = Array.from(encodeLines(data)) ### Return Value -Returns an `Iterable` that yields TOON lines one at a time. Each yielded string is a single line without a trailing newline character. +Returns an `Iterable` that yields TOON lines one at a time. **Each yielded string is a single line without a trailing newline character** — you must add `\n` when writing to streams or stdout. ::: info Relationship to `encode()` `encode(value, options)` is equivalent to: diff --git a/packages/cli/README.md b/packages/cli/README.md index 64efbbf..1921e95 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -1,8 +1,8 @@ # @toon-format/cli -Command-line tool for converting between JSON and TOON formats. +Command-line tool for converting JSON to TOON and back, with token analysis and streaming support. -[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage. +[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable encoding of the JSON data model that minimizes tokens for LLM input. The CLI lets you test conversions, analyze token savings, and integrate TOON into shell pipelines with stdin/stdout support—no code required. ## Installation @@ -79,11 +79,12 @@ toon data.json --stats -o output.toon ``` Example output: + ``` -✓ Encoded to TOON - Input: 15,145 tokens (JSON) - Output: 8,745 tokens (TOON) - Saved: 6,400 tokens (42.3% reduction) +✔ Encoded data.json → output.toon + +ℹ Token estimates: ~15,145 (JSON) → ~8,745 (TOON) +✔ Saved ~6,400 tokens (-42.3%) ``` ### Alternative Delimiters @@ -115,6 +116,21 @@ cat large-dataset.json | toon --delimiter "\t" > output.toon jq '.results' data.json | toon > filtered.toon ``` +### Large Dataset Processing + +The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets: + +```bash +# Encode large JSON file with minimal memory usage +toon huge-dataset.json -o output.toon + +# Process millions of records efficiently +cat million-records.json | toon > output.toon +``` + +> [!NOTE] +> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets. + ### Key Folding (Since v1.5) Collapse nested wrapper chains to reduce tokens: @@ -190,6 +206,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon - **Pipeline integration** with existing JSON-based workflows - **Flexible formatting** with delimiter and indentation options - **Key folding** to collapse nested wrappers for additional token savings +- **Memory-efficient streaming** for processing large datasets without loading everything into memory ## Related diff --git a/packages/cli/src/conversion.ts b/packages/cli/src/conversion.ts index d3210d4..202d770 100644 --- a/packages/cli/src/conversion.ts +++ b/packages/cli/src/conversion.ts @@ -1,3 +1,4 @@ +import type { FileHandle } from 'node:fs/promises' import type { DecodeOptions, EncodeOptions } from '../../toon/src' import type { InputSource } from './types' import * as fsp from 'node:fs/promises' @@ -34,38 +35,42 @@ export async function encodeToToon(config: { flattenDepth: config.flattenDepth, } - let toonOutput: string - // When printing stats, we need the full string for token counting if (config.printStats) { - toonOutput = encode(data, encodeOptions) - } - else { - // Use streaming encoder for non-stats path - const lines = Array.from(encodeLines(data, encodeOptions)) - toonOutput = lines.join('\n') - } + const toonOutput = encode(data, encodeOptions) - if (config.output) { - await fsp.writeFile(config.output, toonOutput, 'utf-8') - const relativeInputPath = formatInputLabel(config.input) - const relativeOutputPath = path.relative(process.cwd(), config.output) - consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``) - } - else { - console.log(toonOutput) - } + if (config.output) { + await fsp.writeFile(config.output, toonOutput, 'utf-8') + } + else { + console.log(toonOutput) + } - if (config.printStats) { const jsonTokens = estimateTokenCount(jsonContent) const toonTokens = estimateTokenCount(toonOutput) const diff = jsonTokens - toonTokens const percent = ((diff / jsonTokens) * 100).toFixed(1) + if (config.output) { + const relativeInputPath = formatInputLabel(config.input) + const relativeOutputPath = path.relative(process.cwd(), config.output) + consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``) + } + console.log() consola.info(`Token estimates: ~${jsonTokens} (JSON) → ~${toonTokens} (TOON)`) consola.success(`Saved ~${diff} tokens (-${percent}%)`) } + else { + // Use streaming encoder for memory-efficient output + await writeStreamingToon(encodeLines(data, encodeOptions), config.output) + + if (config.output) { + const relativeInputPath = formatInputLabel(config.input) + const relativeOutputPath = path.relative(process.cwd(), config.output) + consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``) + } + } } export async function decodeToJson(config: { @@ -102,3 +107,50 @@ export async function decodeToJson(config: { console.log(jsonOutput) } } + +/** + * Writes TOON lines to a file or stdout using streaming approach. + * Lines are written one at a time without building the full string in memory. + * + * @param lines - Iterable of TOON lines (without trailing newlines) + * @param outputPath - File path to write to, or undefined for stdout + */ +async function writeStreamingToon( + lines: Iterable, + outputPath?: string, +): Promise { + let isFirst = true + + // Stream to file using fs/promises API + if (outputPath) { + let fileHandle: FileHandle | undefined + + try { + fileHandle = await fsp.open(outputPath, 'w') + + for (const line of lines) { + if (!isFirst) + await fileHandle.write('\n') + + await fileHandle.write(line) + isFirst = false + } + } + finally { + await fileHandle?.close() + } + } + // Stream to stdout + else { + for (const line of lines) { + if (!isFirst) + process.stdout.write('\n') + + process.stdout.write(line) + isFirst = false + } + + // Add final newline for stdout + process.stdout.write('\n') + } +} diff --git a/packages/cli/test/index.test.ts b/packages/cli/test/index.test.ts index e29a1b6..380ac74 100644 --- a/packages/cli/test/index.test.ts +++ b/packages/cli/test/index.test.ts @@ -33,15 +33,16 @@ describe('toon CLI', () => { } const cleanup = mockStdin(JSON.stringify(data)) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await runCli() - expect(stdout).toHaveLength(1) - expect(stdout[0]).toBe(encode(data)) + const fullOutput = writeChunks.join('') + expect(fullOutput).toBe(`${encode(data)}\n`) } finally { cleanup() @@ -83,16 +84,17 @@ describe('toon CLI', () => { 'input.json': JSON.stringify(data), }) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await context.run(['input.json']) - expect(stdout).toHaveLength(1) - expect(stdout[0]).toBe(encode(data)) + const fullOutput = writeChunks.join('') + expect(fullOutput).toBe(`${encode(data)}\n`) } finally { await context.cleanup() @@ -230,16 +232,17 @@ describe('toon CLI', () => { const data = { items: [1, 2, 3] } const cleanup = mockStdin(JSON.stringify(data)) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await runCli({ rawArgs: ['--delimiter', '|'] }) - expect(stdout).toHaveLength(1) - expect(stdout[0]).toBe(encode(data, { delimiter: '|' })) + const fullOutput = writeChunks.join('') + expect(fullOutput).toBe(`${encode(data, { delimiter: '|' })}\n`) } finally { cleanup() @@ -254,16 +257,17 @@ describe('toon CLI', () => { } const cleanup = mockStdin(JSON.stringify(data)) - const stdout: string[] = [] - vi.spyOn(console, 'log').mockImplementation((message?: unknown) => { - stdout.push(String(message ?? '')) + const writeChunks: string[] = [] + vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true }) try { await runCli({ rawArgs: ['--indent', '4'] }) - expect(stdout).toHaveLength(1) - expect(stdout[0]).toBe(encode(data, { indent: 4 })) + const fullOutput = writeChunks.join('') + expect(fullOutput).toBe(`${encode(data, { indent: 4 })}\n`) } finally { cleanup() @@ -293,6 +297,138 @@ describe('toon CLI', () => { }) }) + describe('streaming output', () => { + it('streams large JSON to TOON file with identical output', async () => { + const data = { + items: Array.from({ length: 1000 }, (_, i) => ({ + id: i, + name: `Item ${i}`, + value: Math.random(), + })), + } + + const context = await createCliTestContext({ + 'large-input.json': JSON.stringify(data, undefined, 2), + }) + + const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined) + + try { + await context.run(['large-input.json', '--output', 'output.toon']) + + const output = await context.read('output.toon') + // Verify streaming produces identical output to `encode()` + const expected = encode(data, { + delimiter: DEFAULT_DELIMITER, + indent: 2, + }) + + expect(output).toBe(expected) + expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Encoded .* → .*/)) + } + finally { + await context.cleanup() + } + }) + + it('streams to stdout using process.stdout.write', async () => { + const data = { + users: [ + { id: 1, name: 'Alice' }, + { id: 2, name: 'Bob' }, + ], + } + + const context = await createCliTestContext({ + 'input.json': JSON.stringify(data), + }) + + const writeChunks: string[] = [] + const writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => { + writeChunks.push(String(chunk)) + return true + }) + + try { + await context.run(['input.json']) + + expect(writeSpy).toHaveBeenCalled() + + // Verify complete output matches `encode()` + const fullOutput = writeChunks.join('') + const expected = `${encode(data)}\n` + expect(fullOutput).toBe(expected) + } + finally { + await context.cleanup() + } + }) + + it('handles empty object streaming correctly', async () => { + const data = {} + + const context = await createCliTestContext({ + 'empty.json': JSON.stringify(data), + }) + + try { + await context.run(['empty.json', '--output', 'output.toon']) + + const output = await context.read('output.toon') + expect(output).toBe(encode(data)) + } + finally { + await context.cleanup() + } + }) + + it('handles single-line output streaming correctly', async () => { + const data = { key: 'value' } + + const context = await createCliTestContext({ + 'single.json': JSON.stringify(data), + }) + + try { + await context.run(['single.json', '--output', 'output.toon']) + + const output = await context.read('output.toon') + expect(output).toBe(encode(data)) + } + finally { + await context.cleanup() + } + }) + + it('uses non-streaming path when stats are enabled', async () => { + const data = { + items: [ + { id: 1, value: 'test' }, + { id: 2, value: 'data' }, + ], + } + + const context = await createCliTestContext({ + 'input.json': JSON.stringify(data), + }) + + const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined) + const consolaInfo = vi.spyOn(consola, 'info').mockImplementation(() => undefined) + const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined) + + try { + await context.run(['input.json', '--stats']) + + expect(consolaInfo).toHaveBeenCalledWith(expect.stringMatching(/Token estimates:/)) + expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Saved.*tokens/)) + expect(consoleLogSpy).toHaveBeenCalledWith(encode(data)) + } + finally { + await context.cleanup() + } + }) + }) + describe('error handling', () => { it('rejects invalid delimiter', async () => { const context = await createCliTestContext({