feat(cli): memory-efficient streaming for encoding

2026-01-29 15:24:10 +08:00 · 2025-11-21 14:02:22 +01:00
parent be8bcfe9b2
commit 1c003c6118
7 changed files with 308 additions and 50 deletions
--- a/packages/cli/README.md
+++ b/packages/cli/README.md
@@ -1,8 +1,8 @@
 # @toon-format/cli

-Command-line tool for converting between JSON and TOON formats.
+Command-line tool for converting JSON to TOON and back, with token analysis and streaming support.

-[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable serialization format designed for passing structured data to Large Language Models with significantly reduced token usage.
+[TOON (Token-Oriented Object Notation)](https://toonformat.dev) is a compact, human-readable encoding of the JSON data model that minimizes tokens for LLM input. The CLI lets you test conversions, analyze token savings, and integrate TOON into shell pipelines with stdin/stdout support—no code required.

 ## Installation

@@ -79,11 +79,12 @@ toon data.json --stats -o output.toon
 ```

 Example output:
+
 ```
-✓ Encoded to TOON
-  Input:  15,145 tokens (JSON)
-  Output:  8,745 tokens (TOON)
-  Saved:   6,400 tokens (42.3% reduction)
+✔ Encoded data.json → output.toon
+
+ℹ Token estimates: ~15,145 (JSON) → ~8,745 (TOON)
+✔ Saved ~6,400 tokens (-42.3%)
 ```

 ### Alternative Delimiters
@@ -115,6 +116,21 @@ cat large-dataset.json | toon --delimiter "\t" > output.toon
 jq '.results' data.json | toon > filtered.toon
 ```

+### Large Dataset Processing
+
+The CLI streams output line-by-line without building the full string in memory, making it suitable for processing large datasets:
+
+```bash
+# Encode large JSON file with minimal memory usage
+toon huge-dataset.json -o output.toon
+
+# Process millions of records efficiently
+cat million-records.json | toon > output.toon
+```
+
+> [!NOTE]
+> When using `--stats`, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets.
+
 ### Key Folding (Since v1.5)

 Collapse nested wrapper chains to reduce tokens:
@@ -190,6 +206,7 @@ toon data.json --key-folding safe --delimiter "\t" --stats -o output.toon
 - **Pipeline integration** with existing JSON-based workflows
 - **Flexible formatting** with delimiter and indentation options
 - **Key folding** to collapse nested wrappers for additional token savings
+- **Memory-efficient streaming** for processing large datasets without loading everything into memory

 ## Related

--- a/packages/cli/src/conversion.ts
+++ b/packages/cli/src/conversion.ts
@@ -1,3 +1,4 @@
+import type { FileHandle } from 'node:fs/promises'
 import type { DecodeOptions, EncodeOptions } from '../../toon/src'
 import type { InputSource } from './types'
 import * as fsp from 'node:fs/promises'
@@ -34,38 +35,42 @@ export async function encodeToToon(config: {
    flattenDepth: config.flattenDepth,
  }

-  let toonOutput: string
-
  // When printing stats, we need the full string for token counting
  if (config.printStats) {
-    toonOutput = encode(data, encodeOptions)
-  }
-  else {
-    // Use streaming encoder for non-stats path
-    const lines = Array.from(encodeLines(data, encodeOptions))
-    toonOutput = lines.join('\n')
-  }
+    const toonOutput = encode(data, encodeOptions)

-  if (config.output) {
-    await fsp.writeFile(config.output, toonOutput, 'utf-8')
-    const relativeInputPath = formatInputLabel(config.input)
-    const relativeOutputPath = path.relative(process.cwd(), config.output)
-    consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
-  }
-  else {
-    console.log(toonOutput)
-  }
+    if (config.output) {
+      await fsp.writeFile(config.output, toonOutput, 'utf-8')
+    }
+    else {
+      console.log(toonOutput)
+    }

-  if (config.printStats) {
    const jsonTokens = estimateTokenCount(jsonContent)
    const toonTokens = estimateTokenCount(toonOutput)
    const diff = jsonTokens - toonTokens
    const percent = ((diff / jsonTokens) * 100).toFixed(1)

+    if (config.output) {
+      const relativeInputPath = formatInputLabel(config.input)
+      const relativeOutputPath = path.relative(process.cwd(), config.output)
+      consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
+    }
+
    console.log()
    consola.info(`Token estimates: ~${jsonTokens} (JSON) → ~${toonTokens} (TOON)`)
    consola.success(`Saved ~${diff} tokens (-${percent}%)`)
  }
+  else {
+    // Use streaming encoder for memory-efficient output
+    await writeStreamingToon(encodeLines(data, encodeOptions), config.output)
+
+    if (config.output) {
+      const relativeInputPath = formatInputLabel(config.input)
+      const relativeOutputPath = path.relative(process.cwd(), config.output)
+      consola.success(`Encoded \`${relativeInputPath}\` → \`${relativeOutputPath}\``)
+    }
+  }
 }

 export async function decodeToJson(config: {
@@ -102,3 +107,50 @@ export async function decodeToJson(config: {
    console.log(jsonOutput)
  }
 }
+
+/**
+ * Writes TOON lines to a file or stdout using streaming approach.
+ * Lines are written one at a time without building the full string in memory.
+ *
+ * @param lines - Iterable of TOON lines (without trailing newlines)
+ * @param outputPath - File path to write to, or undefined for stdout
+ */
+async function writeStreamingToon(
+  lines: Iterable<string>,
+  outputPath?: string,
+): Promise<void> {
+  let isFirst = true
+
+  // Stream to file using fs/promises API
+  if (outputPath) {
+    let fileHandle: FileHandle | undefined
+
+    try {
+      fileHandle = await fsp.open(outputPath, 'w')
+
+      for (const line of lines) {
+        if (!isFirst)
+          await fileHandle.write('\n')
+
+        await fileHandle.write(line)
+        isFirst = false
+      }
+    }
+    finally {
+      await fileHandle?.close()
+    }
+  }
+  // Stream to stdout
+  else {
+    for (const line of lines) {
+      if (!isFirst)
+        process.stdout.write('\n')
+
+      process.stdout.write(line)
+      isFirst = false
+    }
+
+    // Add final newline for stdout
+    process.stdout.write('\n')
+  }
+}
--- a/packages/cli/test/index.test.ts
+++ b/packages/cli/test/index.test.ts
@@ -33,15 +33,16 @@ describe('toon CLI', () => {
      }
      const cleanup = mockStdin(JSON.stringify(data))

-      const stdout: string[] = []
-      vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
-        stdout.push(String(message ?? ''))
+      const writeChunks: string[] = []
+      vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
+        writeChunks.push(String(chunk))
+        return true
      })

      try {
        await runCli()
-        expect(stdout).toHaveLength(1)
-        expect(stdout[0]).toBe(encode(data))
+        const fullOutput = writeChunks.join('')
+        expect(fullOutput).toBe(`${encode(data)}\n`)
      }
      finally {
        cleanup()
@@ -83,16 +84,17 @@ describe('toon CLI', () => {
        'input.json': JSON.stringify(data),
      })

-      const stdout: string[] = []
-      vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
-        stdout.push(String(message ?? ''))
+      const writeChunks: string[] = []
+      vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
+        writeChunks.push(String(chunk))
+        return true
      })

      try {
        await context.run(['input.json'])

-        expect(stdout).toHaveLength(1)
-        expect(stdout[0]).toBe(encode(data))
+        const fullOutput = writeChunks.join('')
+        expect(fullOutput).toBe(`${encode(data)}\n`)
      }
      finally {
        await context.cleanup()
@@ -230,16 +232,17 @@ describe('toon CLI', () => {
      const data = { items: [1, 2, 3] }
      const cleanup = mockStdin(JSON.stringify(data))

-      const stdout: string[] = []
-      vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
-        stdout.push(String(message ?? ''))
+      const writeChunks: string[] = []
+      vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
+        writeChunks.push(String(chunk))
+        return true
      })

      try {
        await runCli({ rawArgs: ['--delimiter', '|'] })

-        expect(stdout).toHaveLength(1)
-        expect(stdout[0]).toBe(encode(data, { delimiter: '|' }))
+        const fullOutput = writeChunks.join('')
+        expect(fullOutput).toBe(`${encode(data, { delimiter: '|' })}\n`)
      }
      finally {
        cleanup()
@@ -254,16 +257,17 @@ describe('toon CLI', () => {
      }
      const cleanup = mockStdin(JSON.stringify(data))

-      const stdout: string[] = []
-      vi.spyOn(console, 'log').mockImplementation((message?: unknown) => {
-        stdout.push(String(message ?? ''))
+      const writeChunks: string[] = []
+      vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
+        writeChunks.push(String(chunk))
+        return true
      })

      try {
        await runCli({ rawArgs: ['--indent', '4'] })

-        expect(stdout).toHaveLength(1)
-        expect(stdout[0]).toBe(encode(data, { indent: 4 }))
+        const fullOutput = writeChunks.join('')
+        expect(fullOutput).toBe(`${encode(data, { indent: 4 })}\n`)
      }
      finally {
        cleanup()
@@ -293,6 +297,138 @@ describe('toon CLI', () => {
    })
  })

+  describe('streaming output', () => {
+    it('streams large JSON to TOON file with identical output', async () => {
+      const data = {
+        items: Array.from({ length: 1000 }, (_, i) => ({
+          id: i,
+          name: `Item ${i}`,
+          value: Math.random(),
+        })),
+      }
+
+      const context = await createCliTestContext({
+        'large-input.json': JSON.stringify(data, undefined, 2),
+      })
+
+      const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined)
+
+      try {
+        await context.run(['large-input.json', '--output', 'output.toon'])
+
+        const output = await context.read('output.toon')
+        // Verify streaming produces identical output to `encode()`
+        const expected = encode(data, {
+          delimiter: DEFAULT_DELIMITER,
+          indent: 2,
+        })
+
+        expect(output).toBe(expected)
+        expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Encoded .* → .*/))
+      }
+      finally {
+        await context.cleanup()
+      }
+    })
+
+    it('streams to stdout using process.stdout.write', async () => {
+      const data = {
+        users: [
+          { id: 1, name: 'Alice' },
+          { id: 2, name: 'Bob' },
+        ],
+      }
+
+      const context = await createCliTestContext({
+        'input.json': JSON.stringify(data),
+      })
+
+      const writeChunks: string[] = []
+      const writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation((chunk) => {
+        writeChunks.push(String(chunk))
+        return true
+      })
+
+      try {
+        await context.run(['input.json'])
+
+        expect(writeSpy).toHaveBeenCalled()
+
+        // Verify complete output matches `encode()`
+        const fullOutput = writeChunks.join('')
+        const expected = `${encode(data)}\n`
+        expect(fullOutput).toBe(expected)
+      }
+      finally {
+        await context.cleanup()
+      }
+    })
+
+    it('handles empty object streaming correctly', async () => {
+      const data = {}
+
+      const context = await createCliTestContext({
+        'empty.json': JSON.stringify(data),
+      })
+
+      try {
+        await context.run(['empty.json', '--output', 'output.toon'])
+
+        const output = await context.read('output.toon')
+        expect(output).toBe(encode(data))
+      }
+      finally {
+        await context.cleanup()
+      }
+    })
+
+    it('handles single-line output streaming correctly', async () => {
+      const data = { key: 'value' }
+
+      const context = await createCliTestContext({
+        'single.json': JSON.stringify(data),
+      })
+
+      try {
+        await context.run(['single.json', '--output', 'output.toon'])
+
+        const output = await context.read('output.toon')
+        expect(output).toBe(encode(data))
+      }
+      finally {
+        await context.cleanup()
+      }
+    })
+
+    it('uses non-streaming path when stats are enabled', async () => {
+      const data = {
+        items: [
+          { id: 1, value: 'test' },
+          { id: 2, value: 'data' },
+        ],
+      }
+
+      const context = await createCliTestContext({
+        'input.json': JSON.stringify(data),
+      })
+
+      const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => undefined)
+      const consolaInfo = vi.spyOn(consola, 'info').mockImplementation(() => undefined)
+      const consolaSuccess = vi.spyOn(consola, 'success').mockImplementation(() => undefined)
+
+      try {
+        await context.run(['input.json', '--stats'])
+
+        expect(consolaInfo).toHaveBeenCalledWith(expect.stringMatching(/Token estimates:/))
+        expect(consolaSuccess).toHaveBeenCalledWith(expect.stringMatching(/Saved.*tokens/))
+        expect(consoleLogSpy).toHaveBeenCalledWith(encode(data))
+      }
+      finally {
+        await context.cleanup()
+      }
+    })
+  })
+
  describe('error handling', () => {
    it('rejects invalid delimiter', async () => {
      const context = await createCliTestContext({