diff --git a/README.md b/README.md index 0c14804..a1358ac 100644 --- a/README.md +++ b/README.md @@ -777,6 +777,46 @@ for (const line of encodeLines(largeData)) { } ``` +**Streaming decode:** + +```ts +import { decodeFromLines, decodeStreamSync } from '@toon-format/toon' + +// 1. Lines → value (build full JSON value) +const value = decodeFromLines([ + 'users[2]{id,name}:', + ' 1,Alice', + ' 2,Bob', +]) +// { users: [{ id: 1, name: 'Alice' }, { id: 2, name: 'Bob' }] } + +// 2. Lines → events (for custom streaming consumers) +const lines = [ + 'users[2]{id,name}:', + ' 1,Alice', + ' 2,Bob', +] +for (const event of decodeStreamSync(lines)) { + // { type: 'startObject' }, { type: 'key', key: 'users' }, ... +} +``` + +**Async streaming decode:** + +```ts +// 3. Async streaming from files or network +import { createReadStream } from 'node:fs' +import { createInterface } from 'node:readline' +import { decodeStream } from '@toon-format/toon' + +const fileStream = createReadStream('data.toon', 'utf-8') +const rl = createInterface({ input: fileStream }) + +for await (const event of decodeStream(rl)) { + // Process events as they arrive +} +``` + ## Playgrounds Experiment with TOON format interactively using these community-built tools for token comparison, format conversion, and validation: diff --git a/docs/cli/index.md b/docs/cli/index.md index 2312413..7001de7 100644 --- a/docs/cli/index.md +++ b/docs/cli/index.md @@ -108,19 +108,25 @@ cat data.toon | toon --decode Both encoding and decoding operations use streaming output, writing incrementally without building the full output string in memory. This makes the CLI efficient for large datasets without requiring additional configuration. -**JSON → TOON (Encode)** -- Streams TOON lines to output -- No full TOON string in memory +**JSON → TOON (Encode)**: -**TOON → JSON (Decode)** -- Streams JSON tokens to output -- No full JSON string in memory +- Streams TOON lines to output. +- No full TOON string in memory. + +**TOON → JSON (Decode)**: + +- Uses the same event-based streaming decoder as the `decodeStream` API in `@toon-format/toon`. +- Streams JSON tokens to output. +- No full JSON string in memory. +- When `--expand-paths safe` is enabled, falls back to non-streaming decode internally to apply deep-merge expansion before writing JSON. + +Process large files with minimal memory usage: ```bash -# Encode large JSON file with minimal memory usage +# Encode large JSON file toon huge-dataset.json -o output.toon -# Decode large TOON file with minimal memory usage +# Decode large TOON file toon huge-dataset.toon -o output.json # Process millions of records efficiently via stdin diff --git a/docs/guide/getting-started.md b/docs/guide/getting-started.md index 1a0ae8d..cedd77b 100644 --- a/docs/guide/getting-started.md +++ b/docs/guide/getting-started.md @@ -237,3 +237,5 @@ Round-tripping is lossless: `decode(encode(x))` always equals `x` (after normali ## Where to Go Next Now that you've seen your first TOON document, read the [Format Overview](/guide/format-overview) for complete syntax details (objects, arrays, quoting rules, key folding), then explore [Using TOON with LLMs](/guide/llm-prompts) to see how to use it effectively in prompts. For implementation details, check the [API reference](/reference/api) (TypeScript) or the [specification](/reference/spec) (language-agnostic normative rules). + +For large datasets or streaming use-cases, see `encodeLines`, `decodeFromLines`, and `decodeStream` in the [API reference](/reference/api). diff --git a/docs/guide/llm-prompts.md b/docs/guide/llm-prompts.md index ba82352..7ab2d30 100644 --- a/docs/guide/llm-prompts.md +++ b/docs/guide/llm-prompts.md @@ -118,6 +118,31 @@ toon large-dataset.json --output output.toon This streaming approach prevents out-of-memory errors when preparing large context windows for LLMs. For complete details on `encodeLines()`, see the [API reference](/reference/api#encodelines). +**Consuming streaming LLM outputs:** If your LLM client exposes streaming text and you buffer by lines, you can decode TOON incrementally: + +```ts +import { decodeFromLines } from '@toon-format/toon' + +// Buffer streaming response into lines +const lines: string[] = [] +let buffer = '' + +for await (const chunk of modelStream) { + buffer += chunk + let index: number + + while ((index = buffer.indexOf('\n')) !== -1) { + lines.push(buffer.slice(0, index)) + buffer = buffer.slice(index + 1) + } +} + +// Decode buffered lines +const data = decodeFromLines(lines) +``` + +For streaming decode APIs, see [`decodeFromLines()`](/reference/api#decodeFromLines-lines-options) and [`decodeStream()`](/reference/api#decodeStream-source-options). + ## Tips and Pitfalls **Show, don't describe.** Don't explain TOON syntax in detail – just show an example. Models learn the pattern from context. A simple code block with 2-5 rows is more effective than paragraphs of explanation. diff --git a/docs/reference/api.md b/docs/reference/api.md index bbcb4c7..9796508 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -300,6 +300,227 @@ decode(toon, { expandPaths: 'safe', strict: false }) ``` ::: +## `decodeFromLines(lines, options?)` + +Decodes TOON format from pre-split lines into a JavaScript value. This is a streaming-friendly wrapper around the event-based decoder that builds the full value in memory. + +Useful when you already have lines as an array or iterable (e.g., from file streams, readline interfaces, or network responses) and want the standard decode behavior with path expansion support. + +### Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `lines` | `Iterable` | Iterable of TOON lines (without trailing newlines) | +| `options` | `DecodeOptions?` | Optional decoding configuration (see below) | + +### Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `indent` | `number` | `2` | Expected number of spaces per indentation level | +| `strict` | `boolean` | `true` | Enable strict validation (array counts, indentation, delimiter consistency) | +| `expandPaths` | `'off'` \| `'safe'` | `'off'` | Enable path expansion to reconstruct dotted keys into nested objects | + +### Return Value + +Returns a `JsonValue` (the parsed JavaScript value: object, array, or primitive). + +### Example + +**Basic usage with arrays:** + +```ts +import { decodeFromLines } from '@toon-format/toon' + +const lines = ['name: Alice', 'age: 30'] +const value = decodeFromLines(lines) +// { name: 'Alice', age: 30 } +``` + +**Streaming from Node.js readline:** + +```ts +import { createReadStream } from 'node:fs' +import { createInterface } from 'node:readline' +import { decodeFromLines } from '@toon-format/toon' + +const rl = createInterface({ + input: createReadStream('data.toon'), + crlfDelay: Infinity, +}) + +const value = decodeFromLines(rl) +console.log(value) +``` + +**With path expansion:** + +```ts +const lines = ['user.name: Alice', 'user.age: 30'] +const value = decodeFromLines(lines, { expandPaths: 'safe' }) +// { user: { name: 'Alice', age: 30 } } +``` + +## `decodeStreamSync(lines, options?)` + +Synchronously decodes TOON lines into a stream of JSON events. This function yields structured events that represent the JSON data model without building the full value tree. + +Useful for streaming processing, custom transformations, or memory-efficient parsing of large datasets where you don't need the full value in memory. + +::: info Event Streaming +This is a low-level API that returns individual parse events. For most use cases, [`decodeFromLines()`](#decodeFromLines-lines-options) or [`decode()`](#decode-input-options) are more convenient. + +Path expansion (`expandPaths: 'safe'`) is **not supported** in streaming mode since it requires the full value tree. +::: + +### Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `lines` | `Iterable` | Iterable of TOON lines (without trailing newlines) | +| `options` | `DecodeStreamOptions?` | Optional streaming decoding configuration (see below) | + +### Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `indent` | `number` | `2` | Expected number of spaces per indentation level | +| `strict` | `boolean` | `true` | Enable strict validation (array counts, indentation, delimiter consistency) | + +### Return Value + +Returns an `Iterable` that yields structured events. + +### Event Types + +Events represent the structure of the JSON data model: + +```ts +type JsonStreamEvent + = | { type: 'startObject' } + | { type: 'endObject' } + | { type: 'startArray' } + | { type: 'endArray' } + | { type: 'key', key: string } + | { type: 'primitive', value: JsonPrimitive } + +type JsonPrimitive = string | number | boolean | null +``` + +### Example + +**Basic event streaming:** + +```ts +import { decodeStreamSync } from '@toon-format/toon' + +const lines = ['name: Alice', 'age: 30'] + +for (const event of decodeStreamSync(lines)) { + console.log(event) +} + +// Output: +// { type: 'startObject' } +// { type: 'key', key: 'name' } +// { type: 'primitive', value: 'Alice' } +// { type: 'key', key: 'age' } +// { type: 'primitive', value: 30 } +// { type: 'endObject' } +``` + +**Custom processing:** + +```ts +import { decodeStreamSync } from '@toon-format/toon' + +const lines = ['users[2]{id,name}:', ' 1,Alice', ' 2,Bob'] +let userCount = 0 + +for (const event of decodeStreamSync(lines)) { + if (event.type === 'endObject' && userCount < 2) { + userCount++ + console.log(`Processed user ${userCount}`) + } +} +``` + +## `decodeStream(source, options?)` + +Asynchronously decodes TOON lines into a stream of JSON events. This is the async version of [`decodeStreamSync()`](#decodeStreamSync-lines-options), supporting both synchronous and asynchronous iterables. + +Useful for processing file streams, network responses, or other async sources where you want to handle data incrementally as it arrives. + +### Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `source` | `AsyncIterable` \| `Iterable` | Async or sync iterable of TOON lines (without trailing newlines) | +| `options` | `DecodeStreamOptions?` | Optional streaming decoding configuration (see below) | + +### Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `indent` | `number` | `2` | Expected number of spaces per indentation level | +| `strict` | `boolean` | `true` | Enable strict validation (array counts, indentation, delimiter consistency) | + +### Return Value + +Returns an `AsyncIterable` that yields structured events asynchronously. + +### Example + +**Streaming from file:** + +```ts +import { createReadStream } from 'node:fs' +import { createInterface } from 'node:readline' +import { decodeStream } from '@toon-format/toon' + +const fileStream = createReadStream('data.toon', 'utf-8') +const rl = createInterface({ input: fileStream, crlfDelay: Infinity }) + +for await (const event of decodeStream(rl)) { + console.log(event) + // Process events as they arrive +} +``` + +**Processing events incrementally:** + +```ts +import { decodeStream } from '@toon-format/toon' + +const lines = getAsyncLineSource() // AsyncIterable + +for await (const event of decodeStream(lines, { strict: true })) { + if (event.type === 'key' && event.key === 'id') { + // Next event will be the id value + const valueEvent = await decodeStream(lines).next() + if (valueEvent.value?.type === 'primitive') { + console.log('Found ID:', valueEvent.value.value) + } + } +} +``` + +**Auto-detection of sync/async sources:** + +```ts +// Works with sync iterables +const syncLines = ['name: Alice', 'age: 30'] +for await (const event of decodeStream(syncLines)) { + console.log(event) +} + +// Works with async iterables +const asyncLines = readLinesFromNetwork() +for await (const event of decodeStream(asyncLines)) { + console.log(event) +} +``` + ## Round-Trip Compatibility TOON provides lossless round-trips after normalization: diff --git a/eslint.config.mjs b/eslint.config.mjs index 4ad5b7d..a1f447f 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -1,10 +1,15 @@ // @ts-check import antfu from '@antfu/eslint-config' -export default antfu().append({ +export default antfu({ + rules: { + 'no-cond-assign': 'off', + }, +}).append({ files: ['README.md', 'SPEC.md', '**/docs/**/*'], rules: { - 'yaml/quotes': 'off', + 'import/no-duplicates': 'off', 'style/no-tabs': 'off', + 'yaml/quotes': 'off', }, }) diff --git a/packages/cli/README.md b/packages/cli/README.md index 1ef6eb2..a895829 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -134,8 +134,9 @@ cat million-records.toon | toon --decode > output.json **Memory efficiency:** - **Encode (JSON → TOON)**: Streams TOON lines to output without full string in memory -- **Decode (TOON → JSON)**: Streams JSON tokens to output without full string in memory +- **Decode (TOON → JSON)**: Uses the same event-based streaming decoder as the `decodeStream` API in `@toon-format/toon`, streaming JSON tokens to output without full string in memory - Peak memory usage scales with data depth, not total size +- When `--expand-paths safe` is enabled, decode falls back to non-streaming mode internally to apply deep-merge expansion before writing JSON > [!NOTE] > When using `--stats` with encode, the full output string is kept in memory for token counting. Omit `--stats` for maximum memory efficiency with very large datasets. diff --git a/packages/cli/src/conversion.ts b/packages/cli/src/conversion.ts index aa341bb..ad7b393 100644 --- a/packages/cli/src/conversion.ts +++ b/packages/cli/src/conversion.ts @@ -1,14 +1,15 @@ import type { FileHandle } from 'node:fs/promises' -import type { DecodeOptions, EncodeOptions } from '../../toon/src' +import type { DecodeOptions, DecodeStreamOptions, EncodeOptions } from '../../toon/src' import type { InputSource } from './types' import * as fsp from 'node:fs/promises' import * as path from 'node:path' import process from 'node:process' import { consola } from 'consola' import { estimateTokenCount } from 'tokenx' -import { decode, encode, encodeLines } from '../../toon/src' +import { decode, decodeStream, encode, encodeLines } from '../../toon/src' +import { jsonStreamFromEvents } from './json-from-events' import { jsonStringifyLines } from './json-stringify-stream' -import { formatInputLabel, readInput } from './utils' +import { formatInputLabel, readInput, readLinesFromSource } from './utils' export async function encodeToToon(config: { input: InputSource @@ -80,22 +81,43 @@ export async function decodeToJson(config: { strict: NonNullable expandPaths?: NonNullable }): Promise { - const toonContent = await readInput(config.input) + // Path expansion requires full value in memory, so use non-streaming path + if (config.expandPaths === 'safe') { + const toonContent = await readInput(config.input) - let data: unknown - try { - const decodeOptions: DecodeOptions = { - indent: config.indent, - strict: config.strict, - expandPaths: config.expandPaths, + let data: unknown + try { + const decodeOptions: DecodeOptions = { + indent: config.indent, + strict: config.strict, + expandPaths: config.expandPaths, + } + data = decode(toonContent, decodeOptions) + } + catch (error) { + throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`) } - data = decode(toonContent, decodeOptions) - } - catch (error) { - throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`) - } - await writeStreamingJson(jsonStringifyLines(data, config.indent), config.output) + await writeStreamingJson(jsonStringifyLines(data, config.indent), config.output) + } + else { + try { + const lineSource = readLinesFromSource(config.input) + + const decodeStreamOptions: DecodeStreamOptions = { + indent: config.indent, + strict: config.strict, + } + + const events = decodeStream(lineSource, decodeStreamOptions) + const jsonChunks = jsonStreamFromEvents(events, config.indent) + + await writeStreamingJson(jsonChunks, config.output) + } + catch (error) { + throw new Error(`Failed to decode TOON: ${error instanceof Error ? error.message : String(error)}`) + } + } if (config.output) { const relativeInputPath = formatInputLabel(config.input) @@ -109,7 +131,7 @@ export async function decodeToJson(config: { * Chunks are written one at a time without building the full string in memory. */ async function writeStreamingJson( - chunks: Iterable, + chunks: AsyncIterable | Iterable, outputPath?: string, ): Promise { // Stream to file using fs/promises API @@ -119,7 +141,7 @@ async function writeStreamingJson( try { fileHandle = await fsp.open(outputPath, 'w') - for (const chunk of chunks) { + for await (const chunk of chunks) { await fileHandle.write(chunk) } } @@ -129,7 +151,7 @@ async function writeStreamingJson( } // Stream to stdout else { - for (const chunk of chunks) { + for await (const chunk of chunks) { process.stdout.write(chunk) } diff --git a/packages/cli/src/json-from-events.ts b/packages/cli/src/json-from-events.ts new file mode 100644 index 0000000..c2477b8 --- /dev/null +++ b/packages/cli/src/json-from-events.ts @@ -0,0 +1,217 @@ +import type { JsonStreamEvent } from '../../toon/src/types' + +/** + * Context for tracking JSON structure state during event streaming. + */ +type JsonContext + = | { type: 'object', needsComma: boolean, expectValue: boolean } + | { type: 'array', needsComma: boolean } + +/** + * Converts a stream of `JsonStreamEvent` into formatted JSON string chunks. + * + * Similar to `jsonStringifyLines` but driven by events instead of a value tree. + * Useful for streaming TOON decode directly to JSON output without building + * the full data structure in memory. + * + * @param events - Async iterable of JSON stream events + * @param indent - Number of spaces for indentation (0 = compact, >0 = pretty) + * @returns Async iterable of JSON string chunks + * + * @example + * ```ts + * const lines = readLinesFromSource(input) + * const events = decodeStream(lines) + * for await (const chunk of jsonStreamFromEvents(events, 2)) { + * process.stdout.write(chunk) + * } + * ``` + */ +export async function* jsonStreamFromEvents( + events: AsyncIterable, + indent: number = 2, +): AsyncIterable { + const stack: JsonContext[] = [] + let depth = 0 + + for await (const event of events) { + const parent = stack.length > 0 ? stack[stack.length - 1] : undefined + + switch (event.type) { + case 'startObject': { + // Emit comma if needed (inside array or after previous object field value) + if (parent) { + if (parent.type === 'array' && parent.needsComma) { + yield ',' + } + else if (parent.type === 'object' && !parent.expectValue) { + // Object field value already emitted, this is a nested object after a key + // The comma is handled by the key event + } + } + + // Emit newline and indent for pretty printing + if (indent > 0 && parent) { + if (parent.type === 'array') { + yield '\n' + yield ' '.repeat(depth * indent) + } + } + + yield '{' + stack.push({ type: 'object', needsComma: false, expectValue: false }) + depth++ + break + } + + case 'endObject': { + const context = stack.pop() + if (!context || context.type !== 'object') { + throw new Error('Mismatched endObject event') + } + + depth-- + + // Emit newline and indent for closing brace (pretty print) + if (indent > 0 && context.needsComma) { + yield '\n' + yield ' '.repeat(depth * indent) + } + + yield '}' + + // Mark parent as needing comma for next item + const newParent = stack.length > 0 ? stack[stack.length - 1] : undefined + if (newParent) { + if (newParent.type === 'object') { + newParent.expectValue = false + newParent.needsComma = true + } + else if (newParent.type === 'array') { + newParent.needsComma = true + } + } + break + } + + case 'startArray': { + // Emit comma if needed + if (parent) { + if (parent.type === 'array' && parent.needsComma) { + yield ',' + } + } + + // Emit newline and indent for pretty printing + if (indent > 0 && parent) { + if (parent.type === 'array') { + yield '\n' + yield ' '.repeat(depth * indent) + } + } + + yield '[' + stack.push({ + type: 'array', + needsComma: false, + }) + depth++ + break + } + + case 'endArray': { + const context = stack.pop() + if (!context || context.type !== 'array') { + throw new Error('Mismatched endArray event') + } + + depth-- + + // Emit newline and indent for closing bracket (pretty print) + if (indent > 0 && context.needsComma) { + yield '\n' + yield ' '.repeat(depth * indent) + } + + yield ']' + + // Mark parent as needing comma for next item + const newParent = stack.length > 0 ? stack[stack.length - 1] : undefined + if (newParent) { + if (newParent.type === 'object') { + newParent.expectValue = false + newParent.needsComma = true + } + else if (newParent.type === 'array') { + newParent.needsComma = true + } + } + break + } + + case 'key': { + if (!parent || parent.type !== 'object') { + throw new Error('Key event outside of object context') + } + + // Emit comma before this field if needed + if (parent.needsComma) { + yield ',' + } + + // Emit newline and indent (pretty print) + if (indent > 0) { + yield '\n' + yield ' '.repeat(depth * indent) + } + + // Emit key + yield JSON.stringify(event.key) + yield indent > 0 ? ': ' : ':' + + parent.expectValue = true + parent.needsComma = true + break + } + + case 'primitive': { + // Emit comma if needed + if (parent) { + if (parent.type === 'array' && parent.needsComma) { + yield ',' + } + else if (parent.type === 'object' && !parent.expectValue) { + // This shouldn't happen in well-formed events + throw new Error('Primitive event in object without preceding key') + } + } + + // Emit newline and indent for array items (pretty print) + if (indent > 0 && parent && parent.type === 'array') { + yield '\n' + yield ' '.repeat(depth * indent) + } + + // Emit primitive value + yield JSON.stringify(event.value) + + // Update parent context + if (parent) { + if (parent.type === 'object') { + parent.expectValue = false + // needsComma already true from key event + } + else if (parent.type === 'array') { + parent.needsComma = true + } + } + break + } + } + } + + // Ensure stack is empty + if (stack.length !== 0) { + throw new Error('Incomplete event stream: unclosed objects or arrays') + } +} diff --git a/packages/cli/src/utils.ts b/packages/cli/src/utils.ts index 9605d90..da15990 100644 --- a/packages/cli/src/utils.ts +++ b/packages/cli/src/utils.ts @@ -1,4 +1,5 @@ import type { InputSource } from './types' +import { createReadStream } from 'node:fs' import * as fsp from 'node:fs/promises' import * as path from 'node:path' import process from 'node:process' @@ -77,3 +78,32 @@ function readFromStdin(): Promise { stdin.resume() }) } + +export async function* readLinesFromSource(source: InputSource): AsyncIterable { + const stream = source.type === 'stdin' + ? process.stdin + : createReadStream(source.path, { encoding: 'utf-8' }) + + // Explicitly set encoding for stdin + if (source.type === 'stdin') { + stream.setEncoding('utf-8') + } + + let buffer = '' + + for await (const chunk of stream) { + buffer += chunk + let index: number + + while ((index = buffer.indexOf('\n')) !== -1) { + const line = buffer.slice(0, index) + buffer = buffer.slice(index + 1) + yield line + } + } + + // Emit last line if buffer is not empty and doesn't end with newline + if (buffer.length > 0) { + yield buffer + } +} diff --git a/packages/toon/src/decode/decoders.ts b/packages/toon/src/decode/decoders.ts index d3dad68..58a15be 100644 --- a/packages/toon/src/decode/decoders.ts +++ b/packages/toon/src/decode/decoders.ts @@ -1,69 +1,231 @@ -import type { ArrayHeaderInfo, Depth, JsonArray, JsonObject, JsonPrimitive, JsonValue, ParsedLine, ResolvedDecodeOptions } from '../types' -import type { ObjectWithQuotedKeys } from './expand' -import type { LineCursor } from './scanner' -import { COLON, DEFAULT_DELIMITER, DOT, LIST_ITEM_PREFIX } from '../constants' +import type { ArrayHeaderInfo, DecodeStreamOptions, Depth, JsonStreamEvent, ParsedLine } from '../types' +import type { StreamingScanState } from './scanner' +import { COLON, DEFAULT_DELIMITER, LIST_ITEM_MARKER, LIST_ITEM_PREFIX } from '../constants' import { findClosingQuote } from '../shared/string-utils' -import { QUOTED_KEY_MARKER } from './expand' -import { isArrayHeaderAfterHyphen, isObjectFirstFieldAfterHyphen, mapRowValuesToPrimitives, parseArrayHeaderLine, parseDelimitedValues, parseKeyToken, parsePrimitiveToken } from './parser' +import { isArrayHeaderContent, isKeyValueContent, mapRowValuesToPrimitives, parseArrayHeaderLine, parseDelimitedValues, parseKeyToken, parsePrimitiveToken } from './parser' +import { createScanState, parseLinesAsync, parseLinesSync } from './scanner' import { assertExpectedCount, validateNoBlankLinesInRange, validateNoExtraListItems, validateNoExtraTabularRows } from './validation' -// #region Entry decoding +interface DecoderContext { indent: number, strict: boolean } -export function decodeValueFromLines(cursor: LineCursor, options: ResolvedDecodeOptions): JsonValue { - const first = cursor.peek() - if (!first) { - throw new ReferenceError('No content to decode') +// #region Streaming line cursor + +class StreamingLineCursor { + private buffer: ParsedLine[] = [] + private generator: Iterator | AsyncIterator + private done = false + private lastLine: ParsedLine | undefined + private scanState: StreamingScanState + + constructor( + generator: Iterator | AsyncIterator, + scanState: StreamingScanState, + ) { + this.generator = generator + this.scanState = scanState } - // Check for root array - if (isArrayHeaderAfterHyphen(first.content)) { - const headerInfo = parseArrayHeaderLine(first.content, DEFAULT_DELIMITER) - if (headerInfo) { - cursor.advance() // Move past the header line - return decodeArrayFromHeader(headerInfo.header, headerInfo.inlineValues, cursor, 0, options) + getBlankLines() { + return this.scanState.blankLines + } + + async peek(): Promise { + if (this.buffer.length > 0) { + return this.buffer[0] } - } - // Check for single primitive value - if (cursor.length === 1 && !isKeyValueLine(first)) { - return parsePrimitiveToken(first.content.trim()) - } - - // Default to object - return decodeObject(cursor, 0, options) -} - -function isKeyValueLine(line: ParsedLine): boolean { - const content = line.content - // Look for unquoted colon or quoted key followed by colon - if (content.startsWith('"')) { - // Quoted key - find the closing quote - const closingQuoteIndex = findClosingQuote(content, 0) - if (closingQuoteIndex === -1) { - return false + if (this.done) { + return undefined } - // Check if colon exists after quoted key (may have array/brace syntax between) - return content.slice(closingQuoteIndex + 1).includes(COLON) + + const result = await this.generator.next() + if (result.done) { + this.done = true + return undefined + } + + this.buffer.push(result.value) + return result.value } - else { - // Unquoted key - look for first colon not inside quotes - return content.includes(COLON) + + async next(): Promise { + const line = await this.peek() + if (line !== undefined) { + this.buffer.shift() + this.lastLine = line + } + + return line + } + + async advance(): Promise { + await this.next() + } + + current(): ParsedLine | undefined { + return this.lastLine + } + + async atEnd(): Promise { + return (await this.peek()) === undefined + } + + peekSync(): ParsedLine | undefined { + if (this.buffer.length > 0) { + return this.buffer[0] + } + + if (this.done) { + return undefined + } + + const result = (this.generator as Iterator).next() + if (result.done) { + this.done = true + return undefined + } + + this.buffer.push(result.value) + return result.value + } + + nextSync(): ParsedLine | undefined { + const line = this.peekSync() + if (line !== undefined) { + this.buffer.shift() + this.lastLine = line + } + + return line + } + + advanceSync(): void { + this.nextSync() + } + + atEndSync(): boolean { + return this.peekSync() === undefined } } // #endregion -// #region Object decoding +// #region Synchronous streaming decode -function decodeObject(cursor: LineCursor, baseDepth: Depth, options: ResolvedDecodeOptions): JsonObject { - const obj: JsonObject = {} - const quotedKeys: Set = new Set() +export function* decodeStreamSync( + source: Iterable, + options?: DecodeStreamOptions, +): Generator { + // Validate options + if (options?.expandPaths !== undefined) { + throw new Error('expandPaths is not supported in streaming decode') + } - // Detect the actual depth of the first field (may differ from baseDepth in nested structures) + const resolvedOptions: DecoderContext = { + indent: options?.indent ?? 2, + strict: options?.strict ?? true, + } + + const scanState = createScanState() + const lineGenerator = parseLinesSync(source, resolvedOptions.indent, resolvedOptions.strict, scanState) + const cursor = new StreamingLineCursor(lineGenerator, scanState) + + // Get first line to determine root form + const first = cursor.peekSync() + if (!first) { + // Empty input decodes to empty object + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + + // Check for root array + if (isArrayHeaderContent(first.content)) { + const headerInfo = parseArrayHeaderLine(first.content, DEFAULT_DELIMITER) + if (headerInfo) { + cursor.advanceSync() + yield* decodeArrayFromHeaderSync(headerInfo.header, headerInfo.inlineValues, cursor, 0, resolvedOptions) + return + } + } + + // Check for single primitive + cursor.advanceSync() + const hasMore = !cursor.atEndSync() + if (!hasMore && !isKeyValueLineSync(first)) { + // Single non-key-value line is root primitive + yield { type: 'primitive', value: parsePrimitiveToken(first.content.trim()) } + return + } + + // Root object + yield { type: 'startObject' } + yield* decodeKeyValueSync(first.content, cursor, 0, resolvedOptions) + + // Process remaining object fields + while (!cursor.atEndSync()) { + const line = cursor.peekSync() + if (!line || line.depth !== 0) { + break + } + + cursor.advanceSync() + yield* decodeKeyValueSync(line.content, cursor, 0, resolvedOptions) + } + + yield { type: 'endObject' } +} + +function* decodeKeyValueSync( + content: string, + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): Generator { + // Check for array header first + const arrayHeader = parseArrayHeaderLine(content, DEFAULT_DELIMITER) + if (arrayHeader && arrayHeader.header.key) { + yield { type: 'key', key: arrayHeader.header.key } + yield* decodeArrayFromHeaderSync(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) + return + } + + // Regular key-value pair + const { key, isQuoted } = parseKeyToken(content, 0) + const colonIndex = content.indexOf(COLON, key.length) + const rest = colonIndex >= 0 ? content.slice(colonIndex + 1).trim() : '' + + yield isQuoted ? { type: 'key', key, wasQuoted: true } : { type: 'key', key } + + // No value after colon - expect nested object or empty + if (!rest) { + const nextLine = cursor.peekSync() + if (nextLine && nextLine.depth > baseDepth) { + yield { type: 'startObject' } + yield* decodeObjectFieldsSync(cursor, baseDepth + 1, options) + yield { type: 'endObject' } + return + } + + // Empty object + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + + // Inline primitive value + yield { type: 'primitive', value: parsePrimitiveToken(rest) } +} + +function* decodeObjectFieldsSync( + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): Generator { let computedDepth: Depth | undefined - while (!cursor.atEnd()) { - const line = cursor.peek() + while (!cursor.atEndSync()) { + const line = cursor.peekSync() if (!line || line.depth < baseDepth) { break } @@ -73,105 +235,51 @@ function decodeObject(cursor: LineCursor, baseDepth: Depth, options: ResolvedDec } if (line.depth === computedDepth) { - cursor.advance() - const { key, value, isQuoted } = decodeKeyValue(line.content, cursor, computedDepth, options) - obj[key] = value - - // Track quoted dotted keys for expansion phase - if (isQuoted && key.includes(DOT)) { - quotedKeys.add(key) - } + cursor.advanceSync() + yield* decodeKeyValueSync(line.content, cursor, computedDepth, options) } else { - // Different depth (shallower or deeper) - stop object parsing break } } - - // Attach quoted key metadata if any were found - if (quotedKeys.size > 0) { - (obj as ObjectWithQuotedKeys)[QUOTED_KEY_MARKER] = quotedKeys - } - - return obj } -function decodeKeyValue( - content: string, - cursor: LineCursor, - baseDepth: Depth, - options: ResolvedDecodeOptions, -): { key: string, value: JsonValue, followDepth: Depth, isQuoted: boolean } { - // Check for array header first (before parsing key) - const arrayHeader = parseArrayHeaderLine(content, DEFAULT_DELIMITER) - if (arrayHeader && arrayHeader.header.key) { - const decodedValue = decodeArrayFromHeader(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) - // After an array, subsequent fields are at baseDepth + 1 (where array content is) - return { - key: arrayHeader.header.key, - value: decodedValue, - followDepth: baseDepth + 1, - isQuoted: false, // Array keys parsed separately in `parseArrayHeaderLine` - } - } - - // Regular key-value pair - const { key, end, isQuoted } = parseKeyToken(content, 0) - const rest = content.slice(end).trim() - - // No value after colon - expect nested object or empty - if (!rest) { - const nextLine = cursor.peek() - if (nextLine && nextLine.depth > baseDepth) { - const nested = decodeObject(cursor, baseDepth + 1, options) - return { key, value: nested, followDepth: baseDepth + 1, isQuoted } - } - // Empty object - return { key, value: {}, followDepth: baseDepth + 1, isQuoted } - } - - // Inline primitive value - const decodedValue = parsePrimitiveToken(rest) - return { key, value: decodedValue, followDepth: baseDepth + 1, isQuoted } -} - -// #endregion - -// #region Array decoding - -function decodeArrayFromHeader( +function* decodeArrayFromHeaderSync( header: ArrayHeaderInfo, inlineValues: string | undefined, - cursor: LineCursor, + cursor: StreamingLineCursor, baseDepth: Depth, - options: ResolvedDecodeOptions, -): JsonArray { + options: DecoderContext, +): Generator { + yield { type: 'startArray', length: header.length } + // Inline primitive array if (inlineValues) { - // For inline arrays, cursor should already be advanced or will be by caller - return decodeInlinePrimitiveArray(header, inlineValues, options) + yield* decodeInlinePrimitiveArraySync(header, inlineValues, options) + yield { type: 'endArray' } + return } - // For multi-line arrays (tabular or list), the cursor should already be positioned - // at the array header line, but we haven't advanced past it yet - // Tabular array if (header.fields && header.fields.length > 0) { - return decodeTabularArray(header, cursor, baseDepth, options) + yield* decodeTabularArraySync(header, cursor, baseDepth, options) + yield { type: 'endArray' } + return } // List array - return decodeListArray(header, cursor, baseDepth, options) + yield* decodeListArraySync(header, cursor, baseDepth, options) + yield { type: 'endArray' } } -function decodeInlinePrimitiveArray( +function* decodeInlinePrimitiveArraySync( header: ArrayHeaderInfo, inlineValues: string, - options: ResolvedDecodeOptions, -): JsonPrimitive[] { + options: DecoderContext, +): Generator { if (!inlineValues.trim()) { assertExpectedCount(0, header.length, 'inline array items', options) - return [] + return } const values = parseDelimitedValues(inlineValues, header.delimiter) @@ -179,158 +287,133 @@ function decodeInlinePrimitiveArray( assertExpectedCount(primitives.length, header.length, 'inline array items', options) - return primitives + for (const primitive of primitives) { + yield { type: 'primitive', value: primitive } + } } -function decodeListArray( +function* decodeTabularArraySync( header: ArrayHeaderInfo, - cursor: LineCursor, + cursor: StreamingLineCursor, baseDepth: Depth, - options: ResolvedDecodeOptions, -): JsonValue[] { - const items: JsonValue[] = [] - const itemDepth = baseDepth + 1 - - // Track line range for blank line validation - let startLine: number | undefined - let endLine: number | undefined - - while (!cursor.atEnd() && items.length < header.length) { - const line = cursor.peek() - if (!line || line.depth < itemDepth) { - break - } - - // Check for list item (with or without space after hyphen) - const isListItem = line.content.startsWith(LIST_ITEM_PREFIX) || line.content === '-' - - if (line.depth === itemDepth && isListItem) { - // Track first and last item line numbers - if (startLine === undefined) { - startLine = line.lineNumber - } - endLine = line.lineNumber - - const item = decodeListItem(cursor, itemDepth, options) - items.push(item) - - // Update endLine to the current cursor position (after item was decoded) - const currentLine = cursor.current() - if (currentLine) { - endLine = currentLine.lineNumber - } - } - else { - break - } - } - - assertExpectedCount(items.length, header.length, 'list array items', options) - - // In strict mode, check for blank lines inside the array - if (options.strict && startLine !== undefined && endLine !== undefined) { - validateNoBlankLinesInRange( - startLine, // From first item line - endLine, // To last item line - cursor.getBlankLines(), - options.strict, - 'list array', - ) - } - - // In strict mode, check for extra items - if (options.strict) { - validateNoExtraListItems(cursor, itemDepth, header.length) - } - - return items -} - -function decodeTabularArray( - header: ArrayHeaderInfo, - cursor: LineCursor, - baseDepth: Depth, - options: ResolvedDecodeOptions, -): JsonObject[] { - const objects: JsonObject[] = [] + options: DecoderContext, +): Generator { const rowDepth = baseDepth + 1 - - // Track line range for blank line validation + let rowCount = 0 let startLine: number | undefined let endLine: number | undefined - while (!cursor.atEnd() && objects.length < header.length) { - const line = cursor.peek() + while (!cursor.atEndSync() && rowCount < header.length) { + const line = cursor.peekSync() if (!line || line.depth < rowDepth) { break } if (line.depth === rowDepth) { - // Track first and last row line numbers if (startLine === undefined) { startLine = line.lineNumber } endLine = line.lineNumber - cursor.advance() + cursor.advanceSync() const values = parseDelimitedValues(line.content, header.delimiter) assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options) const primitives = mapRowValuesToPrimitives(values) - const obj: JsonObject = {} + yield { type: 'startObject' } for (let i = 0; i < header.fields!.length; i++) { - obj[header.fields![i]!] = primitives[i]! + yield { type: 'key', key: header.fields![i]! } + yield { type: 'primitive', value: primitives[i]! } } + yield { type: 'endObject' } - objects.push(obj) + rowCount++ } else { break } } - assertExpectedCount(objects.length, header.length, 'tabular rows', options) + assertExpectedCount(rowCount, header.length, 'tabular rows', options) - // In strict mode, check for blank lines inside the array if (options.strict && startLine !== undefined && endLine !== undefined) { - validateNoBlankLinesInRange( - startLine, // From first row line - endLine, // To last row line - cursor.getBlankLines(), - options.strict, - 'tabular array', - ) + validateNoBlankLinesInRange(startLine, endLine, cursor.getBlankLines(), options.strict, 'tabular array') } - // In strict mode, check for extra rows if (options.strict) { - validateNoExtraTabularRows(cursor, rowDepth, header) + const nextLine = cursor.peekSync() + validateNoExtraTabularRows(nextLine, rowDepth, header) } - - return objects } -// #endregion - -// #region List item decoding - -function decodeListItem( - cursor: LineCursor, +function* decodeListArraySync( + header: ArrayHeaderInfo, + cursor: StreamingLineCursor, baseDepth: Depth, - options: ResolvedDecodeOptions, -): JsonValue { - const line = cursor.next() + options: DecoderContext, +): Generator { + const itemDepth = baseDepth + 1 + let itemCount = 0 + let startLine: number | undefined + let endLine: number | undefined + + while (!cursor.atEndSync() && itemCount < header.length) { + const line = cursor.peekSync() + if (!line || line.depth < itemDepth) { + break + } + + const isListItem = line.content.startsWith(LIST_ITEM_PREFIX) || line.content === LIST_ITEM_MARKER + + if (line.depth === itemDepth && isListItem) { + if (startLine === undefined) { + startLine = line.lineNumber + } + endLine = line.lineNumber + + yield* decodeListItemSync(cursor, itemDepth, options) + + const currentLine = cursor.current() + if (currentLine) { + endLine = currentLine.lineNumber + } + + itemCount++ + } + else { + break + } + } + + assertExpectedCount(itemCount, header.length, 'list array items', options) + + if (options.strict && startLine !== undefined && endLine !== undefined) { + validateNoBlankLinesInRange(startLine, endLine, cursor.getBlankLines(), options.strict, 'list array') + } + + if (options.strict) { + const nextLine = cursor.peekSync() + validateNoExtraListItems(nextLine, itemDepth, header.length) + } +} + +function* decodeListItemSync( + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): Generator { + const line = cursor.nextSync() if (!line) { throw new ReferenceError('Expected list item') } - // Check for list item (with or without space after hyphen) let afterHyphen: string - // Empty list item should be an empty object - if (line.content === '-') { - return {} + if (line.content === LIST_ITEM_MARKER) { + yield { type: 'startObject' } + yield { type: 'endObject' } + return } else if (line.content.startsWith(LIST_ITEM_PREFIX)) { afterHyphen = line.content.slice(LIST_ITEM_PREFIX.length) @@ -339,73 +422,408 @@ function decodeListItem( throw new SyntaxError(`Expected list item to start with "${LIST_ITEM_PREFIX}"`) } - // Empty content after list item should also be an empty object if (!afterHyphen.trim()) { - return {} + yield { type: 'startObject' } + yield { type: 'endObject' } + return } // Check for array header after hyphen - if (isArrayHeaderAfterHyphen(afterHyphen)) { + if (isArrayHeaderContent(afterHyphen)) { const arrayHeader = parseArrayHeaderLine(afterHyphen, DEFAULT_DELIMITER) if (arrayHeader) { - return decodeArrayFromHeader(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) + yield* decodeArrayFromHeaderSync(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) + return } } // Check for object first field after hyphen - if (isObjectFirstFieldAfterHyphen(afterHyphen)) { - return decodeObjectFromListItem(line, cursor, baseDepth, options) + if (isKeyValueContent(afterHyphen)) { + yield { type: 'startObject' } + yield* decodeKeyValueSync(afterHyphen, cursor, baseDepth, options) + + // Read subsequent fields + const followDepth = baseDepth + 1 + while (!cursor.atEndSync()) { + const nextLine = cursor.peekSync() + if (!nextLine || nextLine.depth < followDepth) { + break + } + + if (nextLine.depth === followDepth && !nextLine.content.startsWith(LIST_ITEM_PREFIX)) { + cursor.advanceSync() + yield* decodeKeyValueSync(nextLine.content, cursor, followDepth, options) + } + else { + break + } + } + + yield { type: 'endObject' } + return } // Primitive value - return parsePrimitiveToken(afterHyphen) + yield { type: 'primitive', value: parsePrimitiveToken(afterHyphen) } } -function decodeObjectFromListItem( - firstLine: ParsedLine, - cursor: LineCursor, - baseDepth: Depth, - options: ResolvedDecodeOptions, -): JsonObject { - const afterHyphen = firstLine.content.slice(LIST_ITEM_PREFIX.length) - const { key, value, followDepth, isQuoted } = decodeKeyValue(afterHyphen, cursor, baseDepth, options) +function isKeyValueLineSync(line: ParsedLine): boolean { + const content = line.content + if (content.startsWith('"')) { + const closingQuoteIndex = findClosingQuote(content, 0) + if (closingQuoteIndex === -1) { + return false + } + return content.slice(closingQuoteIndex + 1).includes(COLON) + } + else { + return content.includes(COLON) + } +} - const obj: JsonObject = { [key]: value } - const quotedKeys: Set = new Set() +// #endregion - // Track if first key was quoted and dotted - if (isQuoted && key.includes(DOT)) { - quotedKeys.add(key) +// #region Asynchronous streaming decode + +export async function* decodeStream( + source: AsyncIterable | Iterable, + options?: DecodeStreamOptions, +): AsyncGenerator { + // Validate options + if (options?.expandPaths !== undefined) { + throw new Error('expandPaths is not supported in streaming decode') } - // Read subsequent fields - while (!cursor.atEnd()) { - const line = cursor.peek() - if (!line || line.depth < followDepth) { + const resolvedOptions = { + indent: options?.indent ?? 2, + strict: options?.strict ?? true, + } + + const scanState = createScanState() + + // Determine if source is async or sync + if (Symbol.asyncIterator in source) { + const lineGenerator = parseLinesAsync(source, resolvedOptions.indent, resolvedOptions.strict, scanState) + const cursor = new StreamingLineCursor(lineGenerator, scanState) + + // Get first line to determine root form + const first = await cursor.peek() + if (!first) { + // Empty input decodes to empty object (matches decode('') behavior) + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + + // Check for root array + if (isArrayHeaderContent(first.content)) { + const headerInfo = parseArrayHeaderLine(first.content, DEFAULT_DELIMITER) + if (headerInfo) { + await cursor.advance() + yield* decodeArrayFromHeaderAsync(headerInfo.header, headerInfo.inlineValues, cursor, 0, resolvedOptions) + return + } + } + + // Check for single primitive + await cursor.advance() + const hasMore = !(await cursor.atEnd()) + if (!hasMore && !isKeyValueLineSync(first)) { + yield { type: 'primitive', value: parsePrimitiveToken(first.content.trim()) } + return + } + + // Root object + yield { type: 'startObject' } + yield* decodeKeyValueAsync(first.content, cursor, 0, resolvedOptions) + + // Process remaining object fields + while (!(await cursor.atEnd())) { + const line = await cursor.peek() + if (!line || line.depth !== 0) { + break + } + await cursor.advance() + yield* decodeKeyValueAsync(line.content, cursor, 0, resolvedOptions) + } + + yield { type: 'endObject' } + } + else { + // Sync source, delegate to sync generator + yield* decodeStreamSync(source as Iterable, options) + } +} + +async function* decodeKeyValueAsync( + content: string, + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + // Check for array header first + const arrayHeader = parseArrayHeaderLine(content, DEFAULT_DELIMITER) + if (arrayHeader && arrayHeader.header.key) { + yield { type: 'key', key: arrayHeader.header.key } + yield* decodeArrayFromHeaderAsync(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) + return + } + + // Regular key-value pair + const { key, isQuoted } = parseKeyToken(content, 0) + const colonIndex = content.indexOf(COLON, key.length) + const rest = colonIndex >= 0 ? content.slice(colonIndex + 1).trim() : '' + + yield isQuoted ? { type: 'key', key, wasQuoted: true } : { type: 'key', key } + + // No value after colon - expect nested object or empty + if (!rest) { + const nextLine = await cursor.peek() + if (nextLine && nextLine.depth > baseDepth) { + yield { type: 'startObject' } + yield* decodeObjectFieldsAsync(cursor, baseDepth + 1, options) + yield { type: 'endObject' } + return + } + + // Empty object + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + + // Inline primitive value + yield { type: 'primitive', value: parsePrimitiveToken(rest) } +} + +async function* decodeObjectFieldsAsync( + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + let computedDepth: Depth | undefined + + while (!(await cursor.atEnd())) { + const line = await cursor.peek() + if (!line || line.depth < baseDepth) { break } - if (line.depth === followDepth && !line.content.startsWith(LIST_ITEM_PREFIX)) { - cursor.advance() - const { key: k, value: v, isQuoted: kIsQuoted } = decodeKeyValue(line.content, cursor, followDepth, options) - obj[k] = v + if (computedDepth === undefined && line.depth >= baseDepth) { + computedDepth = line.depth + } - // Track quoted dotted keys - if (kIsQuoted && k.includes(DOT)) { - quotedKeys.add(k) + if (line.depth === computedDepth) { + await cursor.advance() + yield* decodeKeyValueAsync(line.content, cursor, computedDepth, options) + } + else { + break + } + } +} + +async function* decodeArrayFromHeaderAsync( + header: ArrayHeaderInfo, + inlineValues: string | undefined, + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + yield { type: 'startArray', length: header.length } + + // Inline primitive array + if (inlineValues) { + yield* decodeInlinePrimitiveArraySync(header, inlineValues, options) + yield { type: 'endArray' } + return + } + + // Tabular array + if (header.fields && header.fields.length > 0) { + yield* decodeTabularArrayAsync(header, cursor, baseDepth, options) + yield { type: 'endArray' } + return + } + + // List array + yield* decodeListArrayAsync(header, cursor, baseDepth, options) + yield { type: 'endArray' } +} + +async function* decodeTabularArrayAsync( + header: ArrayHeaderInfo, + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + const rowDepth = baseDepth + 1 + let rowCount = 0 + let startLine: number | undefined + let endLine: number | undefined + + while (!(await cursor.atEnd()) && rowCount < header.length) { + const line = await cursor.peek() + if (!line || line.depth < rowDepth) { + break + } + + if (line.depth === rowDepth) { + if (startLine === undefined) { + startLine = line.lineNumber } + endLine = line.lineNumber + + await cursor.advance() + const values = parseDelimitedValues(line.content, header.delimiter) + assertExpectedCount(values.length, header.fields!.length, 'tabular row values', options) + + const primitives = mapRowValuesToPrimitives(values) + + yield { type: 'startObject' } + for (let i = 0; i < header.fields!.length; i++) { + yield { type: 'key', key: header.fields![i]! } + yield { type: 'primitive', value: primitives[i]! } + } + yield { type: 'endObject' } + + rowCount++ } else { break } } - // Attach quoted key metadata if any were found - if (quotedKeys.size > 0) { - (obj as ObjectWithQuotedKeys)[QUOTED_KEY_MARKER] = quotedKeys + assertExpectedCount(rowCount, header.length, 'tabular rows', options) + + if (options.strict && startLine !== undefined && endLine !== undefined) { + validateNoBlankLinesInRange(startLine, endLine, cursor.getBlankLines(), options.strict, 'tabular array') } - return obj + if (options.strict) { + const nextLine = await cursor.peek() + validateNoExtraTabularRows(nextLine, rowDepth, header) + } +} + +async function* decodeListArrayAsync( + header: ArrayHeaderInfo, + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + const itemDepth = baseDepth + 1 + let itemCount = 0 + let startLine: number | undefined + let endLine: number | undefined + + while (!(await cursor.atEnd()) && itemCount < header.length) { + const line = await cursor.peek() + if (!line || line.depth < itemDepth) { + break + } + + const isListItem = line.content.startsWith(LIST_ITEM_PREFIX) || line.content === LIST_ITEM_MARKER + + if (line.depth === itemDepth && isListItem) { + if (startLine === undefined) { + startLine = line.lineNumber + } + endLine = line.lineNumber + + yield* decodeListItemAsync(cursor, itemDepth, options) + + const currentLine = cursor.current() + if (currentLine) { + endLine = currentLine.lineNumber + } + + itemCount++ + } + else { + break + } + } + + assertExpectedCount(itemCount, header.length, 'list array items', options) + + if (options.strict && startLine !== undefined && endLine !== undefined) { + validateNoBlankLinesInRange(startLine, endLine, cursor.getBlankLines(), options.strict, 'list array') + } + + if (options.strict) { + const nextLine = await cursor.peek() + validateNoExtraListItems(nextLine, itemDepth, header.length) + } +} + +async function* decodeListItemAsync( + cursor: StreamingLineCursor, + baseDepth: Depth, + options: DecoderContext, +): AsyncGenerator { + const line = await cursor.next() + if (!line) { + throw new ReferenceError('Expected list item') + } + + let afterHyphen: string + + if (line.content === LIST_ITEM_MARKER) { + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + else if (line.content.startsWith(LIST_ITEM_PREFIX)) { + afterHyphen = line.content.slice(LIST_ITEM_PREFIX.length) + } + else { + throw new SyntaxError(`Expected list item to start with "${LIST_ITEM_PREFIX}"`) + } + + if (!afterHyphen.trim()) { + yield { type: 'startObject' } + yield { type: 'endObject' } + return + } + + // Check for array header after hyphen + if (isArrayHeaderContent(afterHyphen)) { + const arrayHeader = parseArrayHeaderLine(afterHyphen, DEFAULT_DELIMITER) + if (arrayHeader) { + yield* decodeArrayFromHeaderAsync(arrayHeader.header, arrayHeader.inlineValues, cursor, baseDepth, options) + return + } + } + + // Check for object first field after hyphen + if (isKeyValueContent(afterHyphen)) { + yield { type: 'startObject' } + yield* decodeKeyValueAsync(afterHyphen, cursor, baseDepth, options) + + // Read subsequent fields + const followDepth = baseDepth + 1 + while (!(await cursor.atEnd())) { + const nextLine = await cursor.peek() + if (!nextLine || nextLine.depth < followDepth) { + break + } + + if (nextLine.depth === followDepth && !nextLine.content.startsWith(LIST_ITEM_PREFIX)) { + await cursor.advance() + yield* decodeKeyValueAsync(nextLine.content, cursor, followDepth, options) + } + else { + break + } + } + + yield { type: 'endObject' } + return + } + + // Primitive value + yield { type: 'primitive', value: parsePrimitiveToken(afterHyphen) } } // #endregion diff --git a/packages/toon/src/decode/event-builder.ts b/packages/toon/src/decode/event-builder.ts new file mode 100644 index 0000000..5e3a40c --- /dev/null +++ b/packages/toon/src/decode/event-builder.ts @@ -0,0 +1,334 @@ +import type { JsonObject, JsonStreamEvent, JsonValue } from '../types' +import { QUOTED_KEY_MARKER } from './expand' + +// #region Build context types + +/** + * Stack context for building JSON values from events. + */ +type BuildContext + = | { type: 'object', obj: JsonObject, currentKey?: string, quotedKeys: Set } + | { type: 'array', arr: JsonValue[] } + +// #endregion + +// #region Synchronous AST builder + +export function buildValueFromEvents(events: Iterable): JsonValue { + const stack: BuildContext[] = [] + let root: JsonValue | undefined + + for (const event of events) { + switch (event.type) { + case 'startObject': { + const obj: JsonObject = {} + const quotedKeys = new Set() + + if (stack.length === 0) { + // Root object + stack.push({ type: 'object', obj, quotedKeys }) + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Object startObject event without preceding key') + } + + parent.obj[parent.currentKey] = obj + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(obj) + } + + stack.push({ type: 'object', obj, quotedKeys }) + } + + break + } + + case 'endObject': { + if (stack.length === 0) { + throw new Error('Unexpected endObject event') + } + + const context = stack.pop()! + if (context.type !== 'object') { + throw new Error('Mismatched endObject event') + } + + // Attach quoted keys metadata if any keys were quoted + if (context.quotedKeys.size > 0) { + Object.defineProperty(context.obj, QUOTED_KEY_MARKER, { + value: context.quotedKeys, + enumerable: false, + writable: false, + configurable: false, + }) + } + + if (stack.length === 0) { + root = context.obj + } + + break + } + + case 'startArray': { + const arr: JsonValue[] = [] + + if (stack.length === 0) { + // Root array + stack.push({ type: 'array', arr }) + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Array startArray event without preceding key') + } + parent.obj[parent.currentKey] = arr + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(arr) + } + + stack.push({ type: 'array', arr }) + } + + break + } + + case 'endArray': { + if (stack.length === 0) { + throw new Error('Unexpected endArray event') + } + + const context = stack.pop()! + if (context.type !== 'array') { + throw new Error('Mismatched endArray event') + } + + if (stack.length === 0) { + root = context.arr + } + + break + } + + case 'key': { + if (stack.length === 0) { + throw new Error('Key event outside of object context') + } + + const parent = stack[stack.length - 1]! + if (parent.type !== 'object') { + throw new Error('Key event in non-object context') + } + + parent.currentKey = event.key + + // Track quoted keys for path expansion + if (event.wasQuoted) { + parent.quotedKeys.add(event.key) + } + + break + } + + case 'primitive': { + if (stack.length === 0) { + // Root primitive + root = event.value + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Primitive event without preceding key in object') + } + parent.obj[parent.currentKey] = event.value + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(event.value) + } + } + + break + } + } + } + + if (stack.length !== 0) { + throw new Error('Incomplete event stream: stack not empty at end') + } + + if (root === undefined) { + throw new Error('No root value built from events') + } + + return root +} + +// #endregion + +// #region Asynchronous AST builder + +export async function buildValueFromEventsAsync(events: AsyncIterable): Promise { + const stack: BuildContext[] = [] + let root: JsonValue | undefined + + for await (const event of events) { + switch (event.type) { + case 'startObject': { + const obj: JsonObject = {} + const quotedKeys = new Set() + + if (stack.length === 0) { + stack.push({ type: 'object', obj, quotedKeys }) + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Object startObject event without preceding key') + } + parent.obj[parent.currentKey] = obj + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(obj) + } + + stack.push({ type: 'object', obj, quotedKeys }) + } + + break + } + + case 'endObject': { + if (stack.length === 0) { + throw new Error('Unexpected endObject event') + } + + const context = stack.pop()! + if (context.type !== 'object') { + throw new Error('Mismatched endObject event') + } + + // Attach quoted keys metadata if any keys were quoted + if (context.quotedKeys.size > 0) { + Object.defineProperty(context.obj, QUOTED_KEY_MARKER, { + value: context.quotedKeys, + enumerable: false, + writable: false, + configurable: false, + }) + } + + if (stack.length === 0) { + root = context.obj + } + + break + } + + case 'startArray': { + const arr: JsonValue[] = [] + if (stack.length === 0) { + stack.push({ type: 'array', arr }) + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Array startArray event without preceding key') + } + parent.obj[parent.currentKey] = arr + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(arr) + } + + stack.push({ type: 'array', arr }) + } + + break + } + + case 'endArray': { + if (stack.length === 0) { + throw new Error('Unexpected endArray event') + } + + const context = stack.pop()! + if (context.type !== 'array') { + throw new Error('Mismatched endArray event') + } + + if (stack.length === 0) { + root = context.arr + } + + break + } + + case 'key': { + if (stack.length === 0) { + throw new Error('Key event outside of object context') + } + + const parent = stack[stack.length - 1]! + if (parent.type !== 'object') { + throw new Error('Key event in non-object context') + } + + parent.currentKey = event.key + + // Track quoted keys for path expansion + if (event.wasQuoted) { + parent.quotedKeys.add(event.key) + } + + break + } + + case 'primitive': { + if (stack.length === 0) { + root = event.value + } + else { + const parent = stack[stack.length - 1]! + if (parent.type === 'object') { + if (parent.currentKey === undefined) { + throw new Error('Primitive event without preceding key in object') + } + parent.obj[parent.currentKey] = event.value + parent.currentKey = undefined + } + else if (parent.type === 'array') { + parent.arr.push(event.value) + } + } + + break + } + } + } + + if (stack.length !== 0) { + throw new Error('Incomplete event stream: stack not empty at end') + } + + if (root === undefined) { + throw new Error('No root value built from events') + } + + return root +} + +// #endregion diff --git a/packages/toon/src/decode/expand.ts b/packages/toon/src/decode/expand.ts index 5ee82c4..1713b18 100644 --- a/packages/toon/src/decode/expand.ts +++ b/packages/toon/src/decode/expand.ts @@ -12,7 +12,7 @@ import { isIdentifierSegment } from '../shared/validation' export const QUOTED_KEY_MARKER: unique symbol = Symbol('quotedKey') /** - * Type for objects that may have quoted key metadata attached. + * Objects that may have quoted key metadata attached. */ export interface ObjectWithQuotedKeys extends JsonObject { [QUOTED_KEY_MARKER]?: Set @@ -226,6 +226,10 @@ function mergeObjects( // #endregion +// #region Type Guards + function canMerge(a: JsonValue, b: JsonValue): a is JsonObject { return isJsonObject(a) && isJsonObject(b) } + +// #endregion diff --git a/packages/toon/src/decode/parser.ts b/packages/toon/src/decode/parser.ts index bc3d648..69a2f31 100644 --- a/packages/toon/src/decode/parser.ts +++ b/packages/toon/src/decode/parser.ts @@ -305,11 +305,11 @@ export function parseKeyToken(content: string, start: number): { key: string, en // #region Array content detection helpers -export function isArrayHeaderAfterHyphen(content: string): boolean { +export function isArrayHeaderContent(content: string): boolean { return content.trim().startsWith(OPEN_BRACKET) && findUnquotedChar(content, COLON) !== -1 } -export function isObjectFirstFieldAfterHyphen(content: string): boolean { +export function isKeyValueContent(content: string): boolean { return findUnquotedChar(content, COLON) !== -1 } diff --git a/packages/toon/src/decode/scanner.ts b/packages/toon/src/decode/scanner.ts index cb7c9a3..e69f6e5 100644 --- a/packages/toon/src/decode/scanner.ts +++ b/packages/toon/src/decode/scanner.ts @@ -1,109 +1,109 @@ import type { BlankLineInfo, Depth, ParsedLine } from '../types' import { SPACE, TAB } from '../constants' -export interface ScanResult { - lines: ParsedLine[] +// #region Scan state + +export interface StreamingScanState { + lineNumber: number blankLines: BlankLineInfo[] } -export class LineCursor { - private lines: ParsedLine[] - private index: number - private blankLines: BlankLineInfo[] - - constructor(lines: ParsedLine[], blankLines: BlankLineInfo[] = []) { - this.lines = lines - this.index = 0 - this.blankLines = blankLines - } - - getBlankLines(): BlankLineInfo[] { - return this.blankLines - } - - peek(): ParsedLine | undefined { - return this.lines[this.index] - } - - next(): ParsedLine | undefined { - return this.lines[this.index++] - } - - current(): ParsedLine | undefined { - return this.index > 0 ? this.lines[this.index - 1] : undefined - } - - advance(): void { - this.index++ - } - - atEnd(): boolean { - return this.index >= this.lines.length - } - - get length(): number { - return this.lines.length - } - - peekAtDepth(targetDepth: Depth): ParsedLine | undefined { - const line = this.peek() - return line?.depth === targetDepth ? line : undefined +export function createScanState(): StreamingScanState { + return { + lineNumber: 0, + blankLines: [], } } -export function toParsedLines(source: string, indentSize: number, strict: boolean): ScanResult { - if (!source.trim()) { - return { lines: [], blankLines: [] } +// #endregion + +// #region Line parsing + +export function parseLineIncremental( + raw: string, + state: StreamingScanState, + indentSize: number, + strict: boolean, +): ParsedLine | undefined { + state.lineNumber++ + const lineNumber = state.lineNumber + + // Count leading spaces + let indent = 0 + while (indent < raw.length && raw[indent] === SPACE) { + indent++ } - const lines = source.split('\n') - const parsed: ParsedLine[] = [] - const blankLines: BlankLineInfo[] = [] - - for (let i = 0; i < lines.length; i++) { - const raw = lines[i]! - const lineNumber = i + 1 - let indent = 0 - while (indent < raw.length && raw[indent] === SPACE) { - indent++ - } - - const content = raw.slice(indent) - - // Track blank lines - if (!content.trim()) { - const depth = computeDepthFromIndent(indent, indentSize) - blankLines.push({ lineNumber, indent, depth }) - continue - } + const content = raw.slice(indent) + // Track blank lines + if (!content.trim()) { const depth = computeDepthFromIndent(indent, indentSize) - - // Strict mode validation - if (strict) { - // Find the full leading whitespace region (spaces and tabs) - let whitespaceEndIndex = 0 - while (whitespaceEndIndex < raw.length && (raw[whitespaceEndIndex] === SPACE || raw[whitespaceEndIndex] === TAB)) { - whitespaceEndIndex++ - } - - // Check for tabs in leading whitespace (before actual content) - if (raw.slice(0, whitespaceEndIndex).includes(TAB)) { - throw new SyntaxError(`Line ${lineNumber}: Tabs are not allowed in indentation in strict mode`) - } - - // Check for exact multiples of indentSize - if (indent > 0 && indent % indentSize !== 0) { - throw new SyntaxError(`Line ${lineNumber}: Indentation must be exact multiple of ${indentSize}, but found ${indent} spaces`) - } - } - - parsed.push({ raw, indent, content, depth, lineNumber }) + state.blankLines.push({ lineNumber, indent, depth }) + return undefined } - return { lines: parsed, blankLines } + const depth = computeDepthFromIndent(indent, indentSize) + + // Strict mode validation + if (strict) { + // Find the full leading whitespace region (spaces and tabs) + let whitespaceEndIndex = 0 + while ( + whitespaceEndIndex < raw.length + && (raw[whitespaceEndIndex] === SPACE || raw[whitespaceEndIndex] === TAB) + ) { + whitespaceEndIndex++ + } + + // Check for tabs in leading whitespace (before actual content) + if (raw.slice(0, whitespaceEndIndex).includes(TAB)) { + throw new SyntaxError(`Line ${lineNumber}: Tabs are not allowed in indentation in strict mode`) + } + + // Check for exact multiples of indentSize + if (indent > 0 && indent % indentSize !== 0) { + throw new SyntaxError( + `Line ${lineNumber}: Indentation must be exact multiple of ${indentSize}, but found ${indent} spaces`, + ) + } + } + + return { raw, indent, content, depth, lineNumber } +} + +export function* parseLinesSync( + source: Iterable, + indentSize: number, + strict: boolean, + state: StreamingScanState, +): Generator { + for (const raw of source) { + const parsedLine = parseLineIncremental(raw, state, indentSize, strict) + + if (parsedLine !== undefined) { + yield parsedLine + } + } +} + +export async function* parseLinesAsync( + source: AsyncIterable, + indentSize: number, + strict: boolean, + state: StreamingScanState, +): AsyncGenerator { + for await (const raw of source) { + const parsedLine = parseLineIncremental(raw, state, indentSize, strict) + + if (parsedLine !== undefined) { + yield parsedLine + } + } } function computeDepthFromIndent(indentSpaces: number, indentSize: number): Depth { return Math.floor(indentSpaces / indentSize) } + +// #endregion diff --git a/packages/toon/src/decode/validation.ts b/packages/toon/src/decode/validation.ts index fa438af..9024308 100644 --- a/packages/toon/src/decode/validation.ts +++ b/packages/toon/src/decode/validation.ts @@ -1,7 +1,8 @@ -import type { ArrayHeaderInfo, BlankLineInfo, Delimiter, Depth, ResolvedDecodeOptions } from '../types' -import type { LineCursor } from './scanner' +import type { ArrayHeaderInfo, BlankLineInfo, Delimiter, Depth, ParsedLine } from '../types' import { COLON, LIST_ITEM_PREFIX } from '../constants' +// #region Count and structure validation + /** * Asserts that the actual count matches the expected count in strict mode. */ @@ -9,7 +10,7 @@ export function assertExpectedCount( actual: number, expected: number, itemType: string, - options: ResolvedDecodeOptions, + options: { strict: boolean }, ): void { if (options.strict && actual !== expected) { throw new RangeError(`Expected ${expected} ${itemType}, but got ${actual}`) @@ -20,11 +21,10 @@ export function assertExpectedCount( * Validates that there are no extra list items beyond the expected count. */ export function validateNoExtraListItems( - cursor: LineCursor, + nextLine: ParsedLine | undefined, itemDepth: Depth, expectedCount: number, ): void { - const nextLine = cursor.peek() if (nextLine?.depth === itemDepth && nextLine.content.startsWith(LIST_ITEM_PREFIX)) { throw new RangeError(`Expected ${expectedCount} list array items, but found more`) } @@ -34,11 +34,10 @@ export function validateNoExtraListItems( * Validates that there are no extra tabular rows beyond the expected count. */ export function validateNoExtraTabularRows( - cursor: LineCursor, + nextLine: ParsedLine | undefined, rowDepth: Depth, header: ArrayHeaderInfo, ): void { - const nextLine = cursor.peek() if ( nextLine?.depth === rowDepth && !nextLine.content.startsWith(LIST_ITEM_PREFIX) @@ -62,8 +61,6 @@ export function validateNoBlankLinesInRange( return // Find blank lines within the range - // Note: We don't filter by depth because ANY blank line between array items is an error, - // regardless of its indentation level const firstBlank = blankLines.find( blank => blank.lineNumber > startLine && blank.lineNumber < endLine, ) @@ -75,6 +72,10 @@ export function validateNoBlankLinesInRange( } } +// #endregion + +// #region Row classification helpers + /** * Checks if a line is a data row (vs a key-value pair) in a tabular array. */ @@ -95,3 +96,5 @@ function isDataRow(content: string, delimiter: Delimiter): boolean { // Colon before delimiter or no delimiter = key-value pair return false } + +// #endregion diff --git a/packages/toon/src/index.ts b/packages/toon/src/index.ts index b28c379..754c8e5 100644 --- a/packages/toon/src/index.ts +++ b/packages/toon/src/index.ts @@ -1,55 +1,27 @@ -import type { DecodeOptions, EncodeOptions, JsonValue, ResolvedDecodeOptions, ResolvedEncodeOptions } from './types' +import type { DecodeOptions, DecodeStreamOptions, EncodeOptions, JsonStreamEvent, JsonValue, ResolvedDecodeOptions, ResolvedEncodeOptions } from './types' import { DEFAULT_DELIMITER } from './constants' -import { decodeValueFromLines } from './decode/decoders' +import { decodeStream as decodeStreamCore, decodeStreamSync as decodeStreamSyncCore } from './decode/decoders' +import { buildValueFromEvents } from './decode/event-builder' import { expandPathsSafe } from './decode/expand' -import { LineCursor, toParsedLines } from './decode/scanner' import { encodeJsonValue } from './encode/encoders' import { normalizeValue } from './encode/normalize' export { DEFAULT_DELIMITER, DELIMITERS } from './constants' export type { DecodeOptions, + DecodeStreamOptions, Delimiter, DelimiterKey, EncodeOptions, JsonArray, JsonObject, JsonPrimitive, + JsonStreamEvent, JsonValue, ResolvedDecodeOptions, ResolvedEncodeOptions, } from './types' -/** - * Encodes a JavaScript value into TOON format as a sequence of lines. - * - * This function yields TOON lines one at a time without building the full string, - * making it suitable for streaming large outputs to files, HTTP responses, or process stdout. - * - * @param input - Any JavaScript value (objects, arrays, primitives) - * @param options - Optional encoding configuration - * @returns Iterable of TOON lines (without trailing newlines) - * - * @example - * ```ts - * // Stream to stdout - * for (const line of encodeLines({ name: 'Alice', age: 30 })) { - * console.log(line) - * } - * - * // Collect to array - * const lines = Array.from(encodeLines(data)) - * - * // Equivalent to encode() - * const toonString = Array.from(encodeLines(data, options)).join('\n') - * ``` - */ -export function encodeLines(input: unknown, options?: EncodeOptions): Iterable { - const normalizedValue = normalizeValue(input) - const resolvedOptions = resolveOptions(options) - return encodeJsonValue(normalizedValue, resolvedOptions, 0) -} - /** * Encodes a JavaScript value into TOON format string. * @@ -94,15 +66,69 @@ export function encode(input: unknown, options?: EncodeOptions): string { * ``` */ export function decode(input: string, options?: DecodeOptions): JsonValue { - const resolvedOptions = resolveDecodeOptions(options) - const scanResult = toParsedLines(input, resolvedOptions.indent, resolvedOptions.strict) + const lines = input.split('\n') + return decodeFromLines(lines, options) +} - if (scanResult.lines.length === 0) { - return {} +/** + * Encodes a JavaScript value into TOON format as a sequence of lines. + * + * This function yields TOON lines one at a time without building the full string, + * making it suitable for streaming large outputs to files, HTTP responses, or process stdout. + * + * @param input - Any JavaScript value (objects, arrays, primitives) + * @param options - Optional encoding configuration + * @returns Iterable of TOON lines (without trailing newlines) + * + * @example + * ```ts + * // Stream to stdout + * for (const line of encodeLines({ name: 'Alice', age: 30 })) { + * console.log(line) + * } + * + * // Collect to array + * const lines = Array.from(encodeLines(data)) + * + * // Equivalent to encode() + * const toonString = Array.from(encodeLines(data, options)).join('\n') + * ``` + */ +export function encodeLines(input: unknown, options?: EncodeOptions): Iterable { + const normalizedValue = normalizeValue(input) + const resolvedOptions = resolveOptions(options) + return encodeJsonValue(normalizedValue, resolvedOptions, 0) +} + +/** + * Decodes TOON format from pre-split lines into a JavaScript value. + * + * This is a convenience wrapper around the streaming decoder that builds + * the full value in memory. Useful when you already have lines as an array + * or iterable and want the standard decode behavior with path expansion support. + * + * @param lines - Iterable of TOON lines (without newlines) + * @param options - Optional decoding configuration (supports expandPaths) + * @returns Parsed JavaScript value (object, array, or primitive) + * + * @example + * ```ts + * const lines = ['name: Alice', 'age: 30'] + * decodeFromLines(lines) + * // { name: 'Alice', age: 30 } + * ``` + */ +export function decodeFromLines(lines: Iterable, options?: DecodeOptions): JsonValue { + const resolvedOptions = resolveDecodeOptions(options) + + // Use streaming decoder without expandPaths + const streamOptions: DecodeStreamOptions = { + indent: resolvedOptions.indent, + strict: resolvedOptions.strict, } - const cursor = new LineCursor(scanResult.lines, scanResult.blankLines) - const decodedValue = decodeValueFromLines(cursor, resolvedOptions) + const events = decodeStreamSyncCore(lines, streamOptions) + const decodedValue = buildValueFromEvents(events) // Apply path expansion if enabled if (resolvedOptions.expandPaths === 'safe') { @@ -112,6 +138,72 @@ export function decode(input: string, options?: DecodeOptions): JsonValue { return decodedValue } +/** + * Synchronously decodes TOON lines into a stream of JSON events. + * + * This function yields structured events (startObject, endObject, startArray, endArray, + * key, primitive) that represent the JSON data model without building the full value tree. + * Useful for streaming processing, custom transformations, or memory-efficient parsing. + * + * @remarks + * Path expansion (`expandPaths: 'safe'`) is not supported in streaming mode. + * + * @param lines - Iterable of TOON lines (without newlines) + * @param options - Optional decoding configuration (expandPaths not supported) + * @returns Iterable of JSON stream events + * + * @example + * ```ts + * const lines = ['name: Alice', 'age: 30'] + * for (const event of decodeStreamSync(lines)) { + * console.log(event) + * // { type: 'startObject' } + * // { type: 'key', key: 'name' } + * // { type: 'primitive', value: 'Alice' } + * // ... + * } + * ``` + */ +export function decodeStreamSync(lines: Iterable, options?: DecodeStreamOptions): Iterable { + return decodeStreamSyncCore(lines, options) +} + +/** + * Asynchronously decodes TOON lines into a stream of JSON events. + * + * This function yields structured events (startObject, endObject, startArray, endArray, + * key, primitive) that represent the JSON data model without building the full value tree. + * Supports both sync and async iterables for maximum flexibility with file streams, + * network responses, or other async sources. + * + * @remarks + * Path expansion (`expandPaths: 'safe'`) is not supported in streaming mode. + * + * @param source - Async or sync iterable of TOON lines (without newlines) + * @param options - Optional decoding configuration (expandPaths not supported) + * @returns Async iterable of JSON stream events + * + * @example + * ```ts + * const fileStream = createReadStream('data.toon', 'utf-8') + * const lines = splitLines(fileStream) // Async iterable of lines + * + * for await (const event of decodeStream(lines)) { + * console.log(event) + * // { type: 'startObject' } + * // { type: 'key', key: 'name' } + * // { type: 'primitive', value: 'Alice' } + * // ... + * } + * ``` + */ +export function decodeStream( + source: AsyncIterable | Iterable, + options?: DecodeStreamOptions, +): AsyncIterable { + return decodeStreamCore(source, options) +} + function resolveOptions(options?: EncodeOptions): ResolvedEncodeOptions { return { indent: options?.indent ?? 2, diff --git a/packages/toon/src/types.ts b/packages/toon/src/types.ts index a3fd38c..9e38a2a 100644 --- a/packages/toon/src/types.ts +++ b/packages/toon/src/types.ts @@ -69,6 +69,32 @@ export interface DecodeOptions { export type ResolvedDecodeOptions = Readonly> +/** + * Options for streaming decode operations. + * + * @remarks + * Path expansion is not supported in streaming mode. + */ +export interface DecodeStreamOptions extends Omit { + /** + * Path expansion is not supported in streaming decode. + * This option is explicitly omitted. + */ + expandPaths?: never +} + +// #endregion + +// #region Streaming decoder types + +export type JsonStreamEvent + = | { type: 'startObject' } + | { type: 'endObject' } + | { type: 'startArray', length: number } + | { type: 'endArray' } + | { type: 'key', key: string, wasQuoted?: boolean } + | { type: 'primitive', value: JsonPrimitive } + // #endregion // #region Decoder parsing types diff --git a/packages/toon/test/decodeStream.test.ts b/packages/toon/test/decodeStream.test.ts new file mode 100644 index 0000000..8744255 --- /dev/null +++ b/packages/toon/test/decodeStream.test.ts @@ -0,0 +1,343 @@ +import { describe, expect, it } from 'vitest' +import { buildValueFromEvents } from '../src/decode/event-builder' +import { decode, decodeFromLines, decodeStreamSync } from '../src/index' + +describe('streaming decode', () => { + describe('decodeStreamSync', () => { + it('should decode simple object', () => { + const input = 'name: Alice\nage: 30' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'key', key: 'name' }, + { type: 'primitive', value: 'Alice' }, + { type: 'key', key: 'age' }, + { type: 'primitive', value: 30 }, + { type: 'endObject' }, + ]) + }) + + it('should decode nested object', () => { + const input = 'user:\n name: Alice\n age: 30' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'key', key: 'user' }, + { type: 'startObject' }, + { type: 'key', key: 'name' }, + { type: 'primitive', value: 'Alice' }, + { type: 'key', key: 'age' }, + { type: 'primitive', value: 30 }, + { type: 'endObject' }, + { type: 'endObject' }, + ]) + }) + + it('should decode inline primitive array', () => { + const input = 'scores[3]: 95, 87, 92' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'key', key: 'scores' }, + { type: 'startArray', length: 3 }, + { type: 'primitive', value: 95 }, + { type: 'primitive', value: 87 }, + { type: 'primitive', value: 92 }, + { type: 'endArray' }, + { type: 'endObject' }, + ]) + }) + + it('should decode list array', () => { + const input = 'items[2]:\n - Apple\n - Banana' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'key', key: 'items' }, + { type: 'startArray', length: 2 }, + { type: 'primitive', value: 'Apple' }, + { type: 'primitive', value: 'Banana' }, + { type: 'endArray' }, + { type: 'endObject' }, + ]) + }) + + it('should decode tabular array', () => { + const input = 'users[2]{name,age}:\n Alice, 30\n Bob, 25' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'key', key: 'users' }, + { type: 'startArray', length: 2 }, + { type: 'startObject' }, + { type: 'key', key: 'name' }, + { type: 'primitive', value: 'Alice' }, + { type: 'key', key: 'age' }, + { type: 'primitive', value: 30 }, + { type: 'endObject' }, + { type: 'startObject' }, + { type: 'key', key: 'name' }, + { type: 'primitive', value: 'Bob' }, + { type: 'key', key: 'age' }, + { type: 'primitive', value: 25 }, + { type: 'endObject' }, + { type: 'endArray' }, + { type: 'endObject' }, + ]) + }) + + it('should decode root primitive', () => { + const input = 'Hello World' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'primitive', value: 'Hello World' }, + ]) + }) + + it('should decode root array', () => { + const input = '[2]:\n - Apple\n - Banana' + const lines = input.split('\n') + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startArray', length: 2 }, + { type: 'primitive', value: 'Apple' }, + { type: 'primitive', value: 'Banana' }, + { type: 'endArray' }, + ]) + }) + + it('should decode empty input as empty object', () => { + const lines: string[] = [] + const events = Array.from(decodeStreamSync(lines)) + + expect(events).toEqual([ + { type: 'startObject' }, + { type: 'endObject' }, + ]) + }) + + it('should throw on expandPaths option', () => { + const input = 'name: Alice' + const lines = input.split('\n') + + expect(() => Array.from(decodeStreamSync(lines, { expandPaths: 'safe' } as any))) + .toThrow('expandPaths is not supported in streaming decode') + }) + + it('should enforce strict mode validation', () => { + const input = 'items[2]:\n - Apple' + const lines = input.split('\n') + + expect(() => Array.from(decodeStreamSync(lines, { strict: true }))) + .toThrow() + }) + + it('should allow count mismatch in non-strict mode', () => { + const input = 'items[2]:\n - Apple' + const lines = input.split('\n') + + // Should not throw in non-strict mode + const events = Array.from(decodeStreamSync(lines, { strict: false })) + + expect(events).toBeDefined() + expect(events[0]).toEqual({ type: 'startObject' }) + }) + }) + + describe('buildValueFromEvents', () => { + it('should build object from events', () => { + const events = [ + { type: 'startObject' as const }, + { type: 'key' as const, key: 'name' }, + { type: 'primitive' as const, value: 'Alice' }, + { type: 'key' as const, key: 'age' }, + { type: 'primitive' as const, value: 30 }, + { type: 'endObject' as const }, + ] + + const result = buildValueFromEvents(events) + + expect(result).toEqual({ name: 'Alice', age: 30 }) + }) + + it('should build nested object from events', () => { + const events = [ + { type: 'startObject' as const }, + { type: 'key' as const, key: 'user' }, + { type: 'startObject' as const }, + { type: 'key' as const, key: 'name' }, + { type: 'primitive' as const, value: 'Alice' }, + { type: 'endObject' as const }, + { type: 'endObject' as const }, + ] + + const result = buildValueFromEvents(events) + + expect(result).toEqual({ user: { name: 'Alice' } }) + }) + + it('should build array from events', () => { + const events = [ + { type: 'startArray' as const, length: 3 }, + { type: 'primitive' as const, value: 1 }, + { type: 'primitive' as const, value: 2 }, + { type: 'primitive' as const, value: 3 }, + { type: 'endArray' as const }, + ] + + const result = buildValueFromEvents(events) + + expect(result).toEqual([1, 2, 3]) + }) + + it('should build primitive from events', () => { + const events = [ + { type: 'primitive' as const, value: 'Hello' }, + ] + + const result = buildValueFromEvents(events) + + expect(result).toEqual('Hello') + }) + + it('should throw on incomplete event stream', () => { + const events = [ + { type: 'startObject' as const }, + { type: 'key' as const, key: 'name' }, + // Missing primitive and endObject + ] + + expect(() => buildValueFromEvents(events)) + .toThrow('Incomplete event stream') + }) + }) + + describe('decodeFromLines', () => { + it('should produce same result as decode', () => { + const input = 'name: Alice\nage: 30\nscores[3]: 95, 87, 92' + const lines = input.split('\n') + + const fromLines = decodeFromLines(lines) + const fromString = decode(input) + + expect(fromLines).toEqual(fromString) + }) + + it('should support expandPaths option', () => { + const input = 'user.name: Alice\nuser.age: 30' + const lines = input.split('\n') + + const result = decodeFromLines(lines, { expandPaths: 'safe' }) + + expect(result).toEqual({ + user: { + name: 'Alice', + age: 30, + }, + }) + }) + + it('should handle complex nested structures', () => { + const input = [ + 'users[2]:', + ' - name: Alice', + ' scores[3]: 95, 87, 92', + ' - name: Bob', + ' scores[3]: 88, 91, 85', + ].join('\n') + + const fromLines = decodeFromLines(input.split('\n')) + const fromString = decode(input) + + expect(fromLines).toEqual(fromString) + expect(fromLines).toEqual({ + users: [ + { name: 'Alice', scores: [95, 87, 92] }, + { name: 'Bob', scores: [88, 91, 85] }, + ], + }) + }) + + it('should handle tabular arrays', () => { + const input = [ + 'users[3]{name,age,city}:', + ' Alice, 30, NYC', + ' Bob, 25, LA', + ' Charlie, 35, SF', + ].join('\n') + + const fromLines = decodeFromLines(input.split('\n')) + const fromString = decode(input) + + expect(fromLines).toEqual(fromString) + expect(fromLines).toEqual({ + users: [ + { name: 'Alice', age: 30, city: 'NYC' }, + { name: 'Bob', age: 25, city: 'LA' }, + { name: 'Charlie', age: 35, city: 'SF' }, + ], + }) + }) + }) + + describe('streaming equivalence', () => { + // Test that streaming produces same results as non-streaming for various inputs + const testCases = [ + { + name: 'simple object', + input: 'name: Alice\nage: 30', + }, + { + name: 'nested objects', + input: 'user:\n profile:\n name: Alice\n age: 30', + }, + { + name: 'mixed structures', + input: 'name: Alice\nscores[3]: 95, 87, 92\naddress:\n city: NYC\n zip: 10001', + }, + { + name: 'list array with objects', + input: 'users[2]:\n - name: Alice\n age: 30\n - name: Bob\n age: 25', + }, + { + name: 'root primitive number', + input: '42', + }, + { + name: 'root primitive string', + input: 'Hello World', + }, + { + name: 'root primitive boolean', + input: 'true', + }, + { + name: 'root primitive null', + input: 'null', + }, + ] + + for (const testCase of testCases) { + it(`should match decode() for: ${testCase.name}`, () => { + const lines = testCase.input.split('\n') + const streamResult = decodeFromLines(lines) + const regularResult = decode(testCase.input) + + expect(streamResult).toEqual(regularResult) + }) + } + }) +})