feat(specs): add delimiter details to array header

This commit is contained in:
Johann Schopplich
2025-10-25 12:19:42 +02:00
parent 3aaa7354ab
commit 22e23f6f1b
4 changed files with 49 additions and 44 deletions

View File

@@ -234,7 +234,8 @@ const data = {
id: 123, id: 123,
name: 'Ada', name: 'Ada',
tags: ['admin', 'ops'], tags: ['admin', 'ops'],
active: true active: true,
preferences: []
} }
} }
@@ -249,6 +250,7 @@ user:
name: Ada name: Ada
tags[2]: admin,ops tags[2]: admin,ops
active: true active: true
preferences[0]:
``` ```
## Canonical Formatting Rules ## Canonical Formatting Rules
@@ -260,7 +262,9 @@ TOON formatting is deterministic and minimal:
- `key: value` for primitives (single space after colon). - `key: value` for primitives (single space after colon).
- `key:` for nested/empty objects (no trailing space on that line). - `key:` for nested/empty objects (no trailing space on that line).
- **Arrays**: - **Arrays**:
- Primitive arrays inline: `key[N]: v1,v2` (no spaces after commas). - Delimiter encoding: Comma delimiters are implicit in array headers (e.g., `tags[3]:`, `items[2]{id,name}:`). Tab and pipe delimiters are explicitly shown in array headers (e.g., `tags[3|]:`, `items[2 ]{id name}:`).
- Primitive arrays inline: `key[N]: v1,v2` (comma) or `key[N<delim>]: v1<delim>v2` (tab/pipe).
- Tabular arrays: `key[N]{f1,f2}: …` (comma) or `key[N<delim>]{f1<delim>f2}: …` (tab/pipe).
- List items: two spaces, hyphen, space (`" - …"`). - List items: two spaces, hyphen, space (`" - …"`).
- **Whitespace invariants**: - **Whitespace invariants**:
- No trailing spaces at end of any line. - No trailing spaces at end of any line.
@@ -306,7 +310,7 @@ user:
### Arrays ### Arrays
> [!TIP] > [!TIP]
> TOON includes the array length in brackets (e.g., `items[3]` or `[2]`). This explicit count helps LLMs track the number of elements, reducing errors when generating or validating structured output. > TOON includes the array length in brackets (e.g., `items[3]`). When using comma delimiters (default), the delimiter is implicit. When using tab or pipe delimiters, the delimiter is explicitly shown in the header (e.g., `tags[2|]` or `[2 ]`). This encoding helps LLMs identify the delimiter and track the number of elements, reducing errors when generating or validating structured output.
#### Primitive Arrays (Inline) #### Primitive Arrays (Inline)
@@ -454,8 +458,8 @@ String values are quoted when any of the following is true:
| Starts with `"- "` (list-like) | `"- item"` | | Starts with `"- "` (list-like) | `"- item"` |
| Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` | | Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` |
> [!NOTE] > [!IMPORTANT]
> **Delimiter-aware quoting:** The quoting rules are context-sensitive. When using tab or pipe delimiters, commas don't need quoting. Only the active delimiter triggers quoting this applies to both array values and object values. > **Delimiter-aware quoting:** Unquoted strings never contain `:` or the active delimiter. This makes TOON reliably parseable with simple heuristics: split key/value on first `: `, and split array values on the delimiter declared in the array header. When using tab or pipe delimiters, commas don't need quoting only the active delimiter triggers quoting for both array values and object values.
#### Examples #### Examples
@@ -478,9 +482,10 @@ For arrays of objects to use the efficient tabular format, all of the following
| All elements are objects | No primitives in the array | | All elements are objects | No primitives in the array |
| Identical key sets | No missing or extra keys across rows | | Identical key sets | No missing or extra keys across rows |
| Primitive values only | No nested arrays or objects | | Primitive values only | No nested arrays or objects |
| Header delimiter | Comma is implicit in headers (`[N]{f1,f2}`); tab and pipe are explicit (`[N ]{f1 f2}`, `[N|]{f1|f2}`) |
| Header key order | Taken from the first object | | Header key order | Taken from the first object |
| Header key quoting | Same rules as object keys | | Header key quoting | Same rules as object keys; keys containing the active delimiter must be quoted |
| Row value quoting | Same rules as string values | | Row value quoting | Same rules as string values; values containing the active delimiter must be quoted |
If any condition fails, TOON falls back to list format. If any condition fails, TOON falls back to list format.
@@ -568,7 +573,7 @@ console.log(encode(data, { delimiter: '\t' }))
**Output:** **Output:**
``` ```
items[2]{sku,name,qty,price}: items[2 ]{sku name qty price}:
A1 Widget 2 9.99 A1 Widget 2 9.99
B2 Gadget 1 14.5 B2 Gadget 1 14.5
``` ```
@@ -577,6 +582,7 @@ items[2]{sku,name,qty,price}:
- Tabs are single characters and often tokenize more efficiently than commas. - Tabs are single characters and often tokenize more efficiently than commas.
- Tabs rarely appear in natural text, reducing the need for quote-escaping. - Tabs rarely appear in natural text, reducing the need for quote-escaping.
- The delimiter is explicitly encoded in the array header, making it self-descriptive.
**Considerations:** **Considerations:**
@@ -594,7 +600,7 @@ console.log(encode(data, { delimiter: '|' }))
**Output:** **Output:**
``` ```
items[2]{sku,name,qty,price}: items[2|]{sku|name|qty|price}:
A1|Widget|2|9.99 A1|Widget|2|9.99
B2|Gadget|1|14.5 B2|Gadget|1|14.5
``` ```

View File

@@ -88,13 +88,8 @@ export function encodeArray(
options: ResolvedEncodeOptions, options: ResolvedEncodeOptions,
): void { ): void {
if (value.length === 0) { if (value.length === 0) {
if (key === undefined) { const header = formatHeader(0, key ? { key, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, '[0]:') writer.push(depth, header)
}
else {
const encodedKey = encodeKey(key)
writer.push(depth, `${encodedKey}[0]:`)
}
return return
} }
@@ -155,7 +150,7 @@ export function encodeArrayOfArraysAsListItems(
depth: Depth, depth: Depth,
options: ResolvedEncodeOptions, options: ResolvedEncodeOptions,
): void { ): void {
const header = formatHeader(values.length, prefix ? { key: prefix } : undefined) const header = formatHeader(values.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, header) writer.push(depth, header)
for (const arr of values) { for (const arr of values) {
@@ -167,7 +162,7 @@ export function encodeArrayOfArraysAsListItems(
} }
export function formatInlineArray(values: readonly JsonPrimitive[], delimiter: string, prefix?: string): string { export function formatInlineArray(values: readonly JsonPrimitive[], delimiter: string, prefix?: string): string {
const header = formatHeader(values.length, prefix ? { key: prefix } : undefined) const header = formatHeader(values.length, prefix ? { key: prefix, delimiter } : { delimiter })
const joinedValue = joinEncodedValues(values, delimiter) const joinedValue = joinEncodedValues(values, delimiter)
// Only add space if there are values // Only add space if there are values
if (values.length === 0) { if (values.length === 0) {
@@ -188,7 +183,7 @@ export function encodeArrayOfObjectsAsTabular(
depth: Depth, depth: Depth,
options: ResolvedEncodeOptions, options: ResolvedEncodeOptions,
): void { ): void {
const headerStr = formatHeader(rows.length, { key: prefix, fields: header }) const headerStr = formatHeader(rows.length, { key: prefix, fields: header, delimiter: options.delimiter })
writer.push(depth, `${headerStr}`) writer.push(depth, `${headerStr}`)
writeTabularRows(rows, header, writer, depth + 1, options) writeTabularRows(rows, header, writer, depth + 1, options)
@@ -259,7 +254,7 @@ export function encodeMixedArrayAsListItems(
depth: Depth, depth: Depth,
options: ResolvedEncodeOptions, options: ResolvedEncodeOptions,
): void { ): void {
const header = formatHeader(items.length, prefix ? { key: prefix } : undefined) const header = formatHeader(items.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, header) writer.push(depth, header)
for (const item of items) { for (const item of items) {
@@ -307,7 +302,7 @@ export function encodeObjectAsListItem(obj: JsonObject, writer: LineWriter, dept
const header = detectTabularHeader(firstValue) const header = detectTabularHeader(firstValue)
if (header) { if (header) {
// Tabular format for uniform arrays of objects // Tabular format for uniform arrays of objects
const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header }) const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header, delimiter: options.delimiter })
writer.push(depth, `${LIST_ITEM_PREFIX}${headerStr}`) writer.push(depth, `${LIST_ITEM_PREFIX}${headerStr}`)
writeTabularRows(firstValue, header, writer, depth + 1, options) writeTabularRows(firstValue, header, writer, depth + 1, options)
} }

View File

@@ -2,6 +2,7 @@ import type { JsonPrimitive } from './types'
import { import {
BACKSLASH, BACKSLASH,
COMMA, COMMA,
DEFAULT_DELIMITER,
DOUBLE_QUOTE, DOUBLE_QUOTE,
FALSE_LITERAL, FALSE_LITERAL,
LIST_ITEM_MARKER, LIST_ITEM_MARKER,
@@ -139,10 +140,12 @@ export function formatHeader(
options?: { options?: {
key?: string key?: string
fields?: readonly string[] fields?: readonly string[]
delimiter?: string
}, },
): string { ): string {
const key = options?.key const key = options?.key
const fields = options?.fields const fields = options?.fields
const delimiter = options?.delimiter ?? COMMA
let header = '' let header = ''
@@ -150,11 +153,12 @@ export function formatHeader(
header += encodeKey(key) header += encodeKey(key)
} }
header += `[${length}]` // Only include delimiter if it's not the default (comma)
header += `[${length}${delimiter !== DEFAULT_DELIMITER ? delimiter : ''}]`
if (fields) { if (fields) {
const quotedFields = fields.map(f => encodeKey(f)) const quotedFields = fields.map(f => encodeKey(f))
header += `{${quotedFields.join(',')}}` header += `{${quotedFields.join(delimiter)}}`
} }
header += ':' header += ':'

View File

@@ -597,12 +597,12 @@ describe('delimiter options', () => {
{ delimiter: ',' as const, name: 'comma', expected: 'admin,ops,dev' }, { delimiter: ',' as const, name: 'comma', expected: 'admin,ops,dev' },
])('encodes primitive arrays with $name', ({ delimiter, expected }) => { ])('encodes primitive arrays with $name', ({ delimiter, expected }) => {
const obj = { tags: ['admin', 'ops', 'dev'] } const obj = { tags: ['admin', 'ops', 'dev'] }
expect(encode(obj, { delimiter })).toBe(`tags[3]: ${expected}`) expect(encode(obj, { delimiter })).toBe(`tags[3${delimiter !== ',' ? delimiter : ''}]: ${expected}`)
}) })
it.each([ it.each([
{ delimiter: '\t' as const, name: 'tab', expected: 'items[2]{sku,qty,price}:\n A1\t2\t9.99\n B2\t1\t14.5' }, { delimiter: '\t' as const, name: 'tab', expected: 'items[2\t]{sku\tqty\tprice}:\n A1\t2\t9.99\n B2\t1\t14.5' },
{ delimiter: '|' as const, name: 'pipe', expected: 'items[2]{sku,qty,price}:\n A1|2|9.99\n B2|1|14.5' }, { delimiter: '|' as const, name: 'pipe', expected: 'items[2|]{sku|qty|price}:\n A1|2|9.99\n B2|1|14.5' },
])('encodes tabular arrays with $name', ({ delimiter, expected }) => { ])('encodes tabular arrays with $name', ({ delimiter, expected }) => {
const obj = { const obj = {
items: [ items: [
@@ -614,8 +614,8 @@ describe('delimiter options', () => {
}) })
it.each([ it.each([
{ delimiter: '\t' as const, name: 'tab', expected: 'pairs[2]:\n - [2]: a\tb\n - [2]: c\td' }, { delimiter: '\t' as const, name: 'tab', expected: 'pairs[2\t]:\n - [2\t]: a\tb\n - [2\t]: c\td' },
{ delimiter: '|' as const, name: 'pipe', expected: 'pairs[2]:\n - [2]: a|b\n - [2]: c|d' }, { delimiter: '|' as const, name: 'pipe', expected: 'pairs[2|]:\n - [2|]: a|b\n - [2|]: c|d' },
])('encodes nested arrays with $name', ({ delimiter, expected }) => { ])('encodes nested arrays with $name', ({ delimiter, expected }) => {
const obj = { pairs: [['a', 'b'], ['c', 'd']] } const obj = { pairs: [['a', 'b'], ['c', 'd']] }
expect(encode(obj, { delimiter })).toBe(expected) expect(encode(obj, { delimiter })).toBe(expected)
@@ -626,12 +626,12 @@ describe('delimiter options', () => {
{ delimiter: '|' as const, name: 'pipe' }, { delimiter: '|' as const, name: 'pipe' },
])('encodes root arrays with $name', ({ delimiter }) => { ])('encodes root arrays with $name', ({ delimiter }) => {
const arr = ['x', 'y', 'z'] const arr = ['x', 'y', 'z']
expect(encode(arr, { delimiter })).toBe(`[3]: x${delimiter}y${delimiter}z`) expect(encode(arr, { delimiter })).toBe(`[3${delimiter}]: x${delimiter}y${delimiter}z`)
}) })
it.each([ it.each([
{ delimiter: '\t' as const, name: 'tab', expected: '[2]{id}:\n 1\n 2' }, { delimiter: '\t' as const, name: 'tab', expected: '[2\t]{id}:\n 1\n 2' },
{ delimiter: '|' as const, name: 'pipe', expected: '[2]{id}:\n 1\n 2' }, { delimiter: '|' as const, name: 'pipe', expected: '[2|]{id}:\n 1\n 2' },
])('encodes root arrays of objects with $name', ({ delimiter, expected }) => { ])('encodes root arrays of objects with $name', ({ delimiter, expected }) => {
const arr = [{ id: 1 }, { id: 2 }] const arr = [{ id: 1 }, { id: 2 }]
expect(encode(arr, { delimiter })).toBe(expected) expect(encode(arr, { delimiter })).toBe(expected)
@@ -643,14 +643,14 @@ describe('delimiter options', () => {
{ delimiter: '\t' as const, name: 'tab', char: '\t', input: ['a', 'b\tc', 'd'], expected: 'a\t"b\\tc"\td' }, { delimiter: '\t' as const, name: 'tab', char: '\t', input: ['a', 'b\tc', 'd'], expected: 'a\t"b\\tc"\td' },
{ delimiter: '|' as const, name: 'pipe', char: '|', input: ['a', 'b|c', 'd'], expected: 'a|"b|c"|d' }, { delimiter: '|' as const, name: 'pipe', char: '|', input: ['a', 'b|c', 'd'], expected: 'a|"b|c"|d' },
])('quotes strings containing $name', ({ delimiter, input, expected }) => { ])('quotes strings containing $name', ({ delimiter, input, expected }) => {
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`) expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
}) })
it.each([ it.each([
{ delimiter: '\t' as const, name: 'tab', input: ['a,b', 'c,d'], expected: 'a,b\tc,d' }, { delimiter: '\t' as const, name: 'tab', input: ['a,b', 'c,d'], expected: 'a,b\tc,d' },
{ delimiter: '|' as const, name: 'pipe', input: ['a,b', 'c,d'], expected: 'a,b|c,d' }, { delimiter: '|' as const, name: 'pipe', input: ['a,b', 'c,d'], expected: 'a,b|c,d' },
])('does not quote commas with $name', ({ delimiter, input, expected }) => { ])('does not quote commas with $name', ({ delimiter, input, expected }) => {
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`) expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
}) })
it('quotes tabular values containing the delimiter', () => { it('quotes tabular values containing the delimiter', () => {
@@ -661,7 +661,7 @@ describe('delimiter options', () => {
], ],
} }
expect(encode(obj, { delimiter: ',' })).toBe('items[2]{id,note}:\n 1,"a,b"\n 2,"c,d"') expect(encode(obj, { delimiter: ',' })).toBe('items[2]{id,note}:\n 1,"a,b"\n 2,"c,d"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{id,note}:\n 1\ta,b\n 2\tc,d') expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{id\tnote}:\n 1\ta,b\n 2\tc,d')
}) })
it('does not quote commas in object values with non-comma delimiter', () => { it('does not quote commas in object values with non-comma delimiter', () => {
@@ -670,22 +670,22 @@ describe('delimiter options', () => {
}) })
it('quotes nested array values containing the delimiter', () => { it('quotes nested array values containing the delimiter', () => {
expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1]:\n - [2]: a|"b|c"') expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1|]:\n - [2|]: a|"b|c"')
expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1]:\n - [2]: a\t"b\\tc"') expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1\t]:\n - [2\t]: a\t"b\\tc"')
}) })
}) })
describe('delimiter-independent quoting rules', () => { describe('delimiter-independent quoting rules', () => {
it('preserves ambiguity quoting regardless of delimiter', () => { it('preserves ambiguity quoting regardless of delimiter', () => {
const obj = { items: ['true', '42', '-3.14'] } const obj = { items: ['true', '42', '-3.14'] }
expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "true"|"42"|"-3.14"') expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "true"|"42"|"-3.14"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "true"\t"42"\t"-3.14"') expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "true"\t"42"\t"-3.14"')
}) })
it('preserves structural quoting regardless of delimiter', () => { it('preserves structural quoting regardless of delimiter', () => {
const obj = { items: ['[5]', '{key}', '- item'] } const obj = { items: ['[5]', '{key}', '- item'] }
expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "[5]"|"{key}"|"- item"') expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "[5]"|"{key}"|"- item"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "[5]"\t"{key}"\t"- item"') expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "[5]"\t"{key}"\t"- item"')
}) })
it('quotes keys containing the delimiter', () => { it('quotes keys containing the delimiter', () => {
@@ -695,13 +695,13 @@ describe('delimiter options', () => {
it('quotes tabular headers containing the delimiter', () => { it('quotes tabular headers containing the delimiter', () => {
const obj = { items: [{ 'a|b': 1 }, { 'a|b': 2 }] } const obj = { items: [{ 'a|b': 1 }, { 'a|b': 2 }] }
expect(encode(obj, { delimiter: '|' })).toBe('items[2]{"a|b"}:\n 1\n 2') expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{"a|b"}:\n 1\n 2')
}) })
it('always uses commas in tabular headers regardless of delimiter', () => { it('header uses the active delimiter', () => {
const obj = { items: [{ a: 1, b: 2 }, { a: 3, b: 4 }] } const obj = { items: [{ a: 1, b: 2 }, { a: 3, b: 4 }] }
expect(encode(obj, { delimiter: '|' })).toBe('items[2]{a,b}:\n 1|2\n 3|4') expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{a|b}:\n 1|2\n 3|4')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{a,b}:\n 1\t2\n 3\t4') expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{a\tb}:\n 1\t2\n 3\t4')
}) })
}) })