feat(specs): add delimiter details to array header

This commit is contained in:
Johann Schopplich
2025-10-25 12:19:42 +02:00
parent 3aaa7354ab
commit 22e23f6f1b
4 changed files with 49 additions and 44 deletions

View File

@@ -234,7 +234,8 @@ const data = {
id: 123,
name: 'Ada',
tags: ['admin', 'ops'],
active: true
active: true,
preferences: []
}
}
@@ -249,6 +250,7 @@ user:
name: Ada
tags[2]: admin,ops
active: true
preferences[0]:
```
## Canonical Formatting Rules
@@ -260,7 +262,9 @@ TOON formatting is deterministic and minimal:
- `key: value` for primitives (single space after colon).
- `key:` for nested/empty objects (no trailing space on that line).
- **Arrays**:
- Primitive arrays inline: `key[N]: v1,v2` (no spaces after commas).
- Delimiter encoding: Comma delimiters are implicit in array headers (e.g., `tags[3]:`, `items[2]{id,name}:`). Tab and pipe delimiters are explicitly shown in array headers (e.g., `tags[3|]:`, `items[2 ]{id name}:`).
- Primitive arrays inline: `key[N]: v1,v2` (comma) or `key[N<delim>]: v1<delim>v2` (tab/pipe).
- Tabular arrays: `key[N]{f1,f2}: …` (comma) or `key[N<delim>]{f1<delim>f2}: …` (tab/pipe).
- List items: two spaces, hyphen, space (`" - …"`).
- **Whitespace invariants**:
- No trailing spaces at end of any line.
@@ -306,7 +310,7 @@ user:
### Arrays
> [!TIP]
> TOON includes the array length in brackets (e.g., `items[3]` or `[2]`). This explicit count helps LLMs track the number of elements, reducing errors when generating or validating structured output.
> TOON includes the array length in brackets (e.g., `items[3]`). When using comma delimiters (default), the delimiter is implicit. When using tab or pipe delimiters, the delimiter is explicitly shown in the header (e.g., `tags[2|]` or `[2 ]`). This encoding helps LLMs identify the delimiter and track the number of elements, reducing errors when generating or validating structured output.
#### Primitive Arrays (Inline)
@@ -454,8 +458,8 @@ String values are quoted when any of the following is true:
| Starts with `"- "` (list-like) | `"- item"` |
| Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` |
> [!NOTE]
> **Delimiter-aware quoting:** The quoting rules are context-sensitive. When using tab or pipe delimiters, commas don't need quoting. Only the active delimiter triggers quoting this applies to both array values and object values.
> [!IMPORTANT]
> **Delimiter-aware quoting:** Unquoted strings never contain `:` or the active delimiter. This makes TOON reliably parseable with simple heuristics: split key/value on first `: `, and split array values on the delimiter declared in the array header. When using tab or pipe delimiters, commas don't need quoting only the active delimiter triggers quoting for both array values and object values.
#### Examples
@@ -478,9 +482,10 @@ For arrays of objects to use the efficient tabular format, all of the following
| All elements are objects | No primitives in the array |
| Identical key sets | No missing or extra keys across rows |
| Primitive values only | No nested arrays or objects |
| Header delimiter | Comma is implicit in headers (`[N]{f1,f2}`); tab and pipe are explicit (`[N ]{f1 f2}`, `[N|]{f1|f2}`) |
| Header key order | Taken from the first object |
| Header key quoting | Same rules as object keys |
| Row value quoting | Same rules as string values |
| Header key quoting | Same rules as object keys; keys containing the active delimiter must be quoted |
| Row value quoting | Same rules as string values; values containing the active delimiter must be quoted |
If any condition fails, TOON falls back to list format.
@@ -568,7 +573,7 @@ console.log(encode(data, { delimiter: '\t' }))
**Output:**
```
items[2]{sku,name,qty,price}:
items[2 ]{sku name qty price}:
A1 Widget 2 9.99
B2 Gadget 1 14.5
```
@@ -577,6 +582,7 @@ items[2]{sku,name,qty,price}:
- Tabs are single characters and often tokenize more efficiently than commas.
- Tabs rarely appear in natural text, reducing the need for quote-escaping.
- The delimiter is explicitly encoded in the array header, making it self-descriptive.
**Considerations:**
@@ -594,7 +600,7 @@ console.log(encode(data, { delimiter: '|' }))
**Output:**
```
items[2]{sku,name,qty,price}:
items[2|]{sku|name|qty|price}:
A1|Widget|2|9.99
B2|Gadget|1|14.5
```

View File

@@ -88,13 +88,8 @@ export function encodeArray(
options: ResolvedEncodeOptions,
): void {
if (value.length === 0) {
if (key === undefined) {
writer.push(depth, '[0]:')
}
else {
const encodedKey = encodeKey(key)
writer.push(depth, `${encodedKey}[0]:`)
}
const header = formatHeader(0, key ? { key, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, header)
return
}
@@ -155,7 +150,7 @@ export function encodeArrayOfArraysAsListItems(
depth: Depth,
options: ResolvedEncodeOptions,
): void {
const header = formatHeader(values.length, prefix ? { key: prefix } : undefined)
const header = formatHeader(values.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, header)
for (const arr of values) {
@@ -167,7 +162,7 @@ export function encodeArrayOfArraysAsListItems(
}
export function formatInlineArray(values: readonly JsonPrimitive[], delimiter: string, prefix?: string): string {
const header = formatHeader(values.length, prefix ? { key: prefix } : undefined)
const header = formatHeader(values.length, prefix ? { key: prefix, delimiter } : { delimiter })
const joinedValue = joinEncodedValues(values, delimiter)
// Only add space if there are values
if (values.length === 0) {
@@ -188,7 +183,7 @@ export function encodeArrayOfObjectsAsTabular(
depth: Depth,
options: ResolvedEncodeOptions,
): void {
const headerStr = formatHeader(rows.length, { key: prefix, fields: header })
const headerStr = formatHeader(rows.length, { key: prefix, fields: header, delimiter: options.delimiter })
writer.push(depth, `${headerStr}`)
writeTabularRows(rows, header, writer, depth + 1, options)
@@ -259,7 +254,7 @@ export function encodeMixedArrayAsListItems(
depth: Depth,
options: ResolvedEncodeOptions,
): void {
const header = formatHeader(items.length, prefix ? { key: prefix } : undefined)
const header = formatHeader(items.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
writer.push(depth, header)
for (const item of items) {
@@ -307,7 +302,7 @@ export function encodeObjectAsListItem(obj: JsonObject, writer: LineWriter, dept
const header = detectTabularHeader(firstValue)
if (header) {
// Tabular format for uniform arrays of objects
const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header })
const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header, delimiter: options.delimiter })
writer.push(depth, `${LIST_ITEM_PREFIX}${headerStr}`)
writeTabularRows(firstValue, header, writer, depth + 1, options)
}

View File

@@ -2,6 +2,7 @@ import type { JsonPrimitive } from './types'
import {
BACKSLASH,
COMMA,
DEFAULT_DELIMITER,
DOUBLE_QUOTE,
FALSE_LITERAL,
LIST_ITEM_MARKER,
@@ -139,10 +140,12 @@ export function formatHeader(
options?: {
key?: string
fields?: readonly string[]
delimiter?: string
},
): string {
const key = options?.key
const fields = options?.fields
const delimiter = options?.delimiter ?? COMMA
let header = ''
@@ -150,11 +153,12 @@ export function formatHeader(
header += encodeKey(key)
}
header += `[${length}]`
// Only include delimiter if it's not the default (comma)
header += `[${length}${delimiter !== DEFAULT_DELIMITER ? delimiter : ''}]`
if (fields) {
const quotedFields = fields.map(f => encodeKey(f))
header += `{${quotedFields.join(',')}}`
header += `{${quotedFields.join(delimiter)}}`
}
header += ':'

View File

@@ -597,12 +597,12 @@ describe('delimiter options', () => {
{ delimiter: ',' as const, name: 'comma', expected: 'admin,ops,dev' },
])('encodes primitive arrays with $name', ({ delimiter, expected }) => {
const obj = { tags: ['admin', 'ops', 'dev'] }
expect(encode(obj, { delimiter })).toBe(`tags[3]: ${expected}`)
expect(encode(obj, { delimiter })).toBe(`tags[3${delimiter !== ',' ? delimiter : ''}]: ${expected}`)
})
it.each([
{ delimiter: '\t' as const, name: 'tab', expected: 'items[2]{sku,qty,price}:\n A1\t2\t9.99\n B2\t1\t14.5' },
{ delimiter: '|' as const, name: 'pipe', expected: 'items[2]{sku,qty,price}:\n A1|2|9.99\n B2|1|14.5' },
{ delimiter: '\t' as const, name: 'tab', expected: 'items[2\t]{sku\tqty\tprice}:\n A1\t2\t9.99\n B2\t1\t14.5' },
{ delimiter: '|' as const, name: 'pipe', expected: 'items[2|]{sku|qty|price}:\n A1|2|9.99\n B2|1|14.5' },
])('encodes tabular arrays with $name', ({ delimiter, expected }) => {
const obj = {
items: [
@@ -614,8 +614,8 @@ describe('delimiter options', () => {
})
it.each([
{ delimiter: '\t' as const, name: 'tab', expected: 'pairs[2]:\n - [2]: a\tb\n - [2]: c\td' },
{ delimiter: '|' as const, name: 'pipe', expected: 'pairs[2]:\n - [2]: a|b\n - [2]: c|d' },
{ delimiter: '\t' as const, name: 'tab', expected: 'pairs[2\t]:\n - [2\t]: a\tb\n - [2\t]: c\td' },
{ delimiter: '|' as const, name: 'pipe', expected: 'pairs[2|]:\n - [2|]: a|b\n - [2|]: c|d' },
])('encodes nested arrays with $name', ({ delimiter, expected }) => {
const obj = { pairs: [['a', 'b'], ['c', 'd']] }
expect(encode(obj, { delimiter })).toBe(expected)
@@ -626,12 +626,12 @@ describe('delimiter options', () => {
{ delimiter: '|' as const, name: 'pipe' },
])('encodes root arrays with $name', ({ delimiter }) => {
const arr = ['x', 'y', 'z']
expect(encode(arr, { delimiter })).toBe(`[3]: x${delimiter}y${delimiter}z`)
expect(encode(arr, { delimiter })).toBe(`[3${delimiter}]: x${delimiter}y${delimiter}z`)
})
it.each([
{ delimiter: '\t' as const, name: 'tab', expected: '[2]{id}:\n 1\n 2' },
{ delimiter: '|' as const, name: 'pipe', expected: '[2]{id}:\n 1\n 2' },
{ delimiter: '\t' as const, name: 'tab', expected: '[2\t]{id}:\n 1\n 2' },
{ delimiter: '|' as const, name: 'pipe', expected: '[2|]{id}:\n 1\n 2' },
])('encodes root arrays of objects with $name', ({ delimiter, expected }) => {
const arr = [{ id: 1 }, { id: 2 }]
expect(encode(arr, { delimiter })).toBe(expected)
@@ -643,14 +643,14 @@ describe('delimiter options', () => {
{ delimiter: '\t' as const, name: 'tab', char: '\t', input: ['a', 'b\tc', 'd'], expected: 'a\t"b\\tc"\td' },
{ delimiter: '|' as const, name: 'pipe', char: '|', input: ['a', 'b|c', 'd'], expected: 'a|"b|c"|d' },
])('quotes strings containing $name', ({ delimiter, input, expected }) => {
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`)
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
})
it.each([
{ delimiter: '\t' as const, name: 'tab', input: ['a,b', 'c,d'], expected: 'a,b\tc,d' },
{ delimiter: '|' as const, name: 'pipe', input: ['a,b', 'c,d'], expected: 'a,b|c,d' },
])('does not quote commas with $name', ({ delimiter, input, expected }) => {
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`)
expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
})
it('quotes tabular values containing the delimiter', () => {
@@ -661,7 +661,7 @@ describe('delimiter options', () => {
],
}
expect(encode(obj, { delimiter: ',' })).toBe('items[2]{id,note}:\n 1,"a,b"\n 2,"c,d"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{id,note}:\n 1\ta,b\n 2\tc,d')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{id\tnote}:\n 1\ta,b\n 2\tc,d')
})
it('does not quote commas in object values with non-comma delimiter', () => {
@@ -670,22 +670,22 @@ describe('delimiter options', () => {
})
it('quotes nested array values containing the delimiter', () => {
expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1]:\n - [2]: a|"b|c"')
expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1]:\n - [2]: a\t"b\\tc"')
expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1|]:\n - [2|]: a|"b|c"')
expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1\t]:\n - [2\t]: a\t"b\\tc"')
})
})
describe('delimiter-independent quoting rules', () => {
it('preserves ambiguity quoting regardless of delimiter', () => {
const obj = { items: ['true', '42', '-3.14'] }
expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "true"|"42"|"-3.14"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "true"\t"42"\t"-3.14"')
expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "true"|"42"|"-3.14"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "true"\t"42"\t"-3.14"')
})
it('preserves structural quoting regardless of delimiter', () => {
const obj = { items: ['[5]', '{key}', '- item'] }
expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "[5]"|"{key}"|"- item"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "[5]"\t"{key}"\t"- item"')
expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "[5]"|"{key}"|"- item"')
expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "[5]"\t"{key}"\t"- item"')
})
it('quotes keys containing the delimiter', () => {
@@ -695,13 +695,13 @@ describe('delimiter options', () => {
it('quotes tabular headers containing the delimiter', () => {
const obj = { items: [{ 'a|b': 1 }, { 'a|b': 2 }] }
expect(encode(obj, { delimiter: '|' })).toBe('items[2]{"a|b"}:\n 1\n 2')
expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{"a|b"}:\n 1\n 2')
})
it('always uses commas in tabular headers regardless of delimiter', () => {
it('header uses the active delimiter', () => {
const obj = { items: [{ a: 1, b: 2 }, { a: 3, b: 4 }] }
expect(encode(obj, { delimiter: '|' })).toBe('items[2]{a,b}:\n 1|2\n 3|4')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{a,b}:\n 1\t2\n 3\t4')
expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{a|b}:\n 1|2\n 3|4')
expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{a\tb}:\n 1\t2\n 3\t4')
})
})