feat(specs): add delimiter details to array header

2026-01-29 23:34:10 +08:00 · 2025-10-25 12:19:42 +02:00
parent 3aaa7354ab
commit 22e23f6f1b
4 changed files with 49 additions and 44 deletions
--- a/README.md
+++ b/README.md
@@ -234,7 +234,8 @@ const data = {
    id: 123,
    name: 'Ada',
    tags: ['admin', 'ops'],
-    active: true
+    active: true,
    preferences: []
  }
 }
@@ -249,6 +250,7 @@ user:
  name: Ada
  tags[2]: admin,ops
  active: true
  preferences[0]:
 ```
 ## Canonical Formatting Rules
@@ -260,7 +262,9 @@ TOON formatting is deterministic and minimal:
  - `key: value` for primitives (single space after colon).
  - `key:` for nested/empty objects (no trailing space on that line).
 - **Arrays**:
-  - Primitive arrays inline: `key[N]: v1,v2` (no spaces after commas).
+  - Delimiter encoding: Comma delimiters are implicit in array headers (e.g., `tags[3]:`, `items[2]{id,name}:`). Tab and pipe delimiters are explicitly shown in array headers (e.g., `tags[3|]:`, `items[2	]{id	name}:`).
  - Primitive arrays inline: `key[N]: v1,v2` (comma) or `key[N<delim>]: v1<delim>v2` (tab/pipe).
  - Tabular arrays: `key[N]{f1,f2}: …` (comma) or `key[N<delim>]{f1<delim>f2}: …` (tab/pipe).
  - List items: two spaces, hyphen, space (`"  - …"`).
 - **Whitespace invariants**:
  - No trailing spaces at end of any line.
@@ -306,7 +310,7 @@ user:
 ### Arrays
 > [!TIP]
-> TOON includes the array length in brackets (e.g., `items[3]` or `[2]`). This explicit count helps LLMs track the number of elements, reducing errors when generating or validating structured output.
+> TOON includes the array length in brackets (e.g., `items[3]`). When using comma delimiters (default), the delimiter is implicit. When using tab or pipe delimiters, the delimiter is explicitly shown in the header (e.g., `tags[2|]` or `[2	]`). This encoding helps LLMs identify the delimiter and track the number of elements, reducing errors when generating or validating structured output.
 #### Primitive Arrays (Inline)
@@ -454,8 +458,8 @@ String values are quoted when any of the following is true:
 | Starts with `"- "` (list-like) | `"- item"` |
 | Looks like structural token | `"[5]"`, `"{key}"`, `"[3]: x,y"` |
-> [!NOTE]
+> [!IMPORTANT]
-> **Delimiter-aware quoting:** The quoting rules are context-sensitive. When using tab or pipe delimiters, commas don't need quoting. Only the active delimiter triggers quoting – this applies to both array values and object values.
+> **Delimiter-aware quoting:** Unquoted strings never contain `:` or the active delimiter. This makes TOON reliably parseable with simple heuristics: split key/value on first `: `, and split array values on the delimiter declared in the array header. When using tab or pipe delimiters, commas don't need quoting – only the active delimiter triggers quoting for both array values and object values.
 #### Examples
@@ -478,9 +482,10 @@ For arrays of objects to use the efficient tabular format, all of the following
 | All elements are objects | No primitives in the array |
 | Identical key sets | No missing or extra keys across rows |
 | Primitive values only | No nested arrays or objects |
 | Header delimiter | Comma is implicit in headers (`[N]{f1,f2}`); tab and pipe are explicit (`[N	]{f1	f2}`, `[N|]{f1|f2}`) |
 | Header key order | Taken from the first object |
-| Header key quoting | Same rules as object keys |
+| Header key quoting | Same rules as object keys; keys containing the active delimiter must be quoted |
-| Row value quoting | Same rules as string values |
+| Row value quoting | Same rules as string values; values containing the active delimiter must be quoted |
 If any condition fails, TOON falls back to list format.
@@ -568,7 +573,7 @@ console.log(encode(data, { delimiter: '\t' }))
 **Output:**
 ```
-items[2]{sku,name,qty,price}:
+items[2	]{sku	name	qty	price}:
  A1	Widget	2	9.99
  B2	Gadget	1	14.5
 ```
@@ -577,6 +582,7 @@ items[2]{sku,name,qty,price}:
 - Tabs are single characters and often tokenize more efficiently than commas.
 - Tabs rarely appear in natural text, reducing the need for quote-escaping.
 - The delimiter is explicitly encoded in the array header, making it self-descriptive.
 **Considerations:**
@@ -594,7 +600,7 @@ console.log(encode(data, { delimiter: '|' }))
 **Output:**
 ```
-items[2]{sku,name,qty,price}:
+items[2|]{sku|name|qty|price}:
  A1|Widget|2|9.99
  B2|Gadget|1|14.5
 ```
--- a/src/encoders.ts
+++ b/src/encoders.ts
@@ -88,13 +88,8 @@ export function encodeArray(
  options: ResolvedEncodeOptions,
 ): void {
  if (value.length === 0) {
-    if (key === undefined) {
+    const header = formatHeader(0, key ? { key, delimiter: options.delimiter } : { delimiter: options.delimiter })
-      writer.push(depth, '[0]:')
+    writer.push(depth, header)
    }
    else {
      const encodedKey = encodeKey(key)
      writer.push(depth, `${encodedKey}[0]:`)
    }
    return
  }
@@ -155,7 +150,7 @@ export function encodeArrayOfArraysAsListItems(
  depth: Depth,
  options: ResolvedEncodeOptions,
 ): void {
-  const header = formatHeader(values.length, prefix ? { key: prefix } : undefined)
+  const header = formatHeader(values.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
  writer.push(depth, header)
  for (const arr of values) {
@@ -167,7 +162,7 @@ export function encodeArrayOfArraysAsListItems(
 }
 export function formatInlineArray(values: readonly JsonPrimitive[], delimiter: string, prefix?: string): string {
-  const header = formatHeader(values.length, prefix ? { key: prefix } : undefined)
+  const header = formatHeader(values.length, prefix ? { key: prefix, delimiter } : { delimiter })
  const joinedValue = joinEncodedValues(values, delimiter)
  // Only add space if there are values
  if (values.length === 0) {
@@ -188,7 +183,7 @@ export function encodeArrayOfObjectsAsTabular(
  depth: Depth,
  options: ResolvedEncodeOptions,
 ): void {
-  const headerStr = formatHeader(rows.length, { key: prefix, fields: header })
+  const headerStr = formatHeader(rows.length, { key: prefix, fields: header, delimiter: options.delimiter })
  writer.push(depth, `${headerStr}`)
  writeTabularRows(rows, header, writer, depth + 1, options)
@@ -259,7 +254,7 @@ export function encodeMixedArrayAsListItems(
  depth: Depth,
  options: ResolvedEncodeOptions,
 ): void {
-  const header = formatHeader(items.length, prefix ? { key: prefix } : undefined)
+  const header = formatHeader(items.length, prefix ? { key: prefix, delimiter: options.delimiter } : { delimiter: options.delimiter })
  writer.push(depth, header)
  for (const item of items) {
@@ -307,7 +302,7 @@ export function encodeObjectAsListItem(obj: JsonObject, writer: LineWriter, dept
      const header = detectTabularHeader(firstValue)
      if (header) {
        // Tabular format for uniform arrays of objects
-        const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header })
+        const headerStr = formatHeader(firstValue.length, { key: firstKey, fields: header, delimiter: options.delimiter })
        writer.push(depth, `${LIST_ITEM_PREFIX}${headerStr}`)
        writeTabularRows(firstValue, header, writer, depth + 1, options)
      }
--- a/src/primitives.ts
+++ b/src/primitives.ts
@@ -2,6 +2,7 @@ import type { JsonPrimitive } from './types'
 import {
  BACKSLASH,
  COMMA,
  DEFAULT_DELIMITER,
  DOUBLE_QUOTE,
  FALSE_LITERAL,
  LIST_ITEM_MARKER,
@@ -139,10 +140,12 @@ export function formatHeader(
  options?: {
    key?: string
    fields?: readonly string[]
    delimiter?: string
  },
 ): string {
  const key = options?.key
  const fields = options?.fields
  const delimiter = options?.delimiter ?? COMMA
  let header = ''
@@ -150,11 +153,12 @@ export function formatHeader(
    header += encodeKey(key)
  }
-  header += `[${length}]`
+  // Only include delimiter if it's not the default (comma)
  header += `[${length}${delimiter !== DEFAULT_DELIMITER ? delimiter : ''}]`
  if (fields) {
    const quotedFields = fields.map(f => encodeKey(f))
-    header += `{${quotedFields.join(',')}}`
+    header += `{${quotedFields.join(delimiter)}}`
  }
  header += ':'
--- a/test/index.test.ts
+++ b/test/index.test.ts
@@ -597,12 +597,12 @@ describe('delimiter options', () => {
      { delimiter: ',' as const, name: 'comma', expected: 'admin,ops,dev' },
    ])('encodes primitive arrays with $name', ({ delimiter, expected }) => {
      const obj = { tags: ['admin', 'ops', 'dev'] }
-      expect(encode(obj, { delimiter })).toBe(`tags[3]: ${expected}`)
+      expect(encode(obj, { delimiter })).toBe(`tags[3${delimiter !== ',' ? delimiter : ''}]: ${expected}`)
    })
    it.each([
-      { delimiter: '\t' as const, name: 'tab', expected: 'items[2]{sku,qty,price}:\n  A1\t2\t9.99\n  B2\t1\t14.5' },
+      { delimiter: '\t' as const, name: 'tab', expected: 'items[2\t]{sku\tqty\tprice}:\n  A1\t2\t9.99\n  B2\t1\t14.5' },
-      { delimiter: '|' as const, name: 'pipe', expected: 'items[2]{sku,qty,price}:\n  A1|2|9.99\n  B2|1|14.5' },
+      { delimiter: '|' as const, name: 'pipe', expected: 'items[2|]{sku|qty|price}:\n  A1|2|9.99\n  B2|1|14.5' },
    ])('encodes tabular arrays with $name', ({ delimiter, expected }) => {
      const obj = {
        items: [
@@ -614,8 +614,8 @@ describe('delimiter options', () => {
    })
    it.each([
-      { delimiter: '\t' as const, name: 'tab', expected: 'pairs[2]:\n  - [2]: a\tb\n  - [2]: c\td' },
+      { delimiter: '\t' as const, name: 'tab', expected: 'pairs[2\t]:\n  - [2\t]: a\tb\n  - [2\t]: c\td' },
-      { delimiter: '|' as const, name: 'pipe', expected: 'pairs[2]:\n  - [2]: a|b\n  - [2]: c|d' },
+      { delimiter: '|' as const, name: 'pipe', expected: 'pairs[2|]:\n  - [2|]: a|b\n  - [2|]: c|d' },
    ])('encodes nested arrays with $name', ({ delimiter, expected }) => {
      const obj = { pairs: [['a', 'b'], ['c', 'd']] }
      expect(encode(obj, { delimiter })).toBe(expected)
@@ -626,12 +626,12 @@ describe('delimiter options', () => {
      { delimiter: '|' as const, name: 'pipe' },
    ])('encodes root arrays with $name', ({ delimiter }) => {
      const arr = ['x', 'y', 'z']
-      expect(encode(arr, { delimiter })).toBe(`[3]: x${delimiter}y${delimiter}z`)
+      expect(encode(arr, { delimiter })).toBe(`[3${delimiter}]: x${delimiter}y${delimiter}z`)
    })
    it.each([
-      { delimiter: '\t' as const, name: 'tab', expected: '[2]{id}:\n  1\n  2' },
+      { delimiter: '\t' as const, name: 'tab', expected: '[2\t]{id}:\n  1\n  2' },
-      { delimiter: '|' as const, name: 'pipe', expected: '[2]{id}:\n  1\n  2' },
+      { delimiter: '|' as const, name: 'pipe', expected: '[2|]{id}:\n  1\n  2' },
    ])('encodes root arrays of objects with $name', ({ delimiter, expected }) => {
      const arr = [{ id: 1 }, { id: 2 }]
      expect(encode(arr, { delimiter })).toBe(expected)
@@ -643,14 +643,14 @@ describe('delimiter options', () => {
      { delimiter: '\t' as const, name: 'tab', char: '\t', input: ['a', 'b\tc', 'd'], expected: 'a\t"b\\tc"\td' },
      { delimiter: '|' as const, name: 'pipe', char: '|', input: ['a', 'b|c', 'd'], expected: 'a|"b|c"|d' },
    ])('quotes strings containing $name', ({ delimiter, input, expected }) => {
-      expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`)
+      expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
    })
    it.each([
      { delimiter: '\t' as const, name: 'tab', input: ['a,b', 'c,d'], expected: 'a,b\tc,d' },
      { delimiter: '|' as const, name: 'pipe', input: ['a,b', 'c,d'], expected: 'a,b|c,d' },
    ])('does not quote commas with $name', ({ delimiter, input, expected }) => {
-      expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}]: ${expected}`)
+      expect(encode({ items: input }, { delimiter })).toBe(`items[${input.length}${delimiter}]: ${expected}`)
    })
    it('quotes tabular values containing the delimiter', () => {
@@ -661,7 +661,7 @@ describe('delimiter options', () => {
        ],
      }
      expect(encode(obj, { delimiter: ',' })).toBe('items[2]{id,note}:\n  1,"a,b"\n  2,"c,d"')
-      expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{id,note}:\n  1\ta,b\n  2\tc,d')
+      expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{id\tnote}:\n  1\ta,b\n  2\tc,d')
    })
    it('does not quote commas in object values with non-comma delimiter', () => {
@@ -670,22 +670,22 @@ describe('delimiter options', () => {
    })
    it('quotes nested array values containing the delimiter', () => {
-      expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1]:\n  - [2]: a|"b|c"')
+      expect(encode({ pairs: [['a', 'b|c']] }, { delimiter: '|' })).toBe('pairs[1|]:\n  - [2|]: a|"b|c"')
-      expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1]:\n  - [2]: a\t"b\\tc"')
+      expect(encode({ pairs: [['a', 'b\tc']] }, { delimiter: '\t' })).toBe('pairs[1\t]:\n  - [2\t]: a\t"b\\tc"')
    })
  })
  describe('delimiter-independent quoting rules', () => {
    it('preserves ambiguity quoting regardless of delimiter', () => {
      const obj = { items: ['true', '42', '-3.14'] }
-      expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "true"|"42"|"-3.14"')
+      expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "true"|"42"|"-3.14"')
-      expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "true"\t"42"\t"-3.14"')
+      expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "true"\t"42"\t"-3.14"')
    })
    it('preserves structural quoting regardless of delimiter', () => {
      const obj = { items: ['[5]', '{key}', '- item'] }
-      expect(encode(obj, { delimiter: '|' })).toBe('items[3]: "[5]"|"{key}"|"- item"')
+      expect(encode(obj, { delimiter: '|' })).toBe('items[3|]: "[5]"|"{key}"|"- item"')
-      expect(encode(obj, { delimiter: '\t' })).toBe('items[3]: "[5]"\t"{key}"\t"- item"')
+      expect(encode(obj, { delimiter: '\t' })).toBe('items[3\t]: "[5]"\t"{key}"\t"- item"')
    })
    it('quotes keys containing the delimiter', () => {
@@ -695,13 +695,13 @@ describe('delimiter options', () => {
    it('quotes tabular headers containing the delimiter', () => {
      const obj = { items: [{ 'a|b': 1 }, { 'a|b': 2 }] }
-      expect(encode(obj, { delimiter: '|' })).toBe('items[2]{"a|b"}:\n  1\n  2')
+      expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{"a|b"}:\n  1\n  2')
    })
-    it('always uses commas in tabular headers regardless of delimiter', () => {
+    it('header uses the active delimiter', () => {
      const obj = { items: [{ a: 1, b: 2 }, { a: 3, b: 4 }] }
-      expect(encode(obj, { delimiter: '|' })).toBe('items[2]{a,b}:\n  1|2\n  3|4')
+      expect(encode(obj, { delimiter: '|' })).toBe('items[2|]{a|b}:\n  1|2\n  3|4')
-      expect(encode(obj, { delimiter: '\t' })).toBe('items[2]{a,b}:\n  1\t2\n  3\t4')
+      expect(encode(obj, { delimiter: '\t' })).toBe('items[2\t]{a\tb}:\n  1\t2\n  3\t4')
    })
  })