From ee31be3bdc6de7a84255ebbf03b9fedea473a551 Mon Sep 17 00:00:00 2001 From: Johann Schopplich Date: Wed, 29 Oct 2025 13:05:42 +0100 Subject: [PATCH] test: add case for unquoted invalid numeric formats as strings --- src/decode/decoders.ts | 36 ++-------- src/decode/parser.ts | 97 ++------------------------- src/decode/utils.ts | 96 --------------------------- src/decode/validation.ts | 52 +++++++-------- src/encode/encoders.ts | 25 +------ src/encode/normalize.ts | 7 +- src/encode/primitives.ts | 88 +------------------------ src/index.ts | 8 +-- src/shared/literal-utils.ts | 28 ++++++++ src/shared/string-utils.ts | 127 ++++++++++++++++++++++++++++++++++++ src/shared/validation.ts | 84 ++++++++++++++++++++++++ test/decode.test.ts | 8 +++ 12 files changed, 292 insertions(+), 364 deletions(-) delete mode 100644 src/decode/utils.ts create mode 100644 src/shared/literal-utils.ts create mode 100644 src/shared/string-utils.ts create mode 100644 src/shared/validation.ts diff --git a/src/decode/decoders.ts b/src/decode/decoders.ts index 878e6b3..2e72739 100644 --- a/src/decode/decoders.ts +++ b/src/decode/decoders.ts @@ -1,35 +1,9 @@ -import type { - ArrayHeaderInfo, - Delimiter, - Depth, - JsonArray, - JsonObject, - JsonPrimitive, - JsonValue, - ParsedLine, - ResolvedDecodeOptions, -} from '../types' +import type { ArrayHeaderInfo, Delimiter, Depth, JsonArray, JsonObject, JsonPrimitive, JsonValue, ParsedLine, ResolvedDecodeOptions } from '../types' import type { LineCursor } from './scanner' -import { - COLON, - DEFAULT_DELIMITER, - LIST_ITEM_PREFIX, -} from '../constants' -import { - isArrayHeaderAfterHyphen, - isObjectFirstFieldAfterHyphen, - mapRowValuesToPrimitives, - parseArrayHeaderLine, - parseDelimitedValues, - parseKeyToken, - parsePrimitiveToken, -} from './parser' -import { findClosingQuote } from './utils' -import { - assertExpectedCount, - validateNoExtraListItems, - validateNoExtraTabularRows, -} from './validation' +import { COLON, DEFAULT_DELIMITER, LIST_ITEM_PREFIX } from '../constants' +import { findClosingQuote } from '../shared/string-utils' +import { isArrayHeaderAfterHyphen, isObjectFirstFieldAfterHyphen, mapRowValuesToPrimitives, parseArrayHeaderLine, parseDelimitedValues, parseKeyToken, parsePrimitiveToken } from './parser' +import { assertExpectedCount, validateNoExtraListItems, validateNoExtraTabularRows } from './validation' // #region Entry decoding diff --git a/src/decode/parser.ts b/src/decode/parser.ts index ef6da09..e1f1c39 100644 --- a/src/decode/parser.ts +++ b/src/decode/parser.ts @@ -1,27 +1,7 @@ -import type { - ArrayHeaderInfo, - Delimiter, - JsonPrimitive, -} from '../types' -import { - BACKSLASH, - CARRIAGE_RETURN, - CLOSE_BRACE, - CLOSE_BRACKET, - COLON, - DELIMITERS, - DOUBLE_QUOTE, - FALSE_LITERAL, - HASH, - NEWLINE, - NULL_LITERAL, - OPEN_BRACE, - OPEN_BRACKET, - PIPE, - TAB, - TRUE_LITERAL, -} from '../constants' -import { findClosingQuote, hasUnquotedChar } from './utils' +import type { ArrayHeaderInfo, Delimiter, JsonPrimitive } from '../types' +import { BACKSLASH, CLOSE_BRACE, CLOSE_BRACKET, COLON, DELIMITERS, DOUBLE_QUOTE, FALSE_LITERAL, HASH, NULL_LITERAL, OPEN_BRACE, OPEN_BRACKET, PIPE, TAB, TRUE_LITERAL } from '../constants' +import { isBooleanOrNullLiteral, isNumericLiteral } from '../shared/literal-utils' +import { findClosingQuote, findUnquotedChar, unescapeString } from '../shared/string-utils' // #region Array header parsing @@ -224,24 +204,6 @@ export function parsePrimitiveToken(token: string): JsonPrimitive { return trimmed } -export function isBooleanOrNullLiteral(token: string): boolean { - return token === TRUE_LITERAL || token === FALSE_LITERAL || token === NULL_LITERAL -} - -export function isNumericLiteral(token: string): boolean { - if (!token) - return false - - // Must not have leading zeros (except for "0" itself or decimals like "0.5") - if (token.length > 1 && token[0] === '0' && token[1] !== '.') { - return false - } - - // Check if it's a valid number - const num = Number(token) - return !Number.isNaN(num) && Number.isFinite(num) -} - export function parseStringLiteral(token: string): string { const trimmed = token.trim() @@ -265,53 +227,6 @@ export function parseStringLiteral(token: string): string { return trimmed } -export function unescapeString(value: string): string { - let result = '' - let i = 0 - - while (i < value.length) { - if (value[i] === BACKSLASH) { - if (i + 1 >= value.length) { - throw new SyntaxError('Invalid escape sequence: backslash at end of string') - } - - const next = value[i + 1] - if (next === 'n') { - result += NEWLINE - i += 2 - continue - } - if (next === 't') { - result += TAB - i += 2 - continue - } - if (next === 'r') { - result += CARRIAGE_RETURN - i += 2 - continue - } - if (next === BACKSLASH) { - result += BACKSLASH - i += 2 - continue - } - if (next === DOUBLE_QUOTE) { - result += DOUBLE_QUOTE - i += 2 - continue - } - - throw new SyntaxError(`Invalid escape sequence: \\${next}`) - } - - result += value[i] - i++ - } - - return result -} - export function parseUnquotedKey(content: string, start: number): { key: string, end: number } { let end = start while (end < content.length && content[end] !== COLON) { @@ -367,11 +282,11 @@ export function parseKeyToken(content: string, start: number): { key: string, en // #region Array content detection helpers export function isArrayHeaderAfterHyphen(content: string): boolean { - return content.trim().startsWith(OPEN_BRACKET) && hasUnquotedChar(content, COLON) + return content.trim().startsWith(OPEN_BRACKET) && findUnquotedChar(content, COLON) !== -1 } export function isObjectFirstFieldAfterHyphen(content: string): boolean { - return hasUnquotedChar(content, COLON) + return findUnquotedChar(content, COLON) !== -1 } // #endregion diff --git a/src/decode/utils.ts b/src/decode/utils.ts deleted file mode 100644 index 7746cb5..0000000 --- a/src/decode/utils.ts +++ /dev/null @@ -1,96 +0,0 @@ -import { BACKSLASH, DOUBLE_QUOTE } from '../constants' - -/** - * Finds the index of the closing double quote in a string, accounting for escape sequences. - * - * @param content The string to search in - * @param start The index of the opening quote - * @returns The index of the closing quote, or -1 if not found - */ -export function findClosingQuote(content: string, start: number): number { - let i = start + 1 - while (i < content.length) { - if (content[i] === BACKSLASH && i + 1 < content.length) { - // Skip escaped character - i += 2 - continue - } - if (content[i] === DOUBLE_QUOTE) { - return i - } - i++ - } - return -1 // Not found -} - -/** - * Checks if a string contains a specific character outside of quoted sections. - * - * @param content The string to check - * @param char The character to look for - * @returns true if the character exists outside quotes, false otherwise - */ -export function hasUnquotedChar(content: string, char: string): boolean { - return findUnquotedChar(content, char) !== -1 -} - -/** - * Finds the index of a specific character outside of quoted sections. - * - * @param content The string to search in - * @param char The character to look for - * @param start Optional starting index (defaults to 0) - * @returns The index of the character, or -1 if not found outside quotes - */ -export function findUnquotedChar(content: string, char: string, start = 0): number { - let inQuotes = false - let i = start - - while (i < content.length) { - if (content[i] === BACKSLASH && i + 1 < content.length && inQuotes) { - // Skip escaped character - i += 2 - continue - } - - if (content[i] === DOUBLE_QUOTE) { - inQuotes = !inQuotes - i++ - continue - } - - if (content[i] === char && !inQuotes) { - return i - } - - i++ - } - - return -1 -} - -/** - * Checks if a string starts and ends with double quotes. - * - * @param content The string to check - * @returns true if the string is quoted, false otherwise - */ -export function isQuotedString(content: string): boolean { - const trimmed = content.trim() - return trimmed.startsWith(DOUBLE_QUOTE) && trimmed.endsWith(DOUBLE_QUOTE) && trimmed.length >= 2 -} - -/** - * Skips whitespace characters starting from a given index. - * - * @param content The string to process - * @param start The starting index - * @returns The index of the first non-whitespace character, or content.length if all whitespace - */ -export function skipWhitespace(content: string, start: number): number { - let i = start - while (i < content.length && /\s/.test(content[i]!)) { - i++ - } - return i -} diff --git a/src/decode/validation.ts b/src/decode/validation.ts index 0a35b7b..cc8bad4 100644 --- a/src/decode/validation.ts +++ b/src/decode/validation.ts @@ -7,7 +7,7 @@ import { COLON, LIST_ITEM_PREFIX } from '../constants' * * @param actual The actual count * @param expected The expected count - * @param itemType The type of items being counted (e.g., 'list array items', 'tabular rows') + * @param itemType The type of items being counted (e.g., `list array items`, `tabular rows`) * @param options Decode options * @throws RangeError if counts don't match in strict mode */ @@ -44,31 +44,6 @@ export function validateNoExtraListItems( } } -/** - * Checks if a line represents a data row (as opposed to a key-value pair) in a tabular array. - * - * @param content The line content - * @param delimiter The delimiter used in the table - * @returns true if the line is a data row, false if it's a key-value pair - */ -export function isDataRow(content: string, delimiter: Delimiter): boolean { - const colonPos = content.indexOf(COLON) - const delimiterPos = content.indexOf(delimiter) - - // No colon = definitely a data row - if (colonPos === -1) { - return true - } - - // Has delimiter and it comes before colon = data row - if (delimiterPos !== -1 && delimiterPos < colonPos) { - return true - } - - // Colon before delimiter or no delimiter = key-value pair - return false -} - /** * Validates that there are no extra tabular rows beyond the expected count. * @@ -95,3 +70,28 @@ export function validateNoExtraTabularRows( throw new RangeError(`Expected ${header.length} tabular rows, but found more`) } } + +/** + * Checks if a line represents a data row (as opposed to a key-value pair) in a tabular array. + * + * @param content The line content + * @param delimiter The delimiter used in the table + * @returns true if the line is a data row, false if it's a key-value pair + */ +function isDataRow(content: string, delimiter: Delimiter): boolean { + const colonPos = content.indexOf(COLON) + const delimiterPos = content.indexOf(delimiter) + + // No colon = definitely a data row + if (colonPos === -1) { + return true + } + + // Has delimiter and it comes before colon = data row + if (delimiterPos !== -1 && delimiterPos < colonPos) { + return true + } + + // Colon before delimiter or no delimiter = key-value pair + return false +} diff --git a/src/encode/encoders.ts b/src/encode/encoders.ts index a3f1a13..e517f22 100644 --- a/src/encode/encoders.ts +++ b/src/encode/encoders.ts @@ -1,26 +1,7 @@ -import type { - Depth, - JsonArray, - JsonObject, - JsonPrimitive, - JsonValue, - ResolvedEncodeOptions, -} from '../types' +import type { Depth, JsonArray, JsonObject, JsonPrimitive, JsonValue, ResolvedEncodeOptions } from '../types' import { LIST_ITEM_MARKER } from '../constants' -import { - isArrayOfArrays, - isArrayOfObjects, - isArrayOfPrimitives, - isJsonArray, - isJsonObject, - isJsonPrimitive, -} from './normalize' -import { - encodeAndJoinPrimitives, - encodeKey, - encodePrimitive, - formatHeader, -} from './primitives' +import { isArrayOfArrays, isArrayOfObjects, isArrayOfPrimitives, isJsonArray, isJsonObject, isJsonPrimitive } from './normalize' +import { encodeAndJoinPrimitives, encodeKey, encodePrimitive, formatHeader } from './primitives' import { LineWriter } from './writer' // #region Encode normalized JsonValue diff --git a/src/encode/normalize.ts b/src/encode/normalize.ts index 3d980c8..a4aff86 100644 --- a/src/encode/normalize.ts +++ b/src/encode/normalize.ts @@ -1,9 +1,4 @@ -import type { - JsonArray, - JsonObject, - JsonPrimitive, - JsonValue, -} from '../types' +import type { JsonArray, JsonObject, JsonPrimitive, JsonValue } from '../types' // #region Normalization (unknown → JsonValue) diff --git a/src/encode/primitives.ts b/src/encode/primitives.ts index dca2958..d4c2a62 100644 --- a/src/encode/primitives.ts +++ b/src/encode/primitives.ts @@ -1,14 +1,7 @@ import type { JsonPrimitive } from '../types' -import { - BACKSLASH, - COMMA, - DEFAULT_DELIMITER, - DOUBLE_QUOTE, - FALSE_LITERAL, - LIST_ITEM_MARKER, - NULL_LITERAL, - TRUE_LITERAL, -} from '../constants' +import { COMMA, DEFAULT_DELIMITER, DOUBLE_QUOTE, NULL_LITERAL } from '../constants' +import { escapeString } from '../shared/string-utils' +import { isSafeUnquoted, isValidUnquotedKey } from '../shared/validation' // #region Primitive encoding @@ -36,74 +29,6 @@ export function encodeStringLiteral(value: string, delimiter: string = COMMA): s return `${DOUBLE_QUOTE}${escapeString(value)}${DOUBLE_QUOTE}` } -export function escapeString(value: string): string { - return value - .replace(/\\/g, `${BACKSLASH}${BACKSLASH}`) - .replace(/"/g, `${BACKSLASH}${DOUBLE_QUOTE}`) - .replace(/\n/g, `${BACKSLASH}n`) - .replace(/\r/g, `${BACKSLASH}r`) - .replace(/\t/g, `${BACKSLASH}t`) -} - -export function isSafeUnquoted(value: string, delimiter: string = COMMA): boolean { - if (!value) { - return false - } - - if (isPaddedWithWhitespace(value)) { - return false - } - - if (value === TRUE_LITERAL || value === FALSE_LITERAL || value === NULL_LITERAL) { - return false - } - - if (isNumericLike(value)) { - return false - } - - // Check for colon (always structural) - if (value.includes(':')) { - return false - } - - // Check for quotes and backslash (always need escaping) - if (value.includes('"') || value.includes('\\')) { - return false - } - - // Check for brackets and braces (always structural) - if (/[[\]{}]/.test(value)) { - return false - } - - // Check for control characters (newline, carriage return, tab - always need quoting/escaping) - if (/[\n\r\t]/.test(value)) { - return false - } - - // Check for the active delimiter - if (value.includes(delimiter)) { - return false - } - - // Check for hyphen at start (list marker) - if (value.startsWith(LIST_ITEM_MARKER)) { - return false - } - - return true -} - -function isNumericLike(value: string): boolean { - // Match numbers like: 42, -3.14, 1e-6, 05, etc. - return /^-?\d+(?:\.\d+)?(?:e[+-]?\d+)?$/i.test(value) || /^0\d+$/.test(value) -} - -function isPaddedWithWhitespace(value: string): boolean { - return value !== value.trim() -} - // #endregion // #region Key encoding @@ -116,10 +41,6 @@ export function encodeKey(key: string): string { return `${DOUBLE_QUOTE}${escapeString(key)}${DOUBLE_QUOTE}` } -function isValidUnquotedKey(key: string): boolean { - return /^[A-Z_][\w.]*$/i.test(key) -} - // #endregion // #region Value joining @@ -132,9 +53,6 @@ export function encodeAndJoinPrimitives(values: readonly JsonPrimitive[], delimi // #region Header formatters -/** - * Header formatter for arrays and tables with optional key prefix and field names - */ export function formatHeader( length: number, options?: { diff --git a/src/index.ts b/src/index.ts index 626051e..7649c5e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,10 +1,4 @@ -import type { - DecodeOptions, - EncodeOptions, - JsonValue, - ResolvedDecodeOptions, - ResolvedEncodeOptions, -} from './types' +import type { DecodeOptions, EncodeOptions, JsonValue, ResolvedDecodeOptions, ResolvedEncodeOptions } from './types' import { DEFAULT_DELIMITER } from './constants' import { decodeValueFromLines } from './decode/decoders' import { LineCursor, toParsedLines } from './decode/scanner' diff --git a/src/shared/literal-utils.ts b/src/shared/literal-utils.ts new file mode 100644 index 0000000..201ea53 --- /dev/null +++ b/src/shared/literal-utils.ts @@ -0,0 +1,28 @@ +import { FALSE_LITERAL, NULL_LITERAL, TRUE_LITERAL } from '../constants' + +/** + * Checks if a token is a boolean or null literal (`true`, `false`, `null`). + */ +export function isBooleanOrNullLiteral(token: string): boolean { + return token === TRUE_LITERAL || token === FALSE_LITERAL || token === NULL_LITERAL +} + +/** + * Checks if a token represents a valid numeric literal. + * + * @remarks + * Rejects numbers with leading zeros (except `"0"` itself or decimals like `"0.5"`). + */ +export function isNumericLiteral(token: string): boolean { + if (!token) + return false + + // Must not have leading zeros (except for `"0"` itself or decimals like `"0.5"`) + if (token.length > 1 && token[0] === '0' && token[1] !== '.') { + return false + } + + // Check if it's a valid number + const num = Number(token) + return !Number.isNaN(num) && Number.isFinite(num) +} diff --git a/src/shared/string-utils.ts b/src/shared/string-utils.ts new file mode 100644 index 0000000..04c84c3 --- /dev/null +++ b/src/shared/string-utils.ts @@ -0,0 +1,127 @@ +import { BACKSLASH, CARRIAGE_RETURN, DOUBLE_QUOTE, NEWLINE, TAB } from '../constants' + +/** + * Escapes special characters in a string for encoding. + * + * @remarks + * Handles backslashes, quotes, newlines, carriage returns, and tabs. + */ +export function escapeString(value: string): string { + return value + .replace(/\\/g, `${BACKSLASH}${BACKSLASH}`) + .replace(/"/g, `${BACKSLASH}${DOUBLE_QUOTE}`) + .replace(/\n/g, `${BACKSLASH}n`) + .replace(/\r/g, `${BACKSLASH}r`) + .replace(/\t/g, `${BACKSLASH}t`) +} + +/** + * Unescapes a string by processing escape sequences. + * + * @remarks + * Handles `\n`, `\t`, `\r`, `\\`, and `\"` escape sequences. + */ +export function unescapeString(value: string): string { + let result = '' + let i = 0 + + while (i < value.length) { + if (value[i] === BACKSLASH) { + if (i + 1 >= value.length) { + throw new SyntaxError('Invalid escape sequence: backslash at end of string') + } + + const next = value[i + 1] + if (next === 'n') { + result += NEWLINE + i += 2 + continue + } + if (next === 't') { + result += TAB + i += 2 + continue + } + if (next === 'r') { + result += CARRIAGE_RETURN + i += 2 + continue + } + if (next === BACKSLASH) { + result += BACKSLASH + i += 2 + continue + } + if (next === DOUBLE_QUOTE) { + result += DOUBLE_QUOTE + i += 2 + continue + } + + throw new SyntaxError(`Invalid escape sequence: \\${next}`) + } + + result += value[i] + i++ + } + + return result +} + +/** + * Finds the index of the closing double quote in a string, accounting for escape sequences. + * + * @param content The string to search in + * @param start The index of the opening quote + * @returns The index of the closing quote, or -1 if not found + */ +export function findClosingQuote(content: string, start: number): number { + let i = start + 1 + while (i < content.length) { + if (content[i] === BACKSLASH && i + 1 < content.length) { + // Skip escaped character + i += 2 + continue + } + if (content[i] === DOUBLE_QUOTE) { + return i + } + i++ + } + return -1 // Not found +} + +/** + * Finds the index of a specific character outside of quoted sections. + * + * @param content The string to search in + * @param char The character to look for + * @param start Optional starting index (defaults to 0) + * @returns The index of the character, or -1 if not found outside quotes + */ +export function findUnquotedChar(content: string, char: string, start = 0): number { + let inQuotes = false + let i = start + + while (i < content.length) { + if (content[i] === BACKSLASH && i + 1 < content.length && inQuotes) { + // Skip escaped character + i += 2 + continue + } + + if (content[i] === DOUBLE_QUOTE) { + inQuotes = !inQuotes + i++ + continue + } + + if (content[i] === char && !inQuotes) { + return i + } + + i++ + } + + return -1 +} diff --git a/src/shared/validation.ts b/src/shared/validation.ts new file mode 100644 index 0000000..22cfefc --- /dev/null +++ b/src/shared/validation.ts @@ -0,0 +1,84 @@ +import { COMMA, LIST_ITEM_MARKER } from '../constants' +import { isBooleanOrNullLiteral } from './literal-utils' + +/** + * Checks if a key can be used without quotes. + * + * @remarks + * Valid unquoted keys must start with a letter or underscore, + * followed by letters, digits, underscores, or dots. + */ +export function isValidUnquotedKey(key: string): boolean { + return /^[A-Z_][\w.]*$/i.test(key) +} + +/** + * Determines if a string value can be safely encoded without quotes. + * + * @remarks + * A string needs quoting if it: + * - Is empty + * - Has leading or trailing whitespace + * - Could be confused with a literal (boolean, null, number) + * - Contains structural characters (colons, brackets, braces) + * - Contains quotes or backslashes (need escaping) + * - Contains control characters (newlines, tabs, etc.) + * - Contains the active delimiter + * - Starts with a list marker (hyphen) + */ +export function isSafeUnquoted(value: string, delimiter: string = COMMA): boolean { + if (!value) { + return false + } + + if (value !== value.trim()) { + return false + } + + // Check if it looks like any literal value (boolean, null, or numeric) + if (isBooleanOrNullLiteral(value) || isNumericLike(value)) { + return false + } + + // Check for colon (always structural) + if (value.includes(':')) { + return false + } + + // Check for quotes and backslash (always need escaping) + if (value.includes('"') || value.includes('\\')) { + return false + } + + // Check for brackets and braces (always structural) + if (/[[\]{}]/.test(value)) { + return false + } + + // Check for control characters (newline, carriage return, tab - always need quoting/escaping) + if (/[\n\r\t]/.test(value)) { + return false + } + + // Check for the active delimiter + if (value.includes(delimiter)) { + return false + } + + // Check for hyphen at start (list marker) + if (value.startsWith(LIST_ITEM_MARKER)) { + return false + } + + return true +} + +/** + * Checks if a string looks like a number. + * + * @remarks + * Match numbers like `42`, `-3.14`, `1e-6`, `05`, etc. + */ +function isNumericLike(value: string): boolean { + return /^-?\d+(?:\.\d+)?(?:e[+-]?\d+)?$/i.test(value) || /^0\d+$/.test(value) +} diff --git a/test/decode.test.ts b/test/decode.test.ts index 005ee02..67a958c 100644 --- a/test/decode.test.ts +++ b/test/decode.test.ts @@ -32,6 +32,14 @@ describe('primitives', () => { expect(decode('null')).toBe(null) }) + it('treats unquoted invalid numeric formats as strings', () => { + expect(decode('05')).toBe('05') + expect(decode('007')).toBe('007') + expect(decode('0123')).toBe('0123') + expect(decode('a: 05')).toEqual({ a: '05' }) + expect(decode('nums[3]: 05,007,0123')).toEqual({ nums: ['05', '007', '0123'] }) + }) + it('respects ambiguity quoting (quoted primitives remain strings)', () => { expect(decode('"true"')).toBe('true') expect(decode('"false"')).toBe('false')