Skip to content

Commit f0608aa

Browse files
authored
Merge pull request #22 from jg-rp/unescape-strings
Unescape strings without JSON.parse
2 parents a2d342a + 715b583 commit f0608aa

File tree

5 files changed

+279
-14
lines changed

5 files changed

+279
-14
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# JSON P3 Change Log
22

3+
## Version 1.3.4
4+
5+
**Fixes**
6+
7+
- Fixed decoding of JSONPath escape sequences (those found in name selectors and string literals). Previously we were relying on `JSON.parse()` to unescape strings, now we have our own `unescapeString()` function that rejects invalid codepoints and surrogate pairs. See [jsonpath-compliance-test-suite #87](https://github.com/jsonpath-standard/jsonpath-compliance-test-suite/pull/87).
8+
39
## Version 1.3.3
410

511
**Fixes**

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "json-p3",
3-
"version": "1.3.3",
3+
"version": "1.3.4",
44
"author": "James Prior",
55
"license": "MIT",
66
"description": "JSONPath, JSON Pointer and JSON Patch",

src/path/parse.ts

Lines changed: 210 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ export class Parser {
251251
new NameSelector(
252252
this.environment,
253253
stream.current,
254-
this.decodeString(stream.current, true),
254+
this.decodeString(stream.current),
255255
false,
256256
),
257257
);
@@ -278,7 +278,7 @@ export class Parser {
278278
new KeySelector(
279279
this.environment,
280280
stream.current,
281-
this.decodeString(stream.current, true),
281+
this.decodeString(stream.current),
282282
false,
283283
),
284284
);
@@ -529,21 +529,211 @@ export class Parser {
529529
return left;
530530
}
531531

532-
protected decodeString(token: Token, isName: boolean = false): string {
533-
try {
534-
return JSON.parse(
535-
token.kind === TokenKind.SINGLE_QUOTE_STRING
536-
? `"${token.value.replaceAll('"', '\\"').replaceAll("\\'", "'")}"`
537-
: `"${token.value}"`,
532+
protected decodeString(token: Token): string {
533+
return this.unescapeString(
534+
token.kind === TokenKind.SINGLE_QUOTE_STRING
535+
? token.value.replaceAll('"', '\\"').replaceAll("\\'", "'")
536+
: token.value,
537+
token,
538+
);
539+
}
540+
541+
protected unescapeString(value: string, token: Token): string {
542+
const rv: string[] = [];
543+
const length = value.length;
544+
let index = 0;
545+
let codepoint: number;
546+
547+
while (index < length) {
548+
const ch = value[index];
549+
if (ch === "\\") {
550+
// Handle escape sequences
551+
index += 1; // Move past '\'
552+
553+
switch (value[index]) {
554+
case '"':
555+
rv.push('"');
556+
break;
557+
case "\\":
558+
rv.push("\\");
559+
break;
560+
case "/":
561+
rv.push("/");
562+
break;
563+
case "b":
564+
rv.push("\x08");
565+
break;
566+
case "f":
567+
rv.push("\x0C");
568+
break;
569+
case "n":
570+
rv.push("\n");
571+
break;
572+
case "r":
573+
rv.push("\r");
574+
break;
575+
case "t":
576+
rv.push("\t");
577+
break;
578+
case "u":
579+
[codepoint, index] = this.decodeHexChar(value, index, token);
580+
rv.push(this.stringFromCodePoint(codepoint, token));
581+
break;
582+
default:
583+
// TODO: This is unreachable. The lexer will catch unknown escape sequences.
584+
throw new JSONPathSyntaxError(
585+
`unknown escape sequence at index ${token.index + index - 1}`,
586+
token,
587+
);
588+
}
589+
} else {
590+
this.stringFromCodePoint(ch.codePointAt(0), token);
591+
rv.push(ch);
592+
}
593+
594+
index += 1;
595+
}
596+
597+
return rv.join("");
598+
}
599+
600+
/**
601+
* Decode a `\uXXXX` or `\uXXXX\uXXXX` escape sequence from _value_ at _index_.
602+
*
603+
* @param value - A string value containing the sequence to decode.
604+
* @param index - The start index of an escape sequence in _value_.
605+
* @param token - The token for the string value.
606+
* @returns - A codepoint, new index tuple.
607+
*/
608+
protected decodeHexChar(
609+
value: string,
610+
index: number,
611+
token: Token,
612+
): [number, number] {
613+
const length = value.length;
614+
615+
if (index + 4 >= length) {
616+
throw new JSONPathSyntaxError(
617+
`incomplete escape sequence at index ${token.index + index - 1}`,
618+
token,
538619
);
539-
} catch {
620+
}
621+
622+
index += 1; // Move past 'u'
623+
let codepoint = this.parseHexDigits(value.slice(index, index + 4), token);
624+
625+
if (isLowSurrogate(codepoint)) {
540626
throw new JSONPathSyntaxError(
541-
`invalid ${isName ? "name selector" : "string literal"} '${
542-
token.value
543-
}'`,
627+
`unexpected low surrogate codepoint at index ${token.index + index - 2}`,
544628
token,
545629
);
546630
}
631+
632+
if (isHighSurrogate(codepoint)) {
633+
// Expect a surrogate pair.
634+
if (
635+
!(
636+
index + 9 < length &&
637+
value[index + 4] === "\\" &&
638+
value[index + 5] === "u"
639+
)
640+
) {
641+
throw new JSONPathSyntaxError(
642+
`incomplete escape sequence at index ${token.index + index - 2}`,
643+
token,
644+
);
645+
}
646+
647+
const lowSurrogate = this.parseHexDigits(
648+
value.slice(index + 6, index + 10),
649+
token,
650+
);
651+
652+
if (!isLowSurrogate(lowSurrogate)) {
653+
throw new JSONPathSyntaxError(
654+
`unexpected codepoint at index ${token.index + index + 4}`,
655+
token,
656+
);
657+
}
658+
659+
codepoint =
660+
0x10000 + (((codepoint & 0x03ff) << 10) | (lowSurrogate & 0x03ff));
661+
662+
return [codepoint, index + 9];
663+
}
664+
665+
return [codepoint, index + 3];
666+
}
667+
668+
/**
669+
* Parse a hexadecimal string as an integer.
670+
*
671+
* @param digits - Hexadecimal digit string.
672+
* @param token - The token for the string value.
673+
* @returns - The number representation of _digits_.
674+
*
675+
* Note that we're not using `parseInt(digits, 16)` because it accepts `+`
676+
* and `-` and things we don't allow.
677+
*/
678+
protected parseHexDigits(digits: string, token: Token): number {
679+
const encoder = new TextEncoder();
680+
let codepoint = 0;
681+
for (const digit of encoder.encode(digits)) {
682+
codepoint <<= 4;
683+
switch (digit) {
684+
case 48:
685+
case 49:
686+
case 50:
687+
case 51:
688+
case 52:
689+
case 53:
690+
case 54:
691+
case 55:
692+
case 56:
693+
case 57:
694+
codepoint |= digit - 48; // '0'
695+
break;
696+
case 97:
697+
case 98:
698+
case 99:
699+
case 100:
700+
case 101:
701+
case 102:
702+
codepoint |= digit - 97 + 10; // 'a'
703+
break;
704+
case 65:
705+
case 66:
706+
case 67:
707+
case 68:
708+
case 69:
709+
case 70:
710+
codepoint |= digit - 65 + 10; // 'A'
711+
break;
712+
default:
713+
throw new JSONPathSyntaxError(
714+
"invalid \\uXXXX escape sequence",
715+
token,
716+
);
717+
}
718+
}
719+
return codepoint;
720+
}
721+
722+
/** Check the codepoint is valid and return its string representation. */
723+
protected stringFromCodePoint(
724+
codepoint: number | undefined,
725+
token: Token,
726+
): string {
727+
if (codepoint === undefined || codepoint <= 0x1f) {
728+
throw new JSONPathSyntaxError(`invalid character`, token);
729+
}
730+
731+
try {
732+
return String.fromCodePoint(codepoint);
733+
} catch {
734+
// This should not be reachable.
735+
throw new JSONPathSyntaxError("invalid escape sequence", token);
736+
}
547737
}
548738

549739
protected throwForNonComparable(expr: FilterExpression): void {
@@ -577,3 +767,11 @@ export class Parser {
577767
}
578768
}
579769
}
770+
771+
export function isHighSurrogate(codepoint: number): boolean {
772+
return codepoint >= 0xd800 && codepoint <= 0xdbff;
773+
}
774+
775+
export function isLowSurrogate(codepoint: number): boolean {
776+
return codepoint >= 0xdc00 && codepoint <= 0xdfff;
777+
}

tests/path/cts

tests/path/errors.test.ts

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,64 @@ describe("filter expression EOF", () => {
123123
);
124124
});
125125
});
126+
127+
describe("escape sequence decode errors", () => {
128+
const env = new JSONPathEnvironment();
129+
130+
test("unknown escape sequence", () => {
131+
const query = String.raw`$['ab\xc']`;
132+
// From the lexer
133+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
134+
expect(() => env.query(query, {})).toThrow(
135+
"invalid escape ('['ab\\xc']':6)",
136+
);
137+
});
138+
139+
test("incomplete \\u escape sequence at end of string", () => {
140+
const query = String.raw`$['abc\u263']`;
141+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
142+
expect(() => env.query(query, {})).toThrow(
143+
"incomplete escape sequence at index 6 ('$['abc\\u2':3)",
144+
);
145+
});
146+
147+
test("incomplete surrogate pair at end of string", () => {
148+
const query = String.raw`$['abc\uD83D\uDE0']`;
149+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
150+
expect(() => env.query(query, {})).toThrow(
151+
"incomplete escape sequence at index 6 ('$['abc\\uD':3)",
152+
);
153+
});
154+
155+
test("high high surrogate pair", () => {
156+
const query = String.raw`$['ab\uD800\uD800c']`;
157+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
158+
expect(() => env.query(query, {})).toThrow(
159+
"unexpected codepoint at index 11 ('$['ab\\uD8':3)",
160+
);
161+
});
162+
163+
test("high surrogate followed by non-surrogate", () => {
164+
const query = String.raw`$['ab\uD800\u263Ac']`;
165+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
166+
expect(() => env.query(query, {})).toThrow(
167+
"unexpected codepoint at index 11 ('$['ab\\uD8':3)",
168+
);
169+
});
170+
171+
test("just a low surrogate", () => {
172+
const query = String.raw`$['ab\uDC00c']`;
173+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
174+
expect(() => env.query(query, {})).toThrow(
175+
"unexpected low surrogate codepoint at index 5 ('$['ab\\uDC':3)",
176+
);
177+
});
178+
179+
test("non-hex digits", () => {
180+
const query = String.raw`$['ab\u263Xc']`;
181+
expect(() => env.query(query, {})).toThrow(JSONPathSyntaxError);
182+
expect(() => env.query(query, {})).toThrow(
183+
"invalid \\uXXXX escape sequence ('$['ab\\u26':3)",
184+
);
185+
});
186+
});

0 commit comments

Comments
 (0)