Skip to content

Commit 37aa85f

Browse files
committed
JS: Fix parsing of non-BMP chars before a quantifier
1 parent 8fcf7a2 commit 37aa85f

File tree

4 files changed

+20
-3
lines changed

4 files changed

+20
-3
lines changed

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,10 +326,21 @@ private RegExpTerm parseAtom() {
326326
this.error(Error.UNEXPECTED_CHARACTER, endPos);
327327
endPos = startPos + 1; // To ensure progress, make sure we parse at least one character.
328328
}
329+
// Check if the end of the constant belongs under an upcoming quantifier.
329330
if (endPos != startPos + 1
330331
&& endPos < src.length()
331332
&& "*+?{".indexOf(src.charAt(endPos)) != -1) {
332-
endPos--; // Last constant belongs under an upcoming quantifier.
333+
if (Character.isLowSurrogate(src.charAt(endPos - 1))
334+
&& Character.isHighSurrogate(src.charAt(endPos - 2))) {
335+
// Don't split the surrogate pair.
336+
if (endPos == startPos + 2) {
337+
// The whole constant is a single wide character.
338+
} else {
339+
endPos -= 2; // Last 2 characters belong to an upcoming quantifier.
340+
}
341+
} else {
342+
endPos--; // Last character belongs to an upcoming quantifier.
343+
}
333344
}
334345
String str = src.substring(startPos, endPos);
335346
this.pos = endPos;
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
| tst.js:1:1:1:9 | /[\\u12340\\udf40-\\u12345\\udf45]/ | Character class with supplementary characters in non-unicode literal. |
1+
| tst.js:1:1:1:9 | /[\\u12340\\udf40-\\u12345\\udf45]/ | Split supplementary character in non-unicode literal. |
2+
| tst.js:3:1:3:5 | /\\u12340\\udf40+/ | Split supplementary character in non-unicode literal. |

javascript/ql/test/library-tests/RegExp/MissingUnicodeFlag/MissingUnicodeFlag.ql

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,7 @@ where wideConstant.getLiteral() = literal and
88
wideConstant.getParent() instanceof RegExpCharacterClass
99
or
1010
wideConstant.getParent() instanceof RegExpCharacterRange
11+
or
12+
wideConstant.getParent() instanceof RegExpQuantifier
1113
)
12-
select literal, "Character class with supplementary characters in non-unicode literal."
14+
select literal, "Split supplementary character in non-unicode literal."
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
/[𒍀-𒍅]/; // NOT OK
22
/[𒍀-𒍅]/u; // OK
3+
/𒍀+/; // NOT OK
4+
/𒍀+/u; // OK
5+
/(𒍀)+/; // OK

0 commit comments

Comments
 (0)