Skip to content

Commit 217eda3

Browse files
authored
Merge pull request #2252 from asger-semmle/regexp
JS: Parse regular expressions from string literals
2 parents 73d9cc2 + 7a489af commit 217eda3

File tree

101 files changed

+8900
-5309
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+8900
-5309
lines changed

change-notes/1.23/analysis-javascript.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@
5353
## Changes to libraries
5454

5555
* `Expr.getDocumentation()` now handles chain assignments.
56+
* String literals are now parsed as regular expressions.
57+
Consequently, a `RegExpTerm` may occur as part of a string literal or
58+
as a regular expression literal. Queries that search for regular expressions may need to
59+
use `RegExpTerm.isPartOfRegExpLiteral` or `RegExpTerm.isUsedAsRegExp` to restrict the search.
60+
A regular expression AST can be obtained from a string literal using `StringLiteral.asRegExp`.
5661

5762
## Removal of deprecated queries
5863

javascript/extractor/src/com/semmle/js/ast/Literal.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ public boolean isRegExp() {
5959
return tokenType == TokenType.regexp;
6060
}
6161

62+
/** Is this a string literal? */
63+
public boolean isStringLiteral() {
64+
return tokenType == TokenType.string;
65+
}
66+
6267
/** The value of this literal expressed as a string. */
6368
public String getStringValue() {
6469
// regular expressions may have a null value; use the raw value instead

javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,10 +516,97 @@ public Label visit(Literal nd, Context c) {
516516
String valueString = nd.getStringValue();
517517

518518
trapwriter.addTuple("literals", valueString, source, key);
519-
if (nd.isRegExp()) regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), nd);
519+
if (nd.isRegExp()) {
520+
OffsetTranslation offsets = new OffsetTranslation();
521+
offsets.set(0, 1); // skip the initial '/'
522+
regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
523+
} else if (nd.isStringLiteral() && !c.isInsideType()) {
524+
regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
525+
}
520526
return key;
521527
}
522528

529+
private boolean isOctalDigit(char ch) {
530+
return '0' <= ch && ch <= '7';
531+
}
532+
533+
/**
534+
* Builds a translation from offsets in a string value back to its original raw literal text
535+
* (including quotes).
536+
*
537+
* <p>This is not a 1:1 mapping since escape sequences take up more characters in the raw
538+
* literal than in the resulting string value. This mapping includes the surrounding quotes.
539+
*
540+
* <p>For example: for the raw literal value <code>'x\.y'</code> (quotes included), the <code>y
541+
* </code> at index 2 in <code>x.y</code> maps to index 4 in the raw literal.
542+
*/
543+
public OffsetTranslation makeStringLiteralOffsets(String rawLiteral) {
544+
OffsetTranslation offsets = new OffsetTranslation();
545+
offsets.set(0, 1); // Skip the initial quote
546+
// Invariant: raw character at 'pos' corresponds to decoded character at 'pos - delta'
547+
int pos = 1;
548+
int delta = 1;
549+
while (pos < rawLiteral.length() - 1) {
550+
if (rawLiteral.charAt(pos) != '\\') {
551+
++pos;
552+
continue;
553+
}
554+
final int length; // Length of the escape sequence, including slash.
555+
int outputLength = 1; // Number characters the sequence expands to.
556+
char ch = rawLiteral.charAt(pos + 1);
557+
if ('0' <= ch && ch <= '7') {
558+
// Octal escape: \N, \NN, or \NNN
559+
int firstDigit = pos + 1;
560+
int end = firstDigit;
561+
int maxEnd = Math.min(firstDigit + (ch <= '3' ? 3 : 2), rawLiteral.length());
562+
while (end < maxEnd && isOctalDigit(rawLiteral.charAt(end))) {
563+
++end;
564+
}
565+
length = end - pos;
566+
} else if (ch == 'x') {
567+
// Hex escape: \xNN
568+
length = 4;
569+
} else if (ch == 'u' && pos + 2 < rawLiteral.length()) {
570+
if (rawLiteral.charAt(pos + 2) == '{') {
571+
// Variable-length unicode escape: \U{N...}
572+
// Scan for the ending '}'
573+
int firstDigit = pos + 3;
574+
int end = firstDigit;
575+
int leadingZeros = 0;
576+
while (end < rawLiteral.length() && rawLiteral.charAt(end) == '0') {
577+
++end;
578+
++leadingZeros;
579+
}
580+
while (end < rawLiteral.length() && rawLiteral.charAt(end) != '}') {
581+
++end;
582+
}
583+
int numDigits = end - firstDigit;
584+
if (numDigits - leadingZeros > 4) {
585+
outputLength = 2; // Encoded as a surrogate pair
586+
}
587+
++end; // Include '}' character
588+
length = end - pos;
589+
} else {
590+
// Fixed-length unicode escape: \UNNNN
591+
length = 6;
592+
}
593+
} else {
594+
// Simple escape: \n or similar.
595+
length = 2;
596+
}
597+
int end = pos + length;
598+
if (end > rawLiteral.length()) {
599+
end = rawLiteral.length();
600+
}
601+
int outputPos = pos - delta;
602+
// Map the next character to the adjusted offset.
603+
offsets.set(outputPos + outputLength, end);
604+
delta += length - outputLength;
605+
pos = end;
606+
}
607+
return offsets;
608+
}
609+
523610
@Override
524611
public Label visit(MemberExpression nd, Context c) {
525612
Label key = super.visit(nd, c);

javascript/extractor/src/com/semmle/js/extractor/LocationManager.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ public void setHasLocationTable(String hasLocation) {
6565

6666
/**
6767
* Emit location information for an AST node. The node's location is translated from the parser's
68-
* 0-based column numbering scheme into our 1-based scheme and then emitted as a snippet location.
68+
* 0-based column numbering scheme with exclusive offsets into our 1-based scheme with inclusive
69+
* end-offsets and then emitted as a snippet location.
6970
*/
7071
public void emitNodeLocation(SourceElement nd, Label lbl) {
7172
int sl = nd.getLoc().getStart().getLine(),
@@ -86,7 +87,15 @@ public void emitNodeLocation(SourceElement nd, Label lbl) {
8687
emitSnippetLocation(lbl, sl, sc, el, ec);
8788
}
8889

89-
/** Emit a relative location in the current snippet. */
90+
/**
91+
* Emit a relative location in the current snippet.
92+
*
93+
* @param lbl label to associate with the location
94+
* @param sl start line (1-based)
95+
* @param sc start column (1-based, inclusive)
96+
* @param el end line (1-based)
97+
* @param ec end column (1-based, inclusive)
98+
*/
9099
public void emitSnippetLocation(Label lbl, int sl, int sc, int el, int ec) {
91100
Position start = translatePosition(new Position(sl, sc, -1));
92101
Position end = translatePosition(new Position(el, ec, -1));

javascript/extractor/src/com/semmle/js/extractor/Main.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public class Main {
3737
* A version identifier that should be updated every time the extractor changes in such a way that
3838
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
3939
*/
40-
public static final String EXTRACTOR_VERSION = "2019-10-08";
40+
public static final String EXTRACTOR_VERSION = "2019-10-23";
4141

4242
public static final Pattern NEWLINE = Pattern.compile("\n");
4343

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package com.semmle.js.extractor;
2+
3+
import com.semmle.util.data.IntList;
4+
5+
/**
6+
* A mapping of some source range into a set of intervals in an output source range.
7+
*
8+
* <p>The mapping is constructed by adding "anchors": input/output pairs that correspond to the
9+
* beginning of an interval, which is assumed to end at the next anchor.
10+
*/
11+
public class OffsetTranslation {
12+
private IntList anchors = IntList.create();
13+
private IntList deltas = IntList.create();
14+
15+
/** Returns the mapping of x. */
16+
public int get(int x) {
17+
int index = anchors.binarySearch(x);
18+
if (index < 0) {
19+
// The insertion point is -index - 1.
20+
// Get the index immediately before that.
21+
index = -index - 2;
22+
if (index < 0) {
23+
// If queried before the first anchor, use the first anchor anyway.
24+
index = 0;
25+
}
26+
}
27+
return x + deltas.get(index);
28+
}
29+
30+
/**
31+
* Maps the given input offset to the given output offset.
32+
*
33+
* <p>This is added as an anchor. Any offset is mapped based on its closest preceding anchor.
34+
*/
35+
public void set(int from, int to) {
36+
anchors.add(from);
37+
deltas.add(to - from);
38+
}
39+
}

javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ public class RegExpExtractor {
5151
private final LocationManager locationManager;
5252
private final RegExpParser parser = new RegExpParser();
5353
private Position literalStart;
54+
private OffsetTranslation offsets;
5455

5556
public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
5657
this.trapwriter = trapwriter;
@@ -119,10 +120,14 @@ private Label extractTerm(RegExpTerm term, Label parent, int idx) {
119120
}
120121

121122
public void emitLocation(SourceElement term, Label lbl) {
123+
int col = literalStart.getColumn();
122124
int sl, sc, el, ec;
123125
sl = el = literalStart.getLine();
124-
sc = literalStart.getColumn() + 2 + term.getLoc().getStart().getColumn();
125-
ec = literalStart.getColumn() + 1 + term.getLoc().getEnd().getColumn();
126+
sc = col + offsets.get(term.getLoc().getStart().getColumn());
127+
ec = col + offsets.get(term.getLoc().getEnd().getColumn());
128+
sc += 1; // convert to 1-based
129+
ec += 1; // convert to 1-based
130+
ec -= 1; // convert to inclusive
126131
locationManager.emitSnippetLocation(lbl, sl, sc, el, ec);
127132
}
128133

@@ -341,9 +346,16 @@ public void visit(CharacterClassRange nd) {
341346
}
342347
}
343348

344-
public void extract(String src, Node parent) {
345-
this.literalStart = parent.getLoc().getStart();
349+
public void extract(
350+
String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
346351
Result res = parser.parse(src);
352+
353+
if (isSpeculativeParsing && res.getErrors().size() > 0) {
354+
return;
355+
}
356+
357+
this.literalStart = parent.getLoc().getStart();
358+
this.offsets = offsets;
347359
RegExpTerm ast = res.getAST();
348360
new V().visit(ast, trapwriter.localID(parent), 0);
349361

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package com.semmle.js.extractor.test;
2+
3+
import com.semmle.js.extractor.OffsetTranslation;
4+
import org.junit.Assert;
5+
import org.junit.Test;
6+
7+
public class OffsetTranslationTest {
8+
@Test
9+
public void testBasic() {
10+
OffsetTranslation table = new OffsetTranslation();
11+
table.set(0, 10);
12+
table.set(100, 250);
13+
Assert.assertEquals(10, table.get(0));
14+
Assert.assertEquals(15, table.get(5));
15+
Assert.assertEquals(85, table.get(75));
16+
Assert.assertEquals(109, table.get(99));
17+
Assert.assertEquals(250, table.get(100));
18+
Assert.assertEquals(251, table.get(101));
19+
}
20+
21+
@Test
22+
public void testLookupBefore() {
23+
OffsetTranslation table = new OffsetTranslation();
24+
table.set(0, 10);
25+
table.set(100, 250);
26+
Assert.assertEquals(9, table.get(-1));
27+
}
28+
29+
@Test
30+
public void testIdentity() {
31+
OffsetTranslation table = new OffsetTranslation();
32+
table.set(0, 0);
33+
Assert.assertEquals(0, table.get(0));
34+
Assert.assertEquals(75, table.get(75));
35+
}
36+
37+
@Test
38+
public void testDuplicateAnchor() {
39+
OffsetTranslation table = new OffsetTranslation();
40+
table.set(0, 0);
41+
table.set(10, 100);
42+
table.set(10, 100);
43+
table.set(20, 150);
44+
Assert.assertEquals(1, table.get(1));
45+
Assert.assertEquals(100, table.get(10));
46+
Assert.assertEquals(101, table.get(11));
47+
Assert.assertEquals(150, table.get(20));
48+
Assert.assertEquals(151, table.get(21));
49+
}
50+
}

javascript/extractor/src/com/semmle/js/parser/RegExpParser.java

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,40 @@ private RegExpTerm parseAtom() {
314314
return this.finishTerm(new Group(loc, capture, number, name, dis));
315315
}
316316

317-
char c = this.nextChar();
318-
if ("^$\\.*+?()[]{}|".indexOf(c) != -1) this.error(Error.UNEXPECTED_CHARACTER, this.pos - 1);
319-
return this.finishTerm(new Constant(loc, String.valueOf(c)));
317+
// Parse consecutive constants into a single Constant node.
318+
// Due to speculative parsing of string literals, this part of the code is fairly hot.
319+
int startPos = this.pos;
320+
int endPos = startPos;
321+
while (endPos < src.length()) {
322+
if ("^$\\.*+?()[]{}|".indexOf(src.charAt(endPos)) != -1) break;
323+
++endPos;
324+
}
325+
if (startPos == endPos) {
326+
this.error(Error.UNEXPECTED_CHARACTER, endPos);
327+
endPos = startPos + 1; // To ensure progress, make sure we parse at least one character.
328+
}
329+
// Check if the end of the constant belongs under an upcoming quantifier.
330+
if (endPos != startPos + 1
331+
&& endPos < src.length()
332+
&& "*+?{".indexOf(src.charAt(endPos)) != -1) {
333+
if (Character.isLowSurrogate(src.charAt(endPos - 1))
334+
&& Character.isHighSurrogate(src.charAt(endPos - 2))) {
335+
// Don't split the surrogate pair.
336+
if (endPos == startPos + 2) {
337+
// The whole constant is a single wide character.
338+
} else {
339+
endPos -= 2; // Last 2 characters belong to an upcoming quantifier.
340+
}
341+
} else {
342+
endPos--; // Last character belongs to an upcoming quantifier.
343+
}
344+
}
345+
String str = src.substring(startPos, endPos);
346+
this.pos = endPos;
347+
loc.setEnd(pos());
348+
loc.setSource(str);
349+
// Do not call finishTerm as it will create another copy of 'str'.
350+
return new Constant(loc, str);
320351
}
321352

322353
private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
@@ -469,6 +500,11 @@ private RegExpTerm parseCharacterClassAtom() {
469500
if (this.match("b")) return this.finishTerm(new ControlEscape(loc, "\b", 8, "\\b"));
470501
return this.finishTerm(this.parseAtomEscape(loc, true));
471502
}
472-
return this.finishTerm(new Constant(loc, String.valueOf(c)));
503+
String value = String.valueOf(c);
504+
// Extract a surrogate pair as a single constant.
505+
if (Character.isHighSurrogate(c) && Character.isLowSurrogate(peekChar(true))) {
506+
value += this.nextChar();
507+
}
508+
return this.finishTerm(new Constant(loc, value));
473509
}
474510
}

0 commit comments

Comments
 (0)