github
diff --git a/‎change-notes/1.23/analysis-javascript.md‎
Lines changed: 5 additions & 0 deletions b/‎change-notes/1.23/analysis-javascript.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/ast/Literal.java‎
Lines changed: 5 additions & 0 deletions b/‎javascript/extractor/src/com/semmle/js/ast/Literal.java‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java‎
Lines changed: 88 additions & 1 deletion b/‎javascript/extractor/src/com/semmle/js/extractor/ASTExtractor.java‎
Lines changed: 88 additions & 1 deletion
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/LocationManager.java‎
Lines changed: 11 additions & 2 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/LocationManager.java‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/Main.java‎
Lines changed: 1 addition & 1 deletion b/‎javascript/extractor/src/com/semmle/js/extractor/Main.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/OffsetTranslation.java‎
Lines changed: 39 additions & 0 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/OffsetTranslation.java‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java‎
Lines changed: 16 additions & 4 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/RegExpExtractor.java‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/extractor/test/OffsetTranslationTest.java‎
Lines changed: 50 additions & 0 deletions b/‎javascript/extractor/src/com/semmle/js/extractor/test/OffsetTranslationTest.java‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎javascript/extractor/src/com/semmle/js/parser/RegExpParser.java‎
Lines changed: 40 additions & 4 deletions b/‎javascript/extractor/src/com/semmle/js/parser/RegExpParser.java‎
Lines changed: 40 additions & 4 deletions
@@ -53,6 +53,11 @@
 ## Changes to libraries
 
 * `Expr.getDocumentation()` now handles chain assignments.
+* String literals are now parsed as regular expressions.
+  Consequently, a `RegExpTerm` may occur as part of a string literal or
+  as a regular expression literal. Queries that search for regular expressions may need to
+  use `RegExpTerm.isPartOfRegExpLiteral` or `RegExpTerm.isUsedAsRegExp` to restrict the search.
+  A regular expression AST can be obtained from a string literal using `StringLiteral.asRegExp`.
 
 ## Removal of deprecated queries
 
 
@@ -59,6 +59,11 @@ public boolean isRegExp() {
     return tokenType == TokenType.regexp;
   }
 
+  /** Is this a string literal? */
+  public boolean isStringLiteral() {
+    return tokenType == TokenType.string;
+  }
+
   /** The value of this literal expressed as a string. */
   public String getStringValue() {
     // regular expressions may have a null value; use the raw value instead
 
@@ -516,10 +516,97 @@ public Label visit(Literal nd, Context c) {
       String valueString = nd.getStringValue();
 
       trapwriter.addTuple("literals", valueString, source, key);
-      if (nd.isRegExp()) regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), nd);
+      if (nd.isRegExp()) {
+        OffsetTranslation offsets = new OffsetTranslation();
+        offsets.set(0, 1); // skip the initial '/'
+        regexpExtractor.extract(source.substring(1, source.lastIndexOf('/')), offsets, nd, false);
+      } else if (nd.isStringLiteral() && !c.isInsideType()) {
+        regexpExtractor.extract(valueString, makeStringLiteralOffsets(nd.getRaw()), nd, true);
+      }
       return key;
     }
 
+    private boolean isOctalDigit(char ch) {
+      return '0' <= ch && ch <= '7';
+    }
+
+    /**
+     * Builds a translation from offsets in a string value back to its original raw literal text
+     * (including quotes).
+     *
+     * <p>This is not a 1:1 mapping since escape sequences take up more characters in the raw
+     * literal than in the resulting string value. This mapping includes the surrounding quotes.
+     *
+     * <p>For example: for the raw literal value <code>'x\.y'</code> (quotes included), the <code>y
+     * </code> at index 2 in <code>x.y</code> maps to index 4 in the raw literal.
+     */
+    public OffsetTranslation makeStringLiteralOffsets(String rawLiteral) {
+      OffsetTranslation offsets = new OffsetTranslation();
+      offsets.set(0, 1); // Skip the initial quote
+      // Invariant: raw character at 'pos' corresponds to decoded character at 'pos - delta'
+      int pos = 1;
+      int delta = 1;
+      while (pos < rawLiteral.length() - 1) {
+        if (rawLiteral.charAt(pos) != '\\') {
+          ++pos;
+          continue;
+        }
+        final int length; // Length of the escape sequence, including slash.
+        int outputLength = 1; // Number characters the sequence expands to.
+        char ch = rawLiteral.charAt(pos + 1);
+        if ('0' <= ch && ch <= '7') {
+          // Octal escape: \N, \NN, or \NNN
+          int firstDigit = pos + 1;
+          int end = firstDigit;
+          int maxEnd = Math.min(firstDigit + (ch <= '3' ? 3 : 2), rawLiteral.length());
+          while (end < maxEnd && isOctalDigit(rawLiteral.charAt(end))) {
+            ++end;
+          }
+          length = end - pos;
+        } else if (ch == 'x') {
+          // Hex escape: \xNN
+          length = 4;
+        } else if (ch == 'u' && pos + 2 < rawLiteral.length()) {
+          if (rawLiteral.charAt(pos + 2) == '{') {
+            // Variable-length unicode escape: \U{N...}
+            // Scan for the ending '}'
+            int firstDigit = pos + 3;
+            int end = firstDigit;
+            int leadingZeros = 0;
+            while (end < rawLiteral.length() && rawLiteral.charAt(end) == '0') {
+              ++end;
+              ++leadingZeros;
+            }
+            while (end < rawLiteral.length() && rawLiteral.charAt(end) != '}') {
+              ++end;
+            }
+            int numDigits = end - firstDigit;
+            if (numDigits - leadingZeros > 4) {
+              outputLength = 2; // Encoded as a surrogate pair
+            }
+            ++end; // Include '}' character
+            length = end - pos;
+          } else {
+            // Fixed-length unicode escape: \UNNNN
+            length = 6;
+          }
+        } else {
+          // Simple escape: \n or similar.
+          length = 2;
+        }
+        int end = pos + length;
+        if (end > rawLiteral.length()) {
+          end = rawLiteral.length();
+        }
+        int outputPos = pos - delta;
+        // Map the next character to the adjusted offset.
+        offsets.set(outputPos + outputLength, end);
+        delta += length - outputLength;
+        pos = end;
+      }
+      return offsets;
+    }
+
     @Override
     public Label visit(MemberExpression nd, Context c) {
       Label key = super.visit(nd, c);
 
@@ -65,7 +65,8 @@ public void setHasLocationTable(String hasLocation) {
 
   /**
    * Emit location information for an AST node. The node's location is translated from the parser's
-   * 0-based column numbering scheme into our 1-based scheme and then emitted as a snippet location.
+   * 0-based column numbering scheme with exclusive offsets into our 1-based scheme with inclusive
+   * end-offsets and then emitted as a snippet location.
    */
   public void emitNodeLocation(SourceElement nd, Label lbl) {
     int sl = nd.getLoc().getStart().getLine(),
@@ -86,7 +87,15 @@ public void emitNodeLocation(SourceElement nd, Label lbl) {
     emitSnippetLocation(lbl, sl, sc, el, ec);
   }
 
-  /** Emit a relative location in the current snippet. */
+  /**
+   * Emit a relative location in the current snippet.
+   *
+   * @param lbl label to associate with the location
+   * @param sl start line (1-based)
+   * @param sc start column (1-based, inclusive)
+   * @param el end line (1-based)
+   * @param ec end column (1-based, inclusive)
+   */
   public void emitSnippetLocation(Label lbl, int sl, int sc, int el, int ec) {
     Position start = translatePosition(new Position(sl, sc, -1));
     Position end = translatePosition(new Position(el, ec, -1));
 
@@ -37,7 +37,7 @@ public class Main {
    * A version identifier that should be updated every time the extractor changes in such a way that
    * it may produce different tuples for the same file under the same {@link ExtractorConfig}.
    */
-  public static final String EXTRACTOR_VERSION = "2019-10-08";
+  public static final String EXTRACTOR_VERSION = "2019-10-23";
 
   public static final Pattern NEWLINE = Pattern.compile("\n");
 
 
@@ -0,0 +1,39 @@
+package com.semmle.js.extractor;
+
+import com.semmle.util.data.IntList;
+
+/**
+ * A mapping of some source range into a set of intervals in an output source range.
+ *
+ * <p>The mapping is constructed by adding "anchors": input/output pairs that correspond to the
+ * beginning of an interval, which is assumed to end at the next anchor.
+ */
+public class OffsetTranslation {
+  private IntList anchors = IntList.create();
+  private IntList deltas = IntList.create();
+
+  /** Returns the mapping of x. */
+  public int get(int x) {
+    int index = anchors.binarySearch(x);
+    if (index < 0) {
+      // The insertion point is -index - 1.
+      // Get the index immediately before that.
+      index = -index - 2;
+      if (index < 0) {
+        // If queried before the first anchor, use the first anchor anyway.
+        index = 0;
+      }
+    }
+    return x + deltas.get(index);
+  }
+
+  /**
+   * Maps the given input offset to the given output offset.
+   *
+   * <p>This is added as an anchor. Any offset is mapped based on its closest preceding anchor.
+   */
+  public void set(int from, int to) {
+    anchors.add(from);
+    deltas.add(to - from);
+  }
+}
@@ -51,6 +51,7 @@ public class RegExpExtractor {
   private final LocationManager locationManager;
   private final RegExpParser parser = new RegExpParser();
   private Position literalStart;
+  private OffsetTranslation offsets;
 
   public RegExpExtractor(TrapWriter trapwriter, LocationManager locationManager) {
     this.trapwriter = trapwriter;
@@ -119,10 +120,14 @@ private Label extractTerm(RegExpTerm term, Label parent, int idx) {
   }
 
   public void emitLocation(SourceElement term, Label lbl) {
+    int col = literalStart.getColumn();
     int sl, sc, el, ec;
     sl = el = literalStart.getLine();
-    sc = literalStart.getColumn() + 2 + term.getLoc().getStart().getColumn();
-    ec = literalStart.getColumn() + 1 + term.getLoc().getEnd().getColumn();
+    sc = col + offsets.get(term.getLoc().getStart().getColumn());
+    ec = col + offsets.get(term.getLoc().getEnd().getColumn());
+    sc += 1; // convert to 1-based
+    ec += 1; // convert to 1-based
+    ec -= 1; // convert to inclusive
     locationManager.emitSnippetLocation(lbl, sl, sc, el, ec);
   }
 
@@ -341,9 +346,16 @@ public void visit(CharacterClassRange nd) {
     }
   }
 
-  public void extract(String src, Node parent) {
-    this.literalStart = parent.getLoc().getStart();
+  public void extract(
+      String src, OffsetTranslation offsets, Node parent, boolean isSpeculativeParsing) {
     Result res = parser.parse(src);
+
+    if (isSpeculativeParsing && res.getErrors().size() > 0) {
+      return;
+    }
+
+    this.literalStart = parent.getLoc().getStart();
+    this.offsets = offsets;
     RegExpTerm ast = res.getAST();
     new V().visit(ast, trapwriter.localID(parent), 0);
 
 
@@ -0,0 +1,50 @@
+package com.semmle.js.extractor.test;
+
+import com.semmle.js.extractor.OffsetTranslation;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class OffsetTranslationTest {
+  @Test
+  public void testBasic() {
+    OffsetTranslation table = new OffsetTranslation();
+    table.set(0, 10);
+    table.set(100, 250);
+    Assert.assertEquals(10, table.get(0));
+    Assert.assertEquals(15, table.get(5));
+    Assert.assertEquals(85, table.get(75));
+    Assert.assertEquals(109, table.get(99));
+    Assert.assertEquals(250, table.get(100));
+    Assert.assertEquals(251, table.get(101));
+  }
+
+  @Test
+  public void testLookupBefore() {
+    OffsetTranslation table = new OffsetTranslation();
+    table.set(0, 10);
+    table.set(100, 250);
+    Assert.assertEquals(9, table.get(-1));
+  }
+
+  @Test
+  public void testIdentity() {
+    OffsetTranslation table = new OffsetTranslation();
+    table.set(0, 0);
+    Assert.assertEquals(0, table.get(0));
+    Assert.assertEquals(75, table.get(75));
+  }
+
+  @Test
+  public void testDuplicateAnchor() {
+    OffsetTranslation table = new OffsetTranslation();
+    table.set(0, 0);
+    table.set(10, 100);
+    table.set(10, 100);
+    table.set(20, 150);
+    Assert.assertEquals(1, table.get(1));
+    Assert.assertEquals(100, table.get(10));
+    Assert.assertEquals(101, table.get(11));
+    Assert.assertEquals(150, table.get(20));
+    Assert.assertEquals(151, table.get(21));
+  }
+}
@@ -314,9 +314,40 @@ private RegExpTerm parseAtom() {
       return this.finishTerm(new Group(loc, capture, number, name, dis));
     }
 
-    char c = this.nextChar();
-    if ("^$\\.*+?()[]{}|".indexOf(c) != -1) this.error(Error.UNEXPECTED_CHARACTER, this.pos - 1);
-    return this.finishTerm(new Constant(loc, String.valueOf(c)));
+    // Parse consecutive constants into a single Constant node.
+    // Due to speculative parsing of string literals, this part of the code is fairly hot.
+    int startPos = this.pos;
+    int endPos = startPos;
+    while (endPos < src.length()) {
+      if ("^$\\.*+?()[]{}|".indexOf(src.charAt(endPos)) != -1) break;
+      ++endPos;
+    }
+    if (startPos == endPos) {
+      this.error(Error.UNEXPECTED_CHARACTER, endPos);
+      endPos = startPos + 1; // To ensure progress, make sure we parse at least one character.
+    }
+    // Check if the end of the constant belongs under an upcoming quantifier.
+    if (endPos != startPos + 1
+        && endPos < src.length()
+        && "*+?{".indexOf(src.charAt(endPos)) != -1) {
+      if (Character.isLowSurrogate(src.charAt(endPos - 1))
+          && Character.isHighSurrogate(src.charAt(endPos - 2))) {
+        // Don't split the surrogate pair.
+        if (endPos == startPos + 2) {
+          // The whole constant is a single wide character.
+        } else {
+          endPos -= 2; // Last 2 characters belong to an upcoming quantifier.
+        }
+      } else {
+        endPos--; // Last character belongs to an upcoming quantifier.
+      }
+    }
+    String str = src.substring(startPos, endPos);
+    this.pos = endPos;
+    loc.setEnd(pos());
+    loc.setSource(str);
+    // Do not call finishTerm as it will create another copy of 'str'.
+    return new Constant(loc, str);
   }
 
   private RegExpTerm parseAtomEscape(SourceLocation loc, boolean inCharClass) {
@@ -469,6 +500,11 @@ private RegExpTerm parseCharacterClassAtom() {
       if (this.match("b")) return this.finishTerm(new ControlEscape(loc, "\b", 8, "\\b"));
       return this.finishTerm(this.parseAtomEscape(loc, true));
     }
-    return this.finishTerm(new Constant(loc, String.valueOf(c)));
+    String value = String.valueOf(c);
+    // Extract a surrogate pair as a single constant.
+    if (Character.isHighSurrogate(c) && Character.isLowSurrogate(peekChar(true))) {
+      value += this.nextChar();
+    }
+    return this.finishTerm(new Constant(loc, value));
   }
 }