Merge pull request #4067 from erik-krogh/noBin

codeql-ci · web-flow · commit 29183fa0a1d0 · 2020-08-20T23:07:02.000+01:00
Approved by esbena
diff --git a/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java b/javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java
@@ -41,8 +41,66 @@ public class FileExtractor {
   public static final Pattern JSON_OBJECT_START =
       Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");
 
-  /** The charset for decoding UTF-8 strings. */
-  private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
+  /**
+   * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
+   */
+  private static boolean hasUnprintableUtf8(byte[] bytes, int length) {
+    // Constants for bytes with N high-order 1-bits.
+    // They are typed as `int` as the subsequent byte-to-int promotion would
+    // otherwise fill the high-order `int` bits with 1s.
+    final int high1 = 0b10000000;
+    final int high2 = 0b11000000;
+    final int high3 = 0b11100000;
+    final int high4 = 0b11110000;
+    final int high5 = 0b11111000;
+
+    int startIndex = skipBOM(bytes, length);
+    for (int i = startIndex; i < length; ++i) {
+      int b = bytes[i];
+      if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
+        // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
+        // 127 is the unprintable DEL character.
+        if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
+          return true;
+        }
+      } else {
+        // Check for malformed UTF-8 multibyte code point
+        int trailingBytes = 0;
+        if ((b & high3) == high2) {
+          trailingBytes = 1; // 110xxxxx 10xxxxxx
+        } else if ((b & high4) == high3) {
+          trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
+        } else if ((b & high5) == high4) {
+          trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        } else {
+          return true; // 10xxxxxx and 11111xxx are not valid here.
+        }
+        // Trailing bytes must be of form 10xxxxxx
+        while (trailingBytes > 0) {
+          ++i;
+          --trailingBytes;
+          if (i >= length) {
+            return false;
+          }
+          if ((bytes[i] & high2) != high1) {
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  /** Returns the index after the initial BOM, if any, otherwise 0. */
+  private static int skipBOM(byte[] bytes, int length) {
+    if (length >= 2
+        && (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
+            || bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
+      return 2;
+    } else {
+      return 0;
+    }
+  }
 
   /** Information about supported file types. */
   public static enum FileType {
@@ -66,6 +124,10 @@ public IExtractor mkExtractor(ExtractorConfig config, ExtractorState state) {
 
       @Override
       protected boolean contains(File f, String lcExt, ExtractorConfig config) {
+        if (isBinaryFile(f, lcExt, config)) {
+          return false;
+        }
+
         if (super.contains(f, lcExt, config)) return true;
 
         // detect Node.js scripts that are meant to be run from
@@ -90,6 +152,32 @@ protected boolean contains(File f, String lcExt, ExtractorConfig config) {
       public String toString() {
         return "javascript";
       }
+
+      /** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
+      private static final int fileHeaderSize = 128;
+
+      /** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
+      private boolean isBinaryFile(File f, String lcExt, ExtractorConfig config) {
+        if (!config.getDefaultEncoding().equals(StandardCharsets.UTF_8.name())) {
+          return false;
+        }
+        try (FileInputStream fis = new FileInputStream(f)) {
+          byte[] bytes = new byte[fileHeaderSize];
+          int length = fis.read(bytes);
+
+          if (length == -1) return false;
+
+          // Avoid invalid or unprintable UTF-8 files.
+          if (hasUnprintableUtf8(bytes, length)) {
+            return true;
+          }
+
+          return false;
+        } catch (IOException e) {
+          Exceptions.ignore(e, "Let extractor handle this one.");
+        }
+        return false;
+      }
     },
 
     JSON(".json") {
@@ -160,7 +248,7 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
           if (length == -1) return false;
 
           // Avoid invalid or unprintable UTF-8 files.
-          if (config.getDefaultEncoding().equals("UTF-8") && hasUnprintableUtf8(bytes, length)) {
+          if (config.getDefaultEncoding().equals(StandardCharsets.UTF_8.name()) && hasUnprintableUtf8(bytes, length)) {
             return true;
           }
 
@@ -182,17 +270,6 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
         return false;
       }
 
-      /** Returns the index after the initial BOM, if any, otherwise 0. */
-      private int skipBOM(byte[] bytes, int length) {
-        if (length >= 2
-            && (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
-                || bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
-          return 2;
-        } else {
-          return 0;
-        }
-      }
-
       private boolean isXml(byte[] bytes, int length) {
         int startIndex = skipBOM(bytes, length);
         // Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
@@ -211,56 +288,6 @@ private boolean isTouchstone(byte[] bytes, int length) {
         return s.startsWith("! TOUCHSTONE file ") || s.startsWith("[Version] 2.0");
       }
 
-      /**
-       * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
-       */
-      private boolean hasUnprintableUtf8(byte[] bytes, int length) {
-        // Constants for bytes with N high-order 1-bits.
-        // They are typed as `int` as the subsequent byte-to-int promotion would
-        // otherwise fill the high-order `int` bits with 1s.
-        final int high1 = 0b10000000;
-        final int high2 = 0b11000000;
-        final int high3 = 0b11100000;
-        final int high4 = 0b11110000;
-        final int high5 = 0b11111000;
-
-        int startIndex = skipBOM(bytes, length);
-        for (int i = startIndex; i < length; ++i) {
-          int b = bytes[i];
-          if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
-            // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
-            // 127 is the unprintable DEL character.
-            if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
-              return true;
-            }
-          } else {
-            // Check for malformed UTF-8 multibyte code point
-            int trailingBytes = 0;
-            if ((b & high3) == high2) {
-              trailingBytes = 1; // 110xxxxx 10xxxxxx
-            } else if ((b & high4) == high3) {
-              trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
-            } else if ((b & high5) == high4) {
-              trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-            } else {
-              return true; // 10xxxxxx and 11111xxx are not valid here.
-            }
-            // Trailing bytes must be of form 10xxxxxx
-            while (trailingBytes > 0) {
-              ++i;
-              --trailingBytes;
-              if (i >= length) {
-                return false;
-              }
-              if ((bytes[i] & high2) != high1) {
-                return true;
-              }
-            }
-          }
-        }
-        return false;
-      }
-
       /**
        * Returns true if the byte sequence starts with a shebang line that is not recognized as a
        * JavaScript interpreter.
@@ -288,7 +315,7 @@ private boolean hasUnrecognizedShebang(byte[] bytes, int length) {
         // Extract the shebang text
         int startOfText = startIndex + "#!".length();
         int lengthOfText = endOfLine - startOfText;
-        String text = new String(bytes, startOfText, lengthOfText, UTF8_CHARSET);
+        String text = new String(bytes, startOfText, lengthOfText, StandardCharsets.UTF_8);
         // Check if the shebang is a recognized JavaScript intepreter.
         return !NODE_INVOCATION.matcher(text).find();
       }
diff --git a/javascript/extractor/src/com/semmle/js/extractor/Main.java b/javascript/extractor/src/com/semmle/js/extractor/Main.java
@@ -43,7 +43,7 @@ public class Main {
    * A version identifier that should be updated every time the extractor changes in such a way that
    * it may produce different tuples for the same file under the same {@link ExtractorConfig}.
    */
-  public static final String EXTRACTOR_VERSION = "2020-08-19";
+  public static final String EXTRACTOR_VERSION = "2020-08-20-2";
 
   public static final Pattern NEWLINE = Pattern.compile("\n");
 
diff --git a/javascript/ql/test/library-tests/Files/binary.js b/javascript/ql/test/library-tests/Files/binary.js