Skip to content

Commit 29183fa

Browse files
authored
Merge pull request #4067 from erik-krogh/noBin
Approved by esbena
2 parents 508ade2 + cef681d commit 29183fa

File tree

3 files changed

+93
-66
lines changed

3 files changed

+93
-66
lines changed

javascript/extractor/src/com/semmle/js/extractor/FileExtractor.java

Lines changed: 92 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,66 @@ public class FileExtractor {
4141
public static final Pattern JSON_OBJECT_START =
4242
Pattern.compile("^(?s)\\s*\\{\\s*\"([^\"]|\\\\.)*\"\\s*:.*");
4343

44-
/** The charset for decoding UTF-8 strings. */
45-
private static final Charset UTF8_CHARSET = Charset.forName("UTF-8");
44+
/**
45+
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
46+
*/
47+
private static boolean hasUnprintableUtf8(byte[] bytes, int length) {
48+
// Constants for bytes with N high-order 1-bits.
49+
// They are typed as `int` as the subsequent byte-to-int promotion would
50+
// otherwise fill the high-order `int` bits with 1s.
51+
final int high1 = 0b10000000;
52+
final int high2 = 0b11000000;
53+
final int high3 = 0b11100000;
54+
final int high4 = 0b11110000;
55+
final int high5 = 0b11111000;
56+
57+
int startIndex = skipBOM(bytes, length);
58+
for (int i = startIndex; i < length; ++i) {
59+
int b = bytes[i];
60+
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
61+
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
62+
// 127 is the unprintable DEL character.
63+
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
64+
return true;
65+
}
66+
} else {
67+
// Check for malformed UTF-8 multibyte code point
68+
int trailingBytes = 0;
69+
if ((b & high3) == high2) {
70+
trailingBytes = 1; // 110xxxxx 10xxxxxx
71+
} else if ((b & high4) == high3) {
72+
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
73+
} else if ((b & high5) == high4) {
74+
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
75+
} else {
76+
return true; // 10xxxxxx and 11111xxx are not valid here.
77+
}
78+
// Trailing bytes must be of form 10xxxxxx
79+
while (trailingBytes > 0) {
80+
++i;
81+
--trailingBytes;
82+
if (i >= length) {
83+
return false;
84+
}
85+
if ((bytes[i] & high2) != high1) {
86+
return true;
87+
}
88+
}
89+
}
90+
}
91+
return false;
92+
}
93+
94+
/** Returns the index after the initial BOM, if any, otherwise 0. */
95+
private static int skipBOM(byte[] bytes, int length) {
96+
if (length >= 2
97+
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
98+
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
99+
return 2;
100+
} else {
101+
return 0;
102+
}
103+
}
46104

47105
/** Information about supported file types. */
48106
public static enum FileType {
@@ -66,6 +124,10 @@ public IExtractor mkExtractor(ExtractorConfig config, ExtractorState state) {
66124

67125
@Override
68126
protected boolean contains(File f, String lcExt, ExtractorConfig config) {
127+
if (isBinaryFile(f, lcExt, config)) {
128+
return false;
129+
}
130+
69131
if (super.contains(f, lcExt, config)) return true;
70132

71133
// detect Node.js scripts that are meant to be run from
@@ -90,6 +152,32 @@ protected boolean contains(File f, String lcExt, ExtractorConfig config) {
90152
public String toString() {
91153
return "javascript";
92154
}
155+
156+
/** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
157+
private static final int fileHeaderSize = 128;
158+
159+
/** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
160+
private boolean isBinaryFile(File f, String lcExt, ExtractorConfig config) {
161+
if (!config.getDefaultEncoding().equals(StandardCharsets.UTF_8.name())) {
162+
return false;
163+
}
164+
try (FileInputStream fis = new FileInputStream(f)) {
165+
byte[] bytes = new byte[fileHeaderSize];
166+
int length = fis.read(bytes);
167+
168+
if (length == -1) return false;
169+
170+
// Avoid invalid or unprintable UTF-8 files.
171+
if (hasUnprintableUtf8(bytes, length)) {
172+
return true;
173+
}
174+
175+
return false;
176+
} catch (IOException e) {
177+
Exceptions.ignore(e, "Let extractor handle this one.");
178+
}
179+
return false;
180+
}
93181
},
94182

95183
JSON(".json") {
@@ -160,7 +248,7 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
160248
if (length == -1) return false;
161249

162250
// Avoid invalid or unprintable UTF-8 files.
163-
if (config.getDefaultEncoding().equals("UTF-8") && hasUnprintableUtf8(bytes, length)) {
251+
if (config.getDefaultEncoding().equals(StandardCharsets.UTF_8.name()) && hasUnprintableUtf8(bytes, length)) {
164252
return true;
165253
}
166254

@@ -182,17 +270,6 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
182270
return false;
183271
}
184272

185-
/** Returns the index after the initial BOM, if any, otherwise 0. */
186-
private int skipBOM(byte[] bytes, int length) {
187-
if (length >= 2
188-
&& (bytes[0] == (byte) 0xfe && bytes[1] == (byte) 0xff
189-
|| bytes[0] == (byte) 0xff && bytes[1] == (byte) 0xfe)) {
190-
return 2;
191-
} else {
192-
return 0;
193-
}
194-
}
195-
196273
private boolean isXml(byte[] bytes, int length) {
197274
int startIndex = skipBOM(bytes, length);
198275
// Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
@@ -211,56 +288,6 @@ private boolean isTouchstone(byte[] bytes, int length) {
211288
return s.startsWith("! TOUCHSTONE file ") || s.startsWith("[Version] 2.0");
212289
}
213290

214-
/**
215-
* Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
216-
*/
217-
private boolean hasUnprintableUtf8(byte[] bytes, int length) {
218-
// Constants for bytes with N high-order 1-bits.
219-
// They are typed as `int` as the subsequent byte-to-int promotion would
220-
// otherwise fill the high-order `int` bits with 1s.
221-
final int high1 = 0b10000000;
222-
final int high2 = 0b11000000;
223-
final int high3 = 0b11100000;
224-
final int high4 = 0b11110000;
225-
final int high5 = 0b11111000;
226-
227-
int startIndex = skipBOM(bytes, length);
228-
for (int i = startIndex; i < length; ++i) {
229-
int b = bytes[i];
230-
if ((b & high1) == 0) { // 0xxxxxxx is an ASCII character
231-
// ASCII values 0-31 are unprintable, except 9-13 are whitespace.
232-
// 127 is the unprintable DEL character.
233-
if (b <= 8 || 14 <= b && b <= 31 || b == 127) {
234-
return true;
235-
}
236-
} else {
237-
// Check for malformed UTF-8 multibyte code point
238-
int trailingBytes = 0;
239-
if ((b & high3) == high2) {
240-
trailingBytes = 1; // 110xxxxx 10xxxxxx
241-
} else if ((b & high4) == high3) {
242-
trailingBytes = 2; // 1110xxxx 10xxxxxx 10xxxxxx
243-
} else if ((b & high5) == high4) {
244-
trailingBytes = 3; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
245-
} else {
246-
return true; // 10xxxxxx and 11111xxx are not valid here.
247-
}
248-
// Trailing bytes must be of form 10xxxxxx
249-
while (trailingBytes > 0) {
250-
++i;
251-
--trailingBytes;
252-
if (i >= length) {
253-
return false;
254-
}
255-
if ((bytes[i] & high2) != high1) {
256-
return true;
257-
}
258-
}
259-
}
260-
}
261-
return false;
262-
}
263-
264291
/**
265292
* Returns true if the byte sequence starts with a shebang line that is not recognized as a
266293
* JavaScript interpreter.
@@ -288,7 +315,7 @@ private boolean hasUnrecognizedShebang(byte[] bytes, int length) {
288315
// Extract the shebang text
289316
int startOfText = startIndex + "#!".length();
290317
int lengthOfText = endOfLine - startOfText;
291-
String text = new String(bytes, startOfText, lengthOfText, UTF8_CHARSET);
318+
String text = new String(bytes, startOfText, lengthOfText, StandardCharsets.UTF_8);
292319
// Check if the shebang is a recognized JavaScript intepreter.
293320
return !NODE_INVOCATION.matcher(text).find();
294321
}

javascript/extractor/src/com/semmle/js/extractor/Main.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ public class Main {
4343
* A version identifier that should be updated every time the extractor changes in such a way that
4444
* it may produce different tuples for the same file under the same {@link ExtractorConfig}.
4545
*/
46-
public static final String EXTRACTOR_VERSION = "2020-08-19";
46+
public static final String EXTRACTOR_VERSION = "2020-08-20-2";
4747

4848
public static final Pattern NEWLINE = Pattern.compile("\n");
4949

80 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)