@@ -41,8 +41,66 @@ public class FileExtractor {
4141 public static final Pattern JSON_OBJECT_START =
4242 Pattern .compile ("^(?s)\\ s*\\ {\\ s*\" ([^\" ]|\\ \\ .)*\" \\ s*:.*" );
4343
44- /** The charset for decoding UTF-8 strings. */
45- private static final Charset UTF8_CHARSET = Charset .forName ("UTF-8" );
44+ /**
45+ * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
46+ */
47+ private static boolean hasUnprintableUtf8 (byte [] bytes , int length ) {
48+ // Constants for bytes with N high-order 1-bits.
49+ // They are typed as `int` as the subsequent byte-to-int promotion would
50+ // otherwise fill the high-order `int` bits with 1s.
51+ final int high1 = 0b10000000;
52+ final int high2 = 0b11000000;
53+ final int high3 = 0b11100000;
54+ final int high4 = 0b11110000;
55+ final int high5 = 0b11111000;
56+
57+ int startIndex = skipBOM (bytes , length );
58+ for (int i = startIndex ; i < length ; ++i ) {
59+ int b = bytes [i ];
60+ if ((b & high1 ) == 0 ) { // 0xxxxxxx is an ASCII character
61+ // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
62+ // 127 is the unprintable DEL character.
63+ if (b <= 8 || 14 <= b && b <= 31 || b == 127 ) {
64+ return true ;
65+ }
66+ } else {
67+ // Check for malformed UTF-8 multibyte code point
68+ int trailingBytes = 0 ;
69+ if ((b & high3 ) == high2 ) {
70+ trailingBytes = 1 ; // 110xxxxx 10xxxxxx
71+ } else if ((b & high4 ) == high3 ) {
72+ trailingBytes = 2 ; // 1110xxxx 10xxxxxx 10xxxxxx
73+ } else if ((b & high5 ) == high4 ) {
74+ trailingBytes = 3 ; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
75+ } else {
76+ return true ; // 10xxxxxx and 11111xxx are not valid here.
77+ }
78+ // Trailing bytes must be of form 10xxxxxx
79+ while (trailingBytes > 0 ) {
80+ ++i ;
81+ --trailingBytes ;
82+ if (i >= length ) {
83+ return false ;
84+ }
85+ if ((bytes [i ] & high2 ) != high1 ) {
86+ return true ;
87+ }
88+ }
89+ }
90+ }
91+ return false ;
92+ }
93+
94+ /** Returns the index after the initial BOM, if any, otherwise 0. */
95+ private static int skipBOM (byte [] bytes , int length ) {
96+ if (length >= 2
97+ && (bytes [0 ] == (byte ) 0xfe && bytes [1 ] == (byte ) 0xff
98+ || bytes [0 ] == (byte ) 0xff && bytes [1 ] == (byte ) 0xfe )) {
99+ return 2 ;
100+ } else {
101+ return 0 ;
102+ }
103+ }
46104
47105 /** Information about supported file types. */
48106 public static enum FileType {
@@ -66,6 +124,10 @@ public IExtractor mkExtractor(ExtractorConfig config, ExtractorState state) {
66124
67125 @ Override
68126 protected boolean contains (File f , String lcExt , ExtractorConfig config ) {
127+ if (isBinaryFile (f , lcExt , config )) {
128+ return false ;
129+ }
130+
69131 if (super .contains (f , lcExt , config )) return true ;
70132
71133 // detect Node.js scripts that are meant to be run from
@@ -90,6 +152,32 @@ protected boolean contains(File f, String lcExt, ExtractorConfig config) {
90152 public String toString () {
91153 return "javascript" ;
92154 }
155+
156+ /** Number of bytes to read from the beginning of a ".js" file to detect if it is a binary file. */
157+ private static final int fileHeaderSize = 128 ;
158+
159+ /** Computes if `f` is a binary file based on whether the initial `fileHeaderSize` bytes are printable UTF-8 chars. */
160+ private boolean isBinaryFile (File f , String lcExt , ExtractorConfig config ) {
161+ if (!config .getDefaultEncoding ().equals (StandardCharsets .UTF_8 .name ())) {
162+ return false ;
163+ }
164+ try (FileInputStream fis = new FileInputStream (f )) {
165+ byte [] bytes = new byte [fileHeaderSize ];
166+ int length = fis .read (bytes );
167+
168+ if (length == -1 ) return false ;
169+
170+ // Avoid invalid or unprintable UTF-8 files.
171+ if (hasUnprintableUtf8 (bytes , length )) {
172+ return true ;
173+ }
174+
175+ return false ;
176+ } catch (IOException e ) {
177+ Exceptions .ignore (e , "Let extractor handle this one." );
178+ }
179+ return false ;
180+ }
93181 },
94182
95183 JSON (".json" ) {
@@ -160,7 +248,7 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
160248 if (length == -1 ) return false ;
161249
162250 // Avoid invalid or unprintable UTF-8 files.
163- if (config .getDefaultEncoding ().equals ("UTF-8" ) && hasUnprintableUtf8 (bytes , length )) {
251+ if (config .getDefaultEncoding ().equals (StandardCharsets . UTF_8 . name () ) && hasUnprintableUtf8 (bytes , length )) {
164252 return true ;
165253 }
166254
@@ -182,17 +270,6 @@ private boolean hasBadFileHeader(File f, String lcExt, ExtractorConfig config) {
182270 return false ;
183271 }
184272
185- /** Returns the index after the initial BOM, if any, otherwise 0. */
186- private int skipBOM (byte [] bytes , int length ) {
187- if (length >= 2
188- && (bytes [0 ] == (byte ) 0xfe && bytes [1 ] == (byte ) 0xff
189- || bytes [0 ] == (byte ) 0xff && bytes [1 ] == (byte ) 0xfe )) {
190- return 2 ;
191- } else {
192- return 0 ;
193- }
194- }
195-
196273 private boolean isXml (byte [] bytes , int length ) {
197274 int startIndex = skipBOM (bytes , length );
198275 // Check for `<` encoded in Ascii/UTF-8 or litte-endian UTF-16.
@@ -211,56 +288,6 @@ private boolean isTouchstone(byte[] bytes, int length) {
211288 return s .startsWith ("! TOUCHSTONE file " ) || s .startsWith ("[Version] 2.0" );
212289 }
213290
214- /**
215- * Returns true if the byte sequence contains invalid UTF-8 or unprintable ASCII characters.
216- */
217- private boolean hasUnprintableUtf8 (byte [] bytes , int length ) {
218- // Constants for bytes with N high-order 1-bits.
219- // They are typed as `int` as the subsequent byte-to-int promotion would
220- // otherwise fill the high-order `int` bits with 1s.
221- final int high1 = 0b10000000;
222- final int high2 = 0b11000000;
223- final int high3 = 0b11100000;
224- final int high4 = 0b11110000;
225- final int high5 = 0b11111000;
226-
227- int startIndex = skipBOM (bytes , length );
228- for (int i = startIndex ; i < length ; ++i ) {
229- int b = bytes [i ];
230- if ((b & high1 ) == 0 ) { // 0xxxxxxx is an ASCII character
231- // ASCII values 0-31 are unprintable, except 9-13 are whitespace.
232- // 127 is the unprintable DEL character.
233- if (b <= 8 || 14 <= b && b <= 31 || b == 127 ) {
234- return true ;
235- }
236- } else {
237- // Check for malformed UTF-8 multibyte code point
238- int trailingBytes = 0 ;
239- if ((b & high3 ) == high2 ) {
240- trailingBytes = 1 ; // 110xxxxx 10xxxxxx
241- } else if ((b & high4 ) == high3 ) {
242- trailingBytes = 2 ; // 1110xxxx 10xxxxxx 10xxxxxx
243- } else if ((b & high5 ) == high4 ) {
244- trailingBytes = 3 ; // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
245- } else {
246- return true ; // 10xxxxxx and 11111xxx are not valid here.
247- }
248- // Trailing bytes must be of form 10xxxxxx
249- while (trailingBytes > 0 ) {
250- ++i ;
251- --trailingBytes ;
252- if (i >= length ) {
253- return false ;
254- }
255- if ((bytes [i ] & high2 ) != high1 ) {
256- return true ;
257- }
258- }
259- }
260- }
261- return false ;
262- }
263-
264291 /**
265292 * Returns true if the byte sequence starts with a shebang line that is not recognized as a
266293 * JavaScript interpreter.
@@ -288,7 +315,7 @@ private boolean hasUnrecognizedShebang(byte[] bytes, int length) {
288315 // Extract the shebang text
289316 int startOfText = startIndex + "#!" .length ();
290317 int lengthOfText = endOfLine - startOfText ;
291- String text = new String (bytes , startOfText , lengthOfText , UTF8_CHARSET );
318+ String text = new String (bytes , startOfText , lengthOfText , StandardCharsets . UTF_8 );
292319 // Check if the shebang is a recognized JavaScript intepreter.
293320 return !NODE_INVOCATION .matcher (text ).find ();
294321 }
0 commit comments