11mod extractor;
22mod trap;
33
4+ #[ macro_use]
5+ extern crate lazy_static;
46extern crate num_cpus;
57
68use clap:: arg;
9+ use encoding:: { self } ;
710use rayon:: prelude:: * ;
11+ use std:: borrow:: Cow ;
812use std:: fs;
913use std:: io:: BufRead ;
1014use std:: path:: { Path , PathBuf } ;
@@ -39,6 +43,21 @@ fn num_codeql_threads() -> usize {
3943 }
4044}
4145
46+ lazy_static ! {
47+ static ref CP_NUMBER : regex:: Regex = regex:: Regex :: new( "cp([0-9]+)" ) . unwrap( ) ;
48+ }
49+
50+ fn encoding_from_name ( encoding_name : & str ) -> Option < & ( dyn encoding:: Encoding + Send + Sync ) > {
51+ match encoding:: label:: encoding_from_whatwg_label ( encoding_name) {
52+ s @ Some ( _) => s,
53+ None => CP_NUMBER . captures ( encoding_name) . and_then ( |cap| {
54+ encoding:: label:: encoding_from_windows_code_page (
55+ str:: parse ( cap. get ( 1 ) . unwrap ( ) . as_str ( ) ) . unwrap ( ) ,
56+ )
57+ } ) ,
58+ }
59+ }
60+
4261fn main ( ) -> std:: io:: Result < ( ) > {
4362 tracing_subscriber:: fmt ( )
4463 . with_target ( false )
@@ -104,6 +123,7 @@ fn main() -> std::io::Result<()> {
104123 let path = PathBuf :: from ( line) . canonicalize ( ) ?;
105124 let src_archive_file = path_for ( & src_archive_dir, & path, "" ) ;
106125 let mut source = std:: fs:: read ( & path) ?;
126+ let mut needs_conversion = false ;
107127 let code_ranges;
108128 let mut trap_writer = trap:: Writer :: new ( ) ;
109129 if path. extension ( ) . map_or ( false , |x| x == "erb" ) {
@@ -132,6 +152,43 @@ fn main() -> std::io::Result<()> {
132152 }
133153 code_ranges = ranges;
134154 } else {
155+ if let Some ( encoding_name) = scan_coding_comment ( & source) {
156+ // If the input is already UTF-8 then there is no need to recode the source
157+ // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
158+ // to interpret characters. In this case it is probably best to leave the input
159+ // unchanged.
160+ if !encoding_name. eq_ignore_ascii_case ( "utf-8" )
161+ && !encoding_name. eq_ignore_ascii_case ( "ascii-8bit" )
162+ && !encoding_name. eq_ignore_ascii_case ( "binary" )
163+ {
164+ if let Some ( encoding) = encoding_from_name ( & encoding_name) {
165+ needs_conversion =
166+ encoding. whatwg_name ( ) . unwrap_or_default ( ) != "utf-8" ;
167+ if needs_conversion {
168+ match encoding
169+ . decode ( & source, encoding:: types:: DecoderTrap :: Replace )
170+ {
171+ Ok ( str) => source = str. as_bytes ( ) . to_owned ( ) ,
172+ Err ( msg) => {
173+ needs_conversion = false ;
174+ tracing:: warn!(
175+ "{}: character decoding failure: {} ({})" ,
176+ & path. to_string_lossy( ) ,
177+ msg,
178+ & encoding_name
179+ ) ;
180+ }
181+ }
182+ }
183+ } else {
184+ tracing:: warn!(
185+ "{}: unknown character encoding: '{}'" ,
186+ & path. to_string_lossy( ) ,
187+ & encoding_name
188+ ) ;
189+ }
190+ }
191+ }
135192 code_ranges = vec ! [ ] ;
136193 }
137194 extractor:: extract (
@@ -144,7 +201,11 @@ fn main() -> std::io::Result<()> {
144201 & code_ranges,
145202 ) ?;
146203 std:: fs:: create_dir_all ( & src_archive_file. parent ( ) . unwrap ( ) ) ?;
147- std:: fs:: copy ( & path, & src_archive_file) ?;
204+ if needs_conversion {
205+ std:: fs:: write ( & src_archive_file, & source) ?;
206+ } else {
207+ std:: fs:: copy ( & path, & src_archive_file) ?;
208+ }
148209 write_trap ( & trap_dir, path, & trap_writer, trap_compression)
149210 } )
150211 . expect ( "failed to extract files" ) ;
@@ -255,3 +316,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
255316 }
256317 result
257318}
319+
320+ fn skip_space ( content : & [ u8 ] , index : usize ) -> usize {
321+ let mut index = index;
322+ while index < content. len ( ) {
323+ let c = content[ index] as char ;
324+ // white space except \n
325+ let is_space = c == ' ' || ( '\t' ..='\r' ) . contains ( & c) && c != '\n' ;
326+ if !is_space {
327+ break ;
328+ }
329+ index += 1 ;
330+ }
331+ index
332+ }
333+
334+ fn scan_coding_comment ( content : & [ u8 ] ) -> std:: option:: Option < Cow < str > > {
335+ let mut index = 0 ;
336+ // skip UTF-8 BOM marker if there is one
337+ if content. len ( ) >= 3 && content[ 0 ] == 0xef && content[ 1 ] == 0xbb && content[ 2 ] == 0xbf {
338+ index += 3 ;
339+ }
340+ // skip #! line if there is one
341+ if index + 1 < content. len ( )
342+ && content[ index] as char == '#'
343+ && content[ index + 1 ] as char == '!'
344+ {
345+ index += 2 ;
346+ while index < content. len ( ) && content[ index] as char != '\n' {
347+ index += 1
348+ }
349+ index += 1
350+ }
351+ index = skip_space ( content, index) ;
352+
353+ if index >= content. len ( ) || content[ index] as char != '#' {
354+ return None ;
355+ }
356+ index += 1 ;
357+
358+ const CODING : [ char ; 12 ] = [ 'C' , 'c' , 'O' , 'o' , 'D' , 'd' , 'I' , 'i' , 'N' , 'n' , 'G' , 'g' ] ;
359+ let mut word_index = 0 ;
360+ while index < content. len ( ) && word_index < CODING . len ( ) && content[ index] as char != '\n' {
361+ if content[ index] as char == CODING [ word_index]
362+ || content[ index] as char == CODING [ word_index + 1 ]
363+ {
364+ word_index += 2
365+ } else {
366+ word_index = 0 ;
367+ }
368+ index += 1 ;
369+ }
370+ if word_index < CODING . len ( ) {
371+ return None ;
372+ }
373+ index = skip_space ( content, index) ;
374+
375+ if index < content. len ( ) && content[ index] as char != ':' && content[ index] as char != '=' {
376+ return None ;
377+ }
378+ index += 1 ;
379+ index = skip_space ( content, index) ;
380+
381+ let start = index;
382+ while index < content. len ( ) {
383+ let c = content[ index] as char ;
384+ if c == '-' || c == '_' || c. is_ascii_alphanumeric ( ) {
385+ index += 1 ;
386+ } else {
387+ break ;
388+ }
389+ }
390+ if index > start {
391+ return Some ( String :: from_utf8_lossy ( & content[ start..index] ) ) ;
392+ }
393+ None
394+ }
395+
396+ #[ test]
397+ fn test_scan_coding_comment ( ) {
398+ let text = "# encoding: utf-8" ;
399+ let result = scan_coding_comment ( text. as_bytes ( ) ) ;
400+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
401+
402+ let text = "#coding:utf-8" ;
403+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
404+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
405+
406+ let text = "# foo\n # encoding: utf-8" ;
407+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
408+ assert_eq ! ( result, None ) ;
409+
410+ let text = "# encoding: latin1 encoding: utf-8" ;
411+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
412+ assert_eq ! ( result, Some ( "latin1" . into( ) ) ) ;
413+
414+ let text = "# encoding: nonsense" ;
415+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
416+ assert_eq ! ( result, Some ( "nonsense" . into( ) ) ) ;
417+
418+ let text = "# coding = utf-8" ;
419+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
420+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
421+
422+ let text = "# CODING = utf-8" ;
423+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
424+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
425+
426+ let text = "# CoDiNg = utf-8" ;
427+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
428+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
429+
430+ let text = "# blah blahblahcoding = utf-8" ;
431+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
432+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
433+
434+ // unicode BOM is ignored
435+ let text = "\u{FEFF} # encoding: utf-8" ;
436+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
437+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
438+
439+ let text = "\u{FEFF} # encoding: utf-8" ;
440+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
441+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
442+
443+ let text = "#! /usr/bin/env ruby\n # encoding: utf-8" ;
444+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
445+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
446+
447+ let text = "\u{FEFF} #! /usr/bin/env ruby\n # encoding: utf-8" ;
448+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
449+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
450+
451+ // A #! must be the first thing on a line, otherwise it's a normal comment
452+ let text = " #! /usr/bin/env ruby encoding = utf-8" ;
453+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
454+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
455+ let text = " #! /usr/bin/env ruby \n # encoding = utf-8" ;
456+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
457+ assert_eq ! ( result, None ) ;
458+ }
0 commit comments