11mod extractor;
22
3+ #[ macro_use]
4+ extern crate lazy_static;
35extern crate num_cpus;
46
57use clap:: arg;
8+ use encoding:: { self } ;
69use flate2:: write:: GzEncoder ;
710use rayon:: prelude:: * ;
11+ use std:: borrow:: Cow ;
812use std:: fs;
913use std:: io:: { BufRead , BufWriter } ;
1014use std:: path:: { Path , PathBuf } ;
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
7579 }
7680}
7781
82+ lazy_static ! {
83+ static ref CP_NUMBER : regex:: Regex = regex:: Regex :: new( "cp([0-9]+)" ) . unwrap( ) ;
84+ }
85+
86+ fn encoding_from_name ( encoding_name : & str ) -> Option < & ( dyn encoding:: Encoding + Send + Sync ) > {
87+ match encoding:: label:: encoding_from_whatwg_label ( & encoding_name) {
88+ Some ( e) => return Some ( e) ,
89+ None => {
90+ if let Some ( cap) = CP_NUMBER . captures ( & encoding_name) {
91+ return encoding:: label:: encoding_from_windows_code_page (
92+ str:: parse ( cap. get ( 1 ) . unwrap ( ) . as_str ( ) ) . unwrap ( ) ,
93+ ) ;
94+ } else {
95+ return None ;
96+ }
97+ }
98+ }
99+ }
100+
78101fn main ( ) -> std:: io:: Result < ( ) > {
79102 tracing_subscriber:: fmt ( )
80103 . with_target ( false )
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
140163 let path = PathBuf :: from ( line) . canonicalize ( ) ?;
141164 let src_archive_file = path_for ( & src_archive_dir, & path, "" ) ;
142165 let mut source = std:: fs:: read ( & path) ?;
166+ let mut needs_conversion = false ;
143167 let code_ranges;
144168 let mut trap_writer = extractor:: new_trap_writer ( ) ;
145169 if path. extension ( ) . map_or ( false , |x| x == "erb" ) {
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
168192 }
169193 code_ranges = ranges;
170194 } else {
195+ if let Some ( encoding_name) = scan_coding_comment ( & source) {
196+ // If the input is already UTF-8 then there is no need to recode the source
197+ // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
198+ // to interpret characters. In this case it is probably best to leave the input
199+ // unchanged.
200+ if !encoding_name. eq_ignore_ascii_case ( "utf-8" )
201+ && !encoding_name. eq_ignore_ascii_case ( "ascii-8bit" )
202+ && !encoding_name. eq_ignore_ascii_case ( "binary" )
203+ {
204+ if let Some ( encoding) = encoding_from_name ( & encoding_name) {
205+ needs_conversion =
206+ encoding. whatwg_name ( ) . unwrap_or_default ( ) != "utf-8" ;
207+ if needs_conversion {
208+ match encoding
209+ . decode ( & source, encoding:: types:: DecoderTrap :: Replace )
210+ {
211+ Ok ( str) => source = str. as_bytes ( ) . to_owned ( ) ,
212+ Err ( msg) => {
213+ needs_conversion = false ;
214+ tracing:: warn!(
215+ "{}: character decoding failure: {} ({})" ,
216+ & path. to_string_lossy( ) ,
217+ msg,
218+ & encoding_name
219+ ) ;
220+ }
221+ }
222+ }
223+ } else {
224+ tracing:: warn!(
225+ "{}: unknown character encoding: '{}'" ,
226+ & path. to_string_lossy( ) ,
227+ & encoding_name
228+ ) ;
229+ }
230+ }
231+ }
171232 code_ranges = vec ! [ ] ;
172233 }
173234 extractor:: extract (
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
180241 & code_ranges,
181242 ) ?;
182243 std:: fs:: create_dir_all ( & src_archive_file. parent ( ) . unwrap ( ) ) ?;
183- std:: fs:: copy ( & path, & src_archive_file) ?;
244+ if needs_conversion {
245+ std:: fs:: write ( & src_archive_file, & source) ?;
246+ } else {
247+ std:: fs:: copy ( & path, & src_archive_file) ?;
248+ }
184249 write_trap ( & trap_dir, path, trap_writer, & trap_compression)
185250 } )
186251 . expect ( "failed to extract files" ) ;
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
299364 }
300365 result
301366}
367+
368+ fn skip_space ( content : & [ u8 ] , index : usize ) -> usize {
369+ let mut index = index;
370+ while index < content. len ( ) {
371+ let c = content[ index] as char ;
372+ // white space except \n
373+ let is_space = c == ' ' || ( '\t' ..='\r' ) . contains ( & c) && c != '\n' ;
374+ if !is_space {
375+ break ;
376+ }
377+ index += 1 ;
378+ }
379+ index
380+ }
381+
382+ fn scan_coding_comment ( content : & [ u8 ] ) -> std:: option:: Option < Cow < str > > {
383+ let mut index = 0 ;
384+ // skip UTF-8 BOM marker if there is one
385+ if content. len ( ) >= 3 && content[ 0 ] == 0xef && content[ 1 ] == 0xbb && content[ 2 ] == 0xbf {
386+ index += 3 ;
387+ }
388+ // skip #! line if there is one
389+ if index + 1 < content. len ( )
390+ && content[ index] as char == '#'
391+ && content[ index + 1 ] as char == '!'
392+ {
393+ index += 2 ;
394+ while index < content. len ( ) && content[ index] as char != '\n' {
395+ index += 1
396+ }
397+ index += 1
398+ }
399+ index = skip_space ( content, index) ;
400+
401+ if index >= content. len ( ) || content[ index] as char != '#' {
402+ return None ;
403+ }
404+ index += 1 ;
405+
406+ const CODING : [ char ; 12 ] = [ 'C' , 'c' , 'O' , 'o' , 'D' , 'd' , 'I' , 'i' , 'N' , 'n' , 'G' , 'g' ] ;
407+ let mut word_index = 0 ;
408+ while index < content. len ( ) && word_index < CODING . len ( ) && content[ index] as char != '\n' {
409+ if content[ index] as char == CODING [ word_index]
410+ || content[ index] as char == CODING [ word_index + 1 ]
411+ {
412+ word_index += 2
413+ } else {
414+ word_index = 0 ;
415+ }
416+ index += 1 ;
417+ }
418+ if word_index < CODING . len ( ) {
419+ return None ;
420+ }
421+ index = skip_space ( content, index) ;
422+
423+ if index < content. len ( ) && content[ index] as char != ':' && content[ index] as char != '=' {
424+ return None ;
425+ }
426+ index += 1 ;
427+ index = skip_space ( content, index) ;
428+
429+ let start = index;
430+ while index < content. len ( ) {
431+ let c = content[ index] as char ;
432+ if c == '-' || c == '_' || c. is_ascii_alphanumeric ( ) {
433+ index += 1 ;
434+ } else {
435+ break ;
436+ }
437+ }
438+ if index > start {
439+ return Some ( String :: from_utf8_lossy ( & content[ start..index] ) ) ;
440+ }
441+ None
442+ }
443+
444+ #[ test]
445+ fn test_scan_coding_comment ( ) {
446+ let text = "# encoding: utf-8" ;
447+ let result = scan_coding_comment ( text. as_bytes ( ) ) ;
448+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
449+
450+ let text = "#coding:utf-8" ;
451+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
452+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
453+
454+ let text = "# foo\n # encoding: utf-8" ;
455+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
456+ assert_eq ! ( result, None ) ;
457+
458+ let text = "# encoding: latin1 encoding: utf-8" ;
459+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
460+ assert_eq ! ( result, Some ( "latin1" . into( ) ) ) ;
461+
462+ let text = "# encoding: nonsense" ;
463+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
464+ assert_eq ! ( result, Some ( "nonsense" . into( ) ) ) ;
465+
466+ let text = "# coding = utf-8" ;
467+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
468+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
469+
470+ let text = "# CODING = utf-8" ;
471+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
472+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
473+
474+ let text = "# CoDiNg = utf-8" ;
475+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
476+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
477+
478+ let text = "# blah blahblahcoding = utf-8" ;
479+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
480+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
481+
482+ // unicode BOM is ignored
483+ let text = "\u{FEFF} # encoding: utf-8" ;
484+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
485+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
486+
487+ let text = "\u{FEFF} # encoding: utf-8" ;
488+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
489+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
490+
491+ let text = "#! /usr/bin/env ruby\n # encoding: utf-8" ;
492+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
493+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
494+
495+ let text = "\u{FEFF} #! /usr/bin/env ruby\n # encoding: utf-8" ;
496+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
497+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
498+
499+ // A #! must be the first thing on a line, otherwise it's a normal comment
500+ let text = " #! /usr/bin/env ruby encoding = utf-8" ;
501+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
502+ assert_eq ! ( result, Some ( "utf-8" . into( ) ) ) ;
503+ let text = " #! /usr/bin/env ruby \n # encoding = utf-8" ;
504+ let result = scan_coding_comment ( & text. as_bytes ( ) ) ;
505+ assert_eq ! ( result, None ) ;
506+ }
0 commit comments