Ruby: handle magic coding: comments

aibaars · aibaars · commit 7be106d7bba3 · 2022-07-21T16:33:18.000+02:00
diff --git a/ruby/Cargo.lock b/ruby/Cargo.lock
diff --git a/ruby/extractor/Cargo.toml b/ruby/extractor/Cargo.toml
@@ -18,3 +18,5 @@ tracing-subscriber = { version = "0.3.3", features = ["env-filter"] }
 rayon = "1.5.0"
 num_cpus = "1.13.0"
 regex = "1.5.5"
+encoding = "0.2"
+lazy_static = "1.4.0"
diff --git a/ruby/extractor/src/main.rs b/ruby/extractor/src/main.rs
@@ -1,10 +1,14 @@
 mod extractor;
 
+#[macro_use]
+extern crate lazy_static;
 extern crate num_cpus;
 
 use clap::arg;
+use encoding::{self};
 use flate2::write::GzEncoder;
 use rayon::prelude::*;
+use std::borrow::Cow;
 use std::fs;
 use std::io::{BufRead, BufWriter};
 use std::path::{Path, PathBuf};
@@ -75,6 +79,25 @@ fn num_codeql_threads() -> usize {
     }
 }
 
+lazy_static! {
+    static ref CP_NUMBER: regex::Regex = regex::Regex::new("cp([0-9]+)").unwrap();
+}
+
+fn encoding_from_name(encoding_name: &str) -> Option<&(dyn encoding::Encoding + Send + Sync)> {
+    match encoding::label::encoding_from_whatwg_label(&encoding_name) {
+        Some(e) => return Some(e),
+        None => {
+            if let Some(cap) = CP_NUMBER.captures(&encoding_name) {
+                return encoding::label::encoding_from_windows_code_page(
+                    str::parse(cap.get(1).unwrap().as_str()).unwrap(),
+                );
+            } else {
+                return None;
+            }
+        }
+    }
+}
+
 fn main() -> std::io::Result<()> {
     tracing_subscriber::fmt()
         .with_target(false)
@@ -140,6 +163,7 @@ fn main() -> std::io::Result<()> {
             let path = PathBuf::from(line).canonicalize()?;
             let src_archive_file = path_for(&src_archive_dir, &path, "");
             let mut source = std::fs::read(&path)?;
+            let mut needs_conversion = false;
             let code_ranges;
             let mut trap_writer = extractor::new_trap_writer();
             if path.extension().map_or(false, |x| x == "erb") {
@@ -168,6 +192,43 @@ fn main() -> std::io::Result<()> {
                 }
                 code_ranges = ranges;
             } else {
+                if let Some(encoding_name) = scan_coding_comment(&source) {
+                    // If the input is already UTF-8 then there is no need to recode the source
+                    // If the declared encoding is 'binary' or 'ascii-8bit' then it is not clear how
+                    // to interpret characters. In this case it is probably best to leave the input
+                    // unchanged.
+                    if !encoding_name.eq_ignore_ascii_case("utf-8")
+                        && !encoding_name.eq_ignore_ascii_case("ascii-8bit")
+                        && !encoding_name.eq_ignore_ascii_case("binary")
+                    {
+                        if let Some(encoding) = encoding_from_name(&encoding_name) {
+                            needs_conversion =
+                                encoding.whatwg_name().unwrap_or_default() != "utf-8";
+                            if needs_conversion {
+                                match encoding
+                                    .decode(&source, encoding::types::DecoderTrap::Replace)
+                                {
+                                    Ok(str) => source = str.as_bytes().to_owned(),
+                                    Err(msg) => {
+                                        needs_conversion = false;
+                                        tracing::warn!(
+                                            "{}: character decoding failure: {} ({})",
+                                            &path.to_string_lossy(),
+                                            msg,
+                                            &encoding_name
+                                        );
+                                    }
+                                }
+                            }
+                        } else {
+                            tracing::warn!(
+                                "{}: unknown character encoding: '{}'",
+                                &path.to_string_lossy(),
+                                &encoding_name
+                            );
+                        }
+                    }
+                }
                 code_ranges = vec![];
             }
             extractor::extract(
@@ -180,7 +241,11 @@ fn main() -> std::io::Result<()> {
                 &code_ranges,
             )?;
             std::fs::create_dir_all(&src_archive_file.parent().unwrap())?;
-            std::fs::copy(&path, &src_archive_file)?;
+            if needs_conversion {
+                std::fs::write(&src_archive_file, &source)?;
+            } else {
+                std::fs::copy(&path, &src_archive_file)?;
+            }
             write_trap(&trap_dir, path, trap_writer, &trap_compression)
         })
         .expect("failed to extract files");
@@ -299,3 +364,143 @@ fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
     }
     result
 }
+
+fn skip_space(content: &[u8], index: usize) -> usize {
+    let mut index = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        // white space except \n
+        let is_space = c == ' ' || ('\t'..='\r').contains(&c) && c != '\n';
+        if !is_space {
+            break;
+        }
+        index += 1;
+    }
+    index
+}
+
+fn scan_coding_comment(content: &[u8]) -> std::option::Option<Cow<str>> {
+    let mut index = 0;
+    // skip UTF-8 BOM marker if there is one
+    if content.len() >= 3 && content[0] == 0xef && content[1] == 0xbb && content[2] == 0xbf {
+        index += 3;
+    }
+    // skip #! line if there is one
+    if index + 1 < content.len()
+        && content[index] as char == '#'
+        && content[index + 1] as char == '!'
+    {
+        index += 2;
+        while index < content.len() && content[index] as char != '\n' {
+            index += 1
+        }
+        index += 1
+    }
+    index = skip_space(content, index);
+
+    if index >= content.len() || content[index] as char != '#' {
+        return None;
+    }
+    index += 1;
+
+    const CODING: [char; 12] = ['C', 'c', 'O', 'o', 'D', 'd', 'I', 'i', 'N', 'n', 'G', 'g'];
+    let mut word_index = 0;
+    while index < content.len() && word_index < CODING.len() && content[index] as char != '\n' {
+        if content[index] as char == CODING[word_index]
+            || content[index] as char == CODING[word_index + 1]
+        {
+            word_index += 2
+        } else {
+            word_index = 0;
+        }
+        index += 1;
+    }
+    if word_index < CODING.len() {
+        return None;
+    }
+    index = skip_space(content, index);
+
+    if index < content.len() && content[index] as char != ':' && content[index] as char != '=' {
+        return None;
+    }
+    index += 1;
+    index = skip_space(content, index);
+
+    let start = index;
+    while index < content.len() {
+        let c = content[index] as char;
+        if c == '-' || c == '_' || c.is_ascii_alphanumeric() {
+            index += 1;
+        } else {
+            break;
+        }
+    }
+    if index > start {
+        return Some(String::from_utf8_lossy(&content[start..index]));
+    }
+    None
+}
+
+#[test]
+fn test_scan_coding_comment() {
+    let text = "# encoding: utf-8";
+    let result = scan_coding_comment(text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "#coding:utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# foo\n# encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, None);
+
+    let text = "# encoding: latin1 encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("latin1".into()));
+
+    let text = "# encoding: nonsense";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("nonsense".into()));
+
+    let text = "# coding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# CODING = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# CoDiNg = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "# blah blahblahcoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    // unicode BOM is ignored
+    let text = "\u{FEFF}# encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "\u{FEFF} # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "#! /usr/bin/env ruby\n # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    let text = "\u{FEFF}#! /usr/bin/env ruby\n # encoding: utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+
+    // A #! must be the first thing on a line, otherwise it's a normal comment
+    let text = " #! /usr/bin/env ruby encoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, Some("utf-8".into()));
+    let text = " #! /usr/bin/env ruby \n # encoding = utf-8";
+    let result = scan_coding_comment(&text.as_bytes());
+    assert_eq!(result, None);
+}
diff --git a/ruby/ql/test/library-tests/ast/Ast.expected b/ruby/ql/test/library-tests/ast/Ast.expected
@@ -1762,6 +1762,12 @@ escape_sequences/escapes.rb:
 #   93|   getStmt: [SymbolLiteral] :"\C-?"
 #   93|     getComponent: [StringEscapeSequenceComponent] \C
 #   93|     getComponent: [StringTextComponent] -?
+misc/iso-8859-15.rb:
+#    1| [Toplevel] iso-8859-15.rb
+#    4|   getStmt: [MethodCall] call to print
+#    4|     getReceiver: [SelfVariableAccess] self
+#    4|     getArgument: [StringLiteral] "EUR = €"
+#    4|       getComponent: [StringTextComponent] EUR = €
 literals/literals.rb:
 #    1| [Toplevel] literals.rb
 #    2|   getStmt: [NilLiteral] nil
diff --git a/ruby/ql/test/library-tests/ast/TreeSitter.expected b/ruby/ql/test/library-tests/ast/TreeSitter.expected
@@ -4604,6 +4604,17 @@ literals/literals.rb:
 #  193|    cat file.txt
 #  193|   
 #  195|   1: [HeredocEnd] SCRIPT
+misc/iso-8859-15.rb:
+#    1| [Program] Program
+#    4|   0: [Call] Call
+#    4|     0: [Identifier] print
+#    4|     1: [ArgumentList] ArgumentList
+#    4|       0: [String] String
+#    4|         0: [ReservedWord] "
+#    4|         1: [StringContent] EUR = €
+#    4|         2: [ReservedWord] "
+#    1| [Comment] #! /usr/bin/ruby
+#    2| [Comment] # coding: iso-8859-15
 misc/misc.erb:
 #    2| [Program] Program
 #    2|   0: [Call] Call
diff --git a/ruby/ql/test/library-tests/ast/ValueText.expected b/ruby/ql/test/library-tests/ast/ValueText.expected
@@ -717,6 +717,7 @@ exprValue
 | literals/literals.rb:198:8:198:8 | 5 | 5 | int |
 | literals/literals.rb:199:2:199:2 | :y | :y | symbol |
 | literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
+| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
 | misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
 | misc/misc.rb:1:7:1:11 | "bar" | bar | string |
 | misc/misc.rb:3:7:3:9 | foo | foo | string |
@@ -1592,6 +1593,7 @@ exprCfgNodeValue
 | literals/literals.rb:198:8:198:8 | 5 | 5 | int |
 | literals/literals.rb:199:2:199:2 | :y | :y | symbol |
 | literals/literals.rb:199:7:199:7 | :Z | :Z | symbol |
+| misc/iso-8859-15.rb:4:7:4:17 | "EUR = \u20ac" | EUR = \u20ac | string |
 | misc/misc.erb:2:15:2:37 | "main_include_admin.js" | main_include_admin.js | string |
 | misc/misc.rb:1:7:1:11 | "bar" | bar | string |
 | misc/misc.rb:3:7:3:9 | foo | foo | string |
diff --git a/ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb b/ruby/ql/test/library-tests/ast/misc/iso-8859-15.rb
@@ -0,0 +1,4 @@
+#! /usr/bin/ruby
+# coding: iso-8859-15
+
+print "EUR = �"

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +#! /usr/bin/ruby
 +# coding: iso-8859-15
++
 +print "EUR = ¤"