ruby · kddnewton · Jan 3, 2025 · Dec 23, 2024 · Dec 30, 2024 · Dec 30, 2024
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
@@ -200,7 +200,10 @@ class Lexer
           :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
         ]
 
-        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
+        # Heredocs are complex and require us to keep track of a bit of info to refer to later
+        HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
+
+        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
 
         # The Parser::Source::Buffer that the tokens were lexed from.
         attr_reader :source_buffer
@@ -230,7 +233,8 @@ def to_a
           index = 0
           length = lexed.length
 
-          heredoc_identifier_stack = []
+          heredoc_stack = Array.new
+          quote_stack = Array.new
 
           while index < length
             token, state = lexed[index]
@@ -299,9 +303,6 @@ def to_a
             when :tSPACE
               value = nil
             when :tSTRING_BEG
-              if token.type == :HEREDOC_START
-                heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
-              end
               next_token = lexed[index][0]
               next_next_token = lexed[index + 1][0]
               basic_quotes = ["\"", "'"].include?(value)
@@ -312,47 +313,90 @@ def to_a
                 value = ""
                 location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
                 index += 1
-              elsif basic_quotes && next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
-                # the parser gem doesn't simplify strings when its value ends in a newline
-                unless (string_value = next_token.value).end_with?("\n")
-                  next_location = token.location.join(next_next_token.location)
-                  value = string_value.gsub("\\\\", "\\")
-                  type = :tSTRING
-                  location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
-                  index += 2
+              elsif value.start_with?("'", '"', "%")
+                if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
+                  # the parser gem doesn't simplify strings when its value ends in a newline
+                  if !(string_value = next_token.value).end_with?("\n") && basic_quotes
+                    next_location = token.location.join(next_next_token.location)
+                    value = unescape_string(string_value, value)
+                    type = :tSTRING
+                    location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                    index += 2
+                    tokens << [type, [value, location]]
+
+                    next
+                  end
                 end
-              elsif value.start_with?("<<")
+
+                quote_stack.push(value)
+              elsif token.type == :HEREDOC_START
                 quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
+                heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
+                heredoc = HeredocData.new(
+                  identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
+                  common_whitespace: 0,
+                )
+
                 if quote == "`"
                   type = :tXSTRING_BEG
-                  value = "<<`"
+                end
+
+                # The parser gem trims whitespace from squiggly heredocs. We must record
+                # the most common whitespace to later remove.
+                if heredoc_type == "~" || heredoc_type == "`"
+                  heredoc.common_whitespace = calculate_heredoc_whitespace(index)
+                end
+
+                if quote == "'" || quote == '"' || quote == "`"
+                  value = "<<#{quote}"
                 else
-                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
+                  value = '<<"'
                 end
+
+                heredoc_stack.push(heredoc)
+                quote_stack.push(value)
               end
             when :tSTRING_CONTENT
-              unless (lines = token.value.lines).one?
+              if (lines = token.value.lines).one?
+                # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
+                # The parser gem only removes indentation when the heredoc is not nested
+                not_nested = heredoc_stack.size == 1
+                if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                  value = trim_heredoc_whitespace(value, current_heredoc)
+                end
+
+                value = unescape_string(value, quote_stack.last)
+              else
+                # When the parser gem encounters a line continuation inside of a multiline string,
+                # it emits a single string node. The backslash (and remaining newline) is removed.
+                current_line = +""
+                adjustment = 0
                 start_offset = offset_cache[token.location.start_offset]
-                lines.map do |line|
-                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                emit = false
+
+                lines.each.with_index do |line, index|
                   chomped_line = line.chomp
-                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
-                    adjustment = match[:backslashes].size / 2
-                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
-                    if match[:backslashes].size.odd?
-                      adjusted_line.delete_suffix!("\\")
-                      adjustment += 2
-                    else
-                      adjusted_line << newline
-                    end
+
+                  # When the line ends with an odd number of backslashes, it must be a line continuation.
+                  if chomped_line[/\\{1,}\z/]&.length&.odd?
+                    chomped_line.delete_suffix!("\\")
+                    current_line << chomped_line
+                    adjustment += 2
+                    # If the string ends with a line continuation emit the remainder
+                    emit = index == lines.count - 1
                   else
-                    adjusted_line = line
-                    adjustment = 0
+                    current_line << line
+                    emit = true
                   end
 
-                  end_offset = start_offset + adjusted_line.bytesize + adjustment
-                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
-                  start_offset = end_offset
+                  if emit
+                    end_offset = start_offset + current_line.bytesize + adjustment
+                    tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
+                    start_offset = end_offset
+                    current_line = +""
+                    adjustment = 0
+                  end
                 end
                 next
               end
@@ -361,12 +405,14 @@ def to_a
             when :tSTRING_END
               if token.type == :HEREDOC_END && value.end_with?("\n")
                 newline_length = value.end_with?("\r\n") ? 2 : 1
-                value = heredoc_identifier_stack.pop
+                value = heredoc_stack.pop.identifier
                 location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
               elsif token.type == :REGEXP_END
                 value = value[0]
                 location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
               end
+
+              quote_stack.pop
             when :tSYMBEG
               if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
                 next_location = token.location.join(next_token.location)
@@ -375,6 +421,8 @@ def to_a
                 value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
                 location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
                 index += 1
+              else
+                quote_stack.push(value)
               end
             when :tFID
               if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@@ -384,10 +432,15 @@ def to_a
               if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
                 type = :tBACK_REF2
               end
+              quote_stack.push(value)
             when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
               if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
                 index += 1
               end
+
+              quote_stack.push(value)
+            when :tREGEXP_BEG
+              quote_stack.push(value)
             end
 
             tokens << [type, [value, location]]
@@ -443,6 +496,104 @@ def parse_rational(value)
         rescue ArgumentError
           0r
         end
+
+        # Wonky heredoc tab/spaces rules.
+        # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
+        def calculate_heredoc_whitespace(heredoc_token_index)
+          next_token_index = heredoc_token_index
+          nesting_level = 0
+          previous_line = -1
+          result = Float::MAX
+
+          while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
+            next_token_index += 1
+            next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
+
+            # String content inside nested heredocs and interpolation is ignored
+            if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
+              nesting_level += 1
+            elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
+              nesting_level -= 1
+              # When we encountered the matching heredoc end, we can exit
+              break if nesting_level == -1
+            elsif next_token.type == :STRING_CONTENT && nesting_level == 0
+              common_whitespace = 0
+              next_token.value[/^\s*/].each_char do |char|
+                if char == "\t"
+                  common_whitespace = (common_whitespace / 8 + 1) * 8;
+                else
+                  common_whitespace += 1
+                end
+              end
+
+              is_first_token_on_line = next_token.location.start_line != previous_line
+              # Whitespace is significant if followed by interpolation
+              whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
+              if is_first_token_on_line && !whitespace_only && common_whitespace < result
+                result = common_whitespace
+                previous_line = next_token.location.start_line
+              end
+            end
+          end
+          result
+        end
+
+        # Wonky heredoc tab/spaces rules.
+        # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
+        def trim_heredoc_whitespace(string, heredoc)
+          trimmed_whitespace = 0
+          trimmed_characters = 0
+          while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
+            if string[trimmed_characters] == "\t"
+              trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
+              break if trimmed_whitespace > heredoc.common_whitespace
+            else
+              trimmed_whitespace += 1
+            end
+            trimmed_characters += 1
+          end
+
+          string[trimmed_characters..]
+        end
+
+        # Escape sequences that have special and should appear unescaped in the resulting string.
+        ESCAPES = {
+          "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
+          "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
+          "v" => "\v", "\\\\" => "\\"
+        }.freeze
+        private_constant :ESCAPES
+
+        # When one of these delimiters is encountered, then the other
+        # one is allowed to be escaped as well.
+        DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
+        private_constant :DELIMITER_SYMETRY
+
+        # TODO: Does not handle "\u1234" and other longer-form escapes.
+        def unescape_string(string, quote)
+          # In single-quoted heredocs, everything is taken literally.
+          return string if quote == "<<'"
+
+          # TODO: Implement regexp escaping
+          return string if quote == "/" || quote.start_with?("%r")
+
+          if quote == "'" || quote.start_with?("%q") || quote.start_with?("%w") || quote.start_with?("%i")
+            if quote == "'"
+              delimiter = "'"
+            else
+              delimiter = quote[2]
+            end
+
+            delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
+            string.gsub(/\\([\\#{delimiters}])/, '\1')
+          else
+            # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
+            # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
+            string.gsub(/\\./) do |match|
+              ESCAPES[match[1]] || match[1]
+            end
+          end
+        end
       end
     end
   end

diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
@@ -93,7 +93,6 @@ class ParserTest < TestCase
       "methods.txt",
       "strings.txt",
       "tilde_heredocs.txt",
-      "xstring_with_backslash.txt",
       "seattlerb/backticks_interpolation_line.txt",
       "seattlerb/bug169.txt",
       "seattlerb/case_in.txt",
@@ -102,55 +101,30 @@ class ParserTest < TestCase
       "seattlerb/difficult6__7.txt",
       "seattlerb/difficult6__8.txt",
       "seattlerb/dsym_esc_to_sym.txt",
-      "seattlerb/heredoc__backslash_dos_format.txt",
-      "seattlerb/heredoc_backslash_nl.txt",
-      "seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
-      "seattlerb/heredoc_squiggly_blank_lines.txt",
-      "seattlerb/heredoc_squiggly_interp.txt",
-      "seattlerb/heredoc_squiggly_tabs_extra.txt",
-      "seattlerb/heredoc_squiggly_tabs.txt",
-      "seattlerb/heredoc_squiggly_visually_blank_lines.txt",
-      "seattlerb/heredoc_squiggly.txt",
       "seattlerb/heredoc_unicode.txt",
-      "seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
-      "seattlerb/heredoc_with_carriage_return_escapes.txt",
-      "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
-      "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
       "seattlerb/module_comments.txt",
       "seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
       "seattlerb/parse_line_block_inline_comment.txt",
       "seattlerb/parse_line_block_inline_multiline_comment.txt",
-      "seattlerb/parse_line_dstr_escaped_newline.txt",
       "seattlerb/parse_line_heredoc.txt",
-      "seattlerb/parse_line_multiline_str_literal_n.txt",
-      "seattlerb/parse_line_str_with_newline_escape.txt",
       "seattlerb/pct_w_heredoc_interp_nested.txt",
-      "seattlerb/qw_escape_term.txt",
       "seattlerb/read_escape_unicode_curlies.txt",
       "seattlerb/read_escape_unicode_h4.txt",
       "seattlerb/required_kwarg_no_value.txt",
       "seattlerb/slashy_newlines_within_string.txt",
-      "seattlerb/str_double_escaped_newline.txt",
       "seattlerb/str_evstr_escape.txt",
-      "seattlerb/str_newline_hash_line_number.txt",
       "seattlerb/TestRubyParserShared.txt",
       "unparser/corpus/literal/assignment.txt",
-      "unparser/corpus/literal/dstr.txt",
-      "unparser/corpus/semantic/opasgn.txt",
       "whitequark/args.txt",
       "whitequark/beginless_erange_after_newline.txt",
       "whitequark/beginless_irange_after_newline.txt",
       "whitequark/bug_ascii_8bit_in_literal.txt",
       "whitequark/bug_def_no_paren_eql_begin.txt",
-      "whitequark/dedenting_heredoc.txt",
-      "whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
       "whitequark/forward_arg_with_open_args.txt",
       "whitequark/lbrace_arg_after_command_args.txt",
       "whitequark/multiple_pattern_matches.txt",
       "whitequark/newline_in_hash_argument.txt",
       "whitequark/parser_bug_640.txt",
-      "whitequark/parser_drops_truncated_parts_of_squiggly_heredoc.txt",
-      "whitequark/ruby_bug_11990.txt",
       "whitequark/ruby_bug_14690.txt",
       "whitequark/ruby_bug_9669.txt",
       "whitequark/slash_newline_in_heredocs.txt",