diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index b03b1406cd..d50305e632 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -200,7 +200,10 @@ class Lexer :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS ] - private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES + # Heredocs are complex and require us to keep track of a bit of info to refer to later + HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true) + + private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData # The Parser::Source::Buffer that the tokens were lexed from. attr_reader :source_buffer @@ -230,7 +233,8 @@ def to_a index = 0 length = lexed.length - heredoc_identifier_stack = [] + heredoc_stack = Array.new + quote_stack = Array.new while index < length token, state = lexed[index] @@ -299,9 +303,6 @@ def to_a when :tSPACE value = nil when :tSTRING_BEG - if token.type == :HEREDOC_START - heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?.*?)["'`]?\z/)[:heredoc_identifier]) - end next_token = lexed[index][0] next_next_token = lexed[index + 1][0] basic_quotes = ["\"", "'"].include?(value) @@ -312,47 +313,90 @@ def to_a value = "" location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) index += 1 - elsif basic_quotes && next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END - # the parser gem doesn't simplify strings when its value ends in a newline - unless (string_value = next_token.value).end_with?("\n") - next_location = token.location.join(next_next_token.location) - value = string_value.gsub("\\\\", "\\") - type = :tSTRING - location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) - index += 2 + elsif value.start_with?("'", '"', "%") + if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END + # the parser gem doesn't simplify strings when its value ends in a newline + if !(string_value = next_token.value).end_with?("\n") && basic_quotes + next_location = token.location.join(next_next_token.location) + value = unescape_string(string_value, value) + type = :tSTRING + location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) + index += 2 + tokens << [type, [value, location]] + + next + end end - elsif value.start_with?("<<") + + quote_stack.push(value) + elsif token.type == :HEREDOC_START quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2] + heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : "" + heredoc = HeredocData.new( + identifier: value.match(/<<[-~]?["'`]?(?.*?)["'`]?\z/)[:heredoc_identifier], + common_whitespace: 0, + ) + if quote == "`" type = :tXSTRING_BEG - value = "<<`" + end + + # The parser gem trims whitespace from squiggly heredocs. We must record + # the most common whitespace to later remove. + if heredoc_type == "~" || heredoc_type == "`" + heredoc.common_whitespace = calculate_heredoc_whitespace(index) + end + + if quote == "'" || quote == '"' || quote == "`" + value = "<<#{quote}" else - value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}" + value = '<<"' end + + heredoc_stack.push(heredoc) + quote_stack.push(value) end when :tSTRING_CONTENT - unless (lines = token.value.lines).one? + if (lines = token.value.lines).one? + # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. + is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line + # The parser gem only removes indentation when the heredoc is not nested + not_nested = heredoc_stack.size == 1 + if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0 + value = trim_heredoc_whitespace(value, current_heredoc) + end + + value = unescape_string(value, quote_stack.last) + else + # When the parser gem encounters a line continuation inside of a multiline string, + # it emits a single string node. The backslash (and remaining newline) is removed. + current_line = +"" + adjustment = 0 start_offset = offset_cache[token.location.start_offset] - lines.map do |line| - newline = line.end_with?("\r\n") ? "\r\n" : "\n" + emit = false + + lines.each.with_index do |line, index| chomped_line = line.chomp - if match = chomped_line.match(/(?\\+)\z/) - adjustment = match[:backslashes].size / 2 - adjusted_line = chomped_line.delete_suffix("\\" * adjustment) - if match[:backslashes].size.odd? - adjusted_line.delete_suffix!("\\") - adjustment += 2 - else - adjusted_line << newline - end + + # When the line ends with an odd number of backslashes, it must be a line continuation. + if chomped_line[/\\{1,}\z/]&.length&.odd? + chomped_line.delete_suffix!("\\") + current_line << chomped_line + adjustment += 2 + # If the string ends with a line continuation emit the remainder + emit = index == lines.count - 1 else - adjusted_line = line - adjustment = 0 + current_line << line + emit = true end - end_offset = start_offset + adjusted_line.bytesize + adjustment - tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] - start_offset = end_offset + if emit + end_offset = start_offset + current_line.bytesize + adjustment + tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] + start_offset = end_offset + current_line = +"" + adjustment = 0 + end end next end @@ -361,12 +405,14 @@ def to_a when :tSTRING_END if token.type == :HEREDOC_END && value.end_with?("\n") newline_length = value.end_with?("\r\n") ? 2 : 1 - value = heredoc_identifier_stack.pop + value = heredoc_stack.pop.identifier location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length]) elsif token.type == :REGEXP_END value = value[0] location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1]) end + + quote_stack.pop when :tSYMBEG if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END next_location = token.location.join(next_token.location) @@ -375,6 +421,8 @@ def to_a value = { "~@" => "~", "!@" => "!" }.fetch(value, value) location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) index += 1 + else + quote_stack.push(value) end when :tFID if !tokens.empty? && tokens.dig(-1, 0) == :kDEF @@ -384,10 +432,15 @@ def to_a if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END type = :tBACK_REF2 end + quote_stack.push(value) when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP index += 1 end + + quote_stack.push(value) + when :tREGEXP_BEG + quote_stack.push(value) end tokens << [type, [value, location]] @@ -443,6 +496,104 @@ def parse_rational(value) rescue ArgumentError 0r end + + # Wonky heredoc tab/spaces rules. + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558 + def calculate_heredoc_whitespace(heredoc_token_index) + next_token_index = heredoc_token_index + nesting_level = 0 + previous_line = -1 + result = Float::MAX + + while (lexed[next_token_index] && next_token = lexed[next_token_index][0]) + next_token_index += 1 + next_next_token = lexed[next_token_index] && lexed[next_token_index][0] + + # String content inside nested heredocs and interpolation is ignored + if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN + nesting_level += 1 + elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END + nesting_level -= 1 + # When we encountered the matching heredoc end, we can exit + break if nesting_level == -1 + elsif next_token.type == :STRING_CONTENT && nesting_level == 0 + common_whitespace = 0 + next_token.value[/^\s*/].each_char do |char| + if char == "\t" + common_whitespace = (common_whitespace / 8 + 1) * 8; + else + common_whitespace += 1 + end + end + + is_first_token_on_line = next_token.location.start_line != previous_line + # Whitespace is significant if followed by interpolation + whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line + if is_first_token_on_line && !whitespace_only && common_whitespace < result + result = common_whitespace + previous_line = next_token.location.start_line + end + end + end + result + end + + # Wonky heredoc tab/spaces rules. + # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545 + def trim_heredoc_whitespace(string, heredoc) + trimmed_whitespace = 0 + trimmed_characters = 0 + while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace + if string[trimmed_characters] == "\t" + trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8; + break if trimmed_whitespace > heredoc.common_whitespace + else + trimmed_whitespace += 1 + end + trimmed_characters += 1 + end + + string[trimmed_characters..] + end + + # Escape sequences that have special and should appear unescaped in the resulting string. + ESCAPES = { + "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f", + "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t", + "v" => "\v", "\\\\" => "\\" + }.freeze + private_constant :ESCAPES + + # When one of these delimiters is encountered, then the other + # one is allowed to be escaped as well. + DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze + private_constant :DELIMITER_SYMETRY + + # TODO: Does not handle "\u1234" and other longer-form escapes. + def unescape_string(string, quote) + # In single-quoted heredocs, everything is taken literally. + return string if quote == "<<'" + + # TODO: Implement regexp escaping + return string if quote == "/" || quote.start_with?("%r") + + if quote == "'" || quote.start_with?("%q") || quote.start_with?("%w") || quote.start_with?("%i") + if quote == "'" + delimiter = "'" + else + delimiter = quote[2] + end + + delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}") + string.gsub(/\\([\\#{delimiters}])/, '\1') + else + # When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n", + # and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o". + string.gsub(/\\./) do |match| + ESCAPES[match[1]] || match[1] + end + end + end end end end diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index b1c86d2ce2..05658d1fa1 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -93,7 +93,6 @@ class ParserTest < TestCase "methods.txt", "strings.txt", "tilde_heredocs.txt", - "xstring_with_backslash.txt", "seattlerb/backticks_interpolation_line.txt", "seattlerb/bug169.txt", "seattlerb/case_in.txt", @@ -102,55 +101,30 @@ class ParserTest < TestCase "seattlerb/difficult6__7.txt", "seattlerb/difficult6__8.txt", "seattlerb/dsym_esc_to_sym.txt", - "seattlerb/heredoc__backslash_dos_format.txt", - "seattlerb/heredoc_backslash_nl.txt", - "seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt", - "seattlerb/heredoc_squiggly_blank_lines.txt", - "seattlerb/heredoc_squiggly_interp.txt", - "seattlerb/heredoc_squiggly_tabs_extra.txt", - "seattlerb/heredoc_squiggly_tabs.txt", - "seattlerb/heredoc_squiggly_visually_blank_lines.txt", - "seattlerb/heredoc_squiggly.txt", "seattlerb/heredoc_unicode.txt", - "seattlerb/heredoc_with_carriage_return_escapes_windows.txt", - "seattlerb/heredoc_with_carriage_return_escapes.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt", - "seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt", "seattlerb/module_comments.txt", "seattlerb/parse_line_block_inline_comment_leading_newlines.txt", "seattlerb/parse_line_block_inline_comment.txt", "seattlerb/parse_line_block_inline_multiline_comment.txt", - "seattlerb/parse_line_dstr_escaped_newline.txt", "seattlerb/parse_line_heredoc.txt", - "seattlerb/parse_line_multiline_str_literal_n.txt", - "seattlerb/parse_line_str_with_newline_escape.txt", "seattlerb/pct_w_heredoc_interp_nested.txt", - "seattlerb/qw_escape_term.txt", "seattlerb/read_escape_unicode_curlies.txt", "seattlerb/read_escape_unicode_h4.txt", "seattlerb/required_kwarg_no_value.txt", "seattlerb/slashy_newlines_within_string.txt", - "seattlerb/str_double_escaped_newline.txt", "seattlerb/str_evstr_escape.txt", - "seattlerb/str_newline_hash_line_number.txt", "seattlerb/TestRubyParserShared.txt", "unparser/corpus/literal/assignment.txt", - "unparser/corpus/literal/dstr.txt", - "unparser/corpus/semantic/opasgn.txt", "whitequark/args.txt", "whitequark/beginless_erange_after_newline.txt", "whitequark/beginless_irange_after_newline.txt", "whitequark/bug_ascii_8bit_in_literal.txt", "whitequark/bug_def_no_paren_eql_begin.txt", - "whitequark/dedenting_heredoc.txt", - "whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt", "whitequark/forward_arg_with_open_args.txt", "whitequark/lbrace_arg_after_command_args.txt", "whitequark/multiple_pattern_matches.txt", "whitequark/newline_in_hash_argument.txt", "whitequark/parser_bug_640.txt", - "whitequark/parser_drops_truncated_parts_of_squiggly_heredoc.txt", - "whitequark/ruby_bug_11990.txt", "whitequark/ruby_bug_14690.txt", "whitequark/ruby_bug_9669.txt", "whitequark/slash_newline_in_heredocs.txt",