diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 54e08eb991..a20136a03d 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -1511,13 +1511,11 @@ def visit_redo_node(node) # /foo/ # ^^^^^ def visit_regular_expression_node(node) - content = node.content parts = - if content.include?("\n") - offset = node.content_loc.start_offset - content.lines.map do |line| - builder.string_internal([line, srange_offsets(offset, offset += line.bytesize)]) - end + if node.content == "" + [] + elsif node.content.include?("\n") + string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening) else [builder.string_internal(token(node.content_loc))] end @@ -2074,55 +2072,7 @@ def visit_heredoc(node) node.parts.each do |part| pushing = if part.is_a?(StringNode) && part.unescaped.include?("\n") - unescaped = part.unescaped.lines - escaped = part.content.lines - - escaped_lengths = [] - normalized_lengths = [] - # Keeps track of where an unescaped line should start a new token. An unescaped - # \n would otherwise be indistinguishable from the actual newline at the end of - # of the line. The parser gem only emits a new string node at "real" newlines, - # line continuations don't start a new node as well. - do_next_tokens = [] - - if node.opening.end_with?("'") - escaped.each do |line| - escaped_lengths << line.bytesize - normalized_lengths << chomped_bytesize(line) - do_next_tokens << true - end - else - escaped - .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false } - .each do |lines| - escaped_lengths << lines.sum(&:bytesize) - normalized_lengths << lines.sum { |line| chomped_bytesize(line) } - unescaped_lines_count = lines.sum do |line| - line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false } - end - do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false)) - do_next_tokens[-1] = true - end - end - - start_offset = part.location.start_offset - current_line = +"" - current_normalized_length = 0 - - unescaped.filter_map.with_index do |unescaped_line, index| - current_line << unescaped_line - current_normalized_length += normalized_lengths.fetch(index, 0) - - if do_next_tokens[index] - inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) - start_offset += escaped_lengths.fetch(index, 0) - current_line = +"" - current_normalized_length = 0 - inner_part - else - nil - end - end + string_nodes_from_line_continuations(part, part.location.start_offset, node.opening) else [visit(part)] end @@ -2172,6 +2122,59 @@ def within_pattern parser.pattern_variables.pop end end + + # Create parser string nodes from a single prism node. The parser gem + # "glues" strings together when a line continuation is encountered. + def string_nodes_from_line_continuations(node, start_offset, opening) + unescaped = node.unescaped.lines + escaped = node.content.lines + + escaped_lengths = [] + normalized_lengths = [] + # Keeps track of where an unescaped line should start a new token. An unescaped + # \n would otherwise be indistinguishable from the actual newline at the end of + # of the line. The parser gem only emits a new string node at "real" newlines, + # line continuations don't start a new node as well. + do_next_tokens = [] + + if opening.end_with?("'") + escaped.each do |line| + escaped_lengths << line.bytesize + normalized_lengths << chomped_bytesize(line) + do_next_tokens << true + end + else + escaped + .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false } + .each do |lines| + escaped_lengths << lines.sum(&:bytesize) + normalized_lengths << lines.sum { |line| chomped_bytesize(line) } + unescaped_lines_count = lines.sum do |line| + line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false } + end + do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false)) + do_next_tokens[-1] = true + end + end + + current_line = +"" + current_normalized_length = 0 + + unescaped.filter_map.with_index do |unescaped_line, index| + current_line << unescaped_line + current_normalized_length += normalized_lengths.fetch(index, 0) + + if do_next_tokens[index] + inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) + start_offset += escaped_lengths.fetch(index, 0) + current_line = +"" + current_normalized_length = 0 + inner_part + else + nil + end + end + end end end end diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt index 4623733f58..712bfc081a 100644 --- a/test/prism/fixtures/regex.txt +++ b/test/prism/fixtures/regex.txt @@ -46,3 +46,13 @@ tap { /(?)/ =~ to_s } def foo(nil:) = /(?)/ =~ "" /(?-x:#)/x + +/a +b\ +c\ +d\\\ +e\\ +f\ +/ + +// diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index b440c8955f..f8a59d36a0 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -59,7 +59,6 @@ class ParserTest < TestCase # These files are either failing to parse or failing to translate, so we'll # skip them for now. skip_all = skip_incorrect | [ - "regex.txt", "unescaping.txt", "seattlerb/bug190.txt", "seattlerb/heredoc_with_extra_carriage_returns_windows.txt", diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt index c90e689760..e8835648ee 100644 --- a/test/prism/snapshots/regex.txt +++ b/test/prism/snapshots/regex.txt @@ -1,10 +1,10 @@ -@ ProgramNode (location: (1,0)-(48,10)) +@ ProgramNode (location: (1,0)-(58,2)) ├── flags: ∅ ├── locals: [:foo, :ab, :abc, :a] └── statements: - @ StatementsNode (location: (1,0)-(48,10)) + @ StatementsNode (location: (1,0)-(58,2)) ├── flags: ∅ - └── body: (length: 26) + └── body: (length: 28) ├── @ CallNode (location: (1,0)-(1,9)) │ ├── flags: newline, ignore_visibility │ ├── receiver: ∅ @@ -537,9 +537,21 @@ │ ├── rparen_loc: (46,12)-(46,13) = ")" │ ├── equal_loc: (46,14)-(46,15) = "=" │ └── end_keyword_loc: ∅ - └── @ RegularExpressionNode (location: (48,0)-(48,10)) - ├── flags: newline, static_literal, extended, forced_us_ascii_encoding - ├── opening_loc: (48,0)-(48,1) = "/" - ├── content_loc: (48,1)-(48,8) = "(?-x:#)" - ├── closing_loc: (48,8)-(48,10) = "/x" - └── unescaped: "(?-x:#)" + ├── @ RegularExpressionNode (location: (48,0)-(48,10)) + │ ├── flags: newline, static_literal, extended, forced_us_ascii_encoding + │ ├── opening_loc: (48,0)-(48,1) = "/" + │ ├── content_loc: (48,1)-(48,8) = "(?-x:#)" + │ ├── closing_loc: (48,8)-(48,10) = "/x" + │ └── unescaped: "(?-x:#)" + ├── @ RegularExpressionNode (location: (50,0)-(56,1)) + │ ├── flags: newline, static_literal, forced_us_ascii_encoding + │ ├── opening_loc: (50,0)-(50,1) = "/" + │ ├── content_loc: (50,1)-(56,0) = "a\nb\\\nc\\\nd\\\\\\\ne\\\\\nf\\\n" + │ ├── closing_loc: (56,0)-(56,1) = "/" + │ └── unescaped: "a\nbcd\\\\e\\\\\nf" + └── @ RegularExpressionNode (location: (58,0)-(58,2)) + ├── flags: newline, static_literal, forced_us_ascii_encoding + ├── opening_loc: (58,0)-(58,1) = "/" + ├── content_loc: (58,1)-(58,1) = "" + ├── closing_loc: (58,1)-(58,2) = "/" + └── unescaped: ""