From 6bba1c54e18e4577bf55cdb06c4f7f5b0c8d7b64 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Mon, 6 Jan 2025 20:18:03 +0100 Subject: [PATCH 1/2] Fix parser translator ast for regex with line continuation Turns out, the vast majority of work was already done with handling the same for heredocs I'm confident this should also apply to actual string nodes (there's even a todo for it) but no tests change if I apply it there too, so I can't say for sure if the logic would be correct. The individual test files are a bit too large, maybe something else would break that currently passes. Leaving it for later to look more closely into that. --- lib/prism/translation/parser/compiler.rb | 111 ++++++++++++----------- test/prism/fixtures/regex.txt | 8 ++ test/prism/ruby/parser_test.rb | 1 - test/prism/snapshots/regex.txt | 24 +++-- 4 files changed, 79 insertions(+), 65 deletions(-) diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index 54e08eb991..c6a7154625 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -1511,13 +1511,9 @@ def visit_redo_node(node) # /foo/ # ^^^^^ def visit_regular_expression_node(node) - content = node.content parts = - if content.include?("\n") - offset = node.content_loc.start_offset - content.lines.map do |line| - builder.string_internal([line, srange_offsets(offset, offset += line.bytesize)]) - end + if node.content.include?("\n") + string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening) else [builder.string_internal(token(node.content_loc))] end @@ -2074,55 +2070,7 @@ def visit_heredoc(node) node.parts.each do |part| pushing = if part.is_a?(StringNode) && part.unescaped.include?("\n") - unescaped = part.unescaped.lines - escaped = part.content.lines - - escaped_lengths = [] - normalized_lengths = [] - # Keeps track of where an unescaped line should start a new token. An unescaped - # \n would otherwise be indistinguishable from the actual newline at the end of - # of the line. The parser gem only emits a new string node at "real" newlines, - # line continuations don't start a new node as well. - do_next_tokens = [] - - if node.opening.end_with?("'") - escaped.each do |line| - escaped_lengths << line.bytesize - normalized_lengths << chomped_bytesize(line) - do_next_tokens << true - end - else - escaped - .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false } - .each do |lines| - escaped_lengths << lines.sum(&:bytesize) - normalized_lengths << lines.sum { |line| chomped_bytesize(line) } - unescaped_lines_count = lines.sum do |line| - line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false } - end - do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false)) - do_next_tokens[-1] = true - end - end - - start_offset = part.location.start_offset - current_line = +"" - current_normalized_length = 0 - - unescaped.filter_map.with_index do |unescaped_line, index| - current_line << unescaped_line - current_normalized_length += normalized_lengths.fetch(index, 0) - - if do_next_tokens[index] - inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) - start_offset += escaped_lengths.fetch(index, 0) - current_line = +"" - current_normalized_length = 0 - inner_part - else - nil - end - end + string_nodes_from_line_continuations(part, part.location.start_offset, node.opening) else [visit(part)] end @@ -2172,6 +2120,59 @@ def within_pattern parser.pattern_variables.pop end end + + # Create parser string nodes from a single prism node. The parser gem + # "glues" strings together when a line continuation is encountered. + def string_nodes_from_line_continuations(node, start_offset, opening) + unescaped = node.unescaped.lines + escaped = node.content.lines + + escaped_lengths = [] + normalized_lengths = [] + # Keeps track of where an unescaped line should start a new token. An unescaped + # \n would otherwise be indistinguishable from the actual newline at the end of + # of the line. The parser gem only emits a new string node at "real" newlines, + # line continuations don't start a new node as well. + do_next_tokens = [] + + if opening.end_with?("'") + escaped.each do |line| + escaped_lengths << line.bytesize + normalized_lengths << chomped_bytesize(line) + do_next_tokens << true + end + else + escaped + .chunk_while { |before, after| before[/(\\*)\r?\n$/, 1]&.length&.odd? || false } + .each do |lines| + escaped_lengths << lines.sum(&:bytesize) + normalized_lengths << lines.sum { |line| chomped_bytesize(line) } + unescaped_lines_count = lines.sum do |line| + line.scan(/(\\*)n/).count { |(backslashes)| backslashes&.length&.odd? || false } + end + do_next_tokens.concat(Array.new(unescaped_lines_count + 1, false)) + do_next_tokens[-1] = true + end + end + + current_line = +"" + current_normalized_length = 0 + + unescaped.filter_map.with_index do |unescaped_line, index| + current_line << unescaped_line + current_normalized_length += normalized_lengths.fetch(index, 0) + + if do_next_tokens[index] + inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)]) + start_offset += escaped_lengths.fetch(index, 0) + current_line = +"" + current_normalized_length = 0 + inner_part + else + nil + end + end + end end end end diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt index 4623733f58..85e600fbdd 100644 --- a/test/prism/fixtures/regex.txt +++ b/test/prism/fixtures/regex.txt @@ -46,3 +46,11 @@ tap { /(?)/ =~ to_s } def foo(nil:) = /(?)/ =~ "" /(?-x:#)/x + +/a +b\ +c\ +d\\\ +e\\ +f\ +/ diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index b440c8955f..f8a59d36a0 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -59,7 +59,6 @@ class ParserTest < TestCase # These files are either failing to parse or failing to translate, so we'll # skip them for now. skip_all = skip_incorrect | [ - "regex.txt", "unescaping.txt", "seattlerb/bug190.txt", "seattlerb/heredoc_with_extra_carriage_returns_windows.txt", diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt index c90e689760..46a85252f2 100644 --- a/test/prism/snapshots/regex.txt +++ b/test/prism/snapshots/regex.txt @@ -1,10 +1,10 @@ -@ ProgramNode (location: (1,0)-(48,10)) +@ ProgramNode (location: (1,0)-(56,1)) ├── flags: ∅ ├── locals: [:foo, :ab, :abc, :a] └── statements: - @ StatementsNode (location: (1,0)-(48,10)) + @ StatementsNode (location: (1,0)-(56,1)) ├── flags: ∅ - └── body: (length: 26) + └── body: (length: 27) ├── @ CallNode (location: (1,0)-(1,9)) │ ├── flags: newline, ignore_visibility │ ├── receiver: ∅ @@ -537,9 +537,15 @@ │ ├── rparen_loc: (46,12)-(46,13) = ")" │ ├── equal_loc: (46,14)-(46,15) = "=" │ └── end_keyword_loc: ∅ - └── @ RegularExpressionNode (location: (48,0)-(48,10)) - ├── flags: newline, static_literal, extended, forced_us_ascii_encoding - ├── opening_loc: (48,0)-(48,1) = "/" - ├── content_loc: (48,1)-(48,8) = "(?-x:#)" - ├── closing_loc: (48,8)-(48,10) = "/x" - └── unescaped: "(?-x:#)" + ├── @ RegularExpressionNode (location: (48,0)-(48,10)) + │ ├── flags: newline, static_literal, extended, forced_us_ascii_encoding + │ ├── opening_loc: (48,0)-(48,1) = "/" + │ ├── content_loc: (48,1)-(48,8) = "(?-x:#)" + │ ├── closing_loc: (48,8)-(48,10) = "/x" + │ └── unescaped: "(?-x:#)" + └── @ RegularExpressionNode (location: (50,0)-(56,1)) + ├── flags: newline, static_literal, forced_us_ascii_encoding + ├── opening_loc: (50,0)-(50,1) = "/" + ├── content_loc: (50,1)-(56,0) = "a\nb\\\nc\\\nd\\\\\\\ne\\\\\nf\\\n" + ├── closing_loc: (56,0)-(56,1) = "/" + └── unescaped: "a\nbcd\\\\e\\\\\nf" From 1166db13dde3b34513c9c4cfe02d1644743817d8 Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Mon, 6 Jan 2025 20:19:21 +0100 Subject: [PATCH 2/2] Fix parser translator ast for empty regex In that specific case, no string node is emitted --- lib/prism/translation/parser/compiler.rb | 4 +++- test/prism/fixtures/regex.txt | 2 ++ test/prism/snapshots/regex.txt | 22 ++++++++++++++-------- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/lib/prism/translation/parser/compiler.rb b/lib/prism/translation/parser/compiler.rb index c6a7154625..a20136a03d 100644 --- a/lib/prism/translation/parser/compiler.rb +++ b/lib/prism/translation/parser/compiler.rb @@ -1512,7 +1512,9 @@ def visit_redo_node(node) # ^^^^^ def visit_regular_expression_node(node) parts = - if node.content.include?("\n") + if node.content == "" + [] + elsif node.content.include?("\n") string_nodes_from_line_continuations(node, node.content_loc.start_offset, node.opening) else [builder.string_internal(token(node.content_loc))] diff --git a/test/prism/fixtures/regex.txt b/test/prism/fixtures/regex.txt index 85e600fbdd..712bfc081a 100644 --- a/test/prism/fixtures/regex.txt +++ b/test/prism/fixtures/regex.txt @@ -54,3 +54,5 @@ d\\\ e\\ f\ / + +// diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt index 46a85252f2..e8835648ee 100644 --- a/test/prism/snapshots/regex.txt +++ b/test/prism/snapshots/regex.txt @@ -1,10 +1,10 @@ -@ ProgramNode (location: (1,0)-(56,1)) +@ ProgramNode (location: (1,0)-(58,2)) ├── flags: ∅ ├── locals: [:foo, :ab, :abc, :a] └── statements: - @ StatementsNode (location: (1,0)-(56,1)) + @ StatementsNode (location: (1,0)-(58,2)) ├── flags: ∅ - └── body: (length: 27) + └── body: (length: 28) ├── @ CallNode (location: (1,0)-(1,9)) │ ├── flags: newline, ignore_visibility │ ├── receiver: ∅ @@ -543,9 +543,15 @@ │ ├── content_loc: (48,1)-(48,8) = "(?-x:#)" │ ├── closing_loc: (48,8)-(48,10) = "/x" │ └── unescaped: "(?-x:#)" - └── @ RegularExpressionNode (location: (50,0)-(56,1)) + ├── @ RegularExpressionNode (location: (50,0)-(56,1)) + │ ├── flags: newline, static_literal, forced_us_ascii_encoding + │ ├── opening_loc: (50,0)-(50,1) = "/" + │ ├── content_loc: (50,1)-(56,0) = "a\nb\\\nc\\\nd\\\\\\\ne\\\\\nf\\\n" + │ ├── closing_loc: (56,0)-(56,1) = "/" + │ └── unescaped: "a\nbcd\\\\e\\\\\nf" + └── @ RegularExpressionNode (location: (58,0)-(58,2)) ├── flags: newline, static_literal, forced_us_ascii_encoding - ├── opening_loc: (50,0)-(50,1) = "/" - ├── content_loc: (50,1)-(56,0) = "a\nb\\\nc\\\nd\\\\\\\ne\\\\\nf\\\n" - ├── closing_loc: (56,0)-(56,1) = "/" - └── unescaped: "a\nbcd\\\\e\\\\\nf" + ├── opening_loc: (58,0)-(58,1) = "/" + ├── content_loc: (58,1)-(58,1) = "" + ├── closing_loc: (58,1)-(58,2) = "/" + └── unescaped: ""