From 8593d83c1b5130c9c8b437502d661f36301698c6 Mon Sep 17 00:00:00 2001 From: tompng Date: Thu, 15 Jan 2026 04:18:02 +0900 Subject: [PATCH] RubyLex Ripper to Prism Remove remaining ripper dependency in: syntax check, code continue/termination check, show-source, regexp-completion and string-like command arg parse. --- lib/irb.rb | 16 +- lib/irb/command/internal_helpers.rb | 9 +- lib/irb/completion.rb | 15 +- lib/irb/ruby-lex.rb | 354 +++++++++-------------- lib/irb/source_finder.rb | 16 +- test/irb/test_irb.rb | 2 +- test/irb/test_ruby_lex.rb | 95 +++--- test/irb/yamatanooroti/test_rendering.rb | 24 +- 8 files changed, 215 insertions(+), 316 deletions(-) diff --git a/lib/irb.rb b/lib/irb.rb index b4616df93..6f39d4953 100644 --- a/lib/irb.rb +++ b/lib/irb.rb @@ -6,7 +6,6 @@ # require "prism" -require "ripper" require "reline" require_relative "irb/init" @@ -252,11 +251,10 @@ def read_input_nomultiline(prompt) code << line return code if command?(code) - tokens, opens, terminated = @scanner.check_code_state(code, local_variables: @context.local_variables) + continue, opens, terminated = @scanner.check_code_state(code, local_variables: @context.local_variables) return code if terminated line_offset += 1 - continue = @scanner.should_continue?(tokens) prompt = generate_prompt(opens, continue, line_offset) end end @@ -317,7 +315,7 @@ def configure_io else next true if command?(code) - _tokens, _opens, terminated = @scanner.check_code_state(code, local_variables: @context.local_variables) + _continue, _opens, terminated = @scanner.check_code_state(code, local_variables: @context.local_variables) terminated end end @@ -325,13 +323,17 @@ def configure_io if @context.io.respond_to?(:dynamic_prompt) @context.io.dynamic_prompt do |lines| code = lines.map{ |l| l + "\n" }.join - tokens = RubyLex.ripper_lex_without_warning(code, local_variables: @context.local_variables) parse_lex_result = Prism.parse_lex(code, scopes: [@context.local_variables]) line_results = IRB::NestingParser.parse_by_line(parse_lex_result) + + tokens = parse_lex_result.value[1].map(&:first) + tokens_by_line = tokens.group_by {|t| t.location.start_line - 1 } + tokens_until_line = [] line_results.map.with_index do |(_prev_opens, next_opens, _min_depth), line_num_offset| - tokens_until_line << tokens.shift while !tokens.empty? && tokens.first.pos[0] <= line_num_offset + 1 - continue = @scanner.should_continue?(tokens_until_line) + line = lines[line_num_offset] + tokens_until_line.concat(tokens_by_line[line_num_offset] || []) + continue = @scanner.should_continue?(tokens_until_line, line, line_num_offset + 1) generate_prompt(next_opens, continue, line_num_offset) end end diff --git a/lib/irb/command/internal_helpers.rb b/lib/irb/command/internal_helpers.rb index a01ddb1d4..60914ebe1 100644 --- a/lib/irb/command/internal_helpers.rb +++ b/lib/irb/command/internal_helpers.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require 'prism' + module IRB module Command # Internal use only, for default command's backward compatibility. @@ -7,9 +9,10 @@ module RubyArgsExtractor # :nodoc: def unwrap_string_literal(str) return if str.empty? - sexp = Ripper.sexp(str) - if sexp && sexp.size == 2 && sexp.last&.first&.first == :string_literal - @irb_context.workspace.binding.eval(str).to_s + result = Prism.parse(str) + body = result.value.statements.body + if result.success? && body.size == 1 && body.first.is_a?(Prism::StringNode) + body.first.unescaped else str end diff --git a/lib/irb/completion.rb b/lib/irb/completion.rb index 40a7d3b53..6c2706f16 100644 --- a/lib/irb/completion.rb +++ b/lib/irb/completion.rb @@ -166,22 +166,13 @@ def complete_require_path(target, preposing, postposing) else return nil # It's not String literal end - tokens = RubyLex.ripper_lex_without_warning(preposing.rstrip) - tok = nil - tokens.reverse_each do |t| - unless [:on_lparen, :on_sp, :on_ignored_sp, :on_nl, :on_ignored_nl, :on_comment].include?(t.event) - tok = t - break - end - end - return unless tok&.event == :on_ident && tok.state == Ripper::EXPR_CMDARG - case tok.tok - when 'require' + case preposing + when /(^|[^\w])require\(? *\z/ retrieve_files_to_require_from_load_path.filter_map { |path| quote + path if path.start_with?(actual_target) } - when 'require_relative' + when /(^|[^\w])require_relative\(? *\z/ retrieve_files_to_require_relative_from_current_dir.filter_map { |path| quote + path if path.start_with?(actual_target) } diff --git a/lib/irb/ruby-lex.rb b/lib/irb/ruby-lex.rb index ca70ed90c..ca28a25b7 100644 --- a/lib/irb/ruby-lex.rb +++ b/lib/irb/ruby-lex.rb @@ -5,7 +5,6 @@ # require "prism" -require "ripper" require "jruby" if RUBY_ENGINE == "jruby" require_relative "nesting_parser" @@ -97,213 +96,149 @@ def compile_with_errors_suppressed(code, line_no: 1) end result end - - def generate_local_variables_assign_code(local_variables) - # Some reserved words could be a local variable - # Example: def f(if: 1); binding.irb; end - # These reserved words should be removed from assignment code - local_variables -= RESERVED_WORDS - "#{local_variables.join('=')}=nil;" unless local_variables.empty? - end - - # Some part of the code is not included in Ripper's token. - # Example: DATA part, token after heredoc_beg when heredoc has unclosed embexpr. - # With interpolated tokens, tokens.map(&:tok).join will be equal to code. - def interpolate_ripper_ignored_tokens(code, tokens) - line_positions = [0] - code.lines.each do |line| - line_positions << line_positions.last + line.bytesize - end - prev_byte_pos = 0 - interpolated = [] - prev_line = 1 - tokens.each do |t| - line, col = t.pos - byte_pos = line_positions[line - 1] + col - if prev_byte_pos < byte_pos - tok = code.byteslice(prev_byte_pos...byte_pos) - pos = [prev_line, prev_byte_pos - line_positions[prev_line - 1]] - interpolated << Ripper::Lexer::Elem.new(pos, :on_ignored_by_ripper, tok, 0) - prev_line += tok.count("\n") - end - interpolated << t - prev_byte_pos = byte_pos + t.tok.bytesize - prev_line += t.tok.count("\n") - end - if prev_byte_pos < code.bytesize - tok = code.byteslice(prev_byte_pos..) - pos = [prev_line, prev_byte_pos - line_positions[prev_line - 1]] - interpolated << Ripper::Lexer::Elem.new(pos, :on_ignored_by_ripper, tok, 0) - end - interpolated - end - - def ripper_lex_without_warning(code, local_variables: []) - verbose, $VERBOSE = $VERBOSE, nil - lvars_code = generate_local_variables_assign_code(local_variables) - original_code = code - if lvars_code - code = "#{lvars_code}\n#{code}" - line_no = 0 - else - line_no = 1 - end - - compile_with_errors_suppressed(code, line_no: line_no) do |inner_code, line_no| - lexer = Ripper::Lexer.new(inner_code, '-', line_no) - tokens = [] - lexer.scan.each do |t| - next if t.pos.first == 0 - prev_tk = tokens.last - position_overlapped = prev_tk && t.pos[0] == prev_tk.pos[0] && t.pos[1] < prev_tk.pos[1] + prev_tk.tok.bytesize - if position_overlapped - tokens[-1] = t if ERROR_TOKENS.include?(prev_tk.event) && !ERROR_TOKENS.include?(t.event) - else - tokens << t - end - end - interpolate_ripper_ignored_tokens(original_code, tokens) - end - ensure - $VERBOSE = verbose - end end def check_code_state(code, local_variables:) - tokens = self.class.ripper_lex_without_warning(code, local_variables: local_variables) - opens = NestingParser.open_nestings(Prism.parse_lex(code, scopes: [local_variables])) - [tokens, opens, code_terminated?(code, tokens, opens, local_variables: local_variables)] + parse_lex_result = Prism.parse_lex(code, scopes: [local_variables]) + + opens = NestingParser.open_nestings(parse_lex_result) + lines = code.lines + tokens = parse_lex_result.value[1].map(&:first).sort_by {|t| t.location.start_offset } + continue = should_continue?(tokens, lines.last, lines.size) + [continue, opens, code_terminated?(code, continue, opens, local_variables: local_variables)] end - def code_terminated?(code, tokens, opens, local_variables:) + def code_terminated?(code, continue, opens, local_variables:) case check_code_syntax(code, local_variables: local_variables) when :unrecoverable_error - true + return true when :recoverable_error - false + return false when :other_error - opens.empty? && !should_continue?(tokens) + opens.empty? && !continue when :valid - !should_continue?(tokens) + !continue end end def assignment_expression?(code, local_variables:) - # Try to parse the code and check if the last of possibly multiple - # expressions is an assignment type. - - # If the expression is invalid, Ripper.sexp should return nil which will - # result in false being returned. Any valid expression should return an - # s-expression where the second element of the top level array is an - # array of parsed expressions. The first element of each expression is the - # expression's type. - verbose, $VERBOSE = $VERBOSE, nil - code = "#{RubyLex.generate_local_variables_assign_code(local_variables) || 'nil;'}\n#{code}" - # Get the last node_type of the line. drop(1) is to ignore the local_variables_assign_code part. - node_type = Ripper.sexp(code)&.dig(1)&.drop(1)&.dig(-1, 0) - ASSIGNMENT_NODE_TYPES.include?(node_type) - ensure - $VERBOSE = verbose + # Parse the code and check if the last of possibly multiple + # expressions is an assignment node. + program_node = Prism.parse(code, scopes: [local_variables]).value + node = program_node.statements.body.last + case node + when nil + # Empty code, comment-only code or invalid code + false + when Prism::CallNode + # a.b = 1, a[b] = 1 + # Prism::CallNode#equal_loc is only available in prism >= 1.7.0 + if node.name == :[]= + # Distinguish between `a[k] = v` from `a.[]= k, v`, `a.[]=(k, v)` + node.opening == '[' + else + node.name.end_with?('=') + end + when Prism::MatchWriteNode + # /(?)/ =~ a, Class name is *WriteNode but not an assignment. + false + else + # a = 1, @a = 1, $a = 1, @@a = 1, A = 1, a += 1, a &&= 1, a.b += 1, and so on + node.class.name.match?(/WriteNode/) + end end - def should_continue?(tokens) - # Look at the last token and check if IRB need to continue reading next line. - # Example code that should continue: `a\` `a +` `a.` - # Trailing spaces, newline, comments are skipped - return true if tokens.last&.event == :on_sp && tokens.last.tok == "\\\n" - - tokens.reverse_each do |token| - case token.event - when :on_sp, :on_nl, :on_ignored_nl, :on_comment, :on_embdoc_beg, :on_embdoc, :on_embdoc_end - # Skip - when :on_regexp_end, :on_heredoc_end, :on_semicolon - # State is EXPR_BEG but should not continue - return false + def should_continue?(tokens, line, line_num) + # Check if the line ends with \\. Then IRB should continue reading next line. + # Space and backslash are not included in Prism token, so find trailing text after last non-newline token position. + trailing = line + tokens.reverse_each do |t| + break if t.location.start_line < line_num + if t.location.start_line == line_num && t.type != :IGNORED_NEWLINE && t.type != :NEWLINE && t.type != :EOF + trailing = line.byteslice(t.location.end_column..) + break + end + end + return true if trailing.match?(/\A\s*\\\n?\z/) + + # "1 + \n" and "foo.\n" should continue. + pos = tokens.size - 1 + ignored_newline_found = false + while pos >= 0 + case tokens[pos].type + when :EMBDOC_BEGIN, :EMBDOC_LINE, :EMBDOC_END, :COMMENT, :EOF + pos -= 1 + when :IGNORED_NEWLINE + pos -= 1 + ignored_newline_found = true else - # Endless range should not continue - return false if token.event == :on_op && token.tok.match?(/\A\.\.\.?\z/) - - # EXPR_DOT and most of the EXPR_BEG should continue - return token.state.anybits?(Ripper::EXPR_BEG | Ripper::EXPR_DOT) + break end end - false + + # If IGNORED_NEWLINE token is following non-newline non-semicolon token, it should continue. + # Special case: treat `1..` and `1...` as not continuing. + ignored_newline_found && pos >= 0 && !%i[DOT_DOT DOT_DOT_DOT NEWLINE SEMICOLON].include?(tokens[pos].type) end def check_code_syntax(code, local_variables:) - lvars_code = RubyLex.generate_local_variables_assign_code(local_variables) - code = "#{lvars_code}\n#{code}" - - begin # check if parser error are available - verbose, $VERBOSE = $VERBOSE, nil - case RUBY_ENGINE - when 'ruby' - self.class.compile_with_errors_suppressed(code) do |inner_code, line_no| - RubyVM::InstructionSequence.compile(inner_code, nil, nil, line_no) - end - when 'jruby' - JRuby.compile_ir(code) + result = Prism.lex(code, scopes: [local_variables]) + return :valid if result.success? + + # Get the token excluding trailing comments and newlines + # to compare error location with the last or second-last meaningful token location + tokens = result.value.map(&:first) + until tokens.empty? + case tokens.last.type + when :COMMENT, :NEWLINE, :IGNORED_NEWLINE, :EMBDOC_BEGIN, :EMBDOC_LINE, :EMBDOC_END, :EOF + tokens.pop else - catch(:valid) do - eval("BEGIN { throw :valid, true }\n#{code}") - false - end + break end - rescue EncodingError - # This is for a hash with invalid encoding symbol, {"\xAE": 1} - :unrecoverable_error - rescue SyntaxError => e - case e.message - when /unexpected keyword_end/ - # "syntax error, unexpected keyword_end" - # - # example: - # if ( - # end - # - # example: - # end - return :unrecoverable_error - when /unexpected '\.'/ - # "syntax error, unexpected '.'" - # - # example: - # . - return :unrecoverable_error - when /unexpected tREGEXP_BEG/ - # "syntax error, unexpected tREGEXP_BEG, expecting keyword_do or '{' or '('" - # - # example: - # method / f / + end + + unknown = false + result.errors.each do |error| + case error.message + when /unexpected character literal|incomplete expression at|unexpected .%.|too short escape sequence/i + # Ignore these errors. Likely to appear only at the end of code. + # `[a, b ?` unexpected character literal, incomplete expression at + # `p a, %` unexpected '%' + # `/\u` too short escape sequence + when /unexpected write target/i + # `a,b` recoverable by `=v` + # `a,b,` recoverable by `c=v` + tok = tokens.last + tok = tokens[-2] if tok&.type == :COMMA + return :unrecoverable_error if tok && error.location.end_offset < tok.location.end_offset + when /(invalid|unexpected) (?:break|next|redo)/i + # Hard to check correctly, so treat it as always recoverable. + # `(break;1)` recoverable by `.f while true` + when / meets end of file|end-of-input|unterminated |cannot parse|could not parse/i + # These are recoverable errors if there is no other unrecoverable error + # `/aaa` unterminated regexp meets end of file + # `def f` unexpected end-of-input + # `"#{` unterminated string + # `:"aa` cannot parse the string part + # `def f =` could not parse the endless method body + when /is not allowed/i + # `@@` `$--` return :unrecoverable_error - when /unterminated (?:string|regexp) meets end of file/ - # "unterminated regexp meets end of file" - # - # example: - # / - # - # "unterminated string meets end of file" - # - # example: - # ' - return :recoverable_error - when /unexpected end-of-input/ - # "syntax error, unexpected end-of-input, expecting keyword_end" - # - # example: - # if true - # hoge - # if false - # fuga - # end - return :recoverable_error + when /unexpected |invalid |dynamic constant assignment|can't set variable|can't change the value|is not valid to get|variable capture in alternative pattern/i + # Likely to be unrecoverable except when the error is at the last token location. + # Unexpected: `class a`, `tap(&`, `def f(a,` + # Invalid: `a ? b :`, `/\u{`, `"\M-` + # `a,B` recoverable by `.c=v` dynamic constant assignment + # `a,$1` recoverable by `.f=v` Can't set variable + # `a,self` recoverable by `.f=v` Can't change the value of self + # `p foo?:` recoverable by `v` is not valid to get + # `x in 1|{x:` recoverable by `1}` variable capture in alternative pattern + return :unrecoverable_error if tokens.last && error.location.end_offset <= tokens.last.location.start_offset else - return :other_error + unknown = true end - ensure - $VERBOSE = verbose end - :valid + unknown ? :other_error : :recoverable_error end def calc_indent_level(opens) @@ -456,43 +391,36 @@ def ltype_from_open_nestings(opens) end end + # Check if the node on the last line is connected to previous line. + # Connected example: + # foo + # .bar; baz + # Not connected example: + # foo + # bar + # If it's connected, return the last line string. Otherwise, return false. def check_termination_in_prev_line(code, local_variables:) - tokens = self.class.ripper_lex_without_warning(code, local_variables: local_variables) - past_first_newline = false - index = tokens.rindex do |t| - # traverse first token before last line - if past_first_newline - if t.tok.include?("\n") - true - end - elsif t.tok.include?("\n") - past_first_newline = true - false - else - false - end - end + lines = code.lines + return false if lines.size < 2 - if index - first_token = nil - last_line_tokens = tokens[(index + 1)..(tokens.size - 1)] - last_line_tokens.each do |t| - unless [:on_sp, :on_ignored_sp, :on_comment].include?(t.event) - first_token = t - break - end - end + prev_line_result = Prism.parse(lines[...-1].join, scopes: [local_variables]) + return false unless prev_line_result.success? - if first_token && first_token.state != Ripper::EXPR_DOT - tokens_without_last_line = tokens[0..index] - code_without_last_line = tokens_without_last_line.map(&:tok).join - opens_without_last_line = NestingParser.open_nestings(Prism.parse_lex(code_without_last_line, scopes: [local_variables])) - if code_terminated?(code_without_last_line, tokens_without_last_line, opens_without_last_line, local_variables: local_variables) - return last_line_tokens.map(&:tok).join - end - end + prev_nodes = prev_line_result.value.statements.body + whole_nodes = Prism.parse(code, scopes: [local_variables]).value.statements.body + + return false if whole_nodes.size < prev_nodes.size + return false unless prev_nodes.zip(whole_nodes).all? do |a, b| + a.location == b.location end - false + + # If the last line only contain comments, treat it as not connected to handle this case: + # receiver + # # comment + # .method + return false if lines.last.match?(/\A\s*#/) + + lines.last end end # :startdoc: diff --git a/lib/irb/source_finder.rb b/lib/irb/source_finder.rb index 1a6382089..0321b1bd9 100644 --- a/lib/irb/source_finder.rb +++ b/lib/irb/source_finder.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -require_relative "ruby-lex" +require 'prism' module IRB class SourceFinder @@ -44,21 +44,13 @@ def colorized_content private def find_end - lex = RubyLex.new code = file_content lines = code.lines[(@line - 1)..-1] - tokens = RubyLex.ripper_lex_without_warning(lines.join) - prev_tokens = [] # chunk with line number - tokens.chunk { |tok| tok.pos[0] }.each do |lnum, chunk| - code = lines[0..lnum].join - prev_tokens.concat chunk - continue = lex.should_continue?(prev_tokens) - syntax = lex.check_code_syntax(code, local_variables: []) - if !continue && syntax == :valid - return @line + lnum - end + lines.each_with_index do |line, index| + sub_code = lines.take(index + 1).join + return @line + index if Prism.parse_success?(sub_code) end @line end diff --git a/test/irb/test_irb.rb b/test/irb/test_irb.rb index 5f854df85..cd45fda29 100644 --- a/test/irb/test_irb.rb +++ b/test/irb/test_irb.rb @@ -686,7 +686,7 @@ def assert_rows_with_correct_indents(rows_with_spaces, assert_indent_level: fals def assert_indent_level(lines, expected) code = lines.map { |l| "#{l}\n" }.join # code should end with "\n" - _tokens, opens, _ = @irb.scanner.check_code_state(code, local_variables: []) + _continue, opens, _ = @irb.scanner.check_code_state(code, local_variables: []) indent_level = @irb.scanner.calc_indent_level(opens) error_message = "Calculated the wrong number of indent level for:\n #{lines.join("\n")}" assert_equal(expected, indent_level, error_message) diff --git a/test/irb/test_ruby_lex.rb b/test/irb/test_ruby_lex.rb index 533e27443..90ec37184 100644 --- a/test/irb/test_ruby_lex.rb +++ b/test/irb/test_ruby_lex.rb @@ -13,27 +13,6 @@ def teardown restore_encodings end - def test_interpolate_token_with_heredoc_and_unclosed_embexpr - code = <<~'EOC' - ①+< a = A.new => # irb(main):008> - irb(main):009> a - irb(main):010> .a - irb(main):011> .b + irb(main):009* a + irb(main):010* .a + irb(main):011* .b irb(main):012> .itself => true irb(main):013> @@ -219,26 +219,26 @@ class A def b; self; end; def c; true; end; end; irb(main):007> a = A.new => # irb(main):008> - irb(main):009> a - irb(main):010> .b - irb(main):011> # aaa + irb(main):009* a + irb(main):010* .b + irb(main):011* # aaa irb(main):012> .c => true irb(main):013> - irb(main):014> (a) + irb(main):014* (a) irb(main):015> &.b() => # irb(main):016> irb(main):017> class A def b; self; end; def c; true; end; end; irb(main):018> a = A.new => # - irb(main):019> a - irb(main):020> .b - irb(main):021> # aaa + irb(main):019* a + irb(main):020* .b + irb(main):021* # aaa irb(main):022> .c => true - irb(main):023> (a) - irb(main):024> &.b() + irb(main):023* (a) + irb(main):024* &.b() irb(main):025> .itself => # irb(main):026>