관리-도구
편집 파일: clean_document.rb
# frozen_string_literal: true module SyntaxSuggest # Parses and sanitizes source into a lexically aware document # # Internally the document is represented by an array with each # index containing a CodeLine correlating to a line from the source code. # # There are three main phases in the algorithm: # # 1. Sanitize/format input source # 2. Search for invalid blocks # 3. Format invalid blocks into something meaninful # # This class handles the first part. # # The reason this class exists is to format input source # for better/easier/cleaner exploration. # # The CodeSearch class operates at the line level so # we must be careful to not introduce lines that look # valid by themselves, but when removed will trigger syntax errors # or strange behavior. # # ## Join Trailing slashes # # Code with a trailing slash is logically treated as a single line: # # 1 it "code can be split" \ # 2 "across multiple lines" do # # In this case removing line 2 would add a syntax error. We get around # this by internally joining the two lines into a single "line" object # # ## Logically Consecutive lines # # Code that can be broken over multiple # lines such as method calls are on different lines: # # 1 User. # 2 where(name: "schneems"). # 3 first # # Removing line 2 can introduce a syntax error. To fix this, all lines # are joined into one. # # ## Heredocs # # A heredoc is an way of defining a multi-line string. They can cause many # problems. If left as a single line, the parser would try to parse the contents # as ruby code rather than as a string. Even without this problem, we still # hit an issue with indentation: # # 1 foo = <<~HEREDOC # 2 "Be yourself; everyone else is already taken."" # 3 ― Oscar Wilde # 4 puts "I look like ruby code" # but i'm still a heredoc # 5 HEREDOC # # If we didn't join these lines then our algorithm would think that line 4 # is separate from the rest, has a higher indentation, then look at it first # and remove it. # # If the code evaluates line 5 by itself it will think line 5 is a constant, # remove it, and introduce a syntax errror. # # All of these problems are fixed by joining the whole heredoc into a single # line. # # ## Comments and whitespace # # Comments can throw off the way the lexer tells us that the line # logically belongs with the next line. This is valid ruby but # results in a different lex output than before: # # 1 User. # 2 where(name: "schneems"). # 3 # Comment here # 4 first # # To handle this we can replace comment lines with empty lines # and then re-lex the source. This removal and re-lexing preserves # line index and document size, but generates an easier to work with # document. # class CleanDocument def initialize(source:) lines = clean_sweep(source: source) @document = CodeLine.from_source(lines.join, lines: lines) end # Call all of the document "cleaners" # and return self def call join_trailing_slash! join_consecutive! join_heredoc! self end # Return an array of CodeLines in the # document def lines @document end # Renders the document back to a string def to_s @document.join end # Remove comments # # replace with empty newlines # # source = <<~'EOM' # # Comment 1 # puts "hello" # # Comment 2 # puts "world" # EOM # # lines = CleanDocument.new(source: source).lines # expect(lines[0].to_s).to eq("\n") # expect(lines[1].to_s).to eq("puts "hello") # expect(lines[2].to_s).to eq("\n") # expect(lines[3].to_s).to eq("puts "world") # # Important: This must be done before lexing. # # After this change is made, we lex the document because # removing comments can change how the doc is parsed. # # For example: # # values = LexAll.new(source: <<~EOM)) # User. # # comment # where(name: 'schneems') # EOM # expect( # values.count {|v| v.type == :on_ignored_nl} # ).to eq(1) # # After the comment is removed: # # values = LexAll.new(source: <<~EOM)) # User. # # where(name: 'schneems') # EOM # expect( # values.count {|v| v.type == :on_ignored_nl} # ).to eq(2) # def clean_sweep(source:) # Match comments, but not HEREDOC strings with #{variable} interpolation # https://rubular.com/r/HPwtW9OYxKUHXQ source.lines.map do |line| if line.match?(/^\s*#([^{].*|)$/) $/ else line end end end # Smushes all heredoc lines into one line # # source = <<~'EOM' # foo = <<~HEREDOC # lol # hehehe # HEREDOC # EOM # # lines = CleanDocument.new(source: source).join_heredoc!.lines # expect(lines[0].to_s).to eq(source) # expect(lines[1].to_s).to eq("") def join_heredoc! start_index_stack = [] heredoc_beg_end_index = [] lines.each do |line| line.lex.each do |lex_value| case lex_value.type when :on_heredoc_beg start_index_stack << line.index when :on_heredoc_end start_index = start_index_stack.pop end_index = line.index heredoc_beg_end_index << [start_index, end_index] end end end heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] } join_groups(heredoc_groups) self end # Smushes logically "consecutive" lines # # source = <<~'EOM' # User. # where(name: 'schneems'). # first # EOM # # lines = CleanDocument.new(source: source).join_consecutive!.lines # expect(lines[0].to_s).to eq(source) # expect(lines[1].to_s).to eq("") # # The one known case this doesn't handle is: # # Ripper.lex <<~EOM # a && # b || # c # EOM # # For some reason this introduces `on_ignore_newline` but with BEG type # def join_consecutive! consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line| take_while_including(code_line.index..) do |line| line.ignore_newline_not_beg? end end join_groups(consecutive_groups) self end # Join lines with a trailing slash # # source = <<~'EOM' # it "code can be split" \ # "across multiple lines" do # EOM # # lines = CleanDocument.new(source: source).join_consecutive!.lines # expect(lines[0].to_s).to eq(source) # expect(lines[1].to_s).to eq("") def join_trailing_slash! trailing_groups = @document.select(&:trailing_slash?).map do |code_line| take_while_including(code_line.index..) { |x| x.trailing_slash? } end join_groups(trailing_groups) self end # Helper method for joining "groups" of lines # # Input is expected to be type Array<Array<CodeLine>> # # The outer array holds the various "groups" while the # inner array holds code lines. # # All code lines are "joined" into the first line in # their group. # # To preserve document size, empty lines are placed # in the place of the lines that were "joined" def join_groups(groups) groups.each do |lines| line = lines.first # Handle the case of multiple groups in a a row # if one is already replaced, move on next if @document[line.index].empty? # Join group into the first line @document[line.index] = CodeLine.new( lex: lines.map(&:lex).flatten, line: lines.join, index: line.index ) # Hide the rest of the lines lines[1..].each do |line| # The above lines already have newlines in them, if add more # then there will be double newline, use an empty line instead @document[line.index] = CodeLine.new(line: "", index: line.index, lex: []) end end self end # Helper method for grabbing elements from document # # Like `take_while` except when it stops # iterating, it also returns the line # that caused it to stop def take_while_including(range = 0..) take_next_and_stop = false @document[range].take_while do |line| next if take_next_and_stop take_next_and_stop = !(yield line) true end end end end