diff options
33 files changed, 4180 insertions, 11 deletions
@@ -1,13 +1,10 @@ -.DS_Store -.*~ +.* +bench/example.* coverage pkg spec/reports doc Gemfile.lock -.rvmrc -.ruby-gemset -.ruby-version test/executable/source.rb.html test/executable/source.rb.json test/scanners diff --git a/lib/coderay.rb b/lib/coderay.rb index c3de20b..c1c9e34 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -153,6 +153,10 @@ module CodeRay autoload :Encoders, coderay_path('encoders') autoload :Styles, coderay_path('styles') + # DSL Scanner + autoload :RuleBasedScanner, coderay_path('rule_based_scanner') + autoload :StateBasedScanner, coderay_path('state_based_scanner') + # convenience access and reusable Encoder/Scanner pair autoload :Duo, coderay_path('duo') diff --git a/lib/coderay/encoders/debug.rb b/lib/coderay/encoders/debug.rb index f4db330..6b680fc 100644 --- a/lib/coderay/encoders/debug.rb +++ b/lib/coderay/encoders/debug.rb @@ -15,9 +15,12 @@ module Encoders register_for :debug + attr_reader :size + FILE_EXTENSION = 'raydebug' def text_token text, kind + @size += 1 if kind == :space @out << text else @@ -43,6 +46,13 @@ module Encoders @out << ']' end + protected + + def setup options + super + @size = 0 + end + end end diff --git a/lib/coderay/encoders/debug_lint.rb b/lib/coderay/encoders/debug_lint.rb index a4eba2c..497d8c5 100644 --- a/lib/coderay/encoders/debug_lint.rb +++ b/lib/coderay/encoders/debug_lint.rb @@ -29,7 +29,7 @@ module Encoders end def end_group kind - raise Lint::IncorrectTokenGroupNesting, 'We are inside %s, not %p (end_group)' % [@opened.reverse.map(&:inspect).join(' < '), kind] if @opened.last != kind + raise Lint::IncorrectTokenGroupNesting, 'We are inside %p, not %p (end_group)' % [@opened.reverse, kind] if @opened.last != kind @opened.pop super end @@ -40,7 +40,7 @@ module Encoders end def end_line kind - raise Lint::IncorrectTokenGroupNesting, 'We are inside %s, not %p (end_line)' % [@opened.reverse.map(&:inspect).join(' < '), kind] if @opened.last != kind + raise Lint::IncorrectTokenGroupNesting, 'We are inside %p, not %p (end_line)' % [@opened.reverse, kind] if @opened.last != kind @opened.pop super end diff --git a/lib/coderay/rule_based_scanner.rb b/lib/coderay/rule_based_scanner.rb new file mode 100644 index 0000000..0eb9222 --- /dev/null +++ b/lib/coderay/rule_based_scanner.rb @@ -0,0 +1,378 @@ +require 'set' + +module CodeRay + module Scanners + class RuleBasedScanner < Scanner + + Pattern = Struct.new :pattern + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state, :group + Pop = Struct.new :group + PushState = Struct.new :state + PopState = Class.new + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check + ValueSetter = Struct.new :targets, :value + Increment = Struct.new :targets, :operation, :value + Continue = Class.new + + class << self + attr_accessor :states + + def state *names, &block + @code ||= "" + + @code << "when #{names.map(&:inspect).join(', ')}\n" + + @first = true + instance_eval(&block) + @code << " else\n" + @code << " puts \"no match for \#{state.inspect} => skip char\"\n" if $DEBUG + @code << " encoder.text_token getch, :error\n" + @code << " end\n" + @code << " \n" + end + + def on? pattern + pattern_expression = pattern.inspect + @code << " #{'els' unless @first}if check(#{pattern_expression})\n" + + @first = true + yield + @code << " end\n" + + @first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) } + conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!' + actions = pattern_and_actions[index..-1] or raise 'I need actions!' + else + raise "invalid rule structure: #{pattern_and_actions.map(&:class)}" + end + + condition_expressions = [] + if conditions + for condition in conditions + case condition + when CheckIf + case condition.condition + when Proc + condition_expressions << "#{make_callback(condition.condition)}" + when Symbol + condition_expressions << "#{condition.condition}" + else + raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition] + end + when CheckUnless + case condition.condition + when Proc + condition_expressions << "!#{make_callback(condition.condition)}" + when Symbol + condition_expressions << "!#{condition.condition}" + else + raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition] + end + when Pattern + case condition.pattern + when Proc + condition_expressions << "match = scan(#{make_callback(condition.pattern)})" + else + raise "I don't know how to evaluate this pattern: %p" % [condition.pattern] + end + when Regexp + condition_expressions << "match = scan(#{condition.inspect})" + else + raise "I don't know how to evaluate this pattern/condition: %p" % [condition] + end + end + end + + @code << " #{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + + for action in actions + case action + when String + raise + @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << " #{action}\n" + + when Symbol + @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << " encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @code << " encoder.text_token match, kind = #{make_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push, PushState + case action.state + when String + raise + @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << " state = #{action.state}\n" + @code << " states << state\n" + when Symbol + @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << " state = #{action.state.inspect}\n" + @code << " states << state\n" + when Proc + @code << " if new_state = #{make_callback(action.state)}\n" + @code << " state = new_state\n" + @code << " states << new_state\n" + @code << " end\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + if action.is_a? Push + if action.state == action.group + @code << " encoder.begin_group state\n" + else + case action.state + when Symbol + @code << " p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + @code << " encoder.begin_group #{action.group.inspect}\n" + when Proc + @code << " encoder.begin_group #{make_callback(action.group)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + end + end + when Pop, PopState + @code << " p 'pop %p' % [states.last]\n" if $DEBUG + if action.is_a? Pop + if action.group + case action.group + when Symbol + @code << " encoder.end_group #{action.group.inspect}\n" + else + raise "I don't know how to evaluate this pop group: %p" % [action.group] + end + @code << " states.pop\n" + else + @code << " encoder.end_group states.pop\n" + end + else + @code << " states.pop\n" + end + @code << " state = states.last\n" + + when ValueSetter + case action.value + when Proc + @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} = #{action.value}\n" + else + @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Increment + case action.value + when Proc + @code << " #{action.targets.join(' = ')} #{action.operation}= #{make_callback(action.value)}\n" + when Symbol + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + else + @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + end + + when Proc + @code << " #{make_callback(action)}\n" + + when Continue + @code << " next\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def pattern pattern = nil, &block + Pattern.new pattern || block + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, group = state, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block, group || block + end + + def pop group = nil + Pop.new group + end + + def push_state state = nil, &block + raise 'push_state requires a state or a block; got nothing' unless state || block + PushState.new state || block + end + + def pop_state + PopState.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + + def flag_on *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), true + end + + def flag_off *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + variables << flag + ValueSetter.new [flag], value || callback || true + end + + def unset *flags + flags.each { |name| variables << name } + ValueSetter.new Array(flags), nil + end + + def increment *counters + counters.each { |name| variables << name } + Increment.new Array(counters), :+, 1 + end + + def decrement *counters + counters.each { |name| variables << name } + Increment.new Array(counters), :-, 1 + end + + def continue + Continue.new + end + + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + + protected + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def make_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callbacks[callback_name] = define_method(callback_name, &block) + + parameters = block.parameters + + if parameters.empty? + callback_name + else + parameter_names = parameters.map(&:last) + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" + end + end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + states = [state] + + until eos? + case state +#{ @code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + end + + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, states)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + end + + def close_groups encoder, states + # TODO + end + + end + end +end diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb index a240298..61079d5 100644 --- a/lib/coderay/scanners/_map.rb +++ b/lib/coderay/scanners/_map.rb @@ -10,6 +10,11 @@ module Scanners :eruby => :erb, :irb => :ruby, :javascript => :java_script, + :javascript1 => :java_script1, + :javascript2 => :java_script2, + :javascript3 => :java_script3, + :javascript4 => :java_script4, + :javascript5 => :java_script5, :js => :java_script, :pascal => :delphi, :patch => :diff, diff --git a/lib/coderay/scanners/c2.rb b/lib/coderay/scanners/c2.rb new file mode 100644 index 0000000..3103e54 --- /dev/null +++ b/lib/coderay/scanners/c2.rb @@ -0,0 +1,110 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C2 < RuleBasedScanner + + register_for :c2 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, flag_off(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line) + on %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected| match =~ /[;\{\}]/ || case_expected && match =~ /:/ }, flag_off(:case_expected) + + on %r/ (?: case | default ) \b /x, :keyword, flag_on(:case_expected), flag_off(:label_expected) + on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match| + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + }, set(:label_expected) { |kind| kind == :label } + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, flag_off(:label_expected) + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, flag_off(:label_expected) + on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:label_expected) + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:label_expected) + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, flag_off(:label_expected) + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:label_expected) + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, flag_on(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected) + on %r/ \# [ \t]* \w* /x, :preprocessor, flag_on(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop, flag_off(:label_expected) + on %r/ \\ /x, pop, :error, flag_off(:label_expected) + on %r/ $ /x, pop, flag_off(:label_expected) + end + + state :include_expected do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state + on %r/ \s*? \n \s* /x, :space, pop_state + on %r/\s+/, :space + on %r//, pop_state # TODO: add otherwise method for this + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/c3.rb b/lib/coderay/scanners/c3.rb new file mode 100644 index 0000000..49555ca --- /dev/null +++ b/lib/coderay/scanners/c3.rb @@ -0,0 +1,112 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C3 < RuleBasedScanner + + register_for :c3 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, unset(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line) + on %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected| + match =~ /[;\{\}]/ || case_expected && match =~ /:/ + }, unset(:case_expected) + + on %r/ (?: case | default ) \b /x, :keyword, set(:case_expected), unset(:label_expected) + on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match| + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + }, set(:label_expected) { |kind| kind == :label } + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, unset(:label_expected) + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, unset(:label_expected) + on %r/0[xX][0-9A-Fa-f]+/, :hex, unset(:label_expected) + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, unset(:label_expected) + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, unset(:label_expected) + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, unset(:label_expected) + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop, unset(:label_expected) + on %r/ \\ /x, pop, :error, unset(:label_expected) + on %r/ $ /x, pop, unset(:label_expected) + end + + state :include_expected do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state + on %r/ \s*? \n \s* /x, :space, pop_state + on %r/\s+/, :space + on %r//, pop_state # TODO: add otherwise method for this + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/c4.rb b/lib/coderay/scanners/c4.rb new file mode 100644 index 0000000..ff67e49 --- /dev/null +++ b/lib/coderay/scanners/c4.rb @@ -0,0 +1,126 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C4 < StateBasedScanner + + register_for :c4 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + check in_preproc_line? do + skip %r/ \s*? \n \s* /x, :space do + unset :in_preproc_line + expect :label if label_expected_before_preproc_line? + end + end + + skip %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator do |match, case_expected| + expect :label if match =~ /[;\{\}]/ || expected?(:case) && match =~ /:/ + end + + on %r/ (?: case | default ) \b /x, :keyword do + expect :case + end + + check label_expected?, !in_preproc_line? do + on %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, -> match { + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + } do |kind| + expect :label if kind == :label + end + end + + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, IDENT_KIND + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char + on %r/0[xX][0-9A-Fa-f]+/, :hex + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float + + skip %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push(:include) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + group_state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop + on %r/ \\ /x, pop, :error + on %r/ $ /x, pop + end + + state :include do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop + on %r/ \s*? \n \s* /x, :space, pop + on %r/\s+/, :space + otherwise pop + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/css2.rb b/lib/coderay/scanners/css2.rb new file mode 100644 index 0000000..0c0d4a0 --- /dev/null +++ b/lib/coderay/scanners/css2.rb @@ -0,0 +1,90 @@ +module CodeRay +module Scanners + + class CSS2 < RuleBasedScanner + + register_for :css2 + + KINDS_NOT_LOC = [ + :comment, + :class, :pseudo_class, :tag, + :id, :directive, + :key, :value, :operator, :color, :float, :string, + :error, :important, :type, + ] # :nodoc: + + module RE # :nodoc: + Hex = /[0-9a-fA-F]/ + Unicode = /\\#{Hex}{1,6}\b/ # differs from standard because it allows uppercase hex too + Escape = /#{Unicode}|\\[^\n0-9a-fA-F]/ + NMChar = /[-_a-zA-Z0-9]/ + NMStart = /[_a-zA-Z]/ + String1 = /(")((?:[^\n\\"]+|\\\n|#{Escape})+)?(")?/ # TODO: buggy regexp + String2 = /(')((?:[^\n\\']+|\\\n|#{Escape})+)?(')?/ # TODO: buggy regexp + String = /#{String1}|#{String2}/ + + HexColor = /#(?:#{Hex}{6}|#{Hex}{3})/ + + Num = /-?(?:[0-9]*\.[0-9]+|[0-9]+)n?/ + Name = /#{NMChar}+/ + Ident = /-?#{NMStart}#{NMChar}*/ + AtKeyword = /@#{Ident}/ + Percentage = /#{Num}%/ + + reldimensions = %w[em ex px] + absdimensions = %w[in cm mm pt pc] + Unit = Regexp.union(*(reldimensions + absdimensions + %w[s dpi dppx deg])) + + Dimension = /#{Num}#{Unit}/ + + Function = /((?:url|alpha|attr|counters?)\()((?:[^)\n]|\\\))+)?(\))?/ + + Id = /(?!#{HexColor}\b(?!-))##{Name}/ + Class = /\.#{Name}/ + PseudoClass = /::?#{Ident}/ + AttributeSelector = /(\[)([^\]]+)?(\])?/ + end + + state :initial do + on %r/\s+/, :space + + on check_if(:block), check_if(:value_expected), %r/(?>#{RE::Ident})(?!\()/x, :value + on check_if(:block), %r/(?>#{RE::Ident})(?!\()/x, :key + + on check_unless(:block), %r/(?>#{RE::Ident})(?!\()|\*/x, :tag + on check_unless(:block), RE::Class, :class + on check_unless(:block), RE::Id, :id + on check_unless(:block), RE::PseudoClass, :pseudo_class + # TODO: Improve highlighting inside of attribute selectors. + on check_unless(:block), RE::AttributeSelector, groups(:operator, :attribute_name, :operator) + on check_unless(:block), %r/(@media)(\s+)?(#{RE::Ident})?(\s+)?(\{)?/, groups(:directive, :space, :type, :space, :operator) + + on %r/\/\*(?:.*?\*\/|\z)/m, :comment + on %r/\{/, :operator, flag_off(:value_expected), flag_on(:block) + on %r/\}/, :operator, flag_off(:value_expected), flag_off(:block) + on RE::String1, push(:string), groups(:delimiter, :content, :delimiter), pop + on RE::String2, push(:string), groups(:delimiter, :content, :delimiter), pop + on RE::Function, push(:function), groups(:delimiter, :content, :delimiter), pop + on %r/(?: #{RE::Dimension} | #{RE::Percentage} | #{RE::Num} )/x, :float + on RE::HexColor, :color + on %r/! *important/, :important + on %r/(?:rgb|hsl)a?\([^()\n]*\)?/, :color + on RE::AtKeyword, :directive + on %r/:/, :operator, flag_on(:value_expected) + on %r/;/, :operator, flag_off(:value_expected) + on %r/ [+>~,.=()\/] /x, :operator + end + + protected + + def setup + super + + @value_expected = false + @block = false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script.rb b/lib/coderay/scanners/java_script.rb index 9eb0a0a..5e27813 100644 --- a/lib/coderay/scanners/java_script.rb +++ b/lib/coderay/scanners/java_script.rb @@ -100,7 +100,6 @@ module Scanners # TODO: scan over nested tags xml_scanner.tokenize match, :tokens => encoder value_expected = false - next elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) value_expected = true diff --git a/lib/coderay/scanners/java_script1.rb b/lib/coderay/scanners/java_script1.rb new file mode 100644 index 0000000..4fe59ba --- /dev/null +++ b/lib/coderay/scanners/java_script1.rb @@ -0,0 +1,238 @@ +# like java_script.rb +# - but uses instance instead of local variables for flags +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript1 < Scanner + + register_for :java_script1 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + @value_expected = true if !@value_expected && match.index(?\n) + encoder.text_token match, :space + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + @value_expected = true + encoder.text_token match, :comment + state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + @key_expected = @value_expected = false + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + + elsif @value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + @value_expected = true + last_operator = match[-1] + @key_expected = (last_operator == ?{) || (last_operator == ?,) + @function_expected = false + encoder.text_token match, :operator + + elsif match = scan(/ [)\]}]+ /x) + @function_expected = @key_expected = @value_expected = false + encoder.text_token match, :operator + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + encoder.text_token match, kind + + elsif match = scan(/["']/) + if @key_expected && check(KEY_CHECK_PATTERN[match]) + state = :key + else + state = :string + end + encoder.begin_group state + @string_delimiter = match + encoder.text_token match, :delimiter + + elsif @value_expected && (match = scan(/\//)) + encoder.begin_group :regexp + state = :regexp + @string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + @value_expected = true + @key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[@string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if state == :regexp + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + encoder.end_group state + @string_delimiter = nil + @key_expected = @value_expected = false + state = :initial + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group state + encoder.text_token match, :error unless match.empty? + @string_delimiter = nil + @key_expected = @value_expected = false + state = :initial + else + raise_inspect "else case #{@string_delimiter} reached; %p not handled." % peek(1), encoder + end + + when :open_multi_line_comment + if match = scan(%r! .*? \*/ !mx) + state = :initial + else + match = scan(%r! .+ !mx) + end + @value_expected = true + encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, @string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script2.rb b/lib/coderay/scanners/java_script2.rb new file mode 100644 index 0000000..42fa640 --- /dev/null +++ b/lib/coderay/scanners/java_script2.rb @@ -0,0 +1,240 @@ +# like java_script.rb +# - but uses instance instead of local variables for flags +# - but uses the same rule logic as java_script4.rb +# - also uses states array push/pop +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript2 < Scanner + + register_for :java_script2 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + states = [state] + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + encoder.text_token match, :space + @value_expected = true if !@value_expected && match.index(?\n) + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + encoder.text_token match, :comment + @value_expected = true + # state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + @key_expected = @value_expected = false + + elsif @value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + encoder.text_token match, :operator + @value_expected = true + @key_expected = /[{,]$/ === match + @function_expected = false + + elsif match = scan(/ [)\]}]+ /x) + encoder.text_token match, :operator + @function_expected = @key_expected = @value_expected = false + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + encoder.text_token match, kind + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + + elsif match = scan(/["']/) + state = (@key_expected && check(KEY_CHECK_PATTERN[match])) ? :key : :string + states << state + encoder.begin_group state + @string_delimiter = match + encoder.text_token match, :delimiter + + elsif @value_expected && (match = scan(/\//)) + state = :regexp + states << state + encoder.begin_group state + @string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + @value_expected = true + @key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[@string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + @string_delimiter = nil + @key_expected = @value_expected = false + encoder.end_group states.pop + state = states.last + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group states.pop + state = states.last + encoder.text_token match, :error unless match.empty? + @string_delimiter = nil + @key_expected = @value_expected = false + else + raise_inspect "else case #{@string_delimiter} reached; %p not handled." % peek(1), encoder + end + + # when :open_multi_line_comment + # if match = scan(%r! .*? \*/ !mx) + # states.pop + # state = states.last + # else + # match = scan(%r! .+ !mx) + # end + # @value_expected = true + # encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, @string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script3.rb b/lib/coderay/scanners/java_script3.rb new file mode 100644 index 0000000..9492967 --- /dev/null +++ b/lib/coderay/scanners/java_script3.rb @@ -0,0 +1,239 @@ +# like java_script.rb +# - but uses the same rule logic as java_script4.rb +# - also uses states array push/pop +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript3 < Scanner + + register_for :java_script3 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + protected + + def setup + @state = :initial + end + + def scan_tokens encoder, options + + state, string_delimiter = options[:state] || @state + if string_delimiter + encoder.begin_group state + end + + value_expected = true + key_expected = false + function_expected = false + + states = [state] + + until eos? + + case state + + when :initial + + if match = scan(/ \s+ | \\\n /x) + encoder.text_token match, :space + value_expected = true if !value_expected && match.index(?\n) + + elsif match = scan(%r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx) + encoder.text_token match, :comment + value_expected = true + # state = :open_multi_line_comment if self[1] + + elsif check(/\.?\d/) + if match = scan(/0[xX][0-9A-Fa-f]+/) + encoder.text_token match, :hex + elsif match = scan(/(?>0[0-7]+)(?![89.eEfF])/) + encoder.text_token match, :octal + elsif match = scan(/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/) + encoder.text_token match, :float + elsif match = scan(/\d+/) + encoder.text_token match, :integer + end + key_expected = value_expected = false + + elsif value_expected && match = scan(/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim) + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + value_expected = false + + elsif match = scan(/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x) + encoder.text_token match, :operator + value_expected = true + key_expected = /[{,]$/ === match + function_expected = false + + elsif match = scan(/ [)\]}]+ /x) + encoder.text_token match, :operator + function_expected = key_expected = value_expected = false + + elsif match = scan(/ [$a-zA-Z_][A-Za-z_0-9$]* /x) + kind = IDENT_KIND[match] + value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif key_expected && check(/\s*:/) + kind = :key + end + end + encoder.text_token match, kind + function_expected = (kind == :keyword) && (match == 'function') + key_expected = false + + elsif match = scan(/["']/) + state = (key_expected && check(KEY_CHECK_PATTERN[match])) ? :key : :string + states << state + encoder.begin_group state + string_delimiter = match + encoder.text_token match, :delimiter + + elsif value_expected && (match = scan(/\//)) + state = :regexp + states << state + encoder.begin_group state + string_delimiter = '/' + encoder.text_token match, :delimiter + + elsif match = scan(/ \/ /x) + value_expected = true + key_expected = false + encoder.text_token match, :operator + + else + encoder.text_token getch, :error + + end + + when :string, :regexp, :key + if match = scan(STRING_CONTENT_PATTERN[string_delimiter]) + encoder.text_token match, :content + elsif match = scan(/["'\/]/) + encoder.text_token match, :delimiter + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + string_delimiter = nil + key_expected = value_expected = false + encoder.end_group states.pop + state = states.last + elsif state != :regexp && (match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox)) + if string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + elsif state == :regexp && match = scan(/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox) + encoder.text_token match, :char + elsif match = scan(/\\./m) + encoder.text_token match, :content + elsif match = scan(/ \\ | $ /x) + encoder.end_group states.pop + state = states.last + encoder.text_token match, :error unless match.empty? + string_delimiter = nil + key_expected = value_expected = false + else + raise_inspect "else case #{string_delimiter} reached; %p not handled." % peek(1), encoder + end + + # when :open_multi_line_comment + # if match = scan(%r! .*? \*/ !mx) + # states.pop + # state = states.last + # else + # match = scan(%r! .+ !mx) + # end + # value_expected = true + # encoder.text_token match, :comment if match + + else + #:nocov: + raise_inspect 'Unknown state: %p' % [state], encoder + #:nocov: + + end + + end + + if options[:keep_state] + @state = state, string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script4.rb b/lib/coderay/scanners/java_script4.rb new file mode 100644 index 0000000..4b9601f --- /dev/null +++ b/lib/coderay/scanners/java_script4.rb @@ -0,0 +1,400 @@ +# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … } +module CodeRay +module Scanners + + class RuleBasedScanner5 < Scanner + + CheckIf = Struct.new :condition + + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def on? pattern + pattern_expression = pattern.inspect + @@code << " #{'els' unless @@first}if check(#{pattern_expression})\n" + + @@first = true + yield + @@code << " end\n" + + @@first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !item.is_a?(CheckIf) } + preconditions = pattern_and_actions[0..index - 1] if index > 0 + pattern = pattern_and_actions[index] or raise 'I need a pattern!' + actions = pattern_and_actions[index + 1..-1] or raise 'I need actions!' + end + + precondition_expression = '' + if preconditions + for precondition in preconditions + case precondition + when CheckIf + case precondition.condition + when Proc + callback = make_callback(precondition.condition) + case precondition.condition.arity + when 0 + arguments = '' + when 1 + arguments = '(state)' + else + raise "I got %p arguments for precondition: %p, but I only know how to evaluate 0..1" % [precondition.condition.arity, callback] + end + precondition_expression << "#{callback}#{arguments} && " + when Symbol + precondition_expression << "#{precondition.condition} && " + else + raise "I don't know how to evaluate this check_if precondition: %p" % [precondition.condition] + end + else + raise "I don't know how to evaluate this precondition: %p" % [precondition] + end + end + end + + case pattern + # when String + # pattern_expression = pattern + when Regexp + pattern_expression = pattern.inspect + when Proc + pattern_expression = make_callback(pattern).to_s + else + raise "I don't know how to evaluate this pattern: %p" % [pattern] + end + + @@code << " #{'els' unless @@first}if #{precondition_expression}match = scan(#{pattern_expression})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :push + case action.last + when Symbol + @@code << " p 'push %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + when Proc + callback = make_callback(action.last) + case action.last.arity + when 0 + arguments = '' + when 1 + arguments = '(match)' + else + raise "I got %p arguments for push: %p, but I only know how to evaluate 0..1" % [action.last.arity, callback] + end + @@code << " p 'push %p' % [#{callback}]\n" if $DEBUG + @@code << " state = #{callback}#{arguments}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.last] + end + @@code << " states << state\n" + @@code << " encoder.begin_group state\n" + when :pop + @@code << " p 'pop %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + when Proc + callback = make_callback(action) + case action.arity + when 0 + arguments = '' + when 1 + arguments = '(match)' + when 2 + arguments = '(match, encoder)' + else + raise "I got %p arguments for action: %p, but I only know how to evaluate 0..2" % [action.arity, callback] + end + @@code << " p 'calling %p'\n" % [callback] if $DEBUG + @@code << " #{callback}#{arguments}\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @@first = false + end + + def push state = nil, &block + raise 'push requires a state or a block; got nothing' unless state || block + [:push, state || block] + end + + def pop + [:pop] + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + protected + + def make_callback block + @callbacks ||= {} + + base_name = "__callback_line_#{block.source_location.last}" + name = base_name + counter = 'a' + while @callbacks.key?(name) + name = "#{base_name}_#{counter}" + counter.succ! + end + + @callbacks[name] = define_method(name, &block) + end + end + end + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript4 < RuleBasedScanner5 + + register_for :java_script4 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + state :initial do + # on %r/ [ \t]* \n \s* /x, :space, -> { @value_expected = true } + # on %r/ [ \t]+ | \\\n /x, :space + on %r/ \s+ | \\\n /x, :space, -> (match) { @value_expected = true if !@value_expected && match.index(?\n) } + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, -> { @value_expected = true } + # state = :open_multi_line_comment if self[1] + + on? %r/\.?\d/ do + on %r/0[xX][0-9A-Fa-f]+/, :hex, -> { @key_expected = @value_expected = false } + on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, -> { @key_expected = @value_expected = false } + on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, -> { @key_expected = @value_expected = false } + on %r/\d+/, :integer, -> { @key_expected = @value_expected = false } + end + + on check_if(:@value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + @value_expected = false + end + + on %r/ [-+*=<>?:;,!&^|(\[{~%]+ | \.(?!\d) /x, :operator, -> (match) do + @value_expected = true + @key_expected = /[{,]$/ === match + @function_expected = false + end + + on %r/ [)\]}]+ /x, :operator, -> { @function_expected = @key_expected = @value_expected = false } + + on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, -> (match, encoder) do + kind = IDENT_KIND[match] + @value_expected = (kind == :keyword) && KEYWORDS_EXPECTING_VALUE[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif @function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif @key_expected && check(/\s*:/) + kind = :key + end + end + encoder.text_token match, kind + @function_expected = (kind == :keyword) && (match == 'function') + @key_expected = false + end + + on %r/["']/, push { |match| + @string_delimiter = match + @key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string + }, :delimiter + + on check_if(:@value_expected), %r/\//, push(:regexp), :delimiter, -> { @string_delimiter = '/' } + + on %r/ \/ /x, :operator, -> { @value_expected = true; @key_expected = false } + end + + state :string, :regexp, :key do + on -> { STRING_CONTENT_PATTERN[@string_delimiter] }, :content + # on 'STRING_CONTENT_PATTERN[@string_delimiter]', :content + + # on %r/\//, :delimiter, -> (match, encoder) do + # modifiers = scan(/[gim]+/) + # encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + # @string_delimiter = nil + # @key_expected = @value_expected = false + # end, pop + # + # on %r/["']/, :delimiter, -> do + # @string_delimiter = nil + # @key_expected = @value_expected = false + # end, pop + + on %r/["'\/]/, :delimiter, -> (match, encoder) do + if match == '/' + modifiers = scan(/[gim]+/) + encoder.text_token modifiers, :modifier if modifiers && !modifiers.empty? + end + @string_delimiter = nil + @key_expected = @value_expected = false + end, pop + + on check_if { |state| state != :regexp }, %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mox, -> (match, encoder) do + if @string_delimiter == "'" && !(match == "\\\\" || match == "\\'") + encoder.text_token match, :content + else + encoder.text_token match, :char + end + end + + on check_if { |state| state == :regexp }, %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /mox, :char + on %r/\\./m, :content + on %r/ \\ /x, pop, :error, -> do + @string_delimiter = nil + @key_expected = @value_expected = false + end + end + + # state :open_multi_line_comment do + # on %r! .*? \*/ !mx, :initial # don't consume! + # on %r/ .+ /mx, :comment, -> { @value_expected = true } + # + # # if match = scan(%r! .*? \*/ !mx) + # # state = :initial + # # else + # # match = scan(%r! .+ !mx) + # # end + # # value_expected = true + # # encoder.text_token match, :comment if match + # end + + protected + + def setup + @state = :initial + end + + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options#{ def_line = __LINE__; nil } + state, @string_delimiter = options[:state] || @state + if @string_delimiter + encoder.begin_group state + end + + @value_expected = true + @key_expected = false + @function_expected = false + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state, string_delimiter + end + + if [:string, :regexp].include? state + encoder.end_group state + end + + encoder + end + RUBY + + if ENV['PUTS'] + puts scan_tokens_code + puts "callbacks: #{@callbacks.size}" + end + class_eval scan_tokens_code, __FILE__, def_line + + protected + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/java_script5.rb b/lib/coderay/scanners/java_script5.rb new file mode 100644 index 0000000..9839d23 --- /dev/null +++ b/lib/coderay/scanners/java_script5.rb @@ -0,0 +1,162 @@ +# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … } +module CodeRay +module Scanners + + # Scanner for JavaScript. + # + # Aliases: +ecmascript+, +ecma_script+, +javascript+ + class JavaScript5 < RuleBasedScanner + + register_for :java_script5 + file_extension 'js' + + # The actual JavaScript keywords. + KEYWORDS = %w[ + break case catch continue default delete do else + finally for function if in instanceof new + return switch throw try typeof var void while with + ] # :nodoc: + PREDEFINED_CONSTANTS = %w[ + false null true undefined NaN Infinity + ] # :nodoc: + + MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4 + + KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[ + case delete in instanceof new return throw typeof with + ] # :nodoc: + + # Reserved for future use. + RESERVED_WORDS = %w[ + abstract boolean byte char class debugger double enum export extends + final float goto implements import int interface long native package + private protected public short static super synchronized throws transient + volatile + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(MAGIC_VARIABLES, :local_variable). + add(KEYWORDS, :keyword) # :nodoc: + + ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc: + STRING_CONTENT_PATTERN = { + "'" => /[^\\']+/, + '"' => /[^\\"]+/, + '/' => /[^\\\/]+/, + } # :nodoc: + KEY_CHECK_PATTERN = { + "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx, + '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx, + } # :nodoc: + + state :initial do + on %r/ \s+ | \\\n /x, :space, set(:value_expected) { |match, value_expected| value_expected || match.index(?\n) } + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, flag_off(:value_expected) + # state = :open_multi_line_comment if self[1] + + on? %r/\.?\d/ do + on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:key_expected, :value_expected) + on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:key_expected, :value_expected) + on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:key_expected, :value_expected) + on %r/\d+/, :integer, flag_off(:key_expected, :value_expected) + end + + on check_if(:value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do + # TODO: scan over nested tags + xml_scanner.tokenize match, :tokens => encoder + end, flag_off(:value_expected) + + on %r/ [-+*=<>?:;,!&^|(\[{~%]++ (?<![{,]) | \.+(?!\d) /x, :operator, flag_on(:value_expected), flag_off(:key_expected, :function_expected) + on %r/ [-+*=<>?:;,!&^|(\[{~%]*+ (?<=[{,]) /x, :operator, flag_on(:value_expected, :key_expected), flag_off(:function_expected) + on %r/ [)\]}]+ /x, :operator, flag_off(:function_expected, :key_expected, :value_expected) + + on %r/ function (?![A-Za-z_0-9$]) /x, :keyword, flag_on(:function_expected), flag_off(:key_expected, :value_expected) + on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, kind { |match, function_expected, key_expected| + kind = IDENT_KIND[match] + # TODO: labels + if kind == :ident + if match.index(?$) # $ allowed inside an identifier + kind = :predefined + elsif function_expected + kind = :function + elsif check(/\s*[=:]\s*function\b/) + kind = :function + elsif key_expected && check(/\s*:/) + kind = :key + end + end + + kind + }, flag_off(:function_expected, :key_expected), set(:value_expected) { |match| KEYWORDS_EXPECTING_VALUE[match] } + + on %r/["']/, push { |match, key_expected| key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string }, :delimiter, set(:string_delimiter) { |match| match } + on check_if(:value_expected), %r/\//, push(:regexp), :delimiter + + on %r/\//, :operator, flag_on(:value_expected), flag_off(:key_expected) + end + + state :string, :key do + on pattern { |string_delimiter| STRING_CONTENT_PATTERN[string_delimiter] }, :content + on %r/["']/, :delimiter, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, kind { |match, string_delimiter| + string_delimiter == "'" && !(match == "\\\\" || match == "\\'") ? :content : :char + } + on %r/ \\. /mx, :content + on %r/ \\ /x, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop, :error + end + + state :regexp do + on STRING_CONTENT_PATTERN['/'], :content + on %r/(\/)([gim]+)?/, groups(:delimiter, :modifier), flag_off(:key_expected, :value_expected), pop + on %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + on %r/\\./m, :content + on %r/ \\ /x, pop, :error, flag_off(:key_expected, :value_expected) + end + + # state :open_multi_line_comment do + # on %r! .*? \*/ !mx, :initial # don't consume! + # on %r/ .+ /mx, :comment, -> { value_expected = true } + # + # # if match = scan(%r! .*? \*/ !mx) + # # state = :initial + # # else + # # match = scan(%r! .+ !mx) + # # end + # # value_expected = true + # # encoder.text_token match, :comment if match + # end + + protected + + def setup + super + + @string_delimiter = nil + @value_expected = true + @key_expected = false + @function_expected = false + end + + def close_groups encoder, states + if [:string, :key, :regexp].include? states.last + encoder.end_group states.last + end + end + + def reset_instance + super + @xml_scanner.reset if defined? @xml_scanner + end + + def xml_scanner + @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false + end + + end + +end +end diff --git a/lib/coderay/scanners/json1.rb b/lib/coderay/scanners/json1.rb new file mode 100644 index 0000000..d44f6ba --- /dev/null +++ b/lib/coderay/scanners/json1.rb @@ -0,0 +1,100 @@ +module CodeRay +module Scanners + + # Scanner for JSON (JavaScript Object Notation). + class JSON1 < Scanner + + register_for :json1 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + until eos? + + case state + + when :initial + if match = scan(/ \s+ /x) + encoder.text_token match, :space + elsif match = scan(/ " (?=#{KEY}) /ox) + state = :key + encoder.begin_group :key + encoder.text_token match, :delimiter + elsif match = scan(/ " /x) + state = :string + encoder.begin_group :string + encoder.text_token match, :delimiter + elsif match = scan(/ [:,\[{\]}] /x) + encoder.text_token match, :operator + elsif match = scan(/ true | false | null /x) + encoder.text_token match, :value + elsif match = scan(/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x) + encoder.text_token match, :float + elsif match = scan(/ -? (?: 0 | [1-9]\d* ) /x) + encoder.text_token match, :integer + else + encoder.text_token getch, :error + end + + when :string, :key + if match = scan(/ [^\\"]+ /x) + encoder.text_token match, :content + elsif match = scan(/ " /x) + encoder.text_token match, :delimiter + encoder.end_group state + state = :initial + elsif match = scan(/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /ox) + encoder.text_token match, :char + elsif match = scan(/ \\. /mx) + encoder.text_token match, :content + elsif match = scan(/ \\ /x) + encoder.end_group state + state = :initial + encoder.text_token match, :error + else + raise_inspect "else case \" reached; %p not handled." % peek(1), encoder + end + + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + + end + +end +end diff --git a/lib/coderay/scanners/json2.rb b/lib/coderay/scanners/json2.rb new file mode 100644 index 0000000..6d7adc8 --- /dev/null +++ b/lib/coderay/scanners/json2.rb @@ -0,0 +1,131 @@ +module CodeRay +module Scanners + + class RuleBasedScanner2 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@states ||= {} + + @@rules = [] + + instance_eval(&block) + + for name in names + @@states[name] = @@rules + end + + @@rules = nil + end + + def token pattern, *actions + @@rules << [pattern, *actions] + end + + def push_group name + [:begin_group, name] + end + + def pop_group + [:end_group] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON2 < RuleBasedScanner2 + + register_for :json2 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ " (?=#{KEY}) /x, push_group(:key), :delimiter + token %r/ " /x, push_group(:string), :delimiter + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: [eE][-+]? \d+ )? | [eE][-+]? \d+ ) /x, :float + token %r/ -? (?: 0 | [1-9]\d* ) /x, :integer + end + + state :string, :key do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop_group + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, pop_group, :error + + # token %r/$/, end_group + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + for pattern, *actions in @@states[state] + if match = scan(pattern) + for action in actions + case action + when Symbol + encoder.text_token match, action + when Array + case action.first + when :begin_group + encoder.begin_group action.last + state = action.last + states << state + when :end_group + encoder.end_group states.pop + state = states.last + end + end + end + + break + end + end && encoder.text_token(getch, :error) + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + + end + +end +end diff --git a/lib/coderay/scanners/json3.rb b/lib/coderay/scanners/json3.rb new file mode 100644 index 0000000..cf0c1f0 --- /dev/null +++ b/lib/coderay/scanners/json3.rb @@ -0,0 +1,143 @@ +module CodeRay +module Scanners + + class RuleBasedScanner3 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def token pattern, *actions + @@code << " #{'els' unless @@first}if match = scan(#{pattern.inspect})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :begin_group + @@code << " p 'begin_group %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + @@code << " states << #{action.last.inspect}\n" + @@code << " encoder.begin_group #{action.last.inspect}\n" + when :end_group + @@code << " p 'end_group %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + end + end + + @@first = false + end + + def push_group name + [:begin_group, name] + end + + def pop_group + [:end_group] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON3 < RuleBasedScanner3 + + register_for :json3 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ " (?=#{KEY}) /x, push_group(:key), :delimiter + token %r/ " /x, push_group(:string), :delimiter + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + token %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop_group + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, pop_group, :error + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + RUBY + + # puts scan_tokens_code + class_eval scan_tokens_code + + end + +end +end diff --git a/lib/coderay/scanners/json4.rb b/lib/coderay/scanners/json4.rb new file mode 100644 index 0000000..5cb3afb --- /dev/null +++ b/lib/coderay/scanners/json4.rb @@ -0,0 +1,143 @@ +module CodeRay +module Scanners + + class RuleBasedScanner4 < Scanner + class << self + attr_accessor :states + + def state *names, &block + @@code ||= "" + + @@code << "when #{names.map(&:inspect).join(', ')}\n" + + @@first = true + instance_eval(&block) + @@code << " else\n" + # @@code << " raise 'no match for #{names.map(&:inspect).join(', ')}'\n" + @@code << " encoder.text_token getch, :error\n" + @@code << " end\n" + @@code << " \n" + end + + def token pattern, *actions + @@code << " #{'els' unless @@first}if match = scan(#{pattern.inspect})\n" + + for action in actions + case action + when Symbol + @@code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @@code << " encoder.text_token match, #{action.inspect}\n" + when Array + case action.first + when :push + @@code << " p 'push %p' % [#{action.last.inspect}]\n" if $DEBUG + @@code << " state = #{action.last.inspect}\n" + @@code << " states << state\n" + @@code << " encoder.begin_group state\n" + when :pop + @@code << " p 'pop %p' % [states.last]\n" if $DEBUG + @@code << " encoder.end_group states.pop\n" + @@code << " state = states.last\n" + end + end + end + + @@first = false + end + + def push state + [:push, state] + end + + def pop + [:pop] + end + end + end + + # Scanner for JSON (JavaScript Object Notation). + class JSON4 < RuleBasedScanner4 + + register_for :json4 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + token %r/ \s+ /x, :space + + token %r/ [:,\[{\]}] /x, :operator + + token %r/ " (?=#{KEY}) /x, push(:key), :delimiter + token %r/ " /x, push(:string), :delimiter + + token %r/ true | false | null /x, :value + token %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + token %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + token %r/ [^\\"]+ /x, :content + + token %r/ " /x, :delimiter, pop + + token %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + token %r/ \\. /mx, :content + token %r/ \\ /x, :error, pop + end + + protected + + def setup + @state = :initial + end + + # See http://json.org/ for a definition of the JSON lexic/grammar. + scan_tokens_code = <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + + if [:string, :key].include? state + encoder.begin_group state + end + + states = [state] + + until eos? + + case state + +#{ @@code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + + end + + end + + if options[:keep_state] + @state = state + end + + if [:string, :key].include? state + encoder.end_group state + end + + encoder + end + RUBY + + # puts scan_tokens_code + class_eval scan_tokens_code + + end + +end +end diff --git a/lib/coderay/scanners/json5.rb b/lib/coderay/scanners/json5.rb new file mode 100644 index 0000000..8b0a8bd --- /dev/null +++ b/lib/coderay/scanners/json5.rb @@ -0,0 +1,53 @@ +module CodeRay +module Scanners + + # Scanner for JSON (JavaScript Object Notation). + # + # See http://json.org/ for a definition of the JSON lexic/grammar. + class JSON5 < RuleBasedScanner + + register_for :json5 + file_extension 'json' + + KINDS_NOT_LOC = [ + :float, :char, :content, :delimiter, + :error, :integer, :operator, :value, + ] # :nodoc: + + ESCAPE = / [bfnrt\\"\/] /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} /x # :nodoc: + KEY = / (?> (?: [^\\"]+ | \\. )* ) " \s* : /mx + + state :initial do + on %r/ \s+ /x, :space + + on %r/ [:,\[{\]}] /x, :operator + + on %r/ " (?=#{KEY}) /x, push(:key), :delimiter + on %r/ " /x, push(:string), :delimiter + + on %r/ true | false | null /x, :value + on %r/ -? (?: 0 | [1-9]\d* ) (?: \.\d+ (?: e[-+]? \d+ )? | e[-+]? \d+ ) /ix, :float + on %r/ -? (?: 0 | [1-9]\d* ) (?: e[+-] \d+ )? /ix, :integer + end + + state :key, :string do + on %r/ [^\\"]+ /x, :content + + on %r/ " /x, :delimiter, pop + + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char + on %r/ \\. /mx, :content + on %r/ \\ /x, :error, pop + end + + def close_groups encoder, states + if [:string, :key].include? states.last + encoder.end_group states.last + end + end + + end + +end +end diff --git a/lib/coderay/scanners/lua.rb b/lib/coderay/scanners/lua.rb index fb1e45a..81d7dae 100644 --- a/lib/coderay/scanners/lua.rb +++ b/lib/coderay/scanners/lua.rb @@ -76,7 +76,7 @@ module Scanners encoder.text_token(match, :comment) elsif match = scan(/\[=*\[/) # [[ long (possibly multiline) string ]] - num_equals = match.count("=") # Number must match for comment end + num_equals = match.count("=") # Number must match for string end encoder.begin_group(:string) encoder.text_token(match, :delimiter) state = :long_string diff --git a/lib/coderay/scanners/lua2.rb b/lib/coderay/scanners/lua2.rb new file mode 100644 index 0000000..fa20e9b --- /dev/null +++ b/lib/coderay/scanners/lua2.rb @@ -0,0 +1,157 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua2 < RuleBasedScanner + + register_for :lua2 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + state :initial, :map do + on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end + on %r/--.*$/, :comment # --Lua comment + on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: + on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua + on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits) + # Extra highlighting for entities following certain keywords + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { + on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace } + on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and ' + on %r/'/, push(:single_quoted_string, :string), :delimiter + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators + on %r/\s+/, :space # Space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo() + on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state + on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace + end + + state :local_var_expected do + on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ... + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop_state + on %r/\n/, :space, pop_state + on %r/\s+/, :space + end + + state :long_comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string do + on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + state :double_quoted_string do + on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + protected + + def setup + super + + @brace_depth = 0 + @num_equals = nil + end + + def close_groups encoder, states + states.reverse_each do |state| + case state + when :long_string, :single_quoted_string, :double_quoted_string + encoder.end_group :string + when :long_comment + encoder.end_group :long_comment + when :map + encoder.end_group :map + end + end + end + end + +end +end diff --git a/lib/coderay/scanners/lua2b.rb b/lib/coderay/scanners/lua2b.rb new file mode 100644 index 0000000..9e2b1fe --- /dev/null +++ b/lib/coderay/scanners/lua2b.rb @@ -0,0 +1,157 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua2 < RuleBasedScanner + + register_for :lua2 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end + on %r/--.*$/, :comment # --Lua comment + on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: + on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua + on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits) + # Extra highlighting for entities following certain keywords + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { + on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace } + on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and ' + on %r/'/, push(:single_quoted_string, :string), :delimiter + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators + on %r/\s+/, :space # Space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo() + on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state + on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace + end + + state :local_var_expected do + on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ... + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop_state + on %r/\n/, :space, pop_state + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + protected + + def setup + super + + @brace_depth = 0 + @num_equals = nil + end + + def close_groups encoder, states + states.reverse_each do |state| + case state + when :long_string, :single_quoted_string, :double_quoted_string + encoder.end_group :string + when :long_comment + encoder.end_group :long_comment + when :map + encoder.end_group :map + end + end + end + end + +end +end diff --git a/lib/coderay/scanners/lua3.rb b/lib/coderay/scanners/lua3.rb new file mode 100644 index 0000000..d2d4280 --- /dev/null +++ b/lib/coderay/scanners/lua3.rb @@ -0,0 +1,142 @@ +# encoding: utf-8 +# Pseudocode: states optionally define groups, comments removed, counter definition? + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua3 < RuleBasedScannerX + + register_for :lua3 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + protected + + # Scanner initialization. + def setup + super + @brace_depth = 0 + @num_equals = nil + end + + counter :brace_depth + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/--.*$/, :comment + on %r/\[=*\[/, push(:long_string), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label + on %r/_[A-Z]+/, :predefined + on check_if(:brace_depth, :>, 0), %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) + on check_if(:brace_depth, :==, 1), %r/\}/, :delimiter, pop, decrement(:brace_depth) + on check_if(:brace_depth, :==, 0), %r/\}/, :error + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string), :delimiter + on %r/'/, push(:single_quoted_string), :delimiter + + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float + + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator + on %r/\s+/, :space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop + on %r/\s+/, :space + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop + on %r/\s+/, :space + end + + state :local_var_expected do + on %r/function/, :keyword, pop, push(:function_expected) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop + on %r/\n/, :space, pop + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + end + +end +end diff --git a/lib/coderay/scanners/lua4.rb b/lib/coderay/scanners/lua4.rb new file mode 100644 index 0000000..0315d34 --- /dev/null +++ b/lib/coderay/scanners/lua4.rb @@ -0,0 +1,89 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua4 < RuleBasedScanner + + register_for :lua4 + file_extension 'lua' + title 'Lua' + + protected + + state :initial do + on %r'#!(.*?)$', :doctype + on %r//, push_state(:base) + end + + state :base do + on %r'--\[(=*)\[.*?\]\1\]'m, :comment + on %r'--.*$', :comment + + on %r'(\d*\.\d+|\d+\.\d*)(e[+-]?\d+)?'i, :float + on %r'\d+e[+-]?\d+'i, :float + on %r'0x[0-9a-f]*'i, :hex + on %r'\d+', :integer + + on %r'\n', :space + on %r'[^\S\n]', :space + # multiline strings + on %r'\[(=*)\[.*?\]\1\]'m, :string + + on %r'(==|~=|<=|>=|\.\.\.|\.\.|[=+\-*/%^<>#!.\\:])', :operator + on %r'[\[\]{}().,:;]', :operator + on %r'(and|or|not)\b', :operator + + on %r'(break|do|else|elseif|end|for|if|in|repeat|return|then|until|while)\b', :keyword + on %r'(local)\b', :keyword + on %r'(true|false|nil)\b', :predefined_constant + + on %r'(function)\b', :keyword, push_state(:funcname) + + on %r'[A-Za-z_]\w*(\.[A-Za-z_]\w*)?', :ident + + # on %r"'", :string, combined(:stringescape, :sqs) + on %r"'", :string, push_state(:sqs) + # on %r'"', :string, combined(:stringescape, :dqs) + on %r'"', :string, push_state(:dqs) + end + + state :funcname do + on %r'\s+', :space + on %r'(?:([A-Za-z_]\w*)(\.))?([A-Za-z_]\w*)', groups(:class, :operator, :function), pop_state + # inline function + on %r'\(', :operator, pop_state + end + + # if I understand correctly, every character is valid in a lua string, + # so this state is only for later corrections + # state :string do + # on %r'.', :string + # end + + # state :stringescape do + # on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + # end + + state :sqs do + on %r"'", :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + + state :dqs do + on %r'"', :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + end + +end +end diff --git a/lib/coderay/simple_scanner.rb b/lib/coderay/simple_scanner.rb new file mode 100644 index 0000000..6873f88 --- /dev/null +++ b/lib/coderay/simple_scanner.rb @@ -0,0 +1,40 @@ +require 'set' + +module CodeRay + module Scanners + class SimpleScanner < Scanner + extend SimpleScannerDSL + + class << self + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval <<-RUBY +def scan_tokens encoder, options +#{ scan_tokens_code.chomp.gsub(/^/, ' ' * 2) } +end + RUBY + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + end + + def close_groups encoder, states + # TODO + end + end + end +end
\ No newline at end of file diff --git a/lib/coderay/simple_scanner_dsl.rb b/lib/coderay/simple_scanner_dsl.rb new file mode 100644 index 0000000..b3c8c57 --- /dev/null +++ b/lib/coderay/simple_scanner_dsl.rb @@ -0,0 +1,381 @@ +require 'set' + +module CodeRay + module Scanners + module SimpleScannerDSL + Pattern = Struct.new :pattern + Groups = Struct.new :token_kinds + Kind = Struct.new :token_kind + Push = Struct.new :state, :group + Pop = Struct.new :group + PushState = Struct.new :state + PopState = Class.new + Check = Struct.new :condition + CheckIf = Class.new Check + CheckUnless = Class.new Check + ValueSetter = Struct.new :targets, :value + Increment = Struct.new :targets, :operation, :value + Continue = Class.new + + State = Struct.new :names, :block, :dsl do + def initialize(*) + super + eval + end + + def eval + @first = true + + @code = "" + instance_eval(&block) + end + + def code + <<-RUBY +when #{names.map(&:inspect).join(', ')} +#{ rules_code.chomp.gsub(/^/, ' ') } + else +#{ handle_unexpected_char_code.chomp.gsub(/^/, ' ' * 2) } + end + RUBY + end + + protected + + def rules_code + @code + end + + def handle_unexpected_char_code + ''.tap do |code| + code << 'puts "no match for #{state.inspect} => skip char"' << "\n" if $DEBUG + code << 'encoder.text_token getch, :error' + end + end + + public + + def on? pattern + pattern_expression = pattern.inspect + @code << "#{'els' unless @first}if check(#{pattern_expression})\n" + + @first = true + yield + @code << "end\n" + + @first = false + end + + def on *pattern_and_actions + if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) } + conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!' + actions = pattern_and_actions[index..-1] or raise 'I need actions!' + else + raise "invalid rule structure: #{pattern_and_actions.map(&:class)}" + end + + condition_expressions = [] + if conditions + for condition in conditions + case condition + when CheckIf + case condition.condition + when Proc + condition_expressions << "#{dsl.add_callback(condition.condition)}" + when Symbol + condition_expressions << "#{condition.condition}" + else + raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition] + end + when CheckUnless + case condition.condition + when Proc + condition_expressions << "!#{dsl.add_callback(condition.condition)}" + when Symbol + condition_expressions << "!#{condition.condition}" + else + raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition] + end + when Pattern + case condition.pattern + when Proc + condition_expressions << "match = scan(#{dsl.add_callback(condition.pattern)})" + else + raise "I don't know how to evaluate this pattern: %p" % [condition.pattern] + end + when Regexp + condition_expressions << "match = scan(#{condition.inspect})" + else + raise "I don't know how to evaluate this pattern/condition: %p" % [condition] + end + end + end + + @code << "#{'els' unless @first}if #{condition_expressions.join(' && ')}\n" + + for action in actions + case action + when String + raise + @code << "p 'evaluate #{action.inspect}'\n" if $DEBUG + @code << "#{action}\n" + + when Symbol + @code << "p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG + @code << "encoder.text_token match, #{action.inspect}\n" + when Kind + case action.token_kind + when Proc + @code << "encoder.text_token match, kind = #{dsl.add_callback(action.token_kind)}\n" + else + raise "I don't know how to evaluate this kind: %p" % [action.token_kind] + end + when Groups + @code << "p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG + action.token_kinds.each_with_index do |kind, i| + @code << "encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n" + end + + when Push, PushState + case action.state + when String + raise + @code << "p 'push %p' % [#{action.state}]\n" if $DEBUG + @code << "state = #{action.state}\n" + @code << "states << state\n" + when Symbol + @code << "p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG + @code << "state = #{action.state.inspect}\n" + @code << "states << state\n" + when Proc + @code << "if new_state = #{dsl.add_callback(action.state)}\n" + @code << " state = new_state\n" + @code << " states << new_state\n" + @code << "end\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + if action.is_a? Push + if action.state == action.group + @code << "encoder.begin_group state\n" + else + case action.state + when Symbol + @code << "p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG + @code << "encoder.begin_group #{action.group.inspect}\n" + when Proc + @code << "encoder.begin_group #{dsl.add_callback(action.group)}\n" + else + raise "I don't know how to evaluate this push state: %p" % [action.state] + end + end + end + when Pop, PopState + @code << "p 'pop %p' % [states.last]\n" if $DEBUG + if action.is_a? Pop + if action.group + case action.group + when Symbol + @code << "encoder.end_group #{action.group.inspect}\n" + else + raise "I don't know how to evaluate this pop group: %p" % [action.group] + end + @code << "states.pop\n" + else + @code << "encoder.end_group states.pop\n" + end + else + @code << "states.pop\n" + end + @code << "state = states.last\n" + + when ValueSetter + case action.value + when Proc + @code << "#{action.targets.join(' = ')} = #{dsl.add_callback(action.value)}\n" + when Symbol + @code << "#{action.targets.join(' = ')} = #{action.value}\n" + else + @code << "#{action.targets.join(' = ')} = #{action.value.inspect}\n" + end + + when Increment + case action.value + when Proc + @code << "#{action.targets.join(' = ')} #{action.operation}= #{dsl.add_callback(action.value)}\n" + when Symbol + @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value}\n" + else + @code << "#{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n" + end + + when Proc + @code << "#{dsl.add_callback(action)}\n" + + when Continue + @code << "next\n" + + else + raise "I don't know how to evaluate this action: %p" % [action] + end + end + + @first = false + end + + def groups *token_kinds + Groups.new token_kinds + end + + def pattern pattern = nil, &block + Pattern.new pattern || block + end + + def kind token_kind = nil, &block + Kind.new token_kind || block + end + + def push state = nil, group = state, &block + raise 'push requires a state or a block; got nothing' unless state || block + Push.new state || block, group || block + end + + def pop group = nil + Pop.new group + end + + def push_state state = nil, &block + raise 'push_state requires a state or a block; got nothing' unless state || block + PushState.new state || block + end + + def pop_state + PopState.new + end + + def check_if value = nil, &callback + CheckIf.new value || callback + end + + def check_unless value = nil, &callback + CheckUnless.new value || callback + end + + def flag_on *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), true + end + + def flag_off *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), false + end + + def set flag, value = nil, &callback + dsl.add_variable flag + ValueSetter.new [flag], value || callback + end + + def unset *flags + flags.each { |name| dsl.add_variable name } + ValueSetter.new Array(flags), nil + end + + def increment *counters + counters.each { |name| dsl.add_variable name } + Increment.new Array(counters), :+, 1 + end + + def decrement *counters + counters.each { |name| dsl.add_variable name } + Increment.new Array(counters), :-, 1 + end + + def continue + Continue.new + end + end + + attr_accessor :states + + def state *names, &block + @states ||= [] + @states << State.new(names, block, self) + end + + def add_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callbacks[callback_name] = define_method(callback_name, &block) + + parameters = block.parameters + + if parameters.empty? + callback_name + else + parameter_names = parameters.map(&:last) + parameter_names.each { |name| variables << name } + "#{callback_name}(#{parameter_names.join(', ')})" + end + end + + def add_variable name + variables << name + end + + protected + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def scan_tokens_code + <<-"RUBY" +state = options[:state] || @state +states = [state] +#{ restore_local_variables_code.chomp } + +until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ') } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end +end + +@state = state if options[:keep_state] + +#{ close_groups_code.chomp } + +encoder + RUBY + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def states_code + @states.map(&:code)[0,1].join + end + + def close_groups_code + 'close_groups(encoder, states)' + end + end + end +end
\ No newline at end of file diff --git a/lib/coderay/state_based_scanner.rb b/lib/coderay/state_based_scanner.rb new file mode 100644 index 0000000..b196adc --- /dev/null +++ b/lib/coderay/state_based_scanner.rb @@ -0,0 +1,394 @@ +require 'set' + +module CodeRay + module Scanners + class StateBasedScanner < Scanner + class State + attr_reader :names + attr_reader :rules + attr_reader :scanner + + def initialize scanner, names, &block + @scanner = scanner + @names = names + + @rules = [] + @check = nil + + instance_eval(&block) + end + + def rules_code + <<-RUBY +when #{names.map(&:inspect).join(', ')} +#{rules.map.with_index { |rule, index| rule.code(first: index.zero?) }.join} + else + puts "no match for \#{state.inspect} => skip character" if $DEBUG + encoder.text_token getch, :error + end + + RUBY + end + + protected + + # structure + def check *conditions, &block + return @check unless conditions.any? || block + raise "Can't nest check yet" if @check + + @check = Conditions.new(conditions) + instance_eval(&block) + @check = nil + end + + # rules + def on pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, &block) + end + + def skip pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, skip: true, &block) + end + + def otherwise *actions, &block + @rules << Rule.new(self, //, *actions, check: @check, skip: true, &block) + end + + # actions + def push state + Push.new(state) + end + + def pop + Pop.new + end + + def kind token_kind = nil, &block + Kind.new token_kind || scanner.callback(block) + end + + def groups *token_kinds + Groups.new(token_kinds) + end + + def set target, value = nil, &block + Setter.new target, value || block || true + end + + def callback block + scanner.callback(block) + end + + # magic flag getters + def method_missing method, *args, &block + method_name = method.to_s + if method_name.end_with?('?') + Getter.new(scanner.variable(method_name.chomp('?'))) + else + super + end + end + end + + class GroupState < State + end + + class Rule + attr_reader :pattern + attr_reader :actions + attr_reader :check + attr_reader :state + + def initialize state, pattern, *actions, check:, skip: false, &block + @state = state + @pattern = (skip ? Skip : Scan).new(pattern) + @actions = *build_actions(actions, block) + @check = check + + raise [pattern, *actions, check, skip, block].inspect if check == false + end + + def code first: + <<-RUBI + #{'els' unless first}if #{condition_expression} +#{actions_code.gsub(/^/, ' ' * 2)} + RUBI + end + + def skip? + @pattern.is_a?(Skip) + end + + protected + + def condition_expression + [check, pattern].compact.map(&:code).join(' && ') + end + + def actions_code + actions.map(&:code).join("\n") + end + + def build_actions actions, block + actions += [block] if block + + actions.map do |action| + case action + when Symbol + Token.new(action) + when Proc + state.instance_eval do + callback action + end + when WordList + state.instance_eval do + kind { |match| action[match] } + end + when Push, Pop, Groups, Kind, Setter + action + else + raise "Don't know how to build action for %p (%p)" % [action, action.class] + end + end + end + end + + # conditions + class Conditions < Struct.new(:conditions) + def code + "#{conditions.map(&:code).join(' && ')}" + end + end + + class Scan < Struct.new(:pattern) + def code + "match = scan(#{pattern.inspect})" + end + end + + class Skip < Scan + end + + class Getter < Struct.new(:name, :negative) + def code + "#{negative && '!'}#{name}" + end + + def !@ + negative + end + + protected + + def negative + @negative ||= Getter.new(name, :negative) + end + end + + # actions + class Push < Struct.new :state + def code + "push" + end + end + + class Pop < Class.new + def code + "pop" + end + end + + class Groups < Struct.new(:token_kinds) + def code + "groups" + end + end + + class Setter < Struct.new(:name, :value) + def code + "set" + end + end + + + class Kind < Struct.new(:token_kind) + def code + case token_kind + when Callback + "encoder.text_token match, kind = #{token_kind.code}\n" + else + raise "I don't know how to evaluate this kind: %p" % [token_kind] + end + end + end + + class Token < Struct.new(:name) + def code + "encoder.text_token match, #{name.inspect}" + end + end + + class Callback < Struct.new(:name, :block) + def code + if parameter_names.empty? + name + else + "#{name}(#{parameter_names.join(', ')})" + end + end + + protected + + def parameter_names + block.parameters.map(&:last) + end + end + + class << self + def states + @states ||= {} + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + + def variable name + variables << name.to_sym + + name + end + + def callback block + return unless block + + callback_name = name_for_callback(block) + callbacks[callback_name] = define_method(callback_name, &block) + block.parameters.map(&:last).each { |name| variable name } + + Callback.new(callback_name, block) + end + + protected + + def state *names, state_class: State, &block + state_class.new(self, names, &block).tap do |state| + for name in names + states[name] = state + end + end + end + + def group_state *names, &block + state(*names, state_class: GroupState, &block) + end + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def name_for_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callback_name + end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + states = [state] + + until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + end + + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def states_code + states.values.map(&:rules_code).join + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, states)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + reset_expectations + end + + def close_groups encoder, states + # TODO + end + + def expect kind + @expected = kind + end + + def expected? kind + @expected == kind + end + + def reset_expectations + @expected = nil + end + end + end +end diff --git a/lib/coderay/version.rb b/lib/coderay/version.rb index f5e7a39..ed87d63 100644 --- a/lib/coderay/version.rb +++ b/lib/coderay/version.rb @@ -1,3 +1,3 @@ module CodeRay - VERSION = '1.1.2' + VERSION = '2.0.0' end diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake index 1a23a5b..58e6daa 100644 --- a/rake_tasks/test.rake +++ b/rake_tasks/test.rake @@ -37,7 +37,7 @@ Please rename or remove it and run again to use the GitHub repository: else puts 'Downloading scanner test suite...' sh 'git clone https://github.com/rubychan/coderay-scanner-tests.git test/scanners/' - end + end unless ENV['SKIP_UPDATE_SCANNER_SUITE'] end namespace :scanner do @@ -48,6 +48,11 @@ Please rename or remove it and run again to use the GitHub repository: task lang => :update_scanner_suite do ruby "./test/scanners/suite.rb #{lang}" end + (1..5).each do |i| + task "#{lang}:#{i}" => :update_scanner_suite do + ruby "./test/scanners/suite.rb #{lang}:#{i}" + end + end end end diff --git a/spec/simple_scanner_spec.rb b/spec/simple_scanner_spec.rb new file mode 100644 index 0000000..088343c --- /dev/null +++ b/spec/simple_scanner_spec.rb @@ -0,0 +1,28 @@ +RSpec.describe CodeRay::Scanners::SimpleScanner do + let(:scanner) { Class.new described_class } + + describe '#scan_tokens_code' do + subject { scanner.send :scan_tokens_code } + it 'lets you define states' do + is_expected.to eq <<-RUBY +state = options[:state] || @state +states = [state] + + +until eos? + case state + + else + raise_inspect 'Unknown state: %p' % [state], encoder + end +end + +@state = state if options[:keep_state] + +close_groups(encoder, states) + +encoder + RUBY + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 0000000..49b6a0e --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,96 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# The `.rspec` file also contains a few flags that are not defaults but that +# users commonly want. +# +# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # These two settings work together to allow you to limit a spec run + # to individual examples or groups you care about by tagging them with + # `:focus` metadata. When nothing is tagged with `:focus`, all examples + # get run. + config.filter_run :focus + config.run_all_when_everything_filtered = true + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/ + # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/ + # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = 'doc' + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +end + +$LOAD_PATH << 'lib/coderay' + +require 'coderay' |