summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKornelius Kalnbach <murphy@rubychan.de>2017-11-05 16:13:51 +0100
committerKornelius Kalnbach <murphy@rubychan.de>2017-11-05 16:13:51 +0100
commit579c00bab003658e1bffd989ab15719a188cb7b0 (patch)
treed1a9e476b21ba03b841ac4d17510675ee8fea867
parent8d46c46bf63173a554cea4292b458293ccefb0a4 (diff)
downloadcoderay-579c00bab003658e1bffd989ab15719a188cb7b0.tar.gz
testing SingleStateRuleBasedScanner; not faster :(
-rw-r--r--lib/coderay.rb1
-rw-r--r--lib/coderay/scanners/_map.rb1
-rw-r--r--lib/coderay/scanners/java_script6.rb162
-rw-r--r--lib/coderay/single_state_rule_based_scanner.rb370
-rw-r--r--rake_tasks/test.rake2
5 files changed, 535 insertions, 1 deletions
diff --git a/lib/coderay.rb b/lib/coderay.rb
index c1c9e34..e43f6bb 100644
--- a/lib/coderay.rb
+++ b/lib/coderay.rb
@@ -155,6 +155,7 @@ module CodeRay
# DSL Scanner
autoload :RuleBasedScanner, coderay_path('rule_based_scanner')
+ autoload :SingleStateRuleBasedScanner, coderay_path('single_state_rule_based_scanner')
autoload :StateBasedScanner, coderay_path('state_based_scanner')
# convenience access and reusable Encoder/Scanner pair
diff --git a/lib/coderay/scanners/_map.rb b/lib/coderay/scanners/_map.rb
index 61079d5..82fb17f 100644
--- a/lib/coderay/scanners/_map.rb
+++ b/lib/coderay/scanners/_map.rb
@@ -15,6 +15,7 @@ module Scanners
:javascript3 => :java_script3,
:javascript4 => :java_script4,
:javascript5 => :java_script5,
+ :javascript6 => :java_script6,
:js => :java_script,
:pascal => :delphi,
:patch => :diff,
diff --git a/lib/coderay/scanners/java_script6.rb b/lib/coderay/scanners/java_script6.rb
new file mode 100644
index 0000000..b745bd4
--- /dev/null
+++ b/lib/coderay/scanners/java_script6.rb
@@ -0,0 +1,162 @@
+# TODO: string_delimiter should be part of the state: push(:regexp, '/'), check_if -> (state, delimiter) { … }
+module CodeRay
+module Scanners
+
+ # Scanner for JavaScript.
+ #
+ # Aliases: +ecmascript+, +ecma_script+, +javascript+
+ class JavaScript6 < SingleStateRuleBasedScanner
+
+ register_for :java_script6
+ file_extension 'js'
+
+ # The actual JavaScript keywords.
+ KEYWORDS = %w[
+ break case catch continue default delete do else
+ finally for function if in instanceof new
+ return switch throw try typeof var void while with
+ ] # :nodoc:
+ PREDEFINED_CONSTANTS = %w[
+ false null true undefined NaN Infinity
+ ] # :nodoc:
+
+ MAGIC_VARIABLES = %w[ this arguments ] # :nodoc: arguments was introduced in JavaScript 1.4
+
+ KEYWORDS_EXPECTING_VALUE = WordList.new.add %w[
+ case delete in instanceof new return throw typeof with
+ ] # :nodoc:
+
+ # Reserved for future use.
+ RESERVED_WORDS = %w[
+ abstract boolean byte char class debugger double enum export extends
+ final float goto implements import int interface long native package
+ private protected public short static super synchronized throws transient
+ volatile
+ ] # :nodoc:
+
+ IDENT_KIND = WordList.new(:ident).
+ add(RESERVED_WORDS, :reserved).
+ add(PREDEFINED_CONSTANTS, :predefined_constant).
+ add(MAGIC_VARIABLES, :local_variable).
+ add(KEYWORDS, :keyword) # :nodoc:
+
+ ESCAPE = / [bfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
+ UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc:
+ REGEXP_ESCAPE = / [bBdDsSwW] /x # :nodoc:
+ STRING_CONTENT_PATTERN = {
+ "'" => /[^\\']+/,
+ '"' => /[^\\"]+/,
+ '/' => /[^\\\/]+/,
+ } # :nodoc:
+ KEY_CHECK_PATTERN = {
+ "'" => / (?> [^\\']* (?: \\. [^\\']* )* ) ' \s* : /mx,
+ '"' => / (?> [^\\"]* (?: \\. [^\\"]* )* ) " \s* : /mx,
+ } # :nodoc:
+
+ state :initial do
+ on %r/ \s+ | \\\n /x, :space, set(:value_expected) { |match, value_expected| value_expected || match.index(?\n) }
+ on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .*() ) !mx, :comment, flag_off(:value_expected)
+ # state = :open_multi_line_comment if self[1]
+
+ on? %r/\.?\d/ do
+ on %r/0[xX][0-9A-Fa-f]+/, :hex, flag_off(:key_expected, :value_expected)
+ on %r/(?>0[0-7]+)(?![89.eEfF])/, :octal, flag_off(:key_expected, :value_expected)
+ on %r/\d+[fF]|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, flag_off(:key_expected, :value_expected)
+ on %r/\d+/, :integer, flag_off(:key_expected, :value_expected)
+ end
+
+ on check_if(:value_expected), %r/<([[:alpha:]]\w*) (?: [^\/>]*\/> | .*?<\/\1>)/xim, -> (match, encoder) do
+ # TODO: scan over nested tags
+ xml_scanner.tokenize match, :tokens => encoder
+ end, flag_off(:value_expected)
+
+ on %r/ [-+*=<>?:;,!&^|(\[{~%]++ (?<![{,]) | \.+(?!\d) /x, :operator, flag_on(:value_expected), flag_off(:key_expected, :function_expected)
+ on %r/ [-+*=<>?:;,!&^|(\[{~%]*+ (?<=[{,]) /x, :operator, flag_on(:value_expected, :key_expected), flag_off(:function_expected)
+ on %r/ [)\]}]+ /x, :operator, flag_off(:function_expected, :key_expected, :value_expected)
+
+ on %r/ function (?![A-Za-z_0-9$]) /x, :keyword, flag_on(:function_expected), flag_off(:key_expected, :value_expected)
+ on %r/ [$a-zA-Z_][A-Za-z_0-9$]* /x, kind { |match, function_expected, key_expected|
+ kind = IDENT_KIND[match]
+ # TODO: labels
+ if kind == :ident
+ if match.index(?$) # $ allowed inside an identifier
+ kind = :predefined
+ elsif function_expected
+ kind = :function
+ elsif check(/\s*[=:]\s*function\b/)
+ kind = :function
+ elsif key_expected && check(/\s*:/)
+ kind = :key
+ end
+ end
+
+ kind
+ }, flag_off(:function_expected, :key_expected), set(:value_expected) { |match| KEYWORDS_EXPECTING_VALUE[match] }
+
+ on %r/["']/, push { |match, key_expected| key_expected && check(KEY_CHECK_PATTERN[match]) ? :key : :string }, :delimiter, set(:string_delimiter) { |match| match }
+ on check_if(:value_expected), %r/\//, push(:regexp), :delimiter
+
+ on %r/\//, :operator, flag_on(:value_expected), flag_off(:key_expected)
+ end
+
+ state :string, :key do
+ on pattern { |string_delimiter| STRING_CONTENT_PATTERN[string_delimiter] }, :content
+ on %r/["']/, :delimiter, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop
+ on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /x, kind { |match, string_delimiter|
+ string_delimiter == "'" && !(match == "\\\\" || match == "\\'") ? :content : :char
+ }
+ on %r/ \\. /mx, :content
+ on %r/ \\ /x, unset(:string_delimiter), flag_off(:key_expected, :value_expected), pop, :error
+ end
+
+ state :regexp do
+ on STRING_CONTENT_PATTERN['/'], :content
+ on %r/(\/)([gim]+)?/, groups(:delimiter, :modifier), flag_off(:key_expected, :value_expected), pop
+ on %r/ \\ (?: #{ESCAPE} | #{REGEXP_ESCAPE} | #{UNICODE_ESCAPE} ) /x, :char
+ on %r/\\./m, :content
+ on %r/ \\ /x, pop, :error, flag_off(:key_expected, :value_expected)
+ end
+
+ # state :open_multi_line_comment do
+ # on %r! .*? \*/ !mx, :initial # don't consume!
+ # on %r/ .+ /mx, :comment, -> { value_expected = true }
+ #
+ # # if match = scan(%r! .*? \*/ !mx)
+ # # state = :initial
+ # # else
+ # # match = scan(%r! .+ !mx)
+ # # end
+ # # value_expected = true
+ # # encoder.text_token match, :comment if match
+ # end
+
+ protected
+
+ def setup
+ super
+
+ @string_delimiter = nil
+ @value_expected = true
+ @key_expected = false
+ @function_expected = false
+ end
+
+ def close_groups encoder, state
+ if [:string, :key, :regexp].include? state
+ encoder.end_group state
+ end
+ end
+
+ def reset_instance
+ super
+ @xml_scanner.reset if defined? @xml_scanner
+ end
+
+ def xml_scanner
+ @xml_scanner ||= CodeRay.scanner :xml, :tokens => @tokens, :keep_tokens => true, :keep_state => false
+ end
+
+ end
+
+end
+end
diff --git a/lib/coderay/single_state_rule_based_scanner.rb b/lib/coderay/single_state_rule_based_scanner.rb
new file mode 100644
index 0000000..cd8d4a4
--- /dev/null
+++ b/lib/coderay/single_state_rule_based_scanner.rb
@@ -0,0 +1,370 @@
+require 'set'
+
+module CodeRay
+ module Scanners
+ class SingleStateRuleBasedScanner < Scanner
+
+ Pattern = Struct.new :pattern
+ Groups = Struct.new :token_kinds
+ Kind = Struct.new :token_kind
+ Push = Struct.new :state, :group
+ Pop = Struct.new :group
+ PushState = Struct.new :state
+ PopState = Class.new
+ Check = Struct.new :condition
+ CheckIf = Class.new Check
+ CheckUnless = Class.new Check
+ ValueSetter = Struct.new :targets, :value
+ Increment = Struct.new :targets, :operation, :value
+ Continue = Class.new
+
+ class << self
+ attr_accessor :states
+
+ def state *names, &block
+ @code ||= ""
+
+ @code << "when #{names.map(&:inspect).join(', ')}\n"
+
+ @first = true
+ instance_eval(&block)
+ @code << " else\n"
+ @code << " puts \"no match for \#{state.inspect} => skip char\"\n" if $DEBUG
+ @code << " encoder.text_token getch, :error\n"
+ @code << " end\n"
+ @code << " \n"
+ end
+
+ def on? pattern
+ pattern_expression = pattern.inspect
+ @code << " #{'els' unless @first}if check(#{pattern_expression})\n"
+
+ @first = true
+ yield
+ @code << " end\n"
+
+ @first = false
+ end
+
+ def on *pattern_and_actions
+ if index = pattern_and_actions.find_index { |item| !(item.is_a?(Check) || item.is_a?(Regexp) || item.is_a?(Pattern)) }
+ conditions = pattern_and_actions[0..index - 1] or raise 'I need conditions or a pattern!'
+ actions = pattern_and_actions[index..-1] or raise 'I need actions!'
+ else
+ raise "invalid rule structure: #{pattern_and_actions.map(&:class)}"
+ end
+
+ condition_expressions = []
+ if conditions
+ for condition in conditions
+ case condition
+ when CheckIf
+ case condition.condition
+ when Proc
+ condition_expressions << "#{make_callback(condition.condition)}"
+ when Symbol
+ condition_expressions << "#{condition.condition}"
+ else
+ raise "I don't know how to evaluate this check_if condition: %p" % [condition.condition]
+ end
+ when CheckUnless
+ case condition.condition
+ when Proc
+ condition_expressions << "!#{make_callback(condition.condition)}"
+ when Symbol
+ condition_expressions << "!#{condition.condition}"
+ else
+ raise "I don't know how to evaluate this check_unless condition: %p" % [condition.condition]
+ end
+ when Pattern
+ case condition.pattern
+ when Proc
+ condition_expressions << "match = scan(#{make_callback(condition.pattern)})"
+ else
+ raise "I don't know how to evaluate this pattern: %p" % [condition.pattern]
+ end
+ when Regexp
+ condition_expressions << "match = scan(#{condition.inspect})"
+ else
+ raise "I don't know how to evaluate this pattern/condition: %p" % [condition]
+ end
+ end
+ end
+
+ @code << " #{'els' unless @first}if #{condition_expressions.join(' && ')}\n"
+
+ for action in actions
+ case action
+ when String
+ raise
+ @code << " p 'evaluate #{action.inspect}'\n" if $DEBUG
+ @code << " #{action}\n"
+
+ when Symbol
+ @code << " p 'text_token %p %p' % [match, #{action.inspect}]\n" if $DEBUG
+ @code << " encoder.text_token match, #{action.inspect}\n"
+ when Kind
+ case action.token_kind
+ when Proc
+ @code << " encoder.text_token match, kind = #{make_callback(action.token_kind)}\n"
+ else
+ raise "I don't know how to evaluate this kind: %p" % [action.token_kind]
+ end
+ when Groups
+ @code << " p 'text_tokens %p in groups %p' % [match, #{action.token_kinds.inspect}]\n" if $DEBUG
+ action.token_kinds.each_with_index do |kind, i|
+ @code << " encoder.text_token self[#{i + 1}], #{kind.inspect} if self[#{i + 1}]\n"
+ end
+
+ when Push, PushState
+ case action.state
+ when String
+ raise
+ @code << " p 'push %p' % [#{action.state}]\n" if $DEBUG
+ @code << " state = #{action.state}\n"
+ when Symbol
+ @code << " p 'push %p' % [#{action.state.inspect}]\n" if $DEBUG
+ @code << " state = #{action.state.inspect}\n"
+ when Proc
+ @code << " if new_state = #{make_callback(action.state)}\n"
+ @code << " state = new_state\n"
+ @code << " end\n"
+ else
+ raise "I don't know how to evaluate this push state: %p" % [action.state]
+ end
+ if action.is_a? Push
+ if action.state == action.group
+ @code << " encoder.begin_group state\n"
+ else
+ case action.state
+ when Symbol
+ @code << " p 'begin group %p' % [#{action.group.inspect}]\n" if $DEBUG
+ @code << " encoder.begin_group #{action.group.inspect}\n"
+ when Proc
+ @code << " encoder.begin_group #{make_callback(action.group)}\n"
+ else
+ raise "I don't know how to evaluate this push state: %p" % [action.state]
+ end
+ end
+ end
+ when Pop, PopState
+ @code << " p 'pop %p' % [state]\n" if $DEBUG
+ if action.is_a? Pop
+ if action.group
+ case action.group
+ when Symbol
+ @code << " encoder.end_group #{action.group.inspect}\n"
+ else
+ raise "I don't know how to evaluate this pop group: %p" % [action.group]
+ end
+ else
+ @code << " encoder.end_group state\n"
+ end
+ end
+ @code << " state = :initial\n"
+
+ when ValueSetter
+ case action.value
+ when Proc
+ @code << " #{action.targets.join(' = ')} = #{make_callback(action.value)}\n"
+ when Symbol
+ @code << " #{action.targets.join(' = ')} = #{action.value}\n"
+ else
+ @code << " #{action.targets.join(' = ')} = #{action.value.inspect}\n"
+ end
+
+ when Increment
+ case action.value
+ when Proc
+ @code << " #{action.targets.join(' = ')} #{action.operation}= #{make_callback(action.value)}\n"
+ when Symbol
+ @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value}\n"
+ else
+ @code << " #{action.targets.join(' = ')} #{action.operation}= #{action.value.inspect}\n"
+ end
+
+ when Proc
+ @code << " #{make_callback(action)}\n"
+
+ when Continue
+ @code << " next\n"
+
+ else
+ raise "I don't know how to evaluate this action: %p" % [action]
+ end
+ end
+
+ @first = false
+ end
+
+ def groups *token_kinds
+ Groups.new token_kinds
+ end
+
+ def pattern pattern = nil, &block
+ Pattern.new pattern || block
+ end
+
+ def kind token_kind = nil, &block
+ Kind.new token_kind || block
+ end
+
+ def push state = nil, group = state, &block
+ raise 'push requires a state or a block; got nothing' unless state || block
+ Push.new state || block, group || block
+ end
+
+ def pop group = nil
+ Pop.new group
+ end
+
+ def push_state state = nil, &block
+ raise 'push_state requires a state or a block; got nothing' unless state || block
+ PushState.new state || block
+ end
+
+ def pop_state
+ PopState.new
+ end
+
+ def check_if value = nil, &callback
+ CheckIf.new value || callback
+ end
+
+ def check_unless value = nil, &callback
+ CheckUnless.new value || callback
+ end
+
+ def flag_on *flags
+ flags.each { |name| variables << name }
+ ValueSetter.new Array(flags), true
+ end
+
+ def flag_off *flags
+ flags.each { |name| variables << name }
+ ValueSetter.new Array(flags), false
+ end
+
+ def set flag, value = nil, &callback
+ variables << flag
+ ValueSetter.new [flag], value || callback || true
+ end
+
+ def unset *flags
+ flags.each { |name| variables << name }
+ ValueSetter.new Array(flags), nil
+ end
+
+ def increment *counters
+ counters.each { |name| variables << name }
+ Increment.new Array(counters), :+, 1
+ end
+
+ def decrement *counters
+ counters.each { |name| variables << name }
+ Increment.new Array(counters), :-, 1
+ end
+
+ def continue
+ Continue.new
+ end
+
+ def define_scan_tokens!
+ if ENV['PUTS']
+ puts CodeRay.scan(scan_tokens_code, :ruby).terminal
+ puts "callbacks: #{callbacks.size}"
+ end
+
+ class_eval scan_tokens_code
+ end
+
+ protected
+
+ def callbacks
+ @callbacks ||= {}
+ end
+
+ def variables
+ @variables ||= Set.new
+ end
+
+ def additional_variables
+ variables - %i(encoder options state match kind)
+ end
+
+ def make_callback block
+ base_name = "__callback_line_#{block.source_location.last}"
+ callback_name = base_name
+ counter = 'a'
+ while callbacks.key?(callback_name)
+ callback_name = "#{base_name}_#{counter}"
+ counter.succ!
+ end
+
+ callbacks[callback_name] = define_method(callback_name, &block)
+
+ parameters = block.parameters
+
+ if parameters.empty?
+ callback_name
+ else
+ parameter_names = parameters.map(&:last)
+ parameter_names.each { |name| variables << name }
+ "#{callback_name}(#{parameter_names.join(', ')})"
+ end
+ end
+
+ def scan_tokens_code
+ <<-"RUBY"
+ def scan_tokens encoder, options
+ state = options[:state] || @state
+
+#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) }
+
+ until eos?
+ case state
+#{ @code.chomp.gsub(/^/, ' ' * 4) }
+ else
+ raise_inspect 'Unknown state: %p' % [state], encoder
+ end
+ end
+
+ if options[:keep_state]
+ @state = state
+ end
+
+#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) }
+
+ encoder
+ end
+ RUBY
+ end
+
+ def restore_local_variables_code
+ additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n")
+ end
+
+ def close_groups_code
+ "close_groups(encoder, state)"
+ end
+ end
+
+ def scan_tokens tokens, options
+ self.class.define_scan_tokens!
+
+ scan_tokens tokens, options
+ end
+
+ protected
+
+ def setup
+ @state = :initial
+ end
+
+ def close_groups encoder, state
+ # TODO
+ end
+
+ end
+ end
+end
diff --git a/rake_tasks/test.rake b/rake_tasks/test.rake
index 58e6daa..6468790 100644
--- a/rake_tasks/test.rake
+++ b/rake_tasks/test.rake
@@ -48,7 +48,7 @@ Please rename or remove it and run again to use the GitHub repository:
task lang => :update_scanner_suite do
ruby "./test/scanners/suite.rb #{lang}"
end
- (1..5).each do |i|
+ (1..6).each do |i|
task "#{lang}:#{i}" => :update_scanner_suite do
ruby "./test/scanners/suite.rb #{lang}:#{i}"
end