summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKornelius Kalnbach <murphy@rubychan.de>2017-04-09 18:38:25 +0200
committerKornelius Kalnbach <murphy@rubychan.de>2017-04-09 18:38:25 +0200
commit7a02cdded08dd232319eae17998abc877efb58cb (patch)
tree5f529d1052ef2b605e3096dfa9e818769ba1cb4a
parent548e2d0aea6b4c18a2f3e8203241fcaedb10bc8d (diff)
downloadcoderay-7a02cdded08dd232319eae17998abc877efb58cb.tar.gz
working towards DSL scanner
-rw-r--r--lib/coderay.rb1
-rw-r--r--lib/coderay/scanners/c3.rb112
-rw-r--r--lib/coderay/scanners/c4.rb126
-rw-r--r--lib/coderay/scanners/lua2b.rb157
-rw-r--r--lib/coderay/scanners/lua3.rb142
-rw-r--r--lib/coderay/scanners/lua4.rb89
-rw-r--r--lib/coderay/state_based_scanner.rb394
7 files changed, 1021 insertions, 0 deletions
diff --git a/lib/coderay.rb b/lib/coderay.rb
index 5c923f5..c1c9e34 100644
--- a/lib/coderay.rb
+++ b/lib/coderay.rb
@@ -155,6 +155,7 @@ module CodeRay
# DSL Scanner
autoload :RuleBasedScanner, coderay_path('rule_based_scanner')
+ autoload :StateBasedScanner, coderay_path('state_based_scanner')
# convenience access and reusable Encoder/Scanner pair
autoload :Duo, coderay_path('duo')
diff --git a/lib/coderay/scanners/c3.rb b/lib/coderay/scanners/c3.rb
new file mode 100644
index 0000000..49555ca
--- /dev/null
+++ b/lib/coderay/scanners/c3.rb
@@ -0,0 +1,112 @@
+module CodeRay
+module Scanners
+
+ # Scanner for C.
+ class C3 < RuleBasedScanner
+
+ register_for :c3
+ file_extension 'c'
+
+ KEYWORDS = [
+ 'asm', 'break', 'case', 'continue', 'default', 'do',
+ 'else', 'enum', 'for', 'goto', 'if', 'return',
+ 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while',
+ 'restrict', # added in C99
+ ] # :nodoc:
+
+ PREDEFINED_TYPES = [
+ 'int', 'long', 'short', 'char',
+ 'signed', 'unsigned', 'float', 'double',
+ 'bool', 'complex', # added in C99
+ ] # :nodoc:
+
+ PREDEFINED_CONSTANTS = [
+ 'EOF', 'NULL',
+ 'true', 'false', # added in C99
+ ] # :nodoc:
+ DIRECTIVES = [
+ 'auto', 'extern', 'register', 'static', 'void',
+ 'const', 'volatile', # added in C89
+ 'inline', # added in C99
+ ] # :nodoc:
+
+ IDENT_KIND = WordList.new(:ident).
+ add(KEYWORDS, :keyword).
+ add(PREDEFINED_TYPES, :predefined_type).
+ add(DIRECTIVES, :directive).
+ add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc:
+
+ ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
+ UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc:
+
+ protected
+
+ state :initial do
+ on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, unset(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line)
+ on %r/ \s+ | \\\n /x, :space
+
+ on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected|
+ match =~ /[;\{\}]/ || case_expected && match =~ /:/
+ }, unset(:case_expected)
+
+ on %r/ (?: case | default ) \b /x, :keyword, set(:case_expected), unset(:label_expected)
+ on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match|
+ kind = IDENT_KIND[match.chop]
+ kind == :ident ? :label : kind
+ }, set(:label_expected) { |kind| kind == :label }
+ on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, unset(:label_expected)
+
+ on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter)
+
+ on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, unset(:label_expected)
+ on %r/0[xX][0-9A-Fa-f]+/, :hex, unset(:label_expected)
+ on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, unset(:label_expected)
+ on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, unset(:label_expected)
+ on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, unset(:label_expected)
+
+ on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment
+ on %r/ \# \s* if \s* 0 /x, -> (match) {
+ match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos?
+ }, :comment
+ on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected)
+ on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected)
+
+ on %r/\$/, :ident
+ end
+
+ state :string do
+ on %r/[^\\\n"]+/, :content
+ on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char
+ on %r/"/, :delimiter, pop, unset(:label_expected)
+ on %r/ \\ /x, pop, :error, unset(:label_expected)
+ on %r/ $ /x, pop, unset(:label_expected)
+ end
+
+ state :include_expected do
+ on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state
+ on %r/ \s*? \n \s* /x, :space, pop_state
+ on %r/\s+/, :space
+ on %r//, pop_state # TODO: add otherwise method for this
+ end
+
+ protected
+
+ def setup
+ super
+
+ @label_expected = true
+ @case_expected = false
+ @label_expected_before_preproc_line = nil
+ @in_preproc_line = false
+ end
+
+ def close_groups encoder, states
+ if states.last == :string
+ encoder.end_group :string
+ end
+ end
+
+ end
+
+end
+end
diff --git a/lib/coderay/scanners/c4.rb b/lib/coderay/scanners/c4.rb
new file mode 100644
index 0000000..ff67e49
--- /dev/null
+++ b/lib/coderay/scanners/c4.rb
@@ -0,0 +1,126 @@
+module CodeRay
+module Scanners
+
+ # Scanner for C.
+ class C4 < StateBasedScanner
+
+ register_for :c4
+ file_extension 'c'
+
+ KEYWORDS = [
+ 'asm', 'break', 'case', 'continue', 'default', 'do',
+ 'else', 'enum', 'for', 'goto', 'if', 'return',
+ 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while',
+ 'restrict', # added in C99
+ ] # :nodoc:
+
+ PREDEFINED_TYPES = [
+ 'int', 'long', 'short', 'char',
+ 'signed', 'unsigned', 'float', 'double',
+ 'bool', 'complex', # added in C99
+ ] # :nodoc:
+
+ PREDEFINED_CONSTANTS = [
+ 'EOF', 'NULL',
+ 'true', 'false', # added in C99
+ ] # :nodoc:
+ DIRECTIVES = [
+ 'auto', 'extern', 'register', 'static', 'void',
+ 'const', 'volatile', # added in C89
+ 'inline', # added in C99
+ ] # :nodoc:
+
+ IDENT_KIND = WordList.new(:ident).
+ add(KEYWORDS, :keyword).
+ add(PREDEFINED_TYPES, :predefined_type).
+ add(DIRECTIVES, :directive).
+ add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc:
+
+ ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc:
+ UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc:
+
+ protected
+
+ state :initial do
+ check in_preproc_line? do
+ skip %r/ \s*? \n \s* /x, :space do
+ unset :in_preproc_line
+ expect :label if label_expected_before_preproc_line?
+ end
+ end
+
+ skip %r/ \s+ | \\\n /x, :space
+
+ on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator do |match, case_expected|
+ expect :label if match =~ /[;\{\}]/ || expected?(:case) && match =~ /:/
+ end
+
+ on %r/ (?: case | default ) \b /x, :keyword do
+ expect :case
+ end
+
+ check label_expected?, !in_preproc_line? do
+ on %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, -> match {
+ kind = IDENT_KIND[match.chop]
+ kind == :ident ? :label : kind
+ } do |kind|
+ expect :label if kind == :label
+ end
+ end
+
+ on %r/ [A-Za-z_][A-Za-z_0-9]* /x, IDENT_KIND
+
+ on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter)
+
+ on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char
+ on %r/0[xX][0-9A-Fa-f]+/, :hex
+ on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal
+ on %r/(?:\d+)(?![.eEfF])L?L?/, :integer
+ on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float
+
+ skip %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment
+ on %r/ \# \s* if \s* 0 /x, -> (match) {
+ match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos?
+ }, :comment
+ on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push(:include)
+ on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected)
+
+ on %r/\$/, :ident
+ end
+
+ group_state :string do
+ on %r/[^\\\n"]+/, :content
+ on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char
+ on %r/"/, :delimiter, pop
+ on %r/ \\ /x, pop, :error
+ on %r/ $ /x, pop
+ end
+
+ state :include do
+ on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop
+ on %r/ \s*? \n \s* /x, :space, pop
+ on %r/\s+/, :space
+ otherwise pop
+ end
+
+ protected
+
+ def setup
+ super
+
+ @label_expected = true
+ @case_expected = false
+ @label_expected_before_preproc_line = nil
+ @in_preproc_line = false
+ end
+
+ def close_groups encoder, states
+ if states.last == :string
+ encoder.end_group :string
+ end
+ end
+
+ end
+
+end
+end
diff --git a/lib/coderay/scanners/lua2b.rb b/lib/coderay/scanners/lua2b.rb
new file mode 100644
index 0000000..9e2b1fe
--- /dev/null
+++ b/lib/coderay/scanners/lua2b.rb
@@ -0,0 +1,157 @@
+# encoding: utf-8
+
+module CodeRay
+module Scanners
+
+ # Scanner for the Lua[http://lua.org] programming lanuage.
+ #
+ # The language’s complete syntax is defined in
+ # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html],
+ # which is what this scanner tries to conform to.
+ class Lua2 < RuleBasedScanner
+
+ register_for :lua2
+ file_extension 'lua'
+ title 'Lua'
+
+ # Keywords used in Lua.
+ KEYWORDS = %w[and break do else elseif end
+ for function goto if in
+ local not or repeat return
+ then until while
+ ]
+
+ # Constants set by the Lua core.
+ PREDEFINED_CONSTANTS = %w[false true nil]
+
+ # The expressions contained in this array are parts of Lua’s `basic'
+ # library. Although it’s not entirely necessary to load that library,
+ # it is highly recommended and one would have to provide own implementations
+ # of some of these expressions if one does not do so. They however aren’t
+ # keywords, neither are they constants, but nearly predefined, so they
+ # get tagged as `predefined' rather than anything else.
+ #
+ # This list excludes values of form `_UPPERCASE' because the Lua manual
+ # requires such identifiers to be reserved by Lua anyway and they are
+ # highlighted directly accordingly, without the need for specific
+ # identifiers to be listed here.
+ PREDEFINED_EXPRESSIONS = %w[
+ assert collectgarbage dofile error getmetatable
+ ipairs load loadfile next pairs pcall print
+ rawequal rawget rawlen rawset select setmetatable
+ tonumber tostring type xpcall
+ ]
+
+ # Automatic token kind selection for normal words.
+ IDENT_KIND = CodeRay::WordList.new(:ident).
+ add(KEYWORDS, :keyword).
+ add(PREDEFINED_CONSTANTS, :predefined_constant).
+ add(PREDEFINED_EXPRESSIONS, :predefined)
+
+ state :initial, :map => :map do
+ on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]]
+ set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end
+ on %r/--.*$/, :comment # --Lua comment
+ on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]]
+ set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end
+ on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label::
+ on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua
+ on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator)
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits)
+ # Extra highlighting for entities following certain keywords
+ if kind == :keyword && match == 'function'
+ :function_expected
+ elsif kind == :keyword && match == 'goto'
+ :goto_label_expected
+ elsif kind == :keyword && match == 'local'
+ :local_var_expected
+ end
+ }
+
+ on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace {
+ on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace }
+ on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace
+ on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth)
+
+ on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and '
+ on %r/'/, push(:single_quoted_string, :string), :delimiter
+ # ↓Prefix hex number ←|→ decimal number
+ on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power
+ # ↓Prefix hex number ←|→ decimal number
+ on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power
+ on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators
+ on %r/\s+/, :space # Space
+ end
+
+ state :function_expected do
+ on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name
+ on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo()
+ on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace
+ end
+
+ state :goto_label_expected do
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state
+ on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace
+ end
+
+ state :local_var_expected do
+ on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ...
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable
+ on %r/,/, :operator
+ on %r/\=/, :operator, pop_state
+ on %r/\n/, :space, pop_state
+ on %r/\s+/, :space
+ end
+
+ state :long_comment => :comment do
+ on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment)
+ on %r/.*/m, :error, pop(:comment)
+ end
+
+ state :long_string => :string do
+ on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences
+ on %r/.*/m, :error, pop(:string)
+ end
+
+ state :single_quoted_string => :string do
+ on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z)
+ on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char
+ on %r/'/, :delimiter, pop(:string)
+ on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings
+ # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings
+ end
+
+ state :double_quoted_string => :string do
+ on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z)
+ on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char
+ on %r/"/, :delimiter, pop(:string)
+ on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings
+ # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings
+ end
+
+ protected
+
+ def setup
+ super
+
+ @brace_depth = 0
+ @num_equals = nil
+ end
+
+ def close_groups encoder, states
+ states.reverse_each do |state|
+ case state
+ when :long_string, :single_quoted_string, :double_quoted_string
+ encoder.end_group :string
+ when :long_comment
+ encoder.end_group :long_comment
+ when :map
+ encoder.end_group :map
+ end
+ end
+ end
+ end
+
+end
+end
diff --git a/lib/coderay/scanners/lua3.rb b/lib/coderay/scanners/lua3.rb
new file mode 100644
index 0000000..d2d4280
--- /dev/null
+++ b/lib/coderay/scanners/lua3.rb
@@ -0,0 +1,142 @@
+# encoding: utf-8
+# Pseudocode: states optionally define groups, comments removed, counter definition?
+
+module CodeRay
+module Scanners
+
+ # Scanner for the Lua[http://lua.org] programming lanuage.
+ #
+ # The language’s complete syntax is defined in
+ # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html],
+ # which is what this scanner tries to conform to.
+ class Lua3 < RuleBasedScannerX
+
+ register_for :lua3
+ file_extension 'lua'
+ title 'Lua'
+
+ # Keywords used in Lua.
+ KEYWORDS = %w[and break do else elseif end
+ for function goto if in
+ local not or repeat return
+ then until while
+ ]
+
+ # Constants set by the Lua core.
+ PREDEFINED_CONSTANTS = %w[false true nil]
+
+ # The expressions contained in this array are parts of Lua’s `basic'
+ # library. Although it’s not entirely necessary to load that library,
+ # it is highly recommended and one would have to provide own implementations
+ # of some of these expressions if one does not do so. They however aren’t
+ # keywords, neither are they constants, but nearly predefined, so they
+ # get tagged as `predefined' rather than anything else.
+ #
+ # This list excludes values of form `_UPPERCASE' because the Lua manual
+ # requires such identifiers to be reserved by Lua anyway and they are
+ # highlighted directly accordingly, without the need for specific
+ # identifiers to be listed here.
+ PREDEFINED_EXPRESSIONS = %w[
+ assert collectgarbage dofile error getmetatable
+ ipairs load loadfile next pairs pcall print
+ rawequal rawget rawlen rawset select setmetatable
+ tonumber tostring type xpcall
+ ]
+
+ # Automatic token kind selection for normal words.
+ IDENT_KIND = CodeRay::WordList.new(:ident).
+ add(KEYWORDS, :keyword).
+ add(PREDEFINED_CONSTANTS, :predefined_constant).
+ add(PREDEFINED_EXPRESSIONS, :predefined)
+
+ protected
+
+ # Scanner initialization.
+ def setup
+ super
+ @brace_depth = 0
+ @num_equals = nil
+ end
+
+ counter :brace_depth
+
+ state :initial, :map => :map do
+ on %r/\-\-\[\=*\[/, push(:long_comment), :delimiter, set(:num_equals, -> (match) { match.count('=') })
+ on %r/--.*$/, :comment
+ on %r/\[=*\[/, push(:long_string), :delimiter, set(:num_equals, -> (match) { match.count('=') })
+ on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label
+ on %r/_[A-Z]+/, :predefined
+ on check_if(:brace_depth, :>, 0), %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator)
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind|
+ if kind == :keyword && match == 'function'
+ :function_expected
+ elsif kind == :keyword && match == 'goto'
+ :goto_label_expected
+ elsif kind == :keyword && match == 'local'
+ :local_var_expected
+ end
+ }
+
+ on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth)
+ on check_if(:brace_depth, :==, 1), %r/\}/, :delimiter, pop, decrement(:brace_depth)
+ on check_if(:brace_depth, :==, 0), %r/\}/, :error
+ on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth)
+
+ on %r/"/, push(:double_quoted_string), :delimiter
+ on %r/'/, push(:single_quoted_string), :delimiter
+
+ on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float
+
+ on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer
+ on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator
+ on %r/\s+/, :space
+ end
+
+ state :function_expected do
+ on %r/\(.*?\)/m, :operator, pop
+ on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop
+ on %r/\s+/, :space
+ end
+
+ state :goto_label_expected do
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop
+ on %r/\s+/, :space
+ end
+
+ state :local_var_expected do
+ on %r/function/, :keyword, pop, push(:function_expected)
+ on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable
+ on %r/,/, :operator
+ on %r/\=/, :operator, pop
+ on %r/\n/, :space, pop
+ on %r/\s+/, :space
+ end
+
+ state :long_comment => :comment do
+ on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment)
+ on %r/.*/m, :error, pop(:comment)
+ end
+
+ state :long_string => :string do
+ on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string)
+ on %r/.*/m, :error, pop(:string)
+ end
+
+ state :single_quoted_string => :string do
+ on %r/[^\\'\n]+/, :content
+ on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char
+ on %r/'/, :delimiter, pop(:string)
+ on %r/\n/, :error, pop(:string)
+ end
+
+ state :double_quoted_string => :string do
+ on %r/[^\\"\n]+/, :content
+ on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char
+ on %r/"/, :delimiter, pop(:string)
+ on %r/\n/, :error, pop(:string)
+ end
+ end
+
+end
+end
diff --git a/lib/coderay/scanners/lua4.rb b/lib/coderay/scanners/lua4.rb
new file mode 100644
index 0000000..0315d34
--- /dev/null
+++ b/lib/coderay/scanners/lua4.rb
@@ -0,0 +1,89 @@
+# encoding: utf-8
+
+module CodeRay
+module Scanners
+
+ # Scanner for the Lua[http://lua.org] programming lanuage.
+ #
+ # The language’s complete syntax is defined in
+ # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html],
+ # which is what this scanner tries to conform to.
+ class Lua4 < RuleBasedScanner
+
+ register_for :lua4
+ file_extension 'lua'
+ title 'Lua'
+
+ protected
+
+ state :initial do
+ on %r'#!(.*?)$', :doctype
+ on %r//, push_state(:base)
+ end
+
+ state :base do
+ on %r'--\[(=*)\[.*?\]\1\]'m, :comment
+ on %r'--.*$', :comment
+
+ on %r'(\d*\.\d+|\d+\.\d*)(e[+-]?\d+)?'i, :float
+ on %r'\d+e[+-]?\d+'i, :float
+ on %r'0x[0-9a-f]*'i, :hex
+ on %r'\d+', :integer
+
+ on %r'\n', :space
+ on %r'[^\S\n]', :space
+ # multiline strings
+ on %r'\[(=*)\[.*?\]\1\]'m, :string
+
+ on %r'(==|~=|<=|>=|\.\.\.|\.\.|[=+\-*/%^<>#!.\\:])', :operator
+ on %r'[\[\]{}().,:;]', :operator
+ on %r'(and|or|not)\b', :operator
+
+ on %r'(break|do|else|elseif|end|for|if|in|repeat|return|then|until|while)\b', :keyword
+ on %r'(local)\b', :keyword
+ on %r'(true|false|nil)\b', :predefined_constant
+
+ on %r'(function)\b', :keyword, push_state(:funcname)
+
+ on %r'[A-Za-z_]\w*(\.[A-Za-z_]\w*)?', :ident
+
+ # on %r"'", :string, combined(:stringescape, :sqs)
+ on %r"'", :string, push_state(:sqs)
+ # on %r'"', :string, combined(:stringescape, :dqs)
+ on %r'"', :string, push_state(:dqs)
+ end
+
+ state :funcname do
+ on %r'\s+', :space
+ on %r'(?:([A-Za-z_]\w*)(\.))?([A-Za-z_]\w*)', groups(:class, :operator, :function), pop_state
+ # inline function
+ on %r'\(', :operator, pop_state
+ end
+
+ # if I understand correctly, every character is valid in a lua string,
+ # so this state is only for later corrections
+ # state :string do
+ # on %r'.', :string
+ # end
+
+ # state :stringescape do
+ # on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape
+ # end
+
+ state :sqs do
+ on %r"'", :string, pop_state
+ # include(:string)
+ on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape
+ on %r'.', :string
+ end
+
+ state :dqs do
+ on %r'"', :string, pop_state
+ # include(:string)
+ on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape
+ on %r'.', :string
+ end
+ end
+
+end
+end
diff --git a/lib/coderay/state_based_scanner.rb b/lib/coderay/state_based_scanner.rb
new file mode 100644
index 0000000..b196adc
--- /dev/null
+++ b/lib/coderay/state_based_scanner.rb
@@ -0,0 +1,394 @@
+require 'set'
+
+module CodeRay
+ module Scanners
+ class StateBasedScanner < Scanner
+ class State
+ attr_reader :names
+ attr_reader :rules
+ attr_reader :scanner
+
+ def initialize scanner, names, &block
+ @scanner = scanner
+ @names = names
+
+ @rules = []
+ @check = nil
+
+ instance_eval(&block)
+ end
+
+ def rules_code
+ <<-RUBY
+when #{names.map(&:inspect).join(', ')}
+#{rules.map.with_index { |rule, index| rule.code(first: index.zero?) }.join}
+ else
+ puts "no match for \#{state.inspect} => skip character" if $DEBUG
+ encoder.text_token getch, :error
+ end
+
+ RUBY
+ end
+
+ protected
+
+ # structure
+ def check *conditions, &block
+ return @check unless conditions.any? || block
+ raise "Can't nest check yet" if @check
+
+ @check = Conditions.new(conditions)
+ instance_eval(&block)
+ @check = nil
+ end
+
+ # rules
+ def on pattern, *actions, &block
+ @rules << Rule.new(self, pattern, *actions, check: @check, &block)
+ end
+
+ def skip pattern, *actions, &block
+ @rules << Rule.new(self, pattern, *actions, check: @check, skip: true, &block)
+ end
+
+ def otherwise *actions, &block
+ @rules << Rule.new(self, //, *actions, check: @check, skip: true, &block)
+ end
+
+ # actions
+ def push state
+ Push.new(state)
+ end
+
+ def pop
+ Pop.new
+ end
+
+ def kind token_kind = nil, &block
+ Kind.new token_kind || scanner.callback(block)
+ end
+
+ def groups *token_kinds
+ Groups.new(token_kinds)
+ end
+
+ def set target, value = nil, &block
+ Setter.new target, value || block || true
+ end
+
+ def callback block
+ scanner.callback(block)
+ end
+
+ # magic flag getters
+ def method_missing method, *args, &block
+ method_name = method.to_s
+ if method_name.end_with?('?')
+ Getter.new(scanner.variable(method_name.chomp('?')))
+ else
+ super
+ end
+ end
+ end
+
+ class GroupState < State
+ end
+
+ class Rule
+ attr_reader :pattern
+ attr_reader :actions
+ attr_reader :check
+ attr_reader :state
+
+ def initialize state, pattern, *actions, check:, skip: false, &block
+ @state = state
+ @pattern = (skip ? Skip : Scan).new(pattern)
+ @actions = *build_actions(actions, block)
+ @check = check
+
+ raise [pattern, *actions, check, skip, block].inspect if check == false
+ end
+
+ def code first:
+ <<-RUBI
+ #{'els' unless first}if #{condition_expression}
+#{actions_code.gsub(/^/, ' ' * 2)}
+ RUBI
+ end
+
+ def skip?
+ @pattern.is_a?(Skip)
+ end
+
+ protected
+
+ def condition_expression
+ [check, pattern].compact.map(&:code).join(' && ')
+ end
+
+ def actions_code
+ actions.map(&:code).join("\n")
+ end
+
+ def build_actions actions, block
+ actions += [block] if block
+
+ actions.map do |action|
+ case action
+ when Symbol
+ Token.new(action)
+ when Proc
+ state.instance_eval do
+ callback action
+ end
+ when WordList
+ state.instance_eval do
+ kind { |match| action[match] }
+ end
+ when Push, Pop, Groups, Kind, Setter
+ action
+ else
+ raise "Don't know how to build action for %p (%p)" % [action, action.class]
+ end
+ end
+ end
+ end
+
+ # conditions
+ class Conditions < Struct.new(:conditions)
+ def code
+ "#{conditions.map(&:code).join(' && ')}"
+ end
+ end
+
+ class Scan < Struct.new(:pattern)
+ def code
+ "match = scan(#{pattern.inspect})"
+ end
+ end
+
+ class Skip < Scan
+ end
+
+ class Getter < Struct.new(:name, :negative)
+ def code
+ "#{negative && '!'}#{name}"
+ end
+
+ def !@
+ negative
+ end
+
+ protected
+
+ def negative
+ @negative ||= Getter.new(name, :negative)
+ end
+ end
+
+ # actions
+ class Push < Struct.new :state
+ def code
+ "push"
+ end
+ end
+
+ class Pop < Class.new
+ def code
+ "pop"
+ end
+ end
+
+ class Groups < Struct.new(:token_kinds)
+ def code
+ "groups"
+ end
+ end
+
+ class Setter < Struct.new(:name, :value)
+ def code
+ "set"
+ end
+ end
+
+
+ class Kind < Struct.new(:token_kind)
+ def code
+ case token_kind
+ when Callback
+ "encoder.text_token match, kind = #{token_kind.code}\n"
+ else
+ raise "I don't know how to evaluate this kind: %p" % [token_kind]
+ end
+ end
+ end
+
+ class Token < Struct.new(:name)
+ def code
+ "encoder.text_token match, #{name.inspect}"
+ end
+ end
+
+ class Callback < Struct.new(:name, :block)
+ def code
+ if parameter_names.empty?
+ name
+ else
+ "#{name}(#{parameter_names.join(', ')})"
+ end
+ end
+
+ protected
+
+ def parameter_names
+ block.parameters.map(&:last)
+ end
+ end
+
+ class << self
+ def states
+ @states ||= {}
+ end
+
+ def scan_tokens tokens, options
+ self.class.define_scan_tokens!
+
+ scan_tokens tokens, options
+ end
+
+ def define_scan_tokens!
+ if ENV['PUTS']
+ puts CodeRay.scan(scan_tokens_code, :ruby).terminal
+ puts "callbacks: #{callbacks.size}"
+ end
+
+ class_eval scan_tokens_code
+ end
+
+ def variable name
+ variables << name.to_sym
+
+ name
+ end
+
+ def callback block
+ return unless block
+
+ callback_name = name_for_callback(block)
+ callbacks[callback_name] = define_method(callback_name, &block)
+ block.parameters.map(&:last).each { |name| variable name }
+
+ Callback.new(callback_name, block)
+ end
+
+ protected
+
+ def state *names, state_class: State, &block
+ state_class.new(self, names, &block).tap do |state|
+ for name in names
+ states[name] = state
+ end
+ end
+ end
+
+ def group_state *names, &block
+ state(*names, state_class: GroupState, &block)
+ end
+
+ def callbacks
+ @callbacks ||= {}
+ end
+
+ def variables
+ @variables ||= Set.new
+ end
+
+ def additional_variables
+ variables - %i(encoder options state states match kind)
+ end
+
+ def name_for_callback block
+ base_name = "__callback_line_#{block.source_location.last}"
+ callback_name = base_name
+ counter = 'a'
+
+ while callbacks.key?(callback_name)
+ callback_name = "#{base_name}_#{counter}"
+ counter.succ!
+ end
+
+ callback_name
+ end
+
+ def scan_tokens_code
+ <<-"RUBY"
+ def scan_tokens encoder, options
+ state = options[:state] || @state
+
+#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) }
+
+ states = [state]
+
+ until eos?
+ case state
+#{ states_code.chomp.gsub(/^/, ' ' * 4) }
+ else
+ raise_inspect 'Unknown state: %p' % [state], encoder
+ end
+ end
+
+ if options[:keep_state]
+ @state = state
+ end
+
+#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) }
+
+ encoder
+ end
+ RUBY
+ end
+
+ def states_code
+ states.values.map(&:rules_code).join
+ end
+
+ def restore_local_variables_code
+ additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n")
+ end
+
+ def close_groups_code
+ "close_groups(encoder, states)"
+ end
+ end
+
+ def scan_tokens tokens, options
+ self.class.define_scan_tokens!
+
+ scan_tokens tokens, options
+ end
+
+ protected
+
+ def setup
+ @state = :initial
+ reset_expectations
+ end
+
+ def close_groups encoder, states
+ # TODO
+ end
+
+ def expect kind
+ @expected = kind
+ end
+
+ def expected? kind
+ @expected == kind
+ end
+
+ def reset_expectations
+ @expected = nil
+ end
+ end
+ end
+end