diff options
author | Kornelius Kalnbach <murphy@rubychan.de> | 2017-04-09 18:38:25 +0200 |
---|---|---|
committer | Kornelius Kalnbach <murphy@rubychan.de> | 2017-04-09 18:38:25 +0200 |
commit | 7a02cdded08dd232319eae17998abc877efb58cb (patch) | |
tree | 5f529d1052ef2b605e3096dfa9e818769ba1cb4a /lib | |
parent | 548e2d0aea6b4c18a2f3e8203241fcaedb10bc8d (diff) | |
download | coderay-7a02cdded08dd232319eae17998abc877efb58cb.tar.gz |
working towards DSL scanner
Diffstat (limited to 'lib')
-rw-r--r-- | lib/coderay.rb | 1 | ||||
-rw-r--r-- | lib/coderay/scanners/c3.rb | 112 | ||||
-rw-r--r-- | lib/coderay/scanners/c4.rb | 126 | ||||
-rw-r--r-- | lib/coderay/scanners/lua2b.rb | 157 | ||||
-rw-r--r-- | lib/coderay/scanners/lua3.rb | 142 | ||||
-rw-r--r-- | lib/coderay/scanners/lua4.rb | 89 | ||||
-rw-r--r-- | lib/coderay/state_based_scanner.rb | 394 |
7 files changed, 1021 insertions, 0 deletions
diff --git a/lib/coderay.rb b/lib/coderay.rb index 5c923f5..c1c9e34 100644 --- a/lib/coderay.rb +++ b/lib/coderay.rb @@ -155,6 +155,7 @@ module CodeRay # DSL Scanner autoload :RuleBasedScanner, coderay_path('rule_based_scanner') + autoload :StateBasedScanner, coderay_path('state_based_scanner') # convenience access and reusable Encoder/Scanner pair autoload :Duo, coderay_path('duo') diff --git a/lib/coderay/scanners/c3.rb b/lib/coderay/scanners/c3.rb new file mode 100644 index 0000000..49555ca --- /dev/null +++ b/lib/coderay/scanners/c3.rb @@ -0,0 +1,112 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C3 < RuleBasedScanner + + register_for :c3 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + on check_if(:in_preproc_line), %r/ \s*? \n \s* /x, :space, unset(:in_preproc_line), set(:label_expected, :label_expected_before_preproc_line) + on %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator, set(:label_expected) { |match, case_expected| + match =~ /[;\{\}]/ || case_expected && match =~ /:/ + }, unset(:case_expected) + + on %r/ (?: case | default ) \b /x, :keyword, set(:case_expected), unset(:label_expected) + on check_if(:label_expected), check_unless(:in_preproc_line), %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, kind { |match| + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + }, set(:label_expected) { |kind| kind == :label } + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, kind { |match| IDENT_KIND[match] }, unset(:label_expected) + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char, unset(:label_expected) + on %r/0[xX][0-9A-Fa-f]+/, :hex, unset(:label_expected) + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal, unset(:label_expected) + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer, unset(:label_expected) + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float, unset(:label_expected) + + on %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push_state(:include_expected) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop, unset(:label_expected) + on %r/ \\ /x, pop, :error, unset(:label_expected) + on %r/ $ /x, pop, unset(:label_expected) + end + + state :include_expected do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop_state + on %r/ \s*? \n \s* /x, :space, pop_state + on %r/\s+/, :space + on %r//, pop_state # TODO: add otherwise method for this + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/c4.rb b/lib/coderay/scanners/c4.rb new file mode 100644 index 0000000..ff67e49 --- /dev/null +++ b/lib/coderay/scanners/c4.rb @@ -0,0 +1,126 @@ +module CodeRay +module Scanners + + # Scanner for C. + class C4 < StateBasedScanner + + register_for :c4 + file_extension 'c' + + KEYWORDS = [ + 'asm', 'break', 'case', 'continue', 'default', 'do', + 'else', 'enum', 'for', 'goto', 'if', 'return', + 'sizeof', 'struct', 'switch', 'typedef', 'union', 'while', + 'restrict', # added in C99 + ] # :nodoc: + + PREDEFINED_TYPES = [ + 'int', 'long', 'short', 'char', + 'signed', 'unsigned', 'float', 'double', + 'bool', 'complex', # added in C99 + ] # :nodoc: + + PREDEFINED_CONSTANTS = [ + 'EOF', 'NULL', + 'true', 'false', # added in C99 + ] # :nodoc: + DIRECTIVES = [ + 'auto', 'extern', 'register', 'static', 'void', + 'const', 'volatile', # added in C89 + 'inline', # added in C99 + ] # :nodoc: + + IDENT_KIND = WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_TYPES, :predefined_type). + add(DIRECTIVES, :directive). + add(PREDEFINED_CONSTANTS, :predefined_constant) # :nodoc: + + ESCAPE = / [rbfntv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x # :nodoc: + UNICODE_ESCAPE = / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} /x # :nodoc: + + protected + + state :initial do + check in_preproc_line? do + skip %r/ \s*? \n \s* /x, :space do + unset :in_preproc_line + expect :label if label_expected_before_preproc_line? + end + end + + skip %r/ \s+ | \\\n /x, :space + + on %r/ [-+*=<>?:;,!&^|()\[\]{}~%]+ | \/(?![\/*])=? | \.(?!\d) /x, :operator do |match, case_expected| + expect :label if match =~ /[;\{\}]/ || expected?(:case) && match =~ /:/ + end + + on %r/ (?: case | default ) \b /x, :keyword do + expect :case + end + + check label_expected?, !in_preproc_line? do + on %r/ [A-Za-z_][A-Za-z_0-9]*+ :(?!:) /x, -> match { + kind = IDENT_KIND[match.chop] + kind == :ident ? :label : kind + } do |kind| + expect :label if kind == :label + end + end + + on %r/ [A-Za-z_][A-Za-z_0-9]* /x, IDENT_KIND + + on %r/(L)?(")/, push(:string), groups(:modifier, :delimiter) + + on %r/ L?' (?: [^\'\n\\] | \\ #{ESCAPE} )? '? /x, :char + on %r/0[xX][0-9A-Fa-f]+/, :hex + on %r/(?:0[0-7]+)(?![89.eEfF])/, :octal + on %r/(?:\d+)(?![.eEfF])L?L?/, :integer + on %r/\d[fF]?|\d*\.\d+(?:[eE][+-]?\d+)?[fF]?|\d+[eE][+-]?\d+[fF]?/, :float + + skip %r! // [^\n\\]* (?: \\. [^\n\\]* )* | /\* (?: .*? \*/ | .* ) !mx, :comment + on %r/ \# \s* if \s* 0 /x, -> (match) { + match << scan_until(/ ^\# (?:elif|else|endif) .*? $ | \z /mx) unless eos? + }, :comment + on %r/ \# [ \t]* include\b /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected), push(:include) + on %r/ \# [ \t]* \w* /x, :preprocessor, set(:in_preproc_line), set(:label_expected_before_preproc_line, :label_expected) + + on %r/\$/, :ident + end + + group_state :string do + on %r/[^\\\n"]+/, :content + on %r/ \\ (?: #{ESCAPE} | #{UNICODE_ESCAPE} ) /mx, :char + on %r/"/, :delimiter, pop + on %r/ \\ /x, pop, :error + on %r/ $ /x, pop + end + + state :include do + on %r/<[^>\n]+>?|"[^"\n\\]*(?:\\.[^"\n\\]*)*"?/, :include, pop + on %r/ \s*? \n \s* /x, :space, pop + on %r/\s+/, :space + otherwise pop + end + + protected + + def setup + super + + @label_expected = true + @case_expected = false + @label_expected_before_preproc_line = nil + @in_preproc_line = false + end + + def close_groups encoder, states + if states.last == :string + encoder.end_group :string + end + end + + end + +end +end diff --git a/lib/coderay/scanners/lua2b.rb b/lib/coderay/scanners/lua2b.rb new file mode 100644 index 0000000..9e2b1fe --- /dev/null +++ b/lib/coderay/scanners/lua2b.rb @@ -0,0 +1,157 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua2 < RuleBasedScanner + + register_for :lua2 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment, :comment), :delimiter, #--[[ long (possibly multiline) comment ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for comment end + on %r/--.*$/, :comment # --Lua comment + on %r/\[=*\[/, push(:long_string, :string), :delimiter, # [[ long (possibly multiline) string ]] + set(:num_equals, -> (match) { match.count('=') }) # Number must match for string end + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label # ::goto_label:: + on %r/_[A-Z]+/, :predefined # _UPPERCASE are names reserved for Lua + on check_if { |brace_depth| brace_depth > 0 }, %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| # Normal letters (or letters followed by digits) + # Extra highlighting for entities following certain keywords + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) # Opening table brace { + on check_if { |brace_depth| brace_depth == 1 }, %r/\}/, :delimiter, pop, decrement(:brace_depth) # Closing table brace } + on check_if { |brace_depth| brace_depth == 0 }, %r/\}/, :error # Mismatched brace + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string, :string), :delimiter # String delimiters " and ' + on %r/'/, push(:single_quoted_string, :string), :delimiter + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float # hexadecimal constants have no E power, decimal ones no P power + # ↓Prefix hex number ←|→ decimal number + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer # hexadecimal constants have no E power, decimal ones no P power + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator # Operators + on %r/\s+/, :space # Space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop_state # x = function() # "Anonymous" function without explicit name + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident # function tbl.subtbl.foo() | function tbl:foo() # Colon only allowed as last separator + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop_state # function foo() + on %r/\s+/, :space # Between the `function' keyword and the ident may be any amount of whitespace + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop_state + on %r/\s+/, :space # Between the `goto' keyword and the label may be any amount of whitespace + end + + state :local_var_expected do + on %r/function/, :keyword, pop_state, push_state(:function_expected) # local function ... + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop_state + on %r/\n/, :space, pop_state + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) # Long strings do not interpret any escape sequences + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content # Everything except \ and the start delimiter character is string content (newlines are only allowed if preceeded by \ or \z) + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) # Lua forbids unescaped newlines in normal non-long strings + # encoder.text_token("\\n\n", :error) # Visually appealing error indicator--otherwise users may wonder whether the highlighter cannot highlight multine strings + end + + protected + + def setup + super + + @brace_depth = 0 + @num_equals = nil + end + + def close_groups encoder, states + states.reverse_each do |state| + case state + when :long_string, :single_quoted_string, :double_quoted_string + encoder.end_group :string + when :long_comment + encoder.end_group :long_comment + when :map + encoder.end_group :map + end + end + end + end + +end +end diff --git a/lib/coderay/scanners/lua3.rb b/lib/coderay/scanners/lua3.rb new file mode 100644 index 0000000..d2d4280 --- /dev/null +++ b/lib/coderay/scanners/lua3.rb @@ -0,0 +1,142 @@ +# encoding: utf-8 +# Pseudocode: states optionally define groups, comments removed, counter definition? + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua3 < RuleBasedScannerX + + register_for :lua3 + file_extension 'lua' + title 'Lua' + + # Keywords used in Lua. + KEYWORDS = %w[and break do else elseif end + for function goto if in + local not or repeat return + then until while + ] + + # Constants set by the Lua core. + PREDEFINED_CONSTANTS = %w[false true nil] + + # The expressions contained in this array are parts of Lua’s `basic' + # library. Although it’s not entirely necessary to load that library, + # it is highly recommended and one would have to provide own implementations + # of some of these expressions if one does not do so. They however aren’t + # keywords, neither are they constants, but nearly predefined, so they + # get tagged as `predefined' rather than anything else. + # + # This list excludes values of form `_UPPERCASE' because the Lua manual + # requires such identifiers to be reserved by Lua anyway and they are + # highlighted directly accordingly, without the need for specific + # identifiers to be listed here. + PREDEFINED_EXPRESSIONS = %w[ + assert collectgarbage dofile error getmetatable + ipairs load loadfile next pairs pcall print + rawequal rawget rawlen rawset select setmetatable + tonumber tostring type xpcall + ] + + # Automatic token kind selection for normal words. + IDENT_KIND = CodeRay::WordList.new(:ident). + add(KEYWORDS, :keyword). + add(PREDEFINED_CONSTANTS, :predefined_constant). + add(PREDEFINED_EXPRESSIONS, :predefined) + + protected + + # Scanner initialization. + def setup + super + @brace_depth = 0 + @num_equals = nil + end + + counter :brace_depth + + state :initial, :map => :map do + on %r/\-\-\[\=*\[/, push(:long_comment), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/--.*$/, :comment + on %r/\[=*\[/, push(:long_string), :delimiter, set(:num_equals, -> (match) { match.count('=') }) + on %r/::\s*[a-zA-Z_][a-zA-Z0-9_]+\s*::/, :label + on %r/_[A-Z]+/, :predefined + on check_if(:brace_depth, :>, 0), %r/([a-zA-Z_][a-zA-Z0-9_]*) (\s+)?(=)/x, groups(:key, :space, :operator) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, kind { |match| IDENT_KIND[match] }, push_state { |match, kind| + if kind == :keyword && match == 'function' + :function_expected + elsif kind == :keyword && match == 'goto' + :goto_label_expected + elsif kind == :keyword && match == 'local' + :local_var_expected + end + } + + on %r/\{/, push(:map), kind { |brace_depth| brace_depth > 0 ? :inline_delimiter : :delimiter }, increment(:brace_depth) + on check_if(:brace_depth, :==, 1), %r/\}/, :delimiter, pop, decrement(:brace_depth) + on check_if(:brace_depth, :==, 0), %r/\}/, :error + on %r/\}/, :inline_delimiter, pop, decrement(:brace_depth) + + on %r/"/, push(:double_quoted_string), :delimiter + on %r/'/, push(:single_quoted_string), :delimiter + + on %r/-? (?:0x\h* \. \h+ (?:p[+\-]?\d+)? | \d*\.\d+ (?:e[+\-]?\d+)?)/ix, :float + + on %r/-? (?:0x\h+ (?:p[+\-]?\d+)? | \d+ (?:e[+\-]?\d+)?)/ix, :integer + on %r/[\+\-\*\/%^\#=~<>\(\)\[\]:;,] | \.(?!\d)/x, :operator + on %r/\s+/, :space + end + + state :function_expected do + on %r/\(.*?\)/m, :operator, pop + on %r/[a-zA-Z_] (?:[a-zA-Z0-9_\.] (?!\.\d))* [\.\:]/x, :ident + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :function, pop + on %r/\s+/, :space + end + + state :goto_label_expected do + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :label, pop + on %r/\s+/, :space + end + + state :local_var_expected do + on %r/function/, :keyword, pop, push(:function_expected) + on %r/[a-zA-Z_][a-zA-Z0-9_]*/, :local_variable + on %r/,/, :operator + on %r/\=/, :operator, pop + on %r/\n/, :space, pop + on %r/\s+/, :space + end + + state :long_comment => :comment do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:comment) + on %r/.*/m, :error, pop(:comment) + end + + state :long_string => :string do + on pattern { |num_equals| %r/(.*?)(\]={#{num_equals}}\])/m }, groups(:content, :delimiter), pop(:string) + on %r/.*/m, :error, pop(:string) + end + + state :single_quoted_string => :string do + on %r/[^\\'\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/'/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + + state :double_quoted_string => :string do + on %r/[^\\"\n]+/, :content + on %r/\\(?:["'abfnrtv\\]|z\s*|x\h\h|\d{1,3}|\n)/m, :char + on %r/"/, :delimiter, pop(:string) + on %r/\n/, :error, pop(:string) + end + end + +end +end diff --git a/lib/coderay/scanners/lua4.rb b/lib/coderay/scanners/lua4.rb new file mode 100644 index 0000000..0315d34 --- /dev/null +++ b/lib/coderay/scanners/lua4.rb @@ -0,0 +1,89 @@ +# encoding: utf-8 + +module CodeRay +module Scanners + + # Scanner for the Lua[http://lua.org] programming lanuage. + # + # The language’s complete syntax is defined in + # {the Lua manual}[http://www.lua.org/manual/5.2/manual.html], + # which is what this scanner tries to conform to. + class Lua4 < RuleBasedScanner + + register_for :lua4 + file_extension 'lua' + title 'Lua' + + protected + + state :initial do + on %r'#!(.*?)$', :doctype + on %r//, push_state(:base) + end + + state :base do + on %r'--\[(=*)\[.*?\]\1\]'m, :comment + on %r'--.*$', :comment + + on %r'(\d*\.\d+|\d+\.\d*)(e[+-]?\d+)?'i, :float + on %r'\d+e[+-]?\d+'i, :float + on %r'0x[0-9a-f]*'i, :hex + on %r'\d+', :integer + + on %r'\n', :space + on %r'[^\S\n]', :space + # multiline strings + on %r'\[(=*)\[.*?\]\1\]'m, :string + + on %r'(==|~=|<=|>=|\.\.\.|\.\.|[=+\-*/%^<>#!.\\:])', :operator + on %r'[\[\]{}().,:;]', :operator + on %r'(and|or|not)\b', :operator + + on %r'(break|do|else|elseif|end|for|if|in|repeat|return|then|until|while)\b', :keyword + on %r'(local)\b', :keyword + on %r'(true|false|nil)\b', :predefined_constant + + on %r'(function)\b', :keyword, push_state(:funcname) + + on %r'[A-Za-z_]\w*(\.[A-Za-z_]\w*)?', :ident + + # on %r"'", :string, combined(:stringescape, :sqs) + on %r"'", :string, push_state(:sqs) + # on %r'"', :string, combined(:stringescape, :dqs) + on %r'"', :string, push_state(:dqs) + end + + state :funcname do + on %r'\s+', :space + on %r'(?:([A-Za-z_]\w*)(\.))?([A-Za-z_]\w*)', groups(:class, :operator, :function), pop_state + # inline function + on %r'\(', :operator, pop_state + end + + # if I understand correctly, every character is valid in a lua string, + # so this state is only for later corrections + # state :string do + # on %r'.', :string + # end + + # state :stringescape do + # on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + # end + + state :sqs do + on %r"'", :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + + state :dqs do + on %r'"', :string, pop_state + # include(:string) + on %r/\\([abfnrtv\\"']|\d{1,3})/, :escape + on %r'.', :string + end + end + +end +end diff --git a/lib/coderay/state_based_scanner.rb b/lib/coderay/state_based_scanner.rb new file mode 100644 index 0000000..b196adc --- /dev/null +++ b/lib/coderay/state_based_scanner.rb @@ -0,0 +1,394 @@ +require 'set' + +module CodeRay + module Scanners + class StateBasedScanner < Scanner + class State + attr_reader :names + attr_reader :rules + attr_reader :scanner + + def initialize scanner, names, &block + @scanner = scanner + @names = names + + @rules = [] + @check = nil + + instance_eval(&block) + end + + def rules_code + <<-RUBY +when #{names.map(&:inspect).join(', ')} +#{rules.map.with_index { |rule, index| rule.code(first: index.zero?) }.join} + else + puts "no match for \#{state.inspect} => skip character" if $DEBUG + encoder.text_token getch, :error + end + + RUBY + end + + protected + + # structure + def check *conditions, &block + return @check unless conditions.any? || block + raise "Can't nest check yet" if @check + + @check = Conditions.new(conditions) + instance_eval(&block) + @check = nil + end + + # rules + def on pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, &block) + end + + def skip pattern, *actions, &block + @rules << Rule.new(self, pattern, *actions, check: @check, skip: true, &block) + end + + def otherwise *actions, &block + @rules << Rule.new(self, //, *actions, check: @check, skip: true, &block) + end + + # actions + def push state + Push.new(state) + end + + def pop + Pop.new + end + + def kind token_kind = nil, &block + Kind.new token_kind || scanner.callback(block) + end + + def groups *token_kinds + Groups.new(token_kinds) + end + + def set target, value = nil, &block + Setter.new target, value || block || true + end + + def callback block + scanner.callback(block) + end + + # magic flag getters + def method_missing method, *args, &block + method_name = method.to_s + if method_name.end_with?('?') + Getter.new(scanner.variable(method_name.chomp('?'))) + else + super + end + end + end + + class GroupState < State + end + + class Rule + attr_reader :pattern + attr_reader :actions + attr_reader :check + attr_reader :state + + def initialize state, pattern, *actions, check:, skip: false, &block + @state = state + @pattern = (skip ? Skip : Scan).new(pattern) + @actions = *build_actions(actions, block) + @check = check + + raise [pattern, *actions, check, skip, block].inspect if check == false + end + + def code first: + <<-RUBI + #{'els' unless first}if #{condition_expression} +#{actions_code.gsub(/^/, ' ' * 2)} + RUBI + end + + def skip? + @pattern.is_a?(Skip) + end + + protected + + def condition_expression + [check, pattern].compact.map(&:code).join(' && ') + end + + def actions_code + actions.map(&:code).join("\n") + end + + def build_actions actions, block + actions += [block] if block + + actions.map do |action| + case action + when Symbol + Token.new(action) + when Proc + state.instance_eval do + callback action + end + when WordList + state.instance_eval do + kind { |match| action[match] } + end + when Push, Pop, Groups, Kind, Setter + action + else + raise "Don't know how to build action for %p (%p)" % [action, action.class] + end + end + end + end + + # conditions + class Conditions < Struct.new(:conditions) + def code + "#{conditions.map(&:code).join(' && ')}" + end + end + + class Scan < Struct.new(:pattern) + def code + "match = scan(#{pattern.inspect})" + end + end + + class Skip < Scan + end + + class Getter < Struct.new(:name, :negative) + def code + "#{negative && '!'}#{name}" + end + + def !@ + negative + end + + protected + + def negative + @negative ||= Getter.new(name, :negative) + end + end + + # actions + class Push < Struct.new :state + def code + "push" + end + end + + class Pop < Class.new + def code + "pop" + end + end + + class Groups < Struct.new(:token_kinds) + def code + "groups" + end + end + + class Setter < Struct.new(:name, :value) + def code + "set" + end + end + + + class Kind < Struct.new(:token_kind) + def code + case token_kind + when Callback + "encoder.text_token match, kind = #{token_kind.code}\n" + else + raise "I don't know how to evaluate this kind: %p" % [token_kind] + end + end + end + + class Token < Struct.new(:name) + def code + "encoder.text_token match, #{name.inspect}" + end + end + + class Callback < Struct.new(:name, :block) + def code + if parameter_names.empty? + name + else + "#{name}(#{parameter_names.join(', ')})" + end + end + + protected + + def parameter_names + block.parameters.map(&:last) + end + end + + class << self + def states + @states ||= {} + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + def define_scan_tokens! + if ENV['PUTS'] + puts CodeRay.scan(scan_tokens_code, :ruby).terminal + puts "callbacks: #{callbacks.size}" + end + + class_eval scan_tokens_code + end + + def variable name + variables << name.to_sym + + name + end + + def callback block + return unless block + + callback_name = name_for_callback(block) + callbacks[callback_name] = define_method(callback_name, &block) + block.parameters.map(&:last).each { |name| variable name } + + Callback.new(callback_name, block) + end + + protected + + def state *names, state_class: State, &block + state_class.new(self, names, &block).tap do |state| + for name in names + states[name] = state + end + end + end + + def group_state *names, &block + state(*names, state_class: GroupState, &block) + end + + def callbacks + @callbacks ||= {} + end + + def variables + @variables ||= Set.new + end + + def additional_variables + variables - %i(encoder options state states match kind) + end + + def name_for_callback block + base_name = "__callback_line_#{block.source_location.last}" + callback_name = base_name + counter = 'a' + + while callbacks.key?(callback_name) + callback_name = "#{base_name}_#{counter}" + counter.succ! + end + + callback_name + end + + def scan_tokens_code + <<-"RUBY" + def scan_tokens encoder, options + state = options[:state] || @state + +#{ restore_local_variables_code.chomp.gsub(/^/, ' ' * 3) } + + states = [state] + + until eos? + case state +#{ states_code.chomp.gsub(/^/, ' ' * 4) } + else + raise_inspect 'Unknown state: %p' % [state], encoder + end + end + + if options[:keep_state] + @state = state + end + +#{ close_groups_code.chomp.gsub(/^/, ' ' * 3) } + + encoder + end + RUBY + end + + def states_code + states.values.map(&:rules_code).join + end + + def restore_local_variables_code + additional_variables.sort.map { |name| "#{name} = @#{name}" }.join("\n") + end + + def close_groups_code + "close_groups(encoder, states)" + end + end + + def scan_tokens tokens, options + self.class.define_scan_tokens! + + scan_tokens tokens, options + end + + protected + + def setup + @state = :initial + reset_expectations + end + + def close_groups encoder, states + # TODO + end + + def expect kind + @expected = kind + end + + def expected? kind + @expected == kind + end + + def reset_expectations + @expected = nil + end + end + end +end |