summaryrefslogtreecommitdiff
path: root/lib/coderay/scanners/python.rb
blob: 09c8b6e70d0d7cb35fe44fc1d3d2aba912276cfa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
module CodeRay
module Scanners
  
  # Scanner for Python. Supports Python 3.
  # 
  # Based on pygments' PythonLexer, see
  # http://dev.pocoo.org/projects/pygments/browser/pygments/lexers/agile.py.
  class Python < Scanner
    
    register_for :python
    file_extension 'py'
    
    KEYWORDS = [
      'and', 'as', 'assert', 'break', 'class', 'continue', 'def',
      'del', 'elif', 'else', 'except', 'finally', 'for',
      'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'not',
      'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield',
      'nonlocal',  # new in Python 3
    ]  # :nodoc:
    
    OLD_KEYWORDS = [
      'exec', 'print',  # gone in Python 3
    ]  # :nodoc:
    
    PREDEFINED_METHODS_AND_TYPES = %w[
      __import__ abs all any apply basestring bin bool buffer
      bytearray bytes callable chr classmethod cmp coerce compile
      complex delattr dict dir divmod enumerate eval execfile exit
      file filter float frozenset getattr globals hasattr hash hex id
      input int intern isinstance issubclass iter len list locals
      long map max min next object oct open ord pow property range
      raw_input reduce reload repr reversed round set setattr slice
      sorted staticmethod str sum super tuple type unichr unicode
      vars xrange zip
    ]  # :nodoc:
    
    PREDEFINED_EXCEPTIONS = %w[
      ArithmeticError AssertionError AttributeError
      BaseException DeprecationWarning EOFError EnvironmentError
      Exception FloatingPointError FutureWarning GeneratorExit IOError
      ImportError ImportWarning IndentationError IndexError KeyError
      KeyboardInterrupt LookupError MemoryError NameError
      NotImplemented NotImplementedError OSError OverflowError
      OverflowWarning PendingDeprecationWarning ReferenceError
      RuntimeError RuntimeWarning StandardError StopIteration
      SyntaxError SyntaxWarning SystemError SystemExit TabError
      TypeError UnboundLocalError UnicodeDecodeError
      UnicodeEncodeError UnicodeError UnicodeTranslateError
      UnicodeWarning UserWarning ValueError Warning ZeroDivisionError
    ]  # :nodoc:
    
    PREDEFINED_VARIABLES_AND_CONSTANTS = [
      'False', 'True', 'None',  # "keywords" since Python 3
      'self', 'Ellipsis', 'NotImplemented',
    ]  # :nodoc:
    
    IDENT_KIND = WordList.new(:ident).
      add(KEYWORDS, :keyword).
      add(OLD_KEYWORDS, :old_keyword).
      add(PREDEFINED_METHODS_AND_TYPES, :predefined).
      add(PREDEFINED_VARIABLES_AND_CONSTANTS, :predefined_constant).
      add(PREDEFINED_EXCEPTIONS, :exception)  # :nodoc:
    
    NAME = / [[:alpha:]_] \w* /x  # :nodoc:
    ESCAPE = / [abfnrtv\n\\'"] | x[a-fA-F0-9]{1,2} | [0-7]{1,3} /x  # :nodoc:
    UNICODE_ESCAPE =  / u[a-fA-F0-9]{4} | U[a-fA-F0-9]{8} | N\{[-\w ]+\} /x  # :nodoc:
    
    OPERATOR = /
      \.\.\. |          # ellipsis
      \.(?!\d) |        # dot but not decimal point
      [,;:()\[\]{}] |   # simple delimiters
      \/\/=? | \*\*=? | # special math
      [-+*\/%&|^]=? |   # ordinary math and binary logic
      [~`] |            # binary complement and inspection
      <<=? | >>=? | [<>=]=? | !=  # comparison and assignment
    /x  # :nodoc:
    
    STRING_DELIMITER_REGEXP = Hash.new { |h, delimiter|
      h[delimiter] = Regexp.union delimiter  # :nodoc:
    }
    
    STRING_CONTENT_REGEXP = Hash.new { |h, delimiter|
      h[delimiter] = / [^\\\n]+? (?= \\ | $ | #{Regexp.escape(delimiter)} ) /x  # :nodoc:
    }
    
    DEF_NEW_STATE = WordList.new(:initial).
      add(%w(def), :def_expected).
      add(%w(import from), :include_expected).
      add(%w(class), :class_expected)  # :nodoc:
    
    DESCRIPTOR = /
      #{NAME}
      (?: \. #{NAME} )*
      | \*
    /x  # :nodoc:
    
    DOCSTRING_COMING = /
      [ \t]* u?r? ("""|''')
    /x  # :nodoc:
    
  protected
    
    def scan_tokens encoder, options
      
      state = :initial
      string_delimiter = nil
      string_raw = false
      string_type = nil
      docstring_coming = match?(/#{DOCSTRING_COMING}/o)
      last_token_dot = false
      unicode = string.respond_to?(:encoding) && string.encoding.name == 'UTF-8'
      from_import_state = []
      
      until eos?
        
        if state == :string
          if match = scan(STRING_DELIMITER_REGEXP[string_delimiter])
            encoder.text_token match, :delimiter
            encoder.end_group string_type
            string_type = nil
            state = :initial
            next
          elsif string_delimiter.size == 3 && match = scan(/\n/)
            encoder.text_token match, :content
          elsif match = scan(STRING_CONTENT_REGEXP[string_delimiter])
            encoder.text_token match, :content
          elsif !string_raw && match = scan(/ \\ #{ESCAPE} /ox)
            encoder.text_token match, :char
          elsif match = scan(/ \\ #{UNICODE_ESCAPE} /ox)
            encoder.text_token match, :char
          elsif match = scan(/ \\ . /x)
            encoder.text_token match, :content
          elsif match = scan(/ \\ | $ /x)
            encoder.end_group string_type
            string_type = nil
            encoder.text_token match, :error unless match.empty?
            state = :initial
          else
            raise_inspect "else case \" reached; %p not handled." % peek(1), encoder, state
          end
        
        elsif match = scan(/ [ \t]+ | \\?\n /x)
          encoder.text_token match, :space
          if match == "\n"
            state = :initial if state == :include_expected
            docstring_coming = true if match?(/#{DOCSTRING_COMING}/o)
          end
          next
        
        elsif match = scan(/ \# [^\n]* /mx)
          encoder.text_token match, :comment
          next
        
        elsif state == :initial
          
          if match = scan(/#{OPERATOR}/o)
            encoder.text_token match, :operator
          
          elsif match = scan(/(u?r?|b)?("""|"|'''|')/i)
            modifiers = self[1]
            string_delimiter = self[2]
            string_type = docstring_coming ? :docstring : (modifiers == 'b' ? :binary : :string)
            docstring_coming = false if docstring_coming
            encoder.begin_group string_type
            string_raw = false
            unless modifiers.empty?
              string_raw = !!modifiers.index(?r)
              encoder.text_token modifiers, :modifier
              match = string_delimiter
            end
            state = :string
            encoder.text_token match, :delimiter
          
          # TODO: backticks
          
          elsif match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
            kind = IDENT_KIND[match]
            # TODO: keyword arguments
            kind = :ident if last_token_dot
            if kind == :old_keyword
              kind = check(/\(/) ? :ident : :keyword
            elsif kind == :predefined && check(/ *=/)
              kind = :ident
            elsif kind == :keyword
              state = DEF_NEW_STATE[match]
              from_import_state << match.to_sym if state == :include_expected
            end
            encoder.text_token match, kind
          
          elsif match = scan(/@[a-zA-Z0-9_.]+[lL]?/)
            encoder.text_token match, :decorator
          
          elsif match = scan(/0[xX][0-9A-Fa-f]+[lL]?/)
            encoder.text_token match, :hex
          
          elsif match = scan(/0[bB][01]+[lL]?/)
            encoder.text_token match, :binary
          
          elsif match = scan(/(?:\d*\.\d+|\d+\.\d*)(?:[eE][+-]?\d+)?|\d+[eE][+-]?\d+/)
            if scan(/[jJ]/)
              match << matched
              encoder.text_token match, :imaginary
            else
              encoder.text_token match, :float
            end
          
          elsif match = scan(/0[oO][0-7]+|0[0-7]+(?![89.eE])[lL]?/)
            encoder.text_token match, :octal
          
          elsif match = scan(/\d+([lL])?/)
            if self[1] == nil && scan(/[jJ]/)
              match << matched
              encoder.text_token match, :imaginary
            else
              encoder.text_token match, :integer
            end
          
          else
            encoder.text_token getch, :error
          
          end
            
        elsif state == :def_expected
          state = :initial
          if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
            encoder.text_token match, :method
          else
            next
          end
        
        elsif state == :class_expected
          state = :initial
          if match = scan(unicode ? /#{NAME}/uo : /#{NAME}/o)
            encoder.text_token match, :class
          else
            next
          end
          
        elsif state == :include_expected
          if match = scan(unicode ? /#{DESCRIPTOR}/uo : /#{DESCRIPTOR}/o)
            if match == 'as'
              encoder.text_token match, :keyword
              from_import_state << :as
            elsif from_import_state.first == :from && match == 'import'
              encoder.text_token match, :keyword
              from_import_state << :import
            elsif from_import_state.last == :as
              # encoder.text_token match, match[0,1][unicode ? /[[:upper:]]/u : /[[:upper:]]/] ? :class : :method
              encoder.text_token match, :ident
              from_import_state.pop
            elsif IDENT_KIND[match] == :keyword
              unscan
              match = nil
              state = :initial
              next
            else
              encoder.text_token match, :include
            end
          elsif match = scan(/,/)
            from_import_state.pop if from_import_state.last == :as
            encoder.text_token match, :operator
          else
            from_import_state = []
            state = :initial
            next
          end
          
        else
          raise_inspect 'Unknown state', encoder, state
          
        end
        
        last_token_dot = match == '.'
        
      end
      
      if state == :string
        encoder.end_group string_type
      end
      
      encoder
    end
    
  end
  
end
end