Patch to allow Jgments to pull out the raw regexes easier (#477)

author: Tim Hatch <tim@timhatch.com> 2010-12-21 08:07:13 -0800
committer: Tim Hatch <tim@timhatch.com> 2010-12-21 08:07:13 -0800
commit: 132b2b954b9ae62b6c895bf48109de16ce2e1b1b (patch)
tree: d0725d2a9a23c7437688e1da888cf0b97aca5cd9 /pygments/lexer.py
parent: 89c859f902df3f7f3a3a7696e78d5b5dc78b4ce9 (diff)
download: pygments-132b2b954b9ae62b6c895bf48109de16ce2e1b1b.tar.gz
1 files changed, 56 insertions, 38 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index fbcc39a6..a22768a3 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -349,7 +349,53 @@ class RegexLexerMeta(LexerMeta):
     self.tokens on the first instantiation.
     """
 
+    def _process_regex(cls, regex, rflags):
+        """Preprocess the regular expression component of a token definition."""
+        return re.compile(regex, rflags).match
+
+    def _process_token(cls, token):
+        """Preprocess the token component of a token definition."""
+        assert type(token) is _TokenType or callable(token), \
+               'token type must be simple type or callable, not %r' % (token,)
+        return token
+
+    def _process_new_state(cls, new_state, unprocessed, processed):
+        """Preprocess the state transition action of a token definition."""
+        if isinstance(new_state, str):
+            # an existing state
+            if new_state == '#pop':
+                return -1
+            elif new_state in unprocessed:
+                return (new_state,)
+            elif new_state == '#push':
+                return new_state
+            elif new_state[:5] == '#pop:':
+                return -int(new_state[5:])
+            else:
+                assert False, 'unknown new state %r' % new_state
+        elif isinstance(new_state, combined):
+            # combine a new state from existing ones
+            tmp_state = '_tmp_%d' % cls._tmpname
+            cls._tmpname += 1
+            itokens = []
+            for istate in new_state:
+                assert istate != new_state, 'circular state ref %r' % istate
+                itokens.extend(cls._process_state(unprocessed,
+                                                  processed, istate))
+            processed[tmp_state] = itokens
+            return (tmp_state,)
+        elif isinstance(new_state, tuple):
+            # push more than one state
+            for istate in new_state:
+                assert (istate in unprocessed or
+                        istate in ('#pop', '#push')), \
+                       'unknown new state ' + istate
+            return new_state
+        else:
+            assert False, 'unknown new state def %r' % new_state
+
     def _process_state(cls, unprocessed, processed, state):
+        """Preprocess a single state definition."""
         assert type(state) is str, "wrong state name %r" % state
         assert state[0] != '#', "invalid state name %r" % state
         if state in processed:
@@ -360,60 +406,31 @@ class RegexLexerMeta(LexerMeta):
             if isinstance(tdef, include):
                 # it's a state reference
                 assert tdef != state, "circular state reference %r" % state
-                tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
+                tokens.extend(cls._process_state(unprocessed, processed,
+                                                 str(tdef)))
                 continue
 
             assert type(tdef) is tuple, "wrong rule def %r" % tdef
 
             try:
-                rex = re.compile(tdef[0], rflags).match
+                rex = cls._process_regex(tdef[0], rflags)
             except Exception, err:
                 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
                                  (tdef[0], state, cls, err))
 
-            assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
-                   'token type must be simple type or callable, not %r' % (tdef[1],)
+            token = cls._process_token(tdef[1])
 
             if len(tdef) == 2:
                 new_state = None
             else:
-                tdef2 = tdef[2]
-                if isinstance(tdef2, str):
-                    # an existing state
-                    if tdef2 == '#pop':
-                        new_state = -1
-                    elif tdef2 in unprocessed:
-                        new_state = (tdef2,)
-                    elif tdef2 == '#push':
-                        new_state = tdef2
-                    elif tdef2[:5] == '#pop:':
-                        new_state = -int(tdef2[5:])
-                    else:
-                        assert False, 'unknown new state %r' % tdef2
-                elif isinstance(tdef2, combined):
-                    # combine a new state from existing ones
-                    new_state = '_tmp_%d' % cls._tmpname
-                    cls._tmpname += 1
-                    itokens = []
-                    for istate in tdef2:
-                        assert istate != state, 'circular state ref %r' % istate
-                        itokens.extend(cls._process_state(unprocessed,
-                                                          processed, istate))
-                    processed[new_state] = itokens
-                    new_state = (new_state,)
-                elif isinstance(tdef2, tuple):
-                    # push more than one state
-                    for state in tdef2:
-                        assert (state in unprocessed or
-                                state in ('#pop', '#push')), \
-                               'unknown new state ' + state
-                    new_state = tdef2
-                else:
-                    assert False, 'unknown new state def %r' % tdef2
-            tokens.append((rex, tdef[1], new_state))
+                new_state = cls._process_new_state(tdef[2],
+                                                   unprocessed, processed)
+
+            tokens.append((rex, token, new_state))
         return tokens
 
     def process_tokendef(cls, name, tokendefs=None):
+        """Preprocess a dictionary of token definitions."""
         processed = cls._all_tokens[name] = {}
         tokendefs = tokendefs or cls.tokens[name]
         for state in tokendefs.keys():
@@ -421,6 +438,7 @@ class RegexLexerMeta(LexerMeta):
         return processed
 
     def __call__(cls, *args, **kwds):
+        """Instantiate cls after preprocessing its token definitions."""
         if not hasattr(cls, '_tokens'):
             cls._all_tokens = {}
             cls._tmpname = 0
author	Tim Hatch <tim@timhatch.com>	2010-12-21 08:07:13 -0800
committer	Tim Hatch <tim@timhatch.com>	2010-12-21 08:07:13 -0800
commit	132b2b954b9ae62b6c895bf48109de16ce2e1b1b (patch)
tree	d0725d2a9a23c7437688e1da888cf0b97aca5cd9 /pygments/lexer.py
parent	89c859f902df3f7f3a3a7696e78d5b5dc78b4ce9 (diff)
download	pygments-132b2b954b9ae62b6c895bf48109de16ce2e1b1b.tar.gz