diff options
author | Tim Hatch <tim@timhatch.com> | 2010-12-21 08:07:13 -0800 |
---|---|---|
committer | Tim Hatch <tim@timhatch.com> | 2010-12-21 08:07:13 -0800 |
commit | 132b2b954b9ae62b6c895bf48109de16ce2e1b1b (patch) | |
tree | d0725d2a9a23c7437688e1da888cf0b97aca5cd9 /pygments/lexer.py | |
parent | 89c859f902df3f7f3a3a7696e78d5b5dc78b4ce9 (diff) | |
download | pygments-132b2b954b9ae62b6c895bf48109de16ce2e1b1b.tar.gz |
Patch to allow Jgments to pull out the raw regexes easier (#477)
Diffstat (limited to 'pygments/lexer.py')
-rw-r--r-- | pygments/lexer.py | 94 |
1 files changed, 56 insertions, 38 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index fbcc39a6..a22768a3 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -349,7 +349,53 @@ class RegexLexerMeta(LexerMeta): self.tokens on the first instantiation. """ + def _process_regex(cls, regex, rflags): + """Preprocess the regular expression component of a token definition.""" + return re.compile(regex, rflags).match + + def _process_token(cls, token): + """Preprocess the token component of a token definition.""" + assert type(token) is _TokenType or callable(token), \ + 'token type must be simple type or callable, not %r' % (token,) + return token + + def _process_new_state(cls, new_state, unprocessed, processed): + """Preprocess the state transition action of a token definition.""" + if isinstance(new_state, str): + # an existing state + if new_state == '#pop': + return -1 + elif new_state in unprocessed: + return (new_state,) + elif new_state == '#push': + return new_state + elif new_state[:5] == '#pop:': + return -int(new_state[5:]) + else: + assert False, 'unknown new state %r' % new_state + elif isinstance(new_state, combined): + # combine a new state from existing ones + tmp_state = '_tmp_%d' % cls._tmpname + cls._tmpname += 1 + itokens = [] + for istate in new_state: + assert istate != new_state, 'circular state ref %r' % istate + itokens.extend(cls._process_state(unprocessed, + processed, istate)) + processed[tmp_state] = itokens + return (tmp_state,) + elif isinstance(new_state, tuple): + # push more than one state + for istate in new_state: + assert (istate in unprocessed or + istate in ('#pop', '#push')), \ + 'unknown new state ' + istate + return new_state + else: + assert False, 'unknown new state def %r' % new_state + def _process_state(cls, unprocessed, processed, state): + """Preprocess a single state definition.""" assert type(state) is str, "wrong state name %r" % state assert state[0] != '#', "invalid state name %r" % state if state in processed: @@ -360,60 +406,31 @@ class RegexLexerMeta(LexerMeta): if isinstance(tdef, include): # it's a state reference assert tdef != state, "circular state reference %r" % state - tokens.extend(cls._process_state(unprocessed, processed, str(tdef))) + tokens.extend(cls._process_state(unprocessed, processed, + str(tdef))) continue assert type(tdef) is tuple, "wrong rule def %r" % tdef try: - rex = re.compile(tdef[0], rflags).match + rex = cls._process_regex(tdef[0], rflags) except Exception, err: raise ValueError("uncompilable regex %r in state %r of %r: %s" % (tdef[0], state, cls, err)) - assert type(tdef[1]) is _TokenType or callable(tdef[1]), \ - 'token type must be simple type or callable, not %r' % (tdef[1],) + token = cls._process_token(tdef[1]) if len(tdef) == 2: new_state = None else: - tdef2 = tdef[2] - if isinstance(tdef2, str): - # an existing state - if tdef2 == '#pop': - new_state = -1 - elif tdef2 in unprocessed: - new_state = (tdef2,) - elif tdef2 == '#push': - new_state = tdef2 - elif tdef2[:5] == '#pop:': - new_state = -int(tdef2[5:]) - else: - assert False, 'unknown new state %r' % tdef2 - elif isinstance(tdef2, combined): - # combine a new state from existing ones - new_state = '_tmp_%d' % cls._tmpname - cls._tmpname += 1 - itokens = [] - for istate in tdef2: - assert istate != state, 'circular state ref %r' % istate - itokens.extend(cls._process_state(unprocessed, - processed, istate)) - processed[new_state] = itokens - new_state = (new_state,) - elif isinstance(tdef2, tuple): - # push more than one state - for state in tdef2: - assert (state in unprocessed or - state in ('#pop', '#push')), \ - 'unknown new state ' + state - new_state = tdef2 - else: - assert False, 'unknown new state def %r' % tdef2 - tokens.append((rex, tdef[1], new_state)) + new_state = cls._process_new_state(tdef[2], + unprocessed, processed) + + tokens.append((rex, token, new_state)) return tokens def process_tokendef(cls, name, tokendefs=None): + """Preprocess a dictionary of token definitions.""" processed = cls._all_tokens[name] = {} tokendefs = tokendefs or cls.tokens[name] for state in tokendefs.keys(): @@ -421,6 +438,7 @@ class RegexLexerMeta(LexerMeta): return processed def __call__(cls, *args, **kwds): + """Instantiate cls after preprocessing its token definitions.""" if not hasattr(cls, '_tokens'): cls._all_tokens = {} cls._tmpname = 0 |