From 01f0085bada41394198bd7ac753f280a516675da Mon Sep 17 00:00:00 2001 From: Andy Li Date: Sat, 8 Dec 2012 00:28:44 +0800 Subject: fixed #822 (remove BOM if present) --- pygments/lexer.py | 1 + 1 file changed, 1 insertion(+) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index ad2c72d1..6f466a77 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -164,6 +164,7 @@ class Lexer(object): else: text = text.decode(self.encoding) # text now *is* a unicode string + text = text.lstrip(u'\xef\xbb\xbf\ufeff') # remove BOM text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') if self.stripall: -- cgit v1.2.1 From a0fb320fd7264c3804fd846277f47d40b9013282 Mon Sep 17 00:00:00 2001 From: Andy Li Date: Wed, 12 Dec 2012 18:11:34 +0800 Subject: Remove BOM when the input is unicode. --- pygments/lexer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index 6f466a77..2280a250 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -163,8 +163,11 @@ class Lexer(object): text = decoded else: text = text.decode(self.encoding) + else: + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] + # text now *is* a unicode string - text = text.lstrip(u'\xef\xbb\xbf\ufeff') # remove BOM text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') if self.stripall: -- cgit v1.2.1 From 3c081914779ce96aff16717742f522a50af97c66 Mon Sep 17 00:00:00 2001 From: Alastair Houghton Date: Wed, 19 Dec 2012 13:34:13 +0000 Subject: Added support for inheritance to RegexLexer, so that subclasses can selectively inherit tokendefs from their superclasses. Used this new ability to simplify and unify the C family languages in compiled.py, and to add support for Objective-C++. Also added code to support autodetection of language for .h files, with the default being C if no content is provided, since Objective-C uses the .h file extension. --- pygments/lexer.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index ad2c72d1..8535d086 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -8,7 +8,7 @@ :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ -import re +import re, itertools from pygments.filter import apply_filters, Filter from pygments.filters import get_filter_by_name @@ -18,7 +18,7 @@ from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', - 'LexerContext', 'include', 'bygroups', 'using', 'this'] + 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this'] _encoding_map = [('\xef\xbb\xbf', 'utf-8'), @@ -81,6 +81,9 @@ class Lexer(object): #: mime types mimetypes = [] + #: Priority, should multiple lexers match and no content is provided + priority = 0 + __metaclass__ = LexerMeta def __init__(self, **options): @@ -237,6 +240,14 @@ class include(str): """ pass +class _inherit(object): + """ + Indicates the a state should inherit from its superclass. + """ + def __repr__(self): + return 'inherit' + +inherit = _inherit() class combined(tuple): """ @@ -428,7 +439,10 @@ class RegexLexerMeta(LexerMeta): tokens.extend(cls._process_state(unprocessed, processed, str(tdef))) continue - + if isinstance(tdef, _inherit): + # processed already + continue + assert type(tdef) is tuple, "wrong rule def %r" % tdef try: @@ -456,6 +470,54 @@ class RegexLexerMeta(LexerMeta): cls._process_state(tokendefs, processed, state) return processed + def get_tokendefs(cls): + """ + Merge tokens from superclasses in MRO order, returning a single + tokendef dictionary. + + Any state that is not defined by a subclass will be inherited + automatically. States that *are* defined by subclasses will, by + default, override that state in the superclass. If a subclass + wishes to inherit definitions from a superclass, it can use the + special value "inherit", which will cause the superclass' state + definition to be included at that point in the state. + """ + tokens = {} + inheritable = {} + for c in itertools.chain((cls,), cls.__mro__): + toks = c.__dict__.get('tokens', {}) + + for state, items in toks.iteritems(): + curitems = tokens.get(state) + if curitems is None: + tokens[state] = items + + try: + inherit_ndx = items.index(inherit) + except: + inherit_ndx = -1 + + if inherit_ndx != -1: + inheritable[state] = inherit_ndx + continue + + inherit_ndx = inheritable.pop(state, None) + if inherit_ndx is None: + continue + + # Replace the "inherit" value with the items + curitems[inherit_ndx:inherit_ndx+1] = items + + try: + new_inh_ndx = items.index(inherit) + except: + new_inh_ndx = -1 + + if new_inh_ndx != -1: + inheritable[state] = inherit_ndx + new_inh_ndx + + return tokens + def __call__(cls, *args, **kwds): """Instantiate cls after preprocessing its token definitions.""" if '_tokens' not in cls.__dict__: @@ -465,7 +527,7 @@ class RegexLexerMeta(LexerMeta): # don't process yet pass else: - cls._tokens = cls.process_tokendef('', cls.tokens) + cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) return type.__call__(cls, *args, **kwds) -- cgit v1.2.1 From e594d501a73f5255db2f919cf841996103b115b4 Mon Sep 17 00:00:00 2001 From: Alastair Houghton Date: Thu, 3 Jan 2013 12:10:25 +0000 Subject: Remove the unnecessary analyse_text methods, reduce the default priorities. Also remove some trailing whitespace and tighten-up a couple of except clauses. --- pygments/lexer.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index 8535d086..b8cf69fb 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -442,7 +442,7 @@ class RegexLexerMeta(LexerMeta): if isinstance(tdef, _inherit): # processed already continue - + assert type(tdef) is tuple, "wrong rule def %r" % tdef try: @@ -494,30 +494,29 @@ class RegexLexerMeta(LexerMeta): try: inherit_ndx = items.index(inherit) - except: - inherit_ndx = -1 - - if inherit_ndx != -1: - inheritable[state] = inherit_ndx + except ValueError: + continue + + inheritable[state] = inherit_ndx continue - + inherit_ndx = inheritable.pop(state, None) if inherit_ndx is None: continue # Replace the "inherit" value with the items curitems[inherit_ndx:inherit_ndx+1] = items - + try: new_inh_ndx = items.index(inherit) - except: + except ValueError: new_inh_ndx = -1 - + if new_inh_ndx != -1: inheritable[state] = inherit_ndx + new_inh_ndx return tokens - + def __call__(cls, *args, **kwds): """Instantiate cls after preprocessing its token definitions.""" if '_tokens' not in cls.__dict__: -- cgit v1.2.1 From 317a3c772b237d98b409a35628e2c4cf5ea0a738 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Wed, 9 Jan 2013 10:25:07 +0100 Subject: Whitespace cleanup. --- pygments/lexer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index 2280a250..7cb212d5 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -164,9 +164,9 @@ class Lexer(object): else: text = text.decode(self.encoding) else: - if text.startswith(u'\ufeff'): + if text.startswith(u'\ufeff'): text = text[len(u'\ufeff'):] - + # text now *is* a unicode string text = text.replace('\r\n', '\n') text = text.replace('\r', '\n') @@ -698,4 +698,3 @@ def do_insertions(insertions, tokens): except StopIteration: insleft = False break # not strictly necessary - -- cgit v1.2.1 From d61e25e529a78887c8364126f7f73a2e6651a1e6 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Wed, 9 Jan 2013 12:46:32 +0100 Subject: Happy new year 2013. --- pygments/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pygments/lexer.py') diff --git a/pygments/lexer.py b/pygments/lexer.py index 7cb212d5..1de2d065 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -5,7 +5,7 @@ Base lexer classes. - :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. + :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re -- cgit v1.2.1