Remove BOM when the input is unicode.

author: Andy Li <andy@onthewings.net> 2012-12-12 18:11:34 +0800
committer: Andy Li <andy@onthewings.net> 2012-12-12 18:11:34 +0800
commit: a0fb320fd7264c3804fd846277f47d40b9013282 (patch)
tree: 72f3db02e15c2e5dbc3f80469f2351134c9ddfb8
parent: d523209a898076e118144e6260de6bdb1778c1a4 (diff)
download: pygments-a0fb320fd7264c3804fd846277f47d40b9013282.tar.gz
2 files changed, 6 insertions, 2 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 6f466a77..2280a250 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -163,8 +163,11 @@ class Lexer(object):
                 text = decoded
             else:
                 text = text.decode(self.encoding)
+        else:
+            if text.startswith(u'\ufeff'): 
+                text = text[len(u'\ufeff'):]
+        
         # text now *is* a unicode string
-        text = text.lstrip(u'\xef\xbb\xbf\ufeff') # remove BOM
         text = text.replace('\r\n', '\n')
         text = text.replace('\r', '\n')
         if self.stripall:
diff --git a/tests/test_examplefiles.py b/tests/test_examplefiles.py
index 1d3515df..a938ebaa 100644
--- a/tests/test_examplefiles.py
+++ b/tests/test_examplefiles.py
@@ -54,11 +54,12 @@ def check_lexer(lx, absfn, outfn):
         text = fp.read()
     finally:
         fp.close()
-    text = text.lstrip(u'\xef\xbb\xbf\ufeff') #remove BOM
     text = text.replace(b('\r\n'), b('\n'))
     text = text.strip(b('\n')) + b('\n')
     try:
         text = text.decode('utf-8')
+        if text.startswith(u'\ufeff'):
+            text = text[len(u'\ufeff'):]
     except UnicodeError:
         text = text.decode('latin1')
     ntext = []
author	Andy Li <andy@onthewings.net>	2012-12-12 18:11:34 +0800
committer	Andy Li <andy@onthewings.net>	2012-12-12 18:11:34 +0800
commit	a0fb320fd7264c3804fd846277f47d40b9013282 (patch)
tree	72f3db02e15c2e5dbc3f80469f2351134c9ddfb8
parent	d523209a898076e118144e6260de6bdb1778c1a4 (diff)
download	pygments-a0fb320fd7264c3804fd846277f47d40b9013282.tar.gz