summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--builder.py50
-rw-r--r--test/test_encoding.py62
2 files changed, 108 insertions, 4 deletions
diff --git a/builder.py b/builder.py
index f838ba80..5c6e5710 100644
--- a/builder.py
+++ b/builder.py
@@ -27,7 +27,7 @@ at the same time.
__docformat__ = "restructuredtext en"
-import sys
+import sys, re
from os.path import splitext, basename, dirname, exists, abspath
from logilab.common.modutils import modpath_from_file
@@ -42,6 +42,42 @@ from _ast import PyCF_ONLY_AST
def parse(string):
return compile(string, "<string>", 'exec', PyCF_ONLY_AST)
+_ENCODING_RGX = re.compile("[^#]*#*.*coding[:=]\s*([^\s]+)")
+
+def _guess_encoding(string):
+ """get encoding from a python file as string or return None if not found
+ """
+ # check for UTF-8 byte-order mark
+ if string.startswith('\xef\xbb\xbf'):
+ return 'UTF-8'
+ for line in string.split('\n', 2)[:2]:
+ # check for encoding declaration
+ match = _ENCODING_RGX.match(line)
+ if match is not None:
+ return match.group(1)
+
+def get_data(filename):
+ """get data for parsing a file"""
+ stream = open(filename, 'U')
+ data = stream.read()
+ encoding = _guess_encoding(data)
+ return stream, encoding, data
+
+if sys.version_info >= (3, 0):
+ from tokenize import detect_encoding
+
+ def get_data(filename):
+ byte_stream = open(filename, 'bU')
+ encoding = detect_encoding(byte_stream.readline)[0]
+ stream = open(filename, 'U', encoding=encoding)
+ try:
+ data = stream.read()
+ except UnicodeError: # wrong encoding
+ # detect_encoding returns utf-8 if no encoding specified
+ msg = 'Wrong (%s) or no encoding specified' % encoding
+ raise ASTNGBuildingException(msg)
+ return stream, encoding, data
+
# ast NG builder ##############################################################
MANAGER = ASTNGManager()
@@ -75,10 +111,14 @@ class ASTNGBuilder(InspectBuilder):
path is expected to be a python source file
"""
try:
- data = open(path, 'U').read()
- except IOError, ex:
- msg = 'Unable to load file %r (%s)' % (path, ex)
+ file_stream, encoding, data = get_data(path)
+ except IOError, exc:
+ msg = 'Unable to load file %r (%s)' % (path, exc)
raise ASTNGBuildingException(msg)
+ except SyntaxError, exc: # py3k encoding specification error
+ raise ASTNGBuildingException(exc)
+ except LookupError, l_ex: # unknown encoding
+ raise ASTNGBuildingException(l_ex)
# get module name if necessary, *before modifying sys.path*
if modname is None:
try:
@@ -91,6 +131,8 @@ class ASTNGBuilder(InspectBuilder):
node = self.string_build(data, modname, path)
finally:
sys.path.pop(0)
+ node.file_encoding = encoding
+ node.file_stream = file_stream
return node
def string_build(self, data, modname='', path=None):
diff --git a/test/test_encoding.py b/test/test_encoding.py
new file mode 100644
index 00000000..1ce710b9
--- /dev/null
+++ b/test/test_encoding.py
@@ -0,0 +1,62 @@
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+""" Copyright (c) 2003-2005 LOGILAB S.A. (Paris, FRANCE).
+ http://www.logilab.fr/ -- mailto:contact@logilab.fr
+
+Check source code is ascii only or has an encoding declaration (PEP 263)
+"""
+
+__revision__ = '$Id: test_encoding.py,v 1.6 2005-11-02 09:22:04 syt Exp $'
+
+from logilab.common.testlib import TestCase, unittest_main
+import sys
+from logilab.astng.builder import _guess_encoding as guess_encoding
+
+class TestGuessEncoding(TestCase):
+
+ def testEmacs(self):
+ e = guess_encoding('# -*- coding: UTF-8 -*-')
+ self.failUnlessEqual(e, 'UTF-8')
+ e = guess_encoding('# -*- coding:UTF-8 -*-')
+ self.failUnlessEqual(e, 'UTF-8')
+ e = guess_encoding('''
+ ### -*- coding: ISO-8859-1 -*-
+ ''')
+ self.failUnlessEqual(e, 'ISO-8859-1')
+ e = guess_encoding('''
+
+ ### -*- coding: ISO-8859-1 -*-
+ ''')
+ self.failUnlessEqual(e, None)
+
+ def testVim(self):
+ e = guess_encoding('# vim:fileencoding=UTF-8')
+ self.failUnlessEqual(e, 'UTF-8')
+ e = guess_encoding('''
+ ### vim:fileencoding=ISO-8859-1
+ ''')
+ self.failUnlessEqual(e, 'ISO-8859-1')
+ e = guess_encoding('''
+
+ ### vim:fileencoding= ISO-8859-1
+ ''')
+ self.failUnlessEqual(e, None)
+
+ def testUTF8(self):
+ e = guess_encoding('\xef\xbb\xbf any UTF-8 data')
+ self.failUnlessEqual(e, 'UTF-8')
+ e = guess_encoding(' any UTF-8 data \xef\xbb\xbf')
+ self.failUnlessEqual(e, None)
+
+if __name__ == '__main__':
+ unittest_main()