diff options
-rw-r--r-- | builder.py | 50 | ||||
-rw-r--r-- | test/test_encoding.py | 62 |
2 files changed, 108 insertions, 4 deletions
@@ -27,7 +27,7 @@ at the same time. __docformat__ = "restructuredtext en" -import sys +import sys, re from os.path import splitext, basename, dirname, exists, abspath from logilab.common.modutils import modpath_from_file @@ -42,6 +42,42 @@ from _ast import PyCF_ONLY_AST def parse(string): return compile(string, "<string>", 'exec', PyCF_ONLY_AST) +_ENCODING_RGX = re.compile("[^#]*#*.*coding[:=]\s*([^\s]+)") + +def _guess_encoding(string): + """get encoding from a python file as string or return None if not found + """ + # check for UTF-8 byte-order mark + if string.startswith('\xef\xbb\xbf'): + return 'UTF-8' + for line in string.split('\n', 2)[:2]: + # check for encoding declaration + match = _ENCODING_RGX.match(line) + if match is not None: + return match.group(1) + +def get_data(filename): + """get data for parsing a file""" + stream = open(filename, 'U') + data = stream.read() + encoding = _guess_encoding(data) + return stream, encoding, data + +if sys.version_info >= (3, 0): + from tokenize import detect_encoding + + def get_data(filename): + byte_stream = open(filename, 'bU') + encoding = detect_encoding(byte_stream.readline)[0] + stream = open(filename, 'U', encoding=encoding) + try: + data = stream.read() + except UnicodeError: # wrong encoding + # detect_encoding returns utf-8 if no encoding specified + msg = 'Wrong (%s) or no encoding specified' % encoding + raise ASTNGBuildingException(msg) + return stream, encoding, data + # ast NG builder ############################################################## MANAGER = ASTNGManager() @@ -75,10 +111,14 @@ class ASTNGBuilder(InspectBuilder): path is expected to be a python source file """ try: - data = open(path, 'U').read() - except IOError, ex: - msg = 'Unable to load file %r (%s)' % (path, ex) + file_stream, encoding, data = get_data(path) + except IOError, exc: + msg = 'Unable to load file %r (%s)' % (path, exc) raise ASTNGBuildingException(msg) + except SyntaxError, exc: # py3k encoding specification error + raise ASTNGBuildingException(exc) + except LookupError, l_ex: # unknown encoding + raise ASTNGBuildingException(l_ex) # get module name if necessary, *before modifying sys.path* if modname is None: try: @@ -91,6 +131,8 @@ class ASTNGBuilder(InspectBuilder): node = self.string_build(data, modname, path) finally: sys.path.pop(0) + node.file_encoding = encoding + node.file_stream = file_stream return node def string_build(self, data, modname='', path=None): diff --git a/test/test_encoding.py b/test/test_encoding.py new file mode 100644 index 00000000..1ce710b9 --- /dev/null +++ b/test/test_encoding.py @@ -0,0 +1,62 @@ +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +""" Copyright (c) 2003-2005 LOGILAB S.A. (Paris, FRANCE). + http://www.logilab.fr/ -- mailto:contact@logilab.fr + +Check source code is ascii only or has an encoding declaration (PEP 263) +""" + +__revision__ = '$Id: test_encoding.py,v 1.6 2005-11-02 09:22:04 syt Exp $' + +from logilab.common.testlib import TestCase, unittest_main +import sys +from logilab.astng.builder import _guess_encoding as guess_encoding + +class TestGuessEncoding(TestCase): + + def testEmacs(self): + e = guess_encoding('# -*- coding: UTF-8 -*-') + self.failUnlessEqual(e, 'UTF-8') + e = guess_encoding('# -*- coding:UTF-8 -*-') + self.failUnlessEqual(e, 'UTF-8') + e = guess_encoding(''' + ### -*- coding: ISO-8859-1 -*- + ''') + self.failUnlessEqual(e, 'ISO-8859-1') + e = guess_encoding(''' + + ### -*- coding: ISO-8859-1 -*- + ''') + self.failUnlessEqual(e, None) + + def testVim(self): + e = guess_encoding('# vim:fileencoding=UTF-8') + self.failUnlessEqual(e, 'UTF-8') + e = guess_encoding(''' + ### vim:fileencoding=ISO-8859-1 + ''') + self.failUnlessEqual(e, 'ISO-8859-1') + e = guess_encoding(''' + + ### vim:fileencoding= ISO-8859-1 + ''') + self.failUnlessEqual(e, None) + + def testUTF8(self): + e = guess_encoding('\xef\xbb\xbf any UTF-8 data') + self.failUnlessEqual(e, 'UTF-8') + e = guess_encoding(' any UTF-8 data \xef\xbb\xbf') + self.failUnlessEqual(e, None) + +if __name__ == '__main__': + unittest_main() |