py3k: handle file encoding and stream definition in builder

In Py3k, We can't do "data = open(path, 'U').read()" anymore since we need to know the encoding to read the file (UnicodeError); instead we have to first guess the encoding by reading the file as a byte stream. We then define + node.file_encoding = encoding + node.file_stream = file_stream to be used by the Pylint checkers
author: Emile Anclin <emile.anclin@logilab.fr> 2010-11-22 15:56:26 +0100
committer: Emile Anclin <emile.anclin@logilab.fr> 2010-11-22 15:56:26 +0100
commit: fb09403ab49b207da3aa9a9b3f738af0085c031e (patch)
tree: f21ec6acfc8dd1e765650495145aca1c4a7690d1 /builder.py
parent: b8f49d042a6c3a5cda196dc76e04fd9f479957fd (diff)
download: astroid-git-fb09403ab49b207da3aa9a9b3f738af0085c031e.tar.gz
1 files changed, 46 insertions, 4 deletions
diff --git a/builder.py b/builder.py
index f838ba80..5c6e5710 100644
--- a/builder.py
+++ b/builder.py
@@ -27,7 +27,7 @@ at the same time.
 
 __docformat__ = "restructuredtext en"
 
-import sys
+import sys, re
 from os.path import splitext, basename, dirname, exists, abspath
 
 from logilab.common.modutils import modpath_from_file
@@ -42,6 +42,42 @@ from _ast import PyCF_ONLY_AST
 def parse(string):
     return compile(string, "<string>", 'exec', PyCF_ONLY_AST)
 
+_ENCODING_RGX = re.compile("[^#]*#*.*coding[:=]\s*([^\s]+)")
+
+def _guess_encoding(string):
+    """get encoding from a python file as string or return None if not found
+    """
+    # check for UTF-8 byte-order mark
+    if string.startswith('\xef\xbb\xbf'):
+        return 'UTF-8'
+    for line in string.split('\n', 2)[:2]:
+        # check for encoding declaration
+        match = _ENCODING_RGX.match(line)
+        if match is not None:
+            return match.group(1)
+
+def get_data(filename):
+    """get data for parsing a file"""
+    stream = open(filename, 'U')
+    data = stream.read()
+    encoding = _guess_encoding(data)
+    return stream, encoding, data
+
+if sys.version_info >= (3, 0):
+    from tokenize import detect_encoding
+
+    def get_data(filename):
+        byte_stream = open(filename, 'bU')
+        encoding = detect_encoding(byte_stream.readline)[0]
+        stream = open(filename, 'U', encoding=encoding)
+        try:
+            data = stream.read()
+        except UnicodeError: # wrong encoding
+            # detect_encoding returns utf-8 if no encoding specified
+            msg = 'Wrong (%s) or no encoding specified' % encoding
+            raise ASTNGBuildingException(msg)
+        return stream, encoding, data
+
 # ast NG builder ##############################################################
 
 MANAGER = ASTNGManager()
@@ -75,10 +111,14 @@ class ASTNGBuilder(InspectBuilder):
         path is expected to be a python source file
         """
         try:
-            data = open(path, 'U').read()
-        except IOError, ex:
-            msg = 'Unable to load file %r (%s)' % (path, ex)
+            file_stream, encoding, data = get_data(path)
+        except IOError, exc:
+            msg = 'Unable to load file %r (%s)' % (path, exc)
             raise ASTNGBuildingException(msg)
+        except SyntaxError, exc: # py3k encoding specification error
+            raise ASTNGBuildingException(exc)
+        except LookupError, l_ex: # unknown encoding
+            raise ASTNGBuildingException(l_ex)
         # get module name if necessary, *before modifying sys.path*
         if modname is None:
             try:
@@ -91,6 +131,8 @@ class ASTNGBuilder(InspectBuilder):
             node = self.string_build(data, modname, path)
         finally:
             sys.path.pop(0)
+        node.file_encoding = encoding
+        node.file_stream = file_stream
         return node
 
     def string_build(self, data, modname='', path=None):
author	Emile Anclin <emile.anclin@logilab.fr>	2010-11-22 15:56:26 +0100
committer	Emile Anclin <emile.anclin@logilab.fr>	2010-11-22 15:56:26 +0100
commit	fb09403ab49b207da3aa9a9b3f738af0085c031e (patch)
tree	f21ec6acfc8dd1e765650495145aca1c4a7690d1 /builder.py
parent	b8f49d042a6c3a5cda196dc76e04fd9f479957fd (diff)
download	astroid-git-fb09403ab49b207da3aa9a9b3f738af0085c031e.tar.gz