summaryrefslogtreecommitdiff
path: root/builder.py
diff options
context:
space:
mode:
authorEmile Anclin <emile.anclin@logilab.fr>2010-11-22 15:56:26 +0100
committerEmile Anclin <emile.anclin@logilab.fr>2010-11-22 15:56:26 +0100
commit0bd84938bb037b70cee901157880df4bf9dd7f87 (patch)
treeafa1fa5ac8116ed3d5996fe5329061cffdc9452c /builder.py
parent5d3f2a5255224c15c316d0854fd9db2b41a201a0 (diff)
downloadastroid-git-0bd84938bb037b70cee901157880df4bf9dd7f87.tar.gz
py3k: handle file encoding and stream definition in builder
In Py3k, We can't do "data = open(path, 'U').read()" anymore since we need to know the encoding to read the file (UnicodeError); instead we have to first guess the encoding by reading the file as a byte stream. We then define + node.file_encoding = encoding + node.file_stream = file_stream to be used by the Pylint checkers
Diffstat (limited to 'builder.py')
-rw-r--r--builder.py53
1 files changed, 49 insertions, 4 deletions
diff --git a/builder.py b/builder.py
index f744d607..1b850d8d 100644
--- a/builder.py
+++ b/builder.py
@@ -47,6 +47,45 @@ from _ast import PyCF_ONLY_AST
def parse(string):
return compile(string, "<string>", 'exec', PyCF_ONLY_AST)
+if sys.version_info >= (3, 0):
+ from tokenize import detect_encoding
+
+ def open_source_file(filename):
+ byte_stream = open(filename, 'bU')
+ encoding = detect_encoding(byte_stream.readline)[0]
+ stream = open(filename, 'U', encoding=encoding)
+ try:
+ data = stream.read()
+ except UnicodeError, uex: # wrong encodingg
+ # detect_encoding returns utf-8 if no encoding specified
+ msg = 'Wrong (%s) or no encoding specified' % encoding
+ raise ASTNGBuildingException(msg)
+ return stream, encoding, data
+
+else:
+ import re
+
+ _ENCODING_RGX = re.compile("[^#]*#*.*coding[:=]\s*([^\s]+)")
+
+ def _guess_encoding(string):
+ """get encoding from a python file as string or return None if not found
+ """
+ # check for UTF-8 byte-order mark
+ if string.startswith('\xef\xbb\xbf'):
+ return 'UTF-8'
+ for line in string.split('\n', 2)[:2]:
+ # check for encoding declaration
+ match = _ENCODING_RGX.match(line)
+ if match is not None:
+ return match.group(1)
+
+ def open_source_file(filename):
+ """get data for parsing a file"""
+ stream = open(filename, 'U')
+ data = stream.read()
+ encoding = _guess_encoding(data)
+ return stream, encoding, data
+
# ast NG builder ##############################################################
MANAGER = ASTNGManager()
@@ -101,10 +140,14 @@ class ASTNGBuilder:
path is expected to be a python source file
"""
try:
- data = open(path, 'U').read()
- except IOError, ex:
- msg = 'Unable to load file %r (%s)' % (path, ex)
+ stream, encoding, data = open_source_file(path)
+ except IOError, exc:
+ msg = 'Unable to load file %r (%s)' % (path, exc)
raise ASTNGBuildingException(msg)
+ except SyntaxError, exc: # py3k encoding specification error
+ raise ASTNGBuildingException(exc)
+ except LookupError, exc: # unknown encoding
+ raise ASTNGBuildingException(exc)
# get module name if necessary, *before modifying sys.path*
if modname is None:
try:
@@ -113,10 +156,12 @@ class ASTNGBuilder:
modname = splitext(basename(path))[0]
# build astng representation
try:
- sys.path.insert(0, dirname(path))
+ sys.path.insert(0, dirname(path)) # XXX (syt) iirk
node = self.string_build(data, modname, path)
finally:
sys.path.pop(0)
+ node.file_encoding = encoding
+ node.file_stream = stream
return node
def string_build(self, data, modname='', path=None):