sorted out string/unicode problems, updated the unicode test, renamed assertEquals method to assertEqual (deprecated in py3)

author: Félix Mattrat <mattr.felix@gmail.com> 2016-04-15 12:06:53 +0200
committer: Félix Mattrat <mattr.felix@gmail.com> 2016-04-15 12:06:53 +0200
commit: 9adaa031887b5511508fd2ee2b1f60680a788a90 (patch)
tree: f58c9bf5ef8e23f1f784226662d30482754fcf81 /fastimport
parent: ee90b2a6afeebe24da17f5545eda7f0676867cf4 (diff)
download: python-fastimport-git-9adaa031887b5511508fd2ee2b1f60680a788a90.tar.gz
7 files changed, 75 insertions, 36 deletions
diff --git a/fastimport/commands.py b/fastimport/commands.py
index 575c304..e4f2bf3 100644
--- a/fastimport/commands.py
+++ b/fastimport/commands.py
@@ -19,13 +19,19 @@ These objects are used by the parser to represent the content of
 a fast-import stream.
 """
 from __future__ import division
+
 from past.utils import old_div
 from past.builtins import basestring
+from future.utils import PY2
+
 from builtins import object
+from builtins import str as _text
 
 
+import sys
 import stat
 
+
 # There is a bug in git 1.5.4.3 and older by which unquoting a string consumes
 # one extra character. Set this variable to True to work-around it. It only
 # happens when renaming a file whose name contains spaces and/or quotes, and
@@ -175,7 +181,9 @@ class CommitCommand(ImportCommand):
             if use_features and self.more_authors:
                 for author in self.more_authors:
                     author_section += "\nauthor %s" % format_who_when(author)
+
         committer = "committer %s" % format_who_when(self.committer)
+
         if self.message is None:
             msg_section = ""
         else:
@@ -421,7 +429,6 @@ def check_path(path):
     if path is None or path == '' or path[0] == "/":
         raise ValueError("illegal path '%s'" % path)
     if not isinstance(path, basestring):
-        import ipdb;ipdb.set_trace()
         raise TypeError("illegale type for path '%r'" % path)
     return path
 
@@ -452,24 +459,36 @@ def format_who_when(fields):
     offset_minutes = old_div(offset, 60) - offset_hours * 60
     offset_str = "%s%02d%02d" % (offset_sign, offset_hours, offset_minutes)
     name = fields[0]
+
     if name == '':
         sep = ''
     else:
         sep = ' '
-    if isinstance(name, basestring):
+
+    if isinstance(name, basestring) and PY2:
         name = name.encode('utf8')
+
     email = fields[1]
-    if isinstance(email, basestring):
+
+    if isinstance(email, basestring) and PY2:
         email = email.encode('utf8')
+
     result = "%s%s<%s> %d %s" % (name, sep, email, fields[2], offset_str)
+
     return result
 
 
 def format_property(name, value):
     """Format the name and value (both unicode) of a property as a string."""
-    utf8_name = name.encode('utf8')
+    utf8_name = name
+
+    if PY2:
+        utf8_name = name.encode('utf8')
+
     if value is not None:
-        utf8_value = value.encode('utf8')
+        utf8_value = value
+        if PY2:
+            utf8_name = name.encode('utf8')
         result = "property %s %d %s" % (utf8_name, len(utf8_value), utf8_value)
     else:
         result = "property %s" % (utf8_name,)
diff --git a/fastimport/parser.py b/fastimport/parser.py
index f44c963..1d8cbe6 100644
--- a/fastimport/parser.py
+++ b/fastimport/parser.py
@@ -158,17 +158,17 @@ The grammar is:
   not_lf  ::= # Any byte that is not ASCII newline (LF);
 """
 from __future__ import print_function
-from __future__ import unicode_literals
 from future import standard_library
 standard_library.install_aliases()
 from builtins import map
 from builtins import object
-from builtins import str
+from builtins import str as _text
 
 
 import collections
 import re
 import sys
+import codecs
 
 from fastimport import (
     commands,
@@ -574,7 +574,7 @@ class ImportParser(LineBasedParser):
             if still_to_read > 0:
                 read_bytes = self.read_bytes(still_to_read)
                 value += "\n" + read_bytes[:still_to_read - 1]
-            value = value.decode('utf8')
+            value = _text(value)
         return (name, value)
 
     def _path(self, s):
@@ -621,11 +621,26 @@ class ImportParser(LineBasedParser):
             self.abort(errors.BadFormat, 'filemodify', 'mode', s)
 
 
+ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+    )''', re.UNICODE | re.VERBOSE
+)
+
 def _unquote_c_string(s):
     """replace C-style escape sequences (\n, \", etc.) with real chars."""
-    # HACK: Python strings are close enough
-    #s = str(s)
-    #import ipdb;ipdb.set_trace()
-    return s.decode('string_escape', 'replace')
+
+    # doing a s.encode('utf-8').decode('unicode_escape') can return an
+    # incorrect output with unicode string (both in py2 and py3) the safest way
+    # is to match the escape sequences and decoding them alone.
+    def decode_match(match):
+        return codecs.decode(match.group(0), 'unicode-escape')
+
+    return ESCAPE_SEQUENCE_RE.sub(decode_match, s)
+
 
 Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone')
diff --git a/fastimport/processor.py b/fastimport/processor.py
index 3b601f5..c2b9374 100644
--- a/fastimport/processor.py
+++ b/fastimport/processor.py
@@ -30,12 +30,13 @@ See git-fast-import.1 for the meaning of each command and the
 processors package for examples.
 """
 from __future__ import absolute_import
+
 from builtins import object
 
 import sys
 import time
 
-from . import errors
+from fastimport import errors
 
 
 class ImportProcessor(object):
@@ -151,7 +152,7 @@ class ImportProcessor(object):
 
 class CommitHandler(object):
     """Base class for commit handling.
-    
+
     Subclasses should override the pre_*, post_* and *_handler
     methods as appropriate.
     """
diff --git a/fastimport/tests/test_commands.py b/fastimport/tests/test_commands.py
index 2139f8c..1037c22 100644
--- a/fastimport/tests/test_commands.py
+++ b/fastimport/tests/test_commands.py
@@ -16,6 +16,7 @@
 """Test how Commands are displayed"""
 from future import standard_library
 standard_library.install_aliases()
+from future.utils import PY2
 from builtins import map
 from unittest import TestCase
 
@@ -61,21 +62,24 @@ class TestCommitDisplay(TestCase):
     def test_commit_unicode_committer(self):
         # user tuple is (name, email, secs-since-epoch, secs-offset-from-utc)
         name = u'\u013d\xf3r\xe9m \xcdp\u0161\xfam'
-        name_utf8 = name.encode('utf8')
+
+        commit_utf8 = (
+            u"commit refs/heads/master\n"
+            u"mark :bbb\n"
+            u"committer %s <test@example.com> 1234567890 -0600\n"
+            u"data 12\n"
+            u"release v1.0\n"
+            u"from :aaa" % (name,)
+        )
+
+        if PY2:
+            commit_utf8 = commit_utf8.encode('utf8')
+
         committer = (name, 'test@example.com', 1234567890, -6 * 3600)
         c = commands.CommitCommand("refs/heads/master", "bbb", None, committer,
             "release v1.0", ":aaa", None, None)
-        try:
-            self.assertEqual(
-                "commit refs/heads/master\n"
-                "mark :bbb\n"
-                "committer %s <test@example.com> 1234567890 -0600\n"
-                "data 12\n"
-                "release v1.0\n"
-                "from :aaa" % (name_utf8,),
-                repr(c))
-        except UnicodeEncodeError:
-            import ipdb;ipdb.set_trace()
+
+        self.assertEqual(commit_utf8, repr(c))
 
     def test_commit_no_mark(self):
         # user tuple is (name, email, secs-since-epoch, secs-offset-from-utc)
diff --git a/fastimport/tests/test_dates.py b/fastimport/tests/test_dates.py
index f893da9..5209540 100644
--- a/fastimport/tests/test_dates.py
+++ b/fastimport/tests/test_dates.py
@@ -24,11 +24,11 @@ from fastimport import (
 class ParseTzTests(TestCase):
 
     def test_parse_tz_utc(self):
-        self.assertEquals(0, dates.parse_tz("+0000"))
-        self.assertEquals(0, dates.parse_tz("-0000"))
+        self.assertEqual(0, dates.parse_tz("+0000"))
+        self.assertEqual(0, dates.parse_tz("-0000"))
 
     def test_parse_tz_cet(self):
-        self.assertEquals(3600, dates.parse_tz("+0100"))
+        self.assertEqual(3600, dates.parse_tz("+0100"))
 
     def test_parse_tz_odd(self):
-        self.assertEquals(1864800, dates.parse_tz("+51800"))
+        self.assertEqual(1864800, dates.parse_tz("+51800"))
diff --git a/fastimport/tests/test_filter_processor.py b/fastimport/tests/test_filter_processor.py
index 153c05f..742d15a 100644
--- a/fastimport/tests/test_filter_processor.py
+++ b/fastimport/tests/test_filter_processor.py
@@ -117,7 +117,7 @@ class TestCaseWithFiltering(TestCase):
         p = parser.ImportParser(s)
         proc.process(p.iter_commands)
         out = outf.getvalue()
-        self.assertEquals(expected, out)
+        self.assertEqual(expected, out)
 
 class TestNoFiltering(TestCaseWithFiltering):
 
diff --git a/fastimport/tests/test_parser.py b/fastimport/tests/test_parser.py
index 73bfeb0..8204346 100644
--- a/fastimport/tests/test_parser.py
+++ b/fastimport/tests/test_parser.py
@@ -304,7 +304,7 @@ class TestStringParsing(unittest.TestCase):
 
     def test_unquote(self):
         s = r'hello \"sweet\" wo\\r\tld'
-        self.assertEquals(r'hello "sweet" wo\r' + "\tld",
+        self.assertEqual(r'hello "sweet" wo\r' + "\tld",
             parser._unquote_c_string(s))
 
 
@@ -330,9 +330,9 @@ class TestTagParsing(unittest.TestCase):
             u"data 11\n"
             u"create v1.0"))
         cmds = list(p.iter_commands())
-        self.assertEquals(1, len(cmds))
+        self.assertEqual(1, len(cmds))
         self.assertTrue(isinstance(cmds[0], commands.TagCommand))
-        self.assertEquals(cmds[0].tagger,
+        self.assertEqual(cmds[0].tagger,
             ('Joe Wong', 'joe@example.com', 1234567890.0, -21600))
 
     def test_tagger_no_email_strict(self):
@@ -352,6 +352,6 @@ class TestTagParsing(unittest.TestCase):
             u"data 11\n"
             u"create v1.0"), strict=False)
         cmds = list(p.iter_commands())
-        self.assertEquals(1, len(cmds))
+        self.assertEqual(1, len(cmds))
         self.assertTrue(isinstance(cmds[0], commands.TagCommand))
-        self.assertEquals(cmds[0].tagger[:2], ('Joe Wong', None))
+        self.assertEqual(cmds[0].tagger[:2], ('Joe Wong', None))
author	Félix Mattrat <mattr.felix@gmail.com>	2016-04-15 12:06:53 +0200
committer	Félix Mattrat <mattr.felix@gmail.com>	2016-04-15 12:06:53 +0200
commit	9adaa031887b5511508fd2ee2b1f60680a788a90 (patch)
tree	f58c9bf5ef8e23f1f784226662d30482754fcf81 /fastimport
parent	ee90b2a6afeebe24da17f5545eda7f0676867cf4 (diff)
download	python-fastimport-git-9adaa031887b5511508fd2ee2b1f60680a788a90.tar.gz