Ensure babel i18n extactor works properly with non-ascii input

If mako templates contain something like "_('KÃ¶ln')", babel extractor converts it to pure ASCII so that resulting .po file would contain "K\xf6ln". Not all translation tools and translations are ready for such kind of escape sequences. Babel allows message ids to be non-ascii, the plugin just has to return Unicode objects instead of ASCII strings (and that's exactly how Babel built-in Python and JavaScript extractors work). This fix ensures mako extractor doesn't excape non-ascii symbols, works well both for Unicode and non-unicode input (there is a test for cp1251 encoding), and also provides a workaround for babel charset detector python-babel/babel#274.
author: Roman Imankulov <roman.imankulov@gmail.com> 2015-10-13 11:44:18 +0000
committer: Roman Imankulov <roman.imankulov@gmail.com> 2015-10-13 16:26:07 +0000
commit: d838e4496bb573c252253722401e4f05f55db0e6 (patch)
tree: 2c2e6b5cd25542b3160a20a9ac9c926f1a30dcf6
parent: 72e95faf46665753247796df7403c3b49bfe092d (diff)
download: mako-d838e4496bb573c252253722401e4f05f55db0e6.tar.gz
4 files changed, 24 insertions, 3 deletions
diff --git a/mako/ext/extract.py b/mako/ext/extract.py
index 313c088..8dd2e96 100644
--- a/mako/ext/extract.py
+++ b/mako/ext/extract.py
@@ -16,6 +16,7 @@ class MessageExtractor(object):
     def extract_nodes(self, nodes):
         translator_comments = []
         in_translator_comments = False
+        input_encoding = self.config['encoding'] or 'ascii'
         comment_tags = list(
             filter(None, re.split(r'\s+', self.config['comment-tags'])))
 
@@ -76,13 +77,18 @@ class MessageExtractor(object):
                 comment[1] for comment in translator_comments]
 
             if isinstance(code, compat.text_type):
-                code = code.encode('ascii', 'backslashreplace')
+                code = code.encode(input_encoding, 'backslashreplace')
 
             used_translator_comments = False
-            code = compat.byte_buffer(code)
+            # We add extra newline to work around a pybabel bug
+            # (see python-babel/babel#274, parse_encoding dies if the first
+            # input string of the input is non-ascii)
+            # Also, because we added it, we have to subtract one from
+            # node.lineno
+            code = compat.byte_buffer(compat.b('\n') + code)
 
             for message in self.process_python(
-                    code, node.lineno, translator_strings):
+                    code, node.lineno - 1, translator_strings):
                 yield message
                 used_translator_comments = True
 
diff --git a/test/ext/test_babelplugin.py b/test/ext/test_babelplugin.py
index 3789b58..abce70a 100644
--- a/test/ext/test_babelplugin.py
+++ b/test/ext/test_babelplugin.py
@@ -78,3 +78,16 @@ class ExtractMakoTestCase(TemplateTest):
              (99, '_', 'No action at a distance.', []),
              ]
         self.assertEqual(expected, messages)
+
+    @skip()
+    def test_extract_utf8(self):
+        mako_tmpl = open(os.path.join(template_base, 'gettext_utf8.mako'), 'rb')
+        message = next(extract(mako_tmpl, {'_', None}, [], {'encoding': 'utf-8'}))
+        assert message == (1, '_', u'K\xf6ln', [])
+
+    @skip()
+    def test_extract_cp1251(self):
+        mako_tmpl = open(os.path.join(template_base, 'gettext_cp1251.mako'), 'rb')
+        message = next(extract(mako_tmpl, {'_', None}, [], {'encoding': 'cp1251'}))
+        # "test" in Rusian. File encoding is cp1251 (aka "windows-1251")
+        assert message == (1, '_', u'\u0442\u0435\u0441\u0442', [])
diff --git a/test/templates/gettext_cp1251.mako b/test/templates/gettext_cp1251.mako
new file mode 100644
index 0000000..9341d93
--- /dev/null
+++ b/test/templates/gettext_cp1251.mako
@@ -0,0 +1 @@
+${_("òåñò")}
diff --git a/test/templates/gettext_utf8.mako b/test/templates/gettext_utf8.mako
new file mode 100644
index 0000000..761f946
--- /dev/null
+++ b/test/templates/gettext_utf8.mako
@@ -0,0 +1 @@
+${_("KÃ¶ln")}
author	Roman Imankulov <roman.imankulov@gmail.com>	2015-10-13 11:44:18 +0000
committer	Roman Imankulov <roman.imankulov@gmail.com>	2015-10-13 16:26:07 +0000
commit	d838e4496bb573c252253722401e4f05f55db0e6 (patch)
tree	2c2e6b5cd25542b3160a20a9ac9c926f1a30dcf6
parent	72e95faf46665753247796df7403c3b49bfe092d (diff)
download	mako-d838e4496bb573c252253722401e4f05f55db0e6.tar.gz