From 7b67e09e06d8d9c96b9ff36934ca7427036c3a74 Mon Sep 17 00:00:00 2001 From: Adrien Di Mascio Date: Wed, 2 Mar 2011 09:13:40 +0100 Subject: [textutils] fix unormalize implementation Give a chance to NFKD decomposition even if ord(c) > 2**8, this will work with a few more cases. add a substitute optional parameter to provide a replacement char if decomposition fails --- test/unittest_textutils.py | 12 ++++++++++-- textutils.py | 20 +++++++++++++++----- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/test/unittest_textutils.py b/test/unittest_textutils.py index 75b9cbb..d72a4a1 100644 --- a/test/unittest_textutils.py +++ b/test/unittest_textutils.py @@ -228,7 +228,7 @@ class ColorizeAnsiTC(TestCase): class UnormalizeTC(TestCase): - def test_unormalize(self): + def test_unormalize_no_substitute(self): data = [(u'\u0153nologie', u'oenologie'), (u'\u0152nologie', u'OEnologie'), (u'l\xf8to', u'loto'), @@ -236,11 +236,19 @@ class UnormalizeTC(TestCase): (u'àèùéïîôêç', u'aeueiioec'), (u'ÀÈÙÉÏÎÔÊÇ', u'AEUEIIOEC'), (u'\xa0', u' '), # NO-BREAK SPACE managed by NFKD decomposition + (u'\u0154', u'R'), ] for input, output in data: yield self.assertEqual, tu.unormalize(input), output - self.assertRaises(ValueError, tu.unormalize, u"non ascii char is \u0154", + + def test_unormalize_substitute(self): + self.assertEqual(tu.unormalize(u'ab \u8000 cd', substitute='_'), + 'ab _ cd') + + def test_unormalize_backward_compat(self): + self.assertRaises(ValueError, tu.unormalize, u"\u8000", ignorenonascii=False) + self.assertEqual(tu.unormalize(u"\u8000", ignorenonascii=True), u'') class ModuleDocTest(DocTest): diff --git a/textutils.py b/textutils.py index 4e98e93..64d70d5 100644 --- a/textutils.py +++ b/textutils.py @@ -46,6 +46,7 @@ __docformat__ = "restructuredtext en" import sys import re import os.path as osp +from warnings import warn from unicodedata import normalize as _uninormalize try: from os import linesep @@ -71,7 +72,7 @@ MANUAL_UNICODE_MAP = { u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S } -def unormalize(ustring, ignorenonascii=False): +def unormalize(ustring, ignorenonascii=None, substitute=None): """replace diacritical characters with their corresponding ascii characters Convert the unicode string to its long normalized form (unicode character @@ -79,19 +80,28 @@ def unormalize(ustring, ignorenonascii=False): The normal form KD (NFKD) will apply the compatibility decomposition, i.e. replace all compatibility characters with their equivalents. + :type substitute: str + :param substitute: replacement character to use if decomposition fails + :see: Another project about ASCII transliterations of Unicode text http://pypi.python.org/pypi/Unidecode """ + # backward compatibility, ignorenonascii was a boolean + if ignorenonascii is not None: + warn("ignorenonascii is deprecated, use substitute named parameter instead", + DeprecationWarning, stacklevel=2) + if ignorenonascii: + substitute = '' res = [] for letter in ustring[:]: try: replacement = MANUAL_UNICODE_MAP[letter] except KeyError: - if ord(letter) >= 2**8: - if ignorenonascii: - continue - raise ValueError("can't deal with non-ascii based characters") replacement = _uninormalize('NFKD', letter)[0] + if ord(replacement) >= 2 ** 7: + if substitute is None: + raise ValueError("can't deal with non-ascii based characters") + replacement = substitute res.append(replacement) return u''.join(res) -- cgit v1.2.1