summaryrefslogtreecommitdiff
path: root/textutils.py
diff options
context:
space:
mode:
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>2011-03-02 09:13:40 +0100
committerAdrien Di Mascio <Adrien.DiMascio@logilab.fr>2011-03-02 09:13:40 +0100
commit7b67e09e06d8d9c96b9ff36934ca7427036c3a74 (patch)
treef8b1454445ccba94629c6297b2e0a60483b6b408 /textutils.py
parent4484aa1875d2cddb9afea97bd4d7069aec066825 (diff)
downloadlogilab-common-7b67e09e06d8d9c96b9ff36934ca7427036c3a74.tar.gz
[textutils] fix unormalize implementation
Give a chance to NFKD decomposition even if ord(c) > 2**8, this will work with a few more cases. add a substitute optional parameter to provide a replacement char if decomposition fails
Diffstat (limited to 'textutils.py')
-rw-r--r--textutils.py20
1 files changed, 15 insertions, 5 deletions
diff --git a/textutils.py b/textutils.py
index 4e98e93..64d70d5 100644
--- a/textutils.py
+++ b/textutils.py
@@ -46,6 +46,7 @@ __docformat__ = "restructuredtext en"
import sys
import re
import os.path as osp
+from warnings import warn
from unicodedata import normalize as _uninormalize
try:
from os import linesep
@@ -71,7 +72,7 @@ MANUAL_UNICODE_MAP = {
u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S
}
-def unormalize(ustring, ignorenonascii=False):
+def unormalize(ustring, ignorenonascii=None, substitute=None):
"""replace diacritical characters with their corresponding ascii characters
Convert the unicode string to its long normalized form (unicode character
@@ -79,19 +80,28 @@ def unormalize(ustring, ignorenonascii=False):
The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
replace all compatibility characters with their equivalents.
+ :type substitute: str
+ :param substitute: replacement character to use if decomposition fails
+
:see: Another project about ASCII transliterations of Unicode text
http://pypi.python.org/pypi/Unidecode
"""
+ # backward compatibility, ignorenonascii was a boolean
+ if ignorenonascii is not None:
+ warn("ignorenonascii is deprecated, use substitute named parameter instead",
+ DeprecationWarning, stacklevel=2)
+ if ignorenonascii:
+ substitute = ''
res = []
for letter in ustring[:]:
try:
replacement = MANUAL_UNICODE_MAP[letter]
except KeyError:
- if ord(letter) >= 2**8:
- if ignorenonascii:
- continue
- raise ValueError("can't deal with non-ascii based characters")
replacement = _uninormalize('NFKD', letter)[0]
+ if ord(replacement) >= 2 ** 7:
+ if substitute is None:
+ raise ValueError("can't deal with non-ascii based characters")
+ replacement = substitute
res.append(replacement)
return u''.join(res)