summaryrefslogtreecommitdiff
path: root/textutils.py
diff options
context:
space:
mode:
authorJulien Jehannet <julien.jehannet@logilab.fr>2010-09-23 14:52:49 +0200
committerJulien Jehannet <julien.jehannet@logilab.fr>2010-09-23 14:52:49 +0200
commit7e8c50863b305a6cb01f715d1f8f41042d919ccf (patch)
tree95a500796ddec9b7d5c74ed9213cc967f330881e /textutils.py
parenta4d541d67062fe234be812c7ee83a8f48440916b (diff)
downloadlogilab-common-7e8c50863b305a6cb01f715d1f8f41042d919ccf.tar.gz
[textutils] use NFKD decomposition in unormalize()
The normal form KD (NFKD) will apply the compatibility decomposition, i.e. replace all compatibility characters with their equivalents.
Diffstat (limited to 'textutils.py')
-rw-r--r--textutils.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/textutils.py b/textutils.py
index db69d3b..cf4deb2 100644
--- a/textutils.py
+++ b/textutils.py
@@ -73,6 +73,14 @@ MANUAL_UNICODE_MAP = {
def unormalize(ustring, ignorenonascii=False):
"""replace diacritical characters with their corresponding ascii characters
+
+ Convert the unicode string to its long normalized form (unicode character
+ will be transform into several characters) and keep the first one only.
+ The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
+ replace all compatibility characters with their equivalents.
+
+ :see: Another project about ASCII transliterations of Unicode text
+ http://pypi.python.org/pypi/Unidecode
"""
res = []
for letter in ustring[:]:
@@ -83,7 +91,7 @@ def unormalize(ustring, ignorenonascii=False):
if ignorenonascii:
continue
raise ValueError("can't deal with non-ascii based characters")
- replacement = _uninormalize('NFD', letter)[0]
+ replacement = _uninormalize('NFKD', letter)[0]
res.append(replacement)
return u''.join(res)