diff options
author | Julien Jehannet <julien.jehannet@logilab.fr> | 2010-09-23 14:52:49 +0200 |
---|---|---|
committer | Julien Jehannet <julien.jehannet@logilab.fr> | 2010-09-23 14:52:49 +0200 |
commit | 7e8c50863b305a6cb01f715d1f8f41042d919ccf (patch) | |
tree | 95a500796ddec9b7d5c74ed9213cc967f330881e /textutils.py | |
parent | a4d541d67062fe234be812c7ee83a8f48440916b (diff) | |
download | logilab-common-7e8c50863b305a6cb01f715d1f8f41042d919ccf.tar.gz |
[textutils] use NFKD decomposition in unormalize()
The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
replace all compatibility characters with their equivalents.
Diffstat (limited to 'textutils.py')
-rw-r--r-- | textutils.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/textutils.py b/textutils.py index db69d3b..cf4deb2 100644 --- a/textutils.py +++ b/textutils.py @@ -73,6 +73,14 @@ MANUAL_UNICODE_MAP = { def unormalize(ustring, ignorenonascii=False): """replace diacritical characters with their corresponding ascii characters + + Convert the unicode string to its long normalized form (unicode character + will be transform into several characters) and keep the first one only. + The normal form KD (NFKD) will apply the compatibility decomposition, i.e. + replace all compatibility characters with their equivalents. + + :see: Another project about ASCII transliterations of Unicode text + http://pypi.python.org/pypi/Unidecode """ res = [] for letter in ustring[:]: @@ -83,7 +91,7 @@ def unormalize(ustring, ignorenonascii=False): if ignorenonascii: continue raise ValueError("can't deal with non-ascii based characters") - replacement = _uninormalize('NFD', letter)[0] + replacement = _uninormalize('NFKD', letter)[0] res.append(replacement) return u''.join(res) |