summaryrefslogtreecommitdiff
path: root/textutils.py
diff options
context:
space:
mode:
authorSylvain <syt@logilab.fr>2007-09-28 16:09:49 +0200
committerSylvain <syt@logilab.fr>2007-09-28 16:09:49 +0200
commit1f4776d70ecd6ac849e8e13c7ee9d8724d6e8787 (patch)
tree0b805b41ac8a13ce87760846ad22b7fb5a9ec610 /textutils.py
parenta7bbed8e220645e7713f770c1fd9d3da51f14376 (diff)
downloadlogilab-common-1f4776d70ecd6ac849e8e13c7ee9d8724d6e8787.tar.gz
backport changes to unormalize from ginco
Diffstat (limited to 'textutils.py')
-rw-r--r--textutils.py35
1 files changed, 27 insertions, 8 deletions
diff --git a/textutils.py b/textutils.py
index 5af9ed7..bc22ad5 100644
--- a/textutils.py
+++ b/textutils.py
@@ -51,18 +51,37 @@ import re
from unicodedata import normalize as _uninormalize
from os import linesep
-def unormalize(ustring, killchars='', ignorenonascii=False):
+
+MANUAL_UNICODE_MAP = {
+ u'\xa1': u'!', # INVERTED EXCLAMATION MARK
+ u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE
+ u'\u2044': u'/', # FRACTION SLASH
+ u'\xc6': u'AE', # LATIN CAPITAL LETTER AE
+ u'\xa9': u'(c)', # COPYRIGHT SIGN
+ u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+ u'\xe6': u'ae', # LATIN SMALL LETTER AE
+ u'\xae': u'(r)', # REGISTERED SIGN
+ u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
+ u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
+ u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE
+ u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE
+ u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+ u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S
+ }
+
+def unormalize(ustring, ignorenonascii=False):
"""replace diacritical characters with their corresponding ascii characters
"""
res = []
for letter in ustring[:]:
- if ord(letter) >= 2**8:
- if ignorenonascii:
- continue
- raise ValueError("can't deal with non-ascii based characters")
- replacement = _uninormalize('NFD', letter)[0]
- if replacement in killchars:
- continue
+ try:
+ replacement = MANUAL_UNICODE_MAP[letter]
+ except KeyError:
+ if ord(letter) >= 2**8:
+ if ignorenonascii:
+ continue
+ raise ValueError("can't deal with non-ascii based characters")
+ replacement = _uninormalize('NFD', letter)[0]
res.append(replacement)
return u''.join(res)