diff options
author | Sylvain <syt@logilab.fr> | 2007-09-28 16:09:49 +0200 |
---|---|---|
committer | Sylvain <syt@logilab.fr> | 2007-09-28 16:09:49 +0200 |
commit | 1f4776d70ecd6ac849e8e13c7ee9d8724d6e8787 (patch) | |
tree | 0b805b41ac8a13ce87760846ad22b7fb5a9ec610 | |
parent | a7bbed8e220645e7713f770c1fd9d3da51f14376 (diff) | |
download | logilab-common-1f4776d70ecd6ac849e8e13c7ee9d8724d6e8787.tar.gz |
backport changes to unormalize from ginco
-rw-r--r-- | debian/changelog | 2 | ||||
-rw-r--r-- | test/unittest_textutils.py | 11 | ||||
-rw-r--r-- | textutils.py | 35 |
3 files changed, 39 insertions, 9 deletions
diff --git a/debian/changelog b/debian/changelog index fb01c5f..a346054 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -logilab-common (0.22.1-2) unstable; urgency=low +logilab-common (0.22.2-1) unstable; urgency=low * new upstream release diff --git a/test/unittest_textutils.py b/test/unittest_textutils.py index 66df390..2043e84 100644 --- a/test/unittest_textutils.py +++ b/test/unittest_textutils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- """ unit tests for module textutils squeleton generated by /home/syt/cvs_work/logilab/pyreverse/py2tests.py on Sep 08 at 09:1:31 @@ -130,6 +131,16 @@ class ColorizeAnsiTC(TestCase): self.assertRaises(KeyError, tu.colorize_ansi, 'hello', None, 'italique') +class UnormalizeTC(TestCase): + def test_unormalize(self): + data = [(u'\u0153nologie', u'oenologie'), + (u'\u0152nologie', u'OEnologie'), + (u'l\xf8to', u'loto'), + (u'été', u'ete'), + ] + for input, output in data: + yield self.assertEquals, tu.unormalize(input), output + class ModuleDocTest(DocTest): """test doc test in this module""" module = tu diff --git a/textutils.py b/textutils.py index 5af9ed7..bc22ad5 100644 --- a/textutils.py +++ b/textutils.py @@ -51,18 +51,37 @@ import re from unicodedata import normalize as _uninormalize from os import linesep -def unormalize(ustring, killchars='', ignorenonascii=False): + +MANUAL_UNICODE_MAP = { + u'\xa1': u'!', # INVERTED EXCLAMATION MARK + u'\u0142': u'l', # LATIN SMALL LETTER L WITH STROKE + u'\u2044': u'/', # FRACTION SLASH + u'\xc6': u'AE', # LATIN CAPITAL LETTER AE + u'\xa9': u'(c)', # COPYRIGHT SIGN + u'\xab': u'"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + u'\xe6': u'ae', # LATIN SMALL LETTER AE + u'\xae': u'(r)', # REGISTERED SIGN + u'\u0153': u'oe', # LATIN SMALL LIGATURE OE + u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE + u'\xd8': u'O', # LATIN CAPITAL LETTER O WITH STROKE + u'\xf8': u'o', # LATIN SMALL LETTER O WITH STROKE + u'\xbb': u'"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + u'\xdf': u'ss', # LATIN SMALL LETTER SHARP S + } + +def unormalize(ustring, ignorenonascii=False): """replace diacritical characters with their corresponding ascii characters """ res = [] for letter in ustring[:]: - if ord(letter) >= 2**8: - if ignorenonascii: - continue - raise ValueError("can't deal with non-ascii based characters") - replacement = _uninormalize('NFD', letter)[0] - if replacement in killchars: - continue + try: + replacement = MANUAL_UNICODE_MAP[letter] + except KeyError: + if ord(letter) >= 2**8: + if ignorenonascii: + continue + raise ValueError("can't deal with non-ascii based characters") + replacement = _uninormalize('NFD', letter)[0] res.append(replacement) return u''.join(res) |