diff options
author | Seth M Morton <seth.m.morton@gmail.com> | 2016-05-05 22:08:46 -0700 |
---|---|---|
committer | Seth M Morton <seth.m.morton@gmail.com> | 2016-05-05 22:08:46 -0700 |
commit | d0ec398f30351516d46cdf3e4ba1d30a3bb55046 (patch) | |
tree | 915666884172f5d1c7401cdd87b45c767efc90f2 | |
parent | 3df99572f21d82975ea4dff60ad2695b9c0e1504 (diff) | |
download | natsort-d0ec398f30351516d46cdf3e4ba1d30a3bb55046.tar.gz |
Added unit tests for LOCALENUM functionality.
LOCALENUM is also now dependent on FLOAT.
Additionally, it was found that the broken locale problem extends
to the thousands separator character. A lookup table has been
implemented for locales with known problems.
This closes issue #36.
-rw-r--r-- | natsort/compat/locale.py | 59 | ||||
-rw-r--r-- | natsort/utils.py | 28 | ||||
-rw-r--r-- | test_natsort/test_natsorted.py | 3 | ||||
-rw-r--r-- | test_natsort/test_pre_split_function.py | 94 |
4 files changed, 167 insertions, 17 deletions
diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py index e199aaf..a2371ef 100644 --- a/natsort/compat/locale.py +++ b/natsort/compat/locale.py @@ -18,6 +18,9 @@ try: null_string = b'' + def dumb_sort(): + return False + # If using PyICU, get the locale from the current global locale, def get_icu_locale(): try: @@ -36,9 +39,6 @@ try: sep = PyICU.DecimalFormatSymbols.kDecimalSeparatorSymbol return PyICU.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep) - def dumb_sort(): - return False - except ImportError: import locale if PY_VERSION < 3: @@ -49,16 +49,57 @@ except ImportError: from locale import strxfrm null_string = '' + # On some systems, locale is broken and does not sort in the expected + # order. We will try to detect this and compensate. + def dumb_sort(): + return strxfrm('A') < strxfrm('a') + def get_strxfrm(): return strxfrm def get_thousands_sep(): - return locale.localeconv()['thousands_sep'] + sep = locale.localeconv()['thousands_sep'] + # If this locale library is broken, some of the thousands separator + # characters are incorrectly blank. Here is a lookup table of the + # corrections I am aware of. + if dumb_sort(): + loc = '.'.join(locale.getlocale()) + return {'de_DE.ISO8859-15': '.', + 'es_ES.ISO8859-1': '.', + 'de_AT.ISO8859-1': '.', + 'de_at': '\xa0', + 'nl_NL.UTF-8': '.', + 'es_es': '.', + 'fr_CH.ISO8859-15': '\xa0', + 'fr_CA.ISO8859-1': '\xa0', + 'de_CH.ISO8859-1': '.', + 'fr_FR.ISO8859-15': '\xa0', + 'nl_NL.ISO8859-1': '.', + 'ca_ES.UTF-8': '.', + 'nl_NL.ISO8859-15': '.', + 'de_ch': "'", + 'ca_es': '.', + 'de_AT.ISO8859-15': '.', + 'ca_ES.ISO8859-1': '.', + 'de_AT.UTF-8': '.', + 'es_ES.UTF-8': '.', + 'fr_fr': '\xa0', + 'es_ES.ISO8859-15': '.', + 'de_DE.ISO8859-1': '.', + 'nl_nl': '.', + 'fr_ch': '\xa0', + 'fr_ca': '\xa0', + 'de_DE.UTF-8': '.', + 'ca_ES.ISO8859-15': '.', + 'de_CH.ISO8859-15': '.', + 'fr_FR.ISO8859-1': '\xa0', + 'fr_CH.ISO8859-1': '\xa0', + 'de_de': '.', + 'fr_FR.UTF-8': '\xa0', + 'fr_CA.ISO8859-15': '\xa0', + }.get(loc, sep) + else: + return sep def get_decimal_point(): return locale.localeconv()['decimal_point'] - - # On some systems, locale is broken and does not sort in the expected - # order. We will try to detect this and compensate. - def dumb_sort(): - return strxfrm('A') < strxfrm('a') diff --git a/natsort/utils.py b/natsort/utils.py index 23f83b2..2bf1c5d 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -228,17 +228,33 @@ def _pre_split_function(alg): if alg & ns.LOCALENUM: # Create a regular expression that will remove thousands seprarators. - thousands = get_thousands_sep() - strip_thousands = (r'(?<![0-9]{{4}})(?<=[0-9]{{1}})' - r'{thousands}(?=[0-9]{{3}}([^0-9]|$))') - strip_thousands = strip_thousands.format(thousands=thousands) - strip_thousands = re.compile(strip_thousands) + strip_thousands = r''' + (?<=[0-9]{{1}}) # At least 1 number + (?<![0-9]{{4}}) # No more than 3 numbers + {nodecimal} # Cannot follow decimal + {thou} # The thousands separator + (?=[0-9]{{3}} # Three numbers must follow + ([^0-9]|$) # But a non-number after that + ) + ''' + nodecimal = r'' + if alg & ns.FLOAT: + # Make a regular expression component that will ensure no + # separators are removed after a decimal point. + d = get_decimal_point() + d = r'\.' if d == r'.' else d + nodecimal += r'(?<!' + d + r'[0-9])' + nodecimal += r'(?<!' + d + r'[0-9]{2})' + nodecimal += r'(?<!' + d + r'[0-9]{3})' + strip_thousands = strip_thousands.format(thou=get_thousands_sep(), + nodecimal=nodecimal) + strip_thousands = re.compile(strip_thousands, flags=re.VERBOSE) function_chain.append(partial(strip_thousands.sub, '')) # Create a regular expression that will change the decimal point to # a period if not already a period. decimal = get_decimal_point() - if decimal != '.': + if alg & ns.FLOAT and decimal != '.': switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])' switch_decimal = switch_decimal.format(decimal=decimal) switch_decimal = re.compile(switch_decimal) diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 8802825..146997a 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -13,7 +13,6 @@ from natsort import ( natsorted, ns, ) -from natsort.compat.locale import dumb_sort from compat.locale import ( load_locale, has_locale_de_DE, @@ -230,7 +229,7 @@ def test_natsorted_with_LOCALE_and_en_setting_returns_results_sorted_by_en_langu locale.setlocale(locale.LC_ALL, str('')) -@pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale') +@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale') def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_language(): load_locale('de_DE') a = ['c', 'a5.467,86', 'รค', 'b', 'a5367.86', 'a5,6', 'a5,50'] diff --git a/test_natsort/test_pre_split_function.py b/test_natsort/test_pre_split_function.py index 58cc108..c9db671 100644 --- a/test_natsort/test_pre_split_function.py +++ b/test_natsort/test_pre_split_function.py @@ -3,12 +3,20 @@ from __future__ import unicode_literals import pytest +import locale +from operator import methodcaller from natsort.ns_enum import ns from natsort.utils import _pre_split_function from natsort.compat.py23 import NEWPY +from compat.locale import ( + load_locale, + has_locale_de_DE, +) from compat.hypothesis import ( given, text, + integers, + lists, use_hypothesis, ) @@ -94,3 +102,89 @@ def test_pre_split_function_performs_swapcase_and_casefold_both_LOWERCASEFIRST_A assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold() else: assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower() + + +def test_pre_split_function_removes_thousands_separator_with_LOCALE_example(): + load_locale('en_US') + x = '12,543,642,642.534,534,980' # Without FLOAT it does not account for decimal. + assert _pre_split_function(ns.LOCALE)(x) == '12543642642.534534980' + x = '12,543,642,642.534,534,980' # LOCALEALPHA doesn't do anything... need LOCALENUM + assert _pre_split_function(ns.LOCALEALPHA)(x) == '12,543,642,642.534,534,980' + locale.setlocale(locale.LC_ALL, str('')) + + +@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater') +@given(lists(elements=integers(), min_size=4, max_size=20)) +def test_pre_split_function_removes_thousands_separator_with_LOCALE(x): + load_locale('en_US') + t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x)))) # Remove negative signs trailing L + s = '' + for i, y in enumerate(reversed(t), 1): + s = y + s + if i % 3 == 0 and i != len(t): + s = ',' + s + assert _pre_split_function(ns.LOCALE)(s) == t + locale.setlocale(locale.LC_ALL, str('')) + + +def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT_example(): + x = '12,543,642,642.534,534,980' + assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '12543642642.534,534980' + + +@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater') +@given(lists(elements=integers(), min_size=4, max_size=20), lists(elements=integers(), min_size=4, max_size=20)) +def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT(x, y): + load_locale('en_US') + t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x)))) # Remove negative signs trailing L + s = '' + for i, z in enumerate(reversed(t), 1): + s = z + s + if i % 3 == 0 and i != len(t): + s = ',' + s + u = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, y)))) # Remove negative signs trailing L + v = '' + for i, z in enumerate(reversed(u), 1): + v = z + v + if i % 3 == 0 and i != len(u): + v = ',' + v + # Remove all but first comma. + a = v.split(',', 1) + p = a[0] + ',' + a[1].replace(',', '') + assert _pre_split_function(ns.LOCALE)('.'.join([s, v])) == '.'.join([t, u]) + assert _pre_split_function(ns.LOCALE | ns.FLOAT)('.'.join([s, v])) == '.'.join([t, p]) + locale.setlocale(locale.LC_ALL, str('')) + + +# These might be too much to test with hypothesis. + + +def test_pre_split_function_leaves_invalid_thousands_separator_with_LOCALE_example(): + load_locale('en_US') + x = '12,543,642642.5345,34980' + assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5345,34980' + x = '12,59443,642,642.53,4534980' + assert _pre_split_function(ns.LOCALE)(x) == '12,59443,642642.53,4534980' + x = '12543,642,642.5,34534980' + assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5,34534980' + locale.setlocale(locale.LC_ALL, str('')) + + +# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale') +@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale') +def test_pre_split_function_replaces_decimal_separator_with_LOCALE_example(): + load_locale('de_DE') + x = '1543,753' + assert _pre_split_function(ns.LOCALE)(x) == '1543,753' # Does nothing without FLOAT + assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '1543.753' + assert _pre_split_function(ns.LOCALEALPHA)(x) == '1543,753' # LOCALEALPHA doesn't do anything... need LOCALENUM + locale.setlocale(locale.LC_ALL, str('')) + + +# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale') +@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale') +def test_pre_split_function_does_not_replace_invalid_decimal_separator_with_LOCALE_example(): + load_locale('de_DE') + x = '154s,t53' + assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '154s,t53' + locale.setlocale(locale.LC_ALL, str('')) |