Added unit tests for LOCALENUM functionality.

LOCALENUM is also now dependent on FLOAT. Additionally, it was found that the broken locale problem extends to the thousands separator character. A lookup table has been implemented for locales with known problems. This closes issue #36.
author: Seth M Morton <seth.m.morton@gmail.com> 2016-05-05 22:08:46 -0700
committer: Seth M Morton <seth.m.morton@gmail.com> 2016-05-05 22:08:46 -0700
commit: d0ec398f30351516d46cdf3e4ba1d30a3bb55046 (patch)
tree: 915666884172f5d1c7401cdd87b45c767efc90f2
parent: 3df99572f21d82975ea4dff60ad2695b9c0e1504 (diff)
download: natsort-d0ec398f30351516d46cdf3e4ba1d30a3bb55046.tar.gz
4 files changed, 167 insertions, 17 deletions
diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py
index e199aaf..a2371ef 100644
--- a/natsort/compat/locale.py
+++ b/natsort/compat/locale.py
@@ -18,6 +18,9 @@ try:
 
     null_string = b''
 
+    def dumb_sort():
+        return False
+
     # If using PyICU, get the locale from the current global locale,
     def get_icu_locale():
         try:
@@ -36,9 +39,6 @@ try:
         sep = PyICU.DecimalFormatSymbols.kDecimalSeparatorSymbol
         return PyICU.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)
 
-    def dumb_sort():
-        return False
-
 except ImportError:
     import locale
     if PY_VERSION < 3:
@@ -49,16 +49,57 @@ except ImportError:
         from locale import strxfrm
         null_string = ''
 
+    # On some systems, locale is broken and does not sort in the expected
+    # order. We will try to detect this and compensate.
+    def dumb_sort():
+        return strxfrm('A') < strxfrm('a')
+
     def get_strxfrm():
         return strxfrm
 
     def get_thousands_sep():
-        return locale.localeconv()['thousands_sep']
+        sep = locale.localeconv()['thousands_sep']
+        # If this locale library is broken, some of the thousands separator
+        # characters are incorrectly blank. Here is a lookup table of the
+        # corrections I am aware of.
+        if dumb_sort():
+            loc = '.'.join(locale.getlocale())
+            return {'de_DE.ISO8859-15': '.',
+                    'es_ES.ISO8859-1': '.',
+                    'de_AT.ISO8859-1': '.',
+                    'de_at': '\xa0',
+                    'nl_NL.UTF-8': '.',
+                    'es_es': '.',
+                    'fr_CH.ISO8859-15': '\xa0',
+                    'fr_CA.ISO8859-1': '\xa0',
+                    'de_CH.ISO8859-1': '.',
+                    'fr_FR.ISO8859-15': '\xa0',
+                    'nl_NL.ISO8859-1': '.',
+                    'ca_ES.UTF-8': '.',
+                    'nl_NL.ISO8859-15': '.',
+                    'de_ch': "'",
+                    'ca_es': '.',
+                    'de_AT.ISO8859-15': '.',
+                    'ca_ES.ISO8859-1': '.',
+                    'de_AT.UTF-8': '.',
+                    'es_ES.UTF-8': '.',
+                    'fr_fr': '\xa0',
+                    'es_ES.ISO8859-15': '.',
+                    'de_DE.ISO8859-1': '.',
+                    'nl_nl': '.',
+                    'fr_ch': '\xa0',
+                    'fr_ca': '\xa0',
+                    'de_DE.UTF-8': '.',
+                    'ca_ES.ISO8859-15': '.',
+                    'de_CH.ISO8859-15': '.',
+                    'fr_FR.ISO8859-1': '\xa0',
+                    'fr_CH.ISO8859-1': '\xa0',
+                    'de_de': '.',
+                    'fr_FR.UTF-8': '\xa0',
+                    'fr_CA.ISO8859-15': '\xa0',
+                    }.get(loc, sep)
+        else:
+            return sep
 
     def get_decimal_point():
         return locale.localeconv()['decimal_point']
-
-    # On some systems, locale is broken and does not sort in the expected
-    # order. We will try to detect this and compensate.
-    def dumb_sort():
-        return strxfrm('A') < strxfrm('a')
diff --git a/natsort/utils.py b/natsort/utils.py
index 23f83b2..2bf1c5d 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -228,17 +228,33 @@ def _pre_split_function(alg):
 
     if alg & ns.LOCALENUM:
         # Create a regular expression that will remove thousands seprarators.
-        thousands = get_thousands_sep()
-        strip_thousands = (r'(?<![0-9]{{4}})(?<=[0-9]{{1}})'
-                           r'{thousands}(?=[0-9]{{3}}([^0-9]|$))')
-        strip_thousands = strip_thousands.format(thousands=thousands)
-        strip_thousands = re.compile(strip_thousands)
+        strip_thousands = r'''
+            (?<=[0-9]{{1}})  # At least 1 number
+            (?<![0-9]{{4}})  # No more than 3 numbers
+            {nodecimal}      # Cannot follow decimal
+            {thou}           # The thousands separator
+            (?=[0-9]{{3}}    # Three numbers must follow
+             ([^0-9]|$)      # But a non-number after that
+            )
+        '''
+        nodecimal = r''
+        if alg & ns.FLOAT:
+            # Make a regular expression component that will ensure no
+            # separators are removed after a decimal point.
+            d = get_decimal_point()
+            d = r'\.' if d == r'.' else d
+            nodecimal += r'(?<!' + d + r'[0-9])'
+            nodecimal += r'(?<!' + d + r'[0-9]{2})'
+            nodecimal += r'(?<!' + d + r'[0-9]{3})'
+        strip_thousands = strip_thousands.format(thou=get_thousands_sep(),
+                                                 nodecimal=nodecimal)
+        strip_thousands = re.compile(strip_thousands, flags=re.VERBOSE)
         function_chain.append(partial(strip_thousands.sub, ''))
 
         # Create a regular expression that will change the decimal point to
         # a period if not already a period.
         decimal = get_decimal_point()
-        if decimal != '.':
+        if alg & ns.FLOAT and decimal != '.':
             switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])'
             switch_decimal = switch_decimal.format(decimal=decimal)
             switch_decimal = re.compile(switch_decimal)
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 8802825..146997a 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -13,7 +13,6 @@ from natsort import (
     natsorted,
     ns,
 )
-from natsort.compat.locale import dumb_sort
 from compat.locale import (
     load_locale,
     has_locale_de_DE,
@@ -230,7 +229,7 @@ def test_natsorted_with_LOCALE_and_en_setting_returns_results_sorted_by_en_langu
     locale.setlocale(locale.LC_ALL, str(''))
 
 
-@pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
 def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_language():
     load_locale('de_DE')
     a = ['c', 'a5.467,86', 'ä', 'b', 'a5367.86', 'a5,6', 'a5,50']
diff --git a/test_natsort/test_pre_split_function.py b/test_natsort/test_pre_split_function.py
index 58cc108..c9db671 100644
--- a/test_natsort/test_pre_split_function.py
+++ b/test_natsort/test_pre_split_function.py
@@ -3,12 +3,20 @@
 from __future__ import unicode_literals
 
 import pytest
+import locale
+from operator import methodcaller
 from natsort.ns_enum import ns
 from natsort.utils import _pre_split_function
 from natsort.compat.py23 import NEWPY
+from compat.locale import (
+    load_locale,
+    has_locale_de_DE,
+)
 from compat.hypothesis import (
     given,
     text,
+    integers,
+    lists,
     use_hypothesis,
 )
 
@@ -94,3 +102,89 @@ def test_pre_split_function_performs_swapcase_and_casefold_both_LOWERCASEFIRST_A
         assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold()
     else:
         assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower()
+
+
+def test_pre_split_function_removes_thousands_separator_with_LOCALE_example():
+    load_locale('en_US')
+    x = '12,543,642,642.534,534,980'  # Without FLOAT it does not account for decimal.
+    assert _pre_split_function(ns.LOCALE)(x) == '12543642642.534534980'
+    x = '12,543,642,642.534,534,980'  # LOCALEALPHA doesn't do anything... need LOCALENUM
+    assert _pre_split_function(ns.LOCALEALPHA)(x) == '12,543,642,642.534,534,980'
+    locale.setlocale(locale.LC_ALL, str(''))
+
+
+@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
+@given(lists(elements=integers(), min_size=4, max_size=20))
+def test_pre_split_function_removes_thousands_separator_with_LOCALE(x):
+    load_locale('en_US')
+    t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x))))  # Remove negative signs trailing L
+    s = ''
+    for i, y in enumerate(reversed(t), 1):
+        s = y + s
+        if i % 3 == 0 and i != len(t):
+            s = ',' + s
+    assert _pre_split_function(ns.LOCALE)(s) == t
+    locale.setlocale(locale.LC_ALL, str(''))
+
+
+def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT_example():
+    x = '12,543,642,642.534,534,980'
+    assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '12543642642.534,534980'
+
+
+@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
+@given(lists(elements=integers(), min_size=4, max_size=20), lists(elements=integers(), min_size=4, max_size=20))
+def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT(x, y):
+    load_locale('en_US')
+    t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x))))  # Remove negative signs trailing L
+    s = ''
+    for i, z in enumerate(reversed(t), 1):
+        s = z + s
+        if i % 3 == 0 and i != len(t):
+            s = ',' + s
+    u = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, y))))  # Remove negative signs trailing L
+    v = ''
+    for i, z in enumerate(reversed(u), 1):
+        v = z + v
+        if i % 3 == 0 and i != len(u):
+            v = ',' + v
+    # Remove all but first comma.
+    a = v.split(',', 1)
+    p = a[0] + ',' + a[1].replace(',', '')
+    assert _pre_split_function(ns.LOCALE)('.'.join([s, v])) == '.'.join([t, u])
+    assert _pre_split_function(ns.LOCALE | ns.FLOAT)('.'.join([s, v])) == '.'.join([t, p])
+    locale.setlocale(locale.LC_ALL, str(''))
+
+
+# These might be too much to test with hypothesis.
+
+
+def test_pre_split_function_leaves_invalid_thousands_separator_with_LOCALE_example():
+    load_locale('en_US')
+    x = '12,543,642642.5345,34980'
+    assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5345,34980'
+    x = '12,59443,642,642.53,4534980'
+    assert _pre_split_function(ns.LOCALE)(x) == '12,59443,642642.53,4534980'
+    x = '12543,642,642.5,34534980'
+    assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5,34534980'
+    locale.setlocale(locale.LC_ALL, str(''))
+
+
+# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
+def test_pre_split_function_replaces_decimal_separator_with_LOCALE_example():
+    load_locale('de_DE')
+    x = '1543,753'
+    assert _pre_split_function(ns.LOCALE)(x) == '1543,753'  # Does nothing without FLOAT
+    assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '1543.753'
+    assert _pre_split_function(ns.LOCALEALPHA)(x) == '1543,753'  # LOCALEALPHA doesn't do anything... need LOCALENUM
+    locale.setlocale(locale.LC_ALL, str(''))
+
+
+# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
+def test_pre_split_function_does_not_replace_invalid_decimal_separator_with_LOCALE_example():
+    load_locale('de_DE')
+    x = '154s,t53'
+    assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '154s,t53'
+    locale.setlocale(locale.LC_ALL, str(''))
author	Seth M Morton <seth.m.morton@gmail.com>	2016-05-05 22:08:46 -0700
committer	Seth M Morton <seth.m.morton@gmail.com>	2016-05-05 22:08:46 -0700
commit	d0ec398f30351516d46cdf3e4ba1d30a3bb55046 (patch)
tree	915666884172f5d1c7401cdd87b45c767efc90f2
parent	3df99572f21d82975ea4dff60ad2695b9c0e1504 (diff)
download	natsort-d0ec398f30351516d46cdf3e4ba1d30a3bb55046.tar.gz