summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2016-05-05 22:08:46 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2016-05-05 22:08:46 -0700
commitd0ec398f30351516d46cdf3e4ba1d30a3bb55046 (patch)
tree915666884172f5d1c7401cdd87b45c767efc90f2
parent3df99572f21d82975ea4dff60ad2695b9c0e1504 (diff)
downloadnatsort-d0ec398f30351516d46cdf3e4ba1d30a3bb55046.tar.gz
Added unit tests for LOCALENUM functionality.
LOCALENUM is also now dependent on FLOAT. Additionally, it was found that the broken locale problem extends to the thousands separator character. A lookup table has been implemented for locales with known problems. This closes issue #36.
-rw-r--r--natsort/compat/locale.py59
-rw-r--r--natsort/utils.py28
-rw-r--r--test_natsort/test_natsorted.py3
-rw-r--r--test_natsort/test_pre_split_function.py94
4 files changed, 167 insertions, 17 deletions
diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py
index e199aaf..a2371ef 100644
--- a/natsort/compat/locale.py
+++ b/natsort/compat/locale.py
@@ -18,6 +18,9 @@ try:
null_string = b''
+ def dumb_sort():
+ return False
+
# If using PyICU, get the locale from the current global locale,
def get_icu_locale():
try:
@@ -36,9 +39,6 @@ try:
sep = PyICU.DecimalFormatSymbols.kDecimalSeparatorSymbol
return PyICU.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)
- def dumb_sort():
- return False
-
except ImportError:
import locale
if PY_VERSION < 3:
@@ -49,16 +49,57 @@ except ImportError:
from locale import strxfrm
null_string = ''
+ # On some systems, locale is broken and does not sort in the expected
+ # order. We will try to detect this and compensate.
+ def dumb_sort():
+ return strxfrm('A') < strxfrm('a')
+
def get_strxfrm():
return strxfrm
def get_thousands_sep():
- return locale.localeconv()['thousands_sep']
+ sep = locale.localeconv()['thousands_sep']
+ # If this locale library is broken, some of the thousands separator
+ # characters are incorrectly blank. Here is a lookup table of the
+ # corrections I am aware of.
+ if dumb_sort():
+ loc = '.'.join(locale.getlocale())
+ return {'de_DE.ISO8859-15': '.',
+ 'es_ES.ISO8859-1': '.',
+ 'de_AT.ISO8859-1': '.',
+ 'de_at': '\xa0',
+ 'nl_NL.UTF-8': '.',
+ 'es_es': '.',
+ 'fr_CH.ISO8859-15': '\xa0',
+ 'fr_CA.ISO8859-1': '\xa0',
+ 'de_CH.ISO8859-1': '.',
+ 'fr_FR.ISO8859-15': '\xa0',
+ 'nl_NL.ISO8859-1': '.',
+ 'ca_ES.UTF-8': '.',
+ 'nl_NL.ISO8859-15': '.',
+ 'de_ch': "'",
+ 'ca_es': '.',
+ 'de_AT.ISO8859-15': '.',
+ 'ca_ES.ISO8859-1': '.',
+ 'de_AT.UTF-8': '.',
+ 'es_ES.UTF-8': '.',
+ 'fr_fr': '\xa0',
+ 'es_ES.ISO8859-15': '.',
+ 'de_DE.ISO8859-1': '.',
+ 'nl_nl': '.',
+ 'fr_ch': '\xa0',
+ 'fr_ca': '\xa0',
+ 'de_DE.UTF-8': '.',
+ 'ca_ES.ISO8859-15': '.',
+ 'de_CH.ISO8859-15': '.',
+ 'fr_FR.ISO8859-1': '\xa0',
+ 'fr_CH.ISO8859-1': '\xa0',
+ 'de_de': '.',
+ 'fr_FR.UTF-8': '\xa0',
+ 'fr_CA.ISO8859-15': '\xa0',
+ }.get(loc, sep)
+ else:
+ return sep
def get_decimal_point():
return locale.localeconv()['decimal_point']
-
- # On some systems, locale is broken and does not sort in the expected
- # order. We will try to detect this and compensate.
- def dumb_sort():
- return strxfrm('A') < strxfrm('a')
diff --git a/natsort/utils.py b/natsort/utils.py
index 23f83b2..2bf1c5d 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -228,17 +228,33 @@ def _pre_split_function(alg):
if alg & ns.LOCALENUM:
# Create a regular expression that will remove thousands seprarators.
- thousands = get_thousands_sep()
- strip_thousands = (r'(?<![0-9]{{4}})(?<=[0-9]{{1}})'
- r'{thousands}(?=[0-9]{{3}}([^0-9]|$))')
- strip_thousands = strip_thousands.format(thousands=thousands)
- strip_thousands = re.compile(strip_thousands)
+ strip_thousands = r'''
+ (?<=[0-9]{{1}}) # At least 1 number
+ (?<![0-9]{{4}}) # No more than 3 numbers
+ {nodecimal} # Cannot follow decimal
+ {thou} # The thousands separator
+ (?=[0-9]{{3}} # Three numbers must follow
+ ([^0-9]|$) # But a non-number after that
+ )
+ '''
+ nodecimal = r''
+ if alg & ns.FLOAT:
+ # Make a regular expression component that will ensure no
+ # separators are removed after a decimal point.
+ d = get_decimal_point()
+ d = r'\.' if d == r'.' else d
+ nodecimal += r'(?<!' + d + r'[0-9])'
+ nodecimal += r'(?<!' + d + r'[0-9]{2})'
+ nodecimal += r'(?<!' + d + r'[0-9]{3})'
+ strip_thousands = strip_thousands.format(thou=get_thousands_sep(),
+ nodecimal=nodecimal)
+ strip_thousands = re.compile(strip_thousands, flags=re.VERBOSE)
function_chain.append(partial(strip_thousands.sub, ''))
# Create a regular expression that will change the decimal point to
# a period if not already a period.
decimal = get_decimal_point()
- if decimal != '.':
+ if alg & ns.FLOAT and decimal != '.':
switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])'
switch_decimal = switch_decimal.format(decimal=decimal)
switch_decimal = re.compile(switch_decimal)
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 8802825..146997a 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -13,7 +13,6 @@ from natsort import (
natsorted,
ns,
)
-from natsort.compat.locale import dumb_sort
from compat.locale import (
load_locale,
has_locale_de_DE,
@@ -230,7 +229,7 @@ def test_natsorted_with_LOCALE_and_en_setting_returns_results_sorted_by_en_langu
locale.setlocale(locale.LC_ALL, str(''))
-@pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_language():
load_locale('de_DE')
a = ['c', 'a5.467,86', 'รค', 'b', 'a5367.86', 'a5,6', 'a5,50']
diff --git a/test_natsort/test_pre_split_function.py b/test_natsort/test_pre_split_function.py
index 58cc108..c9db671 100644
--- a/test_natsort/test_pre_split_function.py
+++ b/test_natsort/test_pre_split_function.py
@@ -3,12 +3,20 @@
from __future__ import unicode_literals
import pytest
+import locale
+from operator import methodcaller
from natsort.ns_enum import ns
from natsort.utils import _pre_split_function
from natsort.compat.py23 import NEWPY
+from compat.locale import (
+ load_locale,
+ has_locale_de_DE,
+)
from compat.hypothesis import (
given,
text,
+ integers,
+ lists,
use_hypothesis,
)
@@ -94,3 +102,89 @@ def test_pre_split_function_performs_swapcase_and_casefold_both_LOWERCASEFIRST_A
assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold()
else:
assert _pre_split_function(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower()
+
+
+def test_pre_split_function_removes_thousands_separator_with_LOCALE_example():
+ load_locale('en_US')
+ x = '12,543,642,642.534,534,980' # Without FLOAT it does not account for decimal.
+ assert _pre_split_function(ns.LOCALE)(x) == '12543642642.534534980'
+ x = '12,543,642,642.534,534,980' # LOCALEALPHA doesn't do anything... need LOCALENUM
+ assert _pre_split_function(ns.LOCALEALPHA)(x) == '12,543,642,642.534,534,980'
+ locale.setlocale(locale.LC_ALL, str(''))
+
+
+@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
+@given(lists(elements=integers(), min_size=4, max_size=20))
+def test_pre_split_function_removes_thousands_separator_with_LOCALE(x):
+ load_locale('en_US')
+ t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x)))) # Remove negative signs trailing L
+ s = ''
+ for i, y in enumerate(reversed(t), 1):
+ s = y + s
+ if i % 3 == 0 and i != len(t):
+ s = ',' + s
+ assert _pre_split_function(ns.LOCALE)(s) == t
+ locale.setlocale(locale.LC_ALL, str(''))
+
+
+def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT_example():
+ x = '12,543,642,642.534,534,980'
+ assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '12543642642.534,534980'
+
+
+@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
+@given(lists(elements=integers(), min_size=4, max_size=20), lists(elements=integers(), min_size=4, max_size=20))
+def test_pre_split_function_removes_thousands_separator_and_is_float_aware_with_LOCALE_and_FLOAT(x, y):
+ load_locale('en_US')
+ t = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, x)))) # Remove negative signs trailing L
+ s = ''
+ for i, z in enumerate(reversed(t), 1):
+ s = z + s
+ if i % 3 == 0 and i != len(t):
+ s = ',' + s
+ u = ''.join(map(methodcaller('rstrip', 'lL'), map(str, map(abs, y)))) # Remove negative signs trailing L
+ v = ''
+ for i, z in enumerate(reversed(u), 1):
+ v = z + v
+ if i % 3 == 0 and i != len(u):
+ v = ',' + v
+ # Remove all but first comma.
+ a = v.split(',', 1)
+ p = a[0] + ',' + a[1].replace(',', '')
+ assert _pre_split_function(ns.LOCALE)('.'.join([s, v])) == '.'.join([t, u])
+ assert _pre_split_function(ns.LOCALE | ns.FLOAT)('.'.join([s, v])) == '.'.join([t, p])
+ locale.setlocale(locale.LC_ALL, str(''))
+
+
+# These might be too much to test with hypothesis.
+
+
+def test_pre_split_function_leaves_invalid_thousands_separator_with_LOCALE_example():
+ load_locale('en_US')
+ x = '12,543,642642.5345,34980'
+ assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5345,34980'
+ x = '12,59443,642,642.53,4534980'
+ assert _pre_split_function(ns.LOCALE)(x) == '12,59443,642642.53,4534980'
+ x = '12543,642,642.5,34534980'
+ assert _pre_split_function(ns.LOCALE)(x) == '12543,642642.5,34534980'
+ locale.setlocale(locale.LC_ALL, str(''))
+
+
+# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
+def test_pre_split_function_replaces_decimal_separator_with_LOCALE_example():
+ load_locale('de_DE')
+ x = '1543,753'
+ assert _pre_split_function(ns.LOCALE)(x) == '1543,753' # Does nothing without FLOAT
+ assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '1543.753'
+ assert _pre_split_function(ns.LOCALEALPHA)(x) == '1543,753' # LOCALEALPHA doesn't do anything... need LOCALENUM
+ locale.setlocale(locale.LC_ALL, str(''))
+
+
+# @pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+@pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale and working locale')
+def test_pre_split_function_does_not_replace_invalid_decimal_separator_with_LOCALE_example():
+ load_locale('de_DE')
+ x = '154s,t53'
+ assert _pre_split_function(ns.LOCALE | ns.FLOAT)(x) == '154s,t53'
+ locale.setlocale(locale.LC_ALL, str(''))