Merge pull request #54 from SethMMorton/52-sorting-arabic-farsi-or-hebrew-numbers-is-not-natural

Add support for ints and floats comprised of unicode decimal strings.
author: Seth Morton <seth.m.morton@gmail.com> 2018-04-20 20:10:16 -0700
committer: GitHub <noreply@github.com> 2018-04-20 20:10:16 -0700
commit: 68ed80722c16253c3981a66432325bff9bdfc45d (patch)
tree: 4af4ac76e65eefd25b6eda39888a67ea080b753d
parent: 9a79db89a66fbf93a09e397039396bbbadf6ddd0 (diff)
parent: 5f0abd85fa1e8aa53a1bc1645f009f740ce7f04f (diff)
download: natsort-68ed80722c16253c3981a66432325bff9bdfc45d.tar.gz
8 files changed, 73 insertions, 25 deletions
diff --git a/natsort/compat/fake_fastnumbers.py b/natsort/compat/fake_fastnumbers.py
index 0af60b5..6eee532 100644
--- a/natsort/compat/fake_fastnumbers.py
+++ b/natsort/compat/fake_fastnumbers.py
@@ -13,6 +13,7 @@ from __future__ import (
 
 # Std. lib imports.
 import unicodedata
+from natsort.unicode_numbers import decimal_chars
 from natsort.compat.py23 import PY_VERSION
 if PY_VERSION >= 3:
     long = int
@@ -22,16 +23,18 @@ NAN_INF = ['INF', 'INf', 'Inf', 'inF', 'iNF', 'InF', 'inf', 'iNf',
            'NAN', 'nan', 'NaN', 'nAn', 'naN', 'NAn', 'nAN', 'Nan']
 NAN_INF.extend(['+'+x[:2] for x in NAN_INF] + ['-'+x[:2] for x in NAN_INF])
 NAN_INF = frozenset(NAN_INF)
+ASCII_NUMS = '0123456789+-'
 
 
 def fast_float(x, key=lambda x: x, nan=None,
-               uni=unicodedata.numeric, nan_inf=NAN_INF):
+               uni=unicodedata.numeric, nan_inf=NAN_INF,
+               _first_char=frozenset(decimal_chars + list(ASCII_NUMS + '.'))):
     """\
     Convert a string to a float quickly, return input as-is if not possible.
     We don't need to accept all input that the real fast_int accepts because
     the input will be controlled by the splitting algorithm.
     """
-    if x[0] in '0123456789+-.' or x.lstrip()[:3] in nan_inf:
+    if x[0] in _first_char or x.lstrip()[:3] in nan_inf:
         try:
             x = float(x)
             return nan if nan is not None and x != x else x
@@ -47,13 +50,14 @@ def fast_float(x, key=lambda x: x, nan=None,
             return key(x)
 
 
-def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit):
+def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit,
+             _first_char=frozenset(decimal_chars + list(ASCII_NUMS))):
     """\
     Convert a string to a int quickly, return input as-is if not possible.
     We don't need to accept all input that the real fast_int accepts because
     the input will be controlled by the splitting algorithm.
     """
-    if x[0] in '0123456789+-':
+    if x[0] in _first_char:
         try:
             return long(x)
         except ValueError:
diff --git a/natsort/unicode_numbers.py b/natsort/unicode_numbers.py
index 921c5c1..e87654f 100644
--- a/natsort/unicode_numbers.py
+++ b/natsort/unicode_numbers.py
@@ -280,10 +280,17 @@ for a in numeric_hex:
 digit_chars = [a for a in numeric_chars
                if unicodedata.digit(a, None) is not None]
 
+# The decimal characters are a subset of the numberals
+# (probably of the digits, but let's be safe).
+decimal_chars = [a for a in numeric_chars
+                 if unicodedata.decimal(a, None) is not None]
+
 # Create a single string with the above data.
+decimals = ''.join(decimal_chars)
 digits = ''.join(digit_chars)
 numeric = ''.join(numeric_chars)
-
+digits_no_decimals = ''.join([x for x in digits if x not in decimals])
+numeric_no_decimals = ''.join([x for x in numeric if x not in decimals])
 
 # Some code that can be used to create the above list of hex numbers.
 if __name__ == '__main__':
diff --git a/natsort/utils.py b/natsort/utils.py
index b6484b0..306762a 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -58,7 +58,7 @@ from unicodedata import normalize
 
 # Local imports.
 from natsort.ns_enum import ns
-from natsort.unicode_numbers import digits, numeric
+from natsort.unicode_numbers import numeric_no_decimals, digits_no_decimals
 from natsort.compat.pathlib import PurePath, has_pathlib
 from natsort.compat.locale import (
     get_strxfrm,
@@ -80,25 +80,26 @@ if PY_VERSION >= 3:
     long = int
 
 # The regex that locates floats - include Unicode numerals.
-_exp = r'(?:[eE][-+]?[0-9]+)?'
-_num = r'(?:[0-9]+\.?[0-9]*|\.[0-9]+)'
+_nnd = numeric_no_decimals
+_exp = r'(?:[eE][-+]?\d+)?'
+_num = r'(?:\d+\.?\d*|\.\d+)'
 _float_sign_exp_re = r'([-+]?{0}{1}|[{2}])'
-_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, numeric)
+_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, _nnd)
 _float_sign_exp_re = re.compile(_float_sign_exp_re, flags=re.U)
 _float_nosign_exp_re = r'({0}{1}|[{2}])'
-_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, numeric)
+_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, _nnd)
 _float_nosign_exp_re = re.compile(_float_nosign_exp_re, flags=re.U)
 _float_sign_noexp_re = r'([-+]?{0}|[{1}])'
-_float_sign_noexp_re = _float_sign_noexp_re.format(_num, numeric)
+_float_sign_noexp_re = _float_sign_noexp_re.format(_num, _nnd)
 _float_sign_noexp_re = re.compile(_float_sign_noexp_re, flags=re.U)
 _float_nosign_noexp_re = r'({0}|[{1}])'
-_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, numeric)
+_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, _nnd)
 _float_nosign_noexp_re = re.compile(_float_nosign_noexp_re, flags=re.U)
 
 # Integer regexes - include Unicode digits.
-_int_nosign_re = r'([0-9]+|[{0}])'.format(digits)
+_int_nosign_re = r'(\d+|[{0}])'.format(digits_no_decimals)
 _int_nosign_re = re.compile(_int_nosign_re, flags=re.U)
-_int_sign_re = r'([-+]?[0-9]+|[{0}])'.format(digits)
+_int_sign_re = r'([-+]?\d+|[{0}])'.format(digits_no_decimals)
 _int_sign_re = re.compile(_int_sign_re, flags=re.U)
 
 # This dict will help select the correct regex and number conversion function.
diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py
index aef329e..f3cd76e 100644
--- a/test_natsort/slow_splitters.py
+++ b/test_natsort/slow_splitters.py
@@ -6,6 +6,7 @@ import unicodedata
 import collections
 import itertools
 import functools
+from natsort.unicode_numbers import decimals
 from natsort.compat.py23 import PY_VERSION, py23_zip
 
 if PY_VERSION >= 3.0:
@@ -20,9 +21,9 @@ SplitElement = collections.namedtuple('SplitElement',
 def int_splitter(iterable, signed, sep):
     """Alternate (slow) method to split a string into numbers."""
     iterable = unicodedata.normalize('NFD', iterable)
-    split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit())
-    split_by_digits = refine_split_grouping(split_by_digits)
-    split = int_splitter_iter(split_by_digits, signed)
+    split_by_decimal = itertools.groupby(iterable, lambda a: a.isdigit())
+    split_by_decimal = refine_split_grouping(split_by_decimal)
+    split = int_splitter_iter(split_by_decimal, signed)
     split = sep_inserter(split, sep)
     return tuple(add_leading_space_if_first_is_num(split, sep))
 
@@ -31,12 +32,12 @@ def float_splitter(iterable, signed, exp, sep):
     """Alternate (slow) method to split a string into numbers."""
 
     def number_tester(x):
-        return x.isdigit() or unicodedata.numeric(x, None) is not None
+        return x.isdecimal() or unicodedata.numeric(x, None) is not None
 
     iterable = unicodedata.normalize('NFD', iterable)
-    split_by_digits = itertools.groupby(iterable, number_tester)
-    split_by_digits = peekable(refine_split_grouping(split_by_digits))
-    split = float_splitter_iter(split_by_digits, signed, exp)
+    split_by_decimal = itertools.groupby(iterable, number_tester)
+    split_by_decimal = peekable(refine_split_grouping(split_by_decimal))
+    split = float_splitter_iter(split_by_decimal, signed, exp)
     split = sep_inserter(split, sep)
     return tuple(add_leading_space_if_first_is_num(split, sep))
 
@@ -64,8 +65,9 @@ def refine_split_grouping(iterable):
             yield SplitElement(False, val, False)
 
 
-def group_unicode_and_ascii_numbers(iterable,
-                                    ascii_digits=frozenset('0123456789')):
+def group_unicode_and_ascii_numbers(
+        iterable, ascii_digits=frozenset(decimals + '0123456789')
+):
     """
     Use groupby to group ASCII and unicode numeric characters.
     Assumes all input is already all numeric characters.
diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py
index c9103c2..52bbb8c 100644
--- a/test_natsort/test_fake_fastnumbers.py
+++ b/test_natsort/test_fake_fastnumbers.py
@@ -78,6 +78,8 @@ def test_fast_float_converts_float_string_to_float_example():
     assert isnan(fast_float('nan'))
     assert isnan(fast_float('+nan'))
     assert isnan(fast_float('-NaN'))
+    assert fast_float('۱۲.۱۲') == 12.12
+    assert fast_float('-۱۲.۱۲') == -12.12
 
 
 @given(floats(allow_nan=False))
@@ -117,6 +119,8 @@ def test_fast_int_leaves_float_string_as_is(x):
 def test_fast_int_converts_int_string_to_int_example():
     assert fast_int('-45') == -45
     assert fast_int('+45') == 45
+    assert fast_int('۱۲') == 12
+    assert fast_int('-۱۲') == -12
 
 
 @given(integers())
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 71ffbf6..ce6b879 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -280,3 +280,13 @@ def test_natsorted_sorts_an_odd_collection_of_string():
                             'apple', 'banana', 'corn', '~~~~~~']
     assert natsorted(a, alg=ns.NUMAFTER) == ['Apple', 'Banana', 'Corn',
                                              'apple', 'banana', 'corn', '~~~~~~', '73', '5039']
+
+
+def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers():
+    a = ['1st street', '10th street', '2nd street', '2 street', '1 street', '1street',
+         '11 street', 'street 2', 'street 1', 'Street 11', '۲ street', '۱ street', '۱street',
+         '۱۲street', '۱۱ street', 'street ۲', 'street ۱', 'street ۱', 'street ۱۲', 'street ۱۱']
+    expected = ['1 street', '۱ street', '1st street', '1street', '۱street', '2 street', '۲ street',
+                '2nd street', '10th street', '11 street', '۱۱ street', '۱۲street', 'street 1',
+                'street ۱', 'street ۱', 'street 2', 'street ۲', 'Street 11', 'street ۱۱', 'street ۱۲']
+    assert natsorted(a, alg=ns.IGNORECASE) == expected
diff --git a/test_natsort/test_unicode_numbers.py b/test_natsort/test_unicode_numbers.py
index 1b897eb..e257914 100644
--- a/test_natsort/test_unicode_numbers.py
+++ b/test_natsort/test_unicode_numbers.py
@@ -11,6 +11,10 @@ from natsort.unicode_numbers import (
     numeric,
     digit_chars,
     digits,
+    decimal_chars,
+    decimals,
+    digits_no_decimals,
+    numeric_no_decimals,
 )
 
 
@@ -24,10 +28,16 @@ def test_digit_chars_contains_only_valid_unicode_digit_characters():
         assert unicodedata.digit(a, None) is not None
 
 
+def test_decimal_chars_contains_only_valid_unicode_decimal_characters():
+    for a in decimal_chars:
+        assert unicodedata.decimal(a, None) is not None
+
+
 def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters():
     set_numeric_hex = set(numeric_hex)
     set_numeric_chars = set(numeric_chars)
     set_digit_chars = set(digit_chars)
+    set_decimal_chars = set(decimal_chars)
     for i in py23_range(0X110000):
         try:
             a = py23_unichr(i)
@@ -41,8 +51,18 @@ def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters()
         if unicodedata.digit(a, None) is not None:
             assert i in set_numeric_hex
             assert a in set_digit_chars
+        if unicodedata.decimal(a, None) is not None:
+            assert i in set_numeric_hex
+            assert a in set_decimal_chars
+
+    assert set_decimal_chars.isdisjoint(digits_no_decimals)
+    assert set_digit_chars.issuperset(digits_no_decimals)
+
+    assert set_decimal_chars.isdisjoint(numeric_no_decimals)
+    assert set_numeric_chars.issuperset(numeric_no_decimals)
 
 
 def test_combined_string_contains_all_characters_in_list():
     assert numeric == ''.join(numeric_chars)
     assert digits == ''.join(digit_chars)
+    assert decimals == ''.join(decimal_chars)
diff --git a/tox.ini b/tox.ini
index 8423b3e..31bdc1f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -20,7 +20,7 @@ deps =
 commands =
     {envpython} -m pytest --doctest-modules natsort
     {envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst docs/source/howitworks.rst
-    {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing
+    {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short
 
 [testenv:py26]
 commands =
@@ -30,7 +30,7 @@ commands =
 commands =
     {envpython} -m pytest --doctest-modules natsort
     {envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst
-    {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing
+    {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short
 
 [testenv:pypy]
 deps =
author	Seth Morton <seth.m.morton@gmail.com>	2018-04-20 20:10:16 -0700
committer	GitHub <noreply@github.com>	2018-04-20 20:10:16 -0700
commit	68ed80722c16253c3981a66432325bff9bdfc45d (patch)
tree	4af4ac76e65eefd25b6eda39888a67ea080b753d
parent	9a79db89a66fbf93a09e397039396bbbadf6ddd0 (diff)
parent	5f0abd85fa1e8aa53a1bc1645f009f740ce7f04f (diff)
download	natsort-68ed80722c16253c3981a66432325bff9bdfc45d.tar.gz