diff options
author | Seth Morton <seth.m.morton@gmail.com> | 2018-04-20 20:10:16 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-04-20 20:10:16 -0700 |
commit | 68ed80722c16253c3981a66432325bff9bdfc45d (patch) | |
tree | 4af4ac76e65eefd25b6eda39888a67ea080b753d | |
parent | 9a79db89a66fbf93a09e397039396bbbadf6ddd0 (diff) | |
parent | 5f0abd85fa1e8aa53a1bc1645f009f740ce7f04f (diff) | |
download | natsort-68ed80722c16253c3981a66432325bff9bdfc45d.tar.gz |
Merge pull request #54 from SethMMorton/52-sorting-arabic-farsi-or-hebrew-numbers-is-not-natural
Add support for ints and floats comprised of unicode decimal strings.
-rw-r--r-- | natsort/compat/fake_fastnumbers.py | 12 | ||||
-rw-r--r-- | natsort/unicode_numbers.py | 9 | ||||
-rw-r--r-- | natsort/utils.py | 19 | ||||
-rw-r--r-- | test_natsort/slow_splitters.py | 20 | ||||
-rw-r--r-- | test_natsort/test_fake_fastnumbers.py | 4 | ||||
-rw-r--r-- | test_natsort/test_natsorted.py | 10 | ||||
-rw-r--r-- | test_natsort/test_unicode_numbers.py | 20 | ||||
-rw-r--r-- | tox.ini | 4 |
8 files changed, 73 insertions, 25 deletions
diff --git a/natsort/compat/fake_fastnumbers.py b/natsort/compat/fake_fastnumbers.py index 0af60b5..6eee532 100644 --- a/natsort/compat/fake_fastnumbers.py +++ b/natsort/compat/fake_fastnumbers.py @@ -13,6 +13,7 @@ from __future__ import ( # Std. lib imports. import unicodedata +from natsort.unicode_numbers import decimal_chars from natsort.compat.py23 import PY_VERSION if PY_VERSION >= 3: long = int @@ -22,16 +23,18 @@ NAN_INF = ['INF', 'INf', 'Inf', 'inF', 'iNF', 'InF', 'inf', 'iNf', 'NAN', 'nan', 'NaN', 'nAn', 'naN', 'NAn', 'nAN', 'Nan'] NAN_INF.extend(['+'+x[:2] for x in NAN_INF] + ['-'+x[:2] for x in NAN_INF]) NAN_INF = frozenset(NAN_INF) +ASCII_NUMS = '0123456789+-' def fast_float(x, key=lambda x: x, nan=None, - uni=unicodedata.numeric, nan_inf=NAN_INF): + uni=unicodedata.numeric, nan_inf=NAN_INF, + _first_char=frozenset(decimal_chars + list(ASCII_NUMS + '.'))): """\ Convert a string to a float quickly, return input as-is if not possible. We don't need to accept all input that the real fast_int accepts because the input will be controlled by the splitting algorithm. """ - if x[0] in '0123456789+-.' or x.lstrip()[:3] in nan_inf: + if x[0] in _first_char or x.lstrip()[:3] in nan_inf: try: x = float(x) return nan if nan is not None and x != x else x @@ -47,13 +50,14 @@ def fast_float(x, key=lambda x: x, nan=None, return key(x) -def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit): +def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit, + _first_char=frozenset(decimal_chars + list(ASCII_NUMS))): """\ Convert a string to a int quickly, return input as-is if not possible. We don't need to accept all input that the real fast_int accepts because the input will be controlled by the splitting algorithm. """ - if x[0] in '0123456789+-': + if x[0] in _first_char: try: return long(x) except ValueError: diff --git a/natsort/unicode_numbers.py b/natsort/unicode_numbers.py index 921c5c1..e87654f 100644 --- a/natsort/unicode_numbers.py +++ b/natsort/unicode_numbers.py @@ -280,10 +280,17 @@ for a in numeric_hex: digit_chars = [a for a in numeric_chars if unicodedata.digit(a, None) is not None] +# The decimal characters are a subset of the numberals +# (probably of the digits, but let's be safe). +decimal_chars = [a for a in numeric_chars + if unicodedata.decimal(a, None) is not None] + # Create a single string with the above data. +decimals = ''.join(decimal_chars) digits = ''.join(digit_chars) numeric = ''.join(numeric_chars) - +digits_no_decimals = ''.join([x for x in digits if x not in decimals]) +numeric_no_decimals = ''.join([x for x in numeric if x not in decimals]) # Some code that can be used to create the above list of hex numbers. if __name__ == '__main__': diff --git a/natsort/utils.py b/natsort/utils.py index b6484b0..306762a 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -58,7 +58,7 @@ from unicodedata import normalize # Local imports. from natsort.ns_enum import ns -from natsort.unicode_numbers import digits, numeric +from natsort.unicode_numbers import numeric_no_decimals, digits_no_decimals from natsort.compat.pathlib import PurePath, has_pathlib from natsort.compat.locale import ( get_strxfrm, @@ -80,25 +80,26 @@ if PY_VERSION >= 3: long = int # The regex that locates floats - include Unicode numerals. -_exp = r'(?:[eE][-+]?[0-9]+)?' -_num = r'(?:[0-9]+\.?[0-9]*|\.[0-9]+)' +_nnd = numeric_no_decimals +_exp = r'(?:[eE][-+]?\d+)?' +_num = r'(?:\d+\.?\d*|\.\d+)' _float_sign_exp_re = r'([-+]?{0}{1}|[{2}])' -_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, numeric) +_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, _nnd) _float_sign_exp_re = re.compile(_float_sign_exp_re, flags=re.U) _float_nosign_exp_re = r'({0}{1}|[{2}])' -_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, numeric) +_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, _nnd) _float_nosign_exp_re = re.compile(_float_nosign_exp_re, flags=re.U) _float_sign_noexp_re = r'([-+]?{0}|[{1}])' -_float_sign_noexp_re = _float_sign_noexp_re.format(_num, numeric) +_float_sign_noexp_re = _float_sign_noexp_re.format(_num, _nnd) _float_sign_noexp_re = re.compile(_float_sign_noexp_re, flags=re.U) _float_nosign_noexp_re = r'({0}|[{1}])' -_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, numeric) +_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, _nnd) _float_nosign_noexp_re = re.compile(_float_nosign_noexp_re, flags=re.U) # Integer regexes - include Unicode digits. -_int_nosign_re = r'([0-9]+|[{0}])'.format(digits) +_int_nosign_re = r'(\d+|[{0}])'.format(digits_no_decimals) _int_nosign_re = re.compile(_int_nosign_re, flags=re.U) -_int_sign_re = r'([-+]?[0-9]+|[{0}])'.format(digits) +_int_sign_re = r'([-+]?\d+|[{0}])'.format(digits_no_decimals) _int_sign_re = re.compile(_int_sign_re, flags=re.U) # This dict will help select the correct regex and number conversion function. diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py index aef329e..f3cd76e 100644 --- a/test_natsort/slow_splitters.py +++ b/test_natsort/slow_splitters.py @@ -6,6 +6,7 @@ import unicodedata import collections import itertools import functools +from natsort.unicode_numbers import decimals from natsort.compat.py23 import PY_VERSION, py23_zip if PY_VERSION >= 3.0: @@ -20,9 +21,9 @@ SplitElement = collections.namedtuple('SplitElement', def int_splitter(iterable, signed, sep): """Alternate (slow) method to split a string into numbers.""" iterable = unicodedata.normalize('NFD', iterable) - split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit()) - split_by_digits = refine_split_grouping(split_by_digits) - split = int_splitter_iter(split_by_digits, signed) + split_by_decimal = itertools.groupby(iterable, lambda a: a.isdigit()) + split_by_decimal = refine_split_grouping(split_by_decimal) + split = int_splitter_iter(split_by_decimal, signed) split = sep_inserter(split, sep) return tuple(add_leading_space_if_first_is_num(split, sep)) @@ -31,12 +32,12 @@ def float_splitter(iterable, signed, exp, sep): """Alternate (slow) method to split a string into numbers.""" def number_tester(x): - return x.isdigit() or unicodedata.numeric(x, None) is not None + return x.isdecimal() or unicodedata.numeric(x, None) is not None iterable = unicodedata.normalize('NFD', iterable) - split_by_digits = itertools.groupby(iterable, number_tester) - split_by_digits = peekable(refine_split_grouping(split_by_digits)) - split = float_splitter_iter(split_by_digits, signed, exp) + split_by_decimal = itertools.groupby(iterable, number_tester) + split_by_decimal = peekable(refine_split_grouping(split_by_decimal)) + split = float_splitter_iter(split_by_decimal, signed, exp) split = sep_inserter(split, sep) return tuple(add_leading_space_if_first_is_num(split, sep)) @@ -64,8 +65,9 @@ def refine_split_grouping(iterable): yield SplitElement(False, val, False) -def group_unicode_and_ascii_numbers(iterable, - ascii_digits=frozenset('0123456789')): +def group_unicode_and_ascii_numbers( + iterable, ascii_digits=frozenset(decimals + '0123456789') +): """ Use groupby to group ASCII and unicode numeric characters. Assumes all input is already all numeric characters. diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py index c9103c2..52bbb8c 100644 --- a/test_natsort/test_fake_fastnumbers.py +++ b/test_natsort/test_fake_fastnumbers.py @@ -78,6 +78,8 @@ def test_fast_float_converts_float_string_to_float_example(): assert isnan(fast_float('nan')) assert isnan(fast_float('+nan')) assert isnan(fast_float('-NaN')) + assert fast_float('۱۲.۱۲') == 12.12 + assert fast_float('-۱۲.۱۲') == -12.12 @given(floats(allow_nan=False)) @@ -117,6 +119,8 @@ def test_fast_int_leaves_float_string_as_is(x): def test_fast_int_converts_int_string_to_int_example(): assert fast_int('-45') == -45 assert fast_int('+45') == 45 + assert fast_int('۱۲') == 12 + assert fast_int('-۱۲') == -12 @given(integers()) diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 71ffbf6..ce6b879 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -280,3 +280,13 @@ def test_natsorted_sorts_an_odd_collection_of_string(): 'apple', 'banana', 'corn', '~~~~~~'] assert natsorted(a, alg=ns.NUMAFTER) == ['Apple', 'Banana', 'Corn', 'apple', 'banana', 'corn', '~~~~~~', '73', '5039'] + + +def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers(): + a = ['1st street', '10th street', '2nd street', '2 street', '1 street', '1street', + '11 street', 'street 2', 'street 1', 'Street 11', '۲ street', '۱ street', '۱street', + '۱۲street', '۱۱ street', 'street ۲', 'street ۱', 'street ۱', 'street ۱۲', 'street ۱۱'] + expected = ['1 street', '۱ street', '1st street', '1street', '۱street', '2 street', '۲ street', + '2nd street', '10th street', '11 street', '۱۱ street', '۱۲street', 'street 1', + 'street ۱', 'street ۱', 'street 2', 'street ۲', 'Street 11', 'street ۱۱', 'street ۱۲'] + assert natsorted(a, alg=ns.IGNORECASE) == expected diff --git a/test_natsort/test_unicode_numbers.py b/test_natsort/test_unicode_numbers.py index 1b897eb..e257914 100644 --- a/test_natsort/test_unicode_numbers.py +++ b/test_natsort/test_unicode_numbers.py @@ -11,6 +11,10 @@ from natsort.unicode_numbers import ( numeric, digit_chars, digits, + decimal_chars, + decimals, + digits_no_decimals, + numeric_no_decimals, ) @@ -24,10 +28,16 @@ def test_digit_chars_contains_only_valid_unicode_digit_characters(): assert unicodedata.digit(a, None) is not None +def test_decimal_chars_contains_only_valid_unicode_decimal_characters(): + for a in decimal_chars: + assert unicodedata.decimal(a, None) is not None + + def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters(): set_numeric_hex = set(numeric_hex) set_numeric_chars = set(numeric_chars) set_digit_chars = set(digit_chars) + set_decimal_chars = set(decimal_chars) for i in py23_range(0X110000): try: a = py23_unichr(i) @@ -41,8 +51,18 @@ def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters() if unicodedata.digit(a, None) is not None: assert i in set_numeric_hex assert a in set_digit_chars + if unicodedata.decimal(a, None) is not None: + assert i in set_numeric_hex + assert a in set_decimal_chars + + assert set_decimal_chars.isdisjoint(digits_no_decimals) + assert set_digit_chars.issuperset(digits_no_decimals) + + assert set_decimal_chars.isdisjoint(numeric_no_decimals) + assert set_numeric_chars.issuperset(numeric_no_decimals) def test_combined_string_contains_all_characters_in_list(): assert numeric == ''.join(numeric_chars) assert digits == ''.join(digit_chars) + assert decimals == ''.join(decimal_chars) @@ -20,7 +20,7 @@ deps = commands = {envpython} -m pytest --doctest-modules natsort {envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst docs/source/howitworks.rst - {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing + {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short [testenv:py26] commands = @@ -30,7 +30,7 @@ commands = commands = {envpython} -m pytest --doctest-modules natsort {envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst - {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing + {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short [testenv:pypy] deps = |