summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth Morton <seth.m.morton@gmail.com>2018-04-20 20:10:16 -0700
committerGitHub <noreply@github.com>2018-04-20 20:10:16 -0700
commit68ed80722c16253c3981a66432325bff9bdfc45d (patch)
tree4af4ac76e65eefd25b6eda39888a67ea080b753d
parent9a79db89a66fbf93a09e397039396bbbadf6ddd0 (diff)
parent5f0abd85fa1e8aa53a1bc1645f009f740ce7f04f (diff)
downloadnatsort-68ed80722c16253c3981a66432325bff9bdfc45d.tar.gz
Merge pull request #54 from SethMMorton/52-sorting-arabic-farsi-or-hebrew-numbers-is-not-natural
Add support for ints and floats comprised of unicode decimal strings.
-rw-r--r--natsort/compat/fake_fastnumbers.py12
-rw-r--r--natsort/unicode_numbers.py9
-rw-r--r--natsort/utils.py19
-rw-r--r--test_natsort/slow_splitters.py20
-rw-r--r--test_natsort/test_fake_fastnumbers.py4
-rw-r--r--test_natsort/test_natsorted.py10
-rw-r--r--test_natsort/test_unicode_numbers.py20
-rw-r--r--tox.ini4
8 files changed, 73 insertions, 25 deletions
diff --git a/natsort/compat/fake_fastnumbers.py b/natsort/compat/fake_fastnumbers.py
index 0af60b5..6eee532 100644
--- a/natsort/compat/fake_fastnumbers.py
+++ b/natsort/compat/fake_fastnumbers.py
@@ -13,6 +13,7 @@ from __future__ import (
# Std. lib imports.
import unicodedata
+from natsort.unicode_numbers import decimal_chars
from natsort.compat.py23 import PY_VERSION
if PY_VERSION >= 3:
long = int
@@ -22,16 +23,18 @@ NAN_INF = ['INF', 'INf', 'Inf', 'inF', 'iNF', 'InF', 'inf', 'iNf',
'NAN', 'nan', 'NaN', 'nAn', 'naN', 'NAn', 'nAN', 'Nan']
NAN_INF.extend(['+'+x[:2] for x in NAN_INF] + ['-'+x[:2] for x in NAN_INF])
NAN_INF = frozenset(NAN_INF)
+ASCII_NUMS = '0123456789+-'
def fast_float(x, key=lambda x: x, nan=None,
- uni=unicodedata.numeric, nan_inf=NAN_INF):
+ uni=unicodedata.numeric, nan_inf=NAN_INF,
+ _first_char=frozenset(decimal_chars + list(ASCII_NUMS + '.'))):
"""\
Convert a string to a float quickly, return input as-is if not possible.
We don't need to accept all input that the real fast_int accepts because
the input will be controlled by the splitting algorithm.
"""
- if x[0] in '0123456789+-.' or x.lstrip()[:3] in nan_inf:
+ if x[0] in _first_char or x.lstrip()[:3] in nan_inf:
try:
x = float(x)
return nan if nan is not None and x != x else x
@@ -47,13 +50,14 @@ def fast_float(x, key=lambda x: x, nan=None,
return key(x)
-def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit):
+def fast_int(x, key=lambda x: x, nan=None, uni=unicodedata.digit,
+ _first_char=frozenset(decimal_chars + list(ASCII_NUMS))):
"""\
Convert a string to a int quickly, return input as-is if not possible.
We don't need to accept all input that the real fast_int accepts because
the input will be controlled by the splitting algorithm.
"""
- if x[0] in '0123456789+-':
+ if x[0] in _first_char:
try:
return long(x)
except ValueError:
diff --git a/natsort/unicode_numbers.py b/natsort/unicode_numbers.py
index 921c5c1..e87654f 100644
--- a/natsort/unicode_numbers.py
+++ b/natsort/unicode_numbers.py
@@ -280,10 +280,17 @@ for a in numeric_hex:
digit_chars = [a for a in numeric_chars
if unicodedata.digit(a, None) is not None]
+# The decimal characters are a subset of the numberals
+# (probably of the digits, but let's be safe).
+decimal_chars = [a for a in numeric_chars
+ if unicodedata.decimal(a, None) is not None]
+
# Create a single string with the above data.
+decimals = ''.join(decimal_chars)
digits = ''.join(digit_chars)
numeric = ''.join(numeric_chars)
-
+digits_no_decimals = ''.join([x for x in digits if x not in decimals])
+numeric_no_decimals = ''.join([x for x in numeric if x not in decimals])
# Some code that can be used to create the above list of hex numbers.
if __name__ == '__main__':
diff --git a/natsort/utils.py b/natsort/utils.py
index b6484b0..306762a 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -58,7 +58,7 @@ from unicodedata import normalize
# Local imports.
from natsort.ns_enum import ns
-from natsort.unicode_numbers import digits, numeric
+from natsort.unicode_numbers import numeric_no_decimals, digits_no_decimals
from natsort.compat.pathlib import PurePath, has_pathlib
from natsort.compat.locale import (
get_strxfrm,
@@ -80,25 +80,26 @@ if PY_VERSION >= 3:
long = int
# The regex that locates floats - include Unicode numerals.
-_exp = r'(?:[eE][-+]?[0-9]+)?'
-_num = r'(?:[0-9]+\.?[0-9]*|\.[0-9]+)'
+_nnd = numeric_no_decimals
+_exp = r'(?:[eE][-+]?\d+)?'
+_num = r'(?:\d+\.?\d*|\.\d+)'
_float_sign_exp_re = r'([-+]?{0}{1}|[{2}])'
-_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, numeric)
+_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, _nnd)
_float_sign_exp_re = re.compile(_float_sign_exp_re, flags=re.U)
_float_nosign_exp_re = r'({0}{1}|[{2}])'
-_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, numeric)
+_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, _nnd)
_float_nosign_exp_re = re.compile(_float_nosign_exp_re, flags=re.U)
_float_sign_noexp_re = r'([-+]?{0}|[{1}])'
-_float_sign_noexp_re = _float_sign_noexp_re.format(_num, numeric)
+_float_sign_noexp_re = _float_sign_noexp_re.format(_num, _nnd)
_float_sign_noexp_re = re.compile(_float_sign_noexp_re, flags=re.U)
_float_nosign_noexp_re = r'({0}|[{1}])'
-_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, numeric)
+_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, _nnd)
_float_nosign_noexp_re = re.compile(_float_nosign_noexp_re, flags=re.U)
# Integer regexes - include Unicode digits.
-_int_nosign_re = r'([0-9]+|[{0}])'.format(digits)
+_int_nosign_re = r'(\d+|[{0}])'.format(digits_no_decimals)
_int_nosign_re = re.compile(_int_nosign_re, flags=re.U)
-_int_sign_re = r'([-+]?[0-9]+|[{0}])'.format(digits)
+_int_sign_re = r'([-+]?\d+|[{0}])'.format(digits_no_decimals)
_int_sign_re = re.compile(_int_sign_re, flags=re.U)
# This dict will help select the correct regex and number conversion function.
diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py
index aef329e..f3cd76e 100644
--- a/test_natsort/slow_splitters.py
+++ b/test_natsort/slow_splitters.py
@@ -6,6 +6,7 @@ import unicodedata
import collections
import itertools
import functools
+from natsort.unicode_numbers import decimals
from natsort.compat.py23 import PY_VERSION, py23_zip
if PY_VERSION >= 3.0:
@@ -20,9 +21,9 @@ SplitElement = collections.namedtuple('SplitElement',
def int_splitter(iterable, signed, sep):
"""Alternate (slow) method to split a string into numbers."""
iterable = unicodedata.normalize('NFD', iterable)
- split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit())
- split_by_digits = refine_split_grouping(split_by_digits)
- split = int_splitter_iter(split_by_digits, signed)
+ split_by_decimal = itertools.groupby(iterable, lambda a: a.isdigit())
+ split_by_decimal = refine_split_grouping(split_by_decimal)
+ split = int_splitter_iter(split_by_decimal, signed)
split = sep_inserter(split, sep)
return tuple(add_leading_space_if_first_is_num(split, sep))
@@ -31,12 +32,12 @@ def float_splitter(iterable, signed, exp, sep):
"""Alternate (slow) method to split a string into numbers."""
def number_tester(x):
- return x.isdigit() or unicodedata.numeric(x, None) is not None
+ return x.isdecimal() or unicodedata.numeric(x, None) is not None
iterable = unicodedata.normalize('NFD', iterable)
- split_by_digits = itertools.groupby(iterable, number_tester)
- split_by_digits = peekable(refine_split_grouping(split_by_digits))
- split = float_splitter_iter(split_by_digits, signed, exp)
+ split_by_decimal = itertools.groupby(iterable, number_tester)
+ split_by_decimal = peekable(refine_split_grouping(split_by_decimal))
+ split = float_splitter_iter(split_by_decimal, signed, exp)
split = sep_inserter(split, sep)
return tuple(add_leading_space_if_first_is_num(split, sep))
@@ -64,8 +65,9 @@ def refine_split_grouping(iterable):
yield SplitElement(False, val, False)
-def group_unicode_and_ascii_numbers(iterable,
- ascii_digits=frozenset('0123456789')):
+def group_unicode_and_ascii_numbers(
+ iterable, ascii_digits=frozenset(decimals + '0123456789')
+):
"""
Use groupby to group ASCII and unicode numeric characters.
Assumes all input is already all numeric characters.
diff --git a/test_natsort/test_fake_fastnumbers.py b/test_natsort/test_fake_fastnumbers.py
index c9103c2..52bbb8c 100644
--- a/test_natsort/test_fake_fastnumbers.py
+++ b/test_natsort/test_fake_fastnumbers.py
@@ -78,6 +78,8 @@ def test_fast_float_converts_float_string_to_float_example():
assert isnan(fast_float('nan'))
assert isnan(fast_float('+nan'))
assert isnan(fast_float('-NaN'))
+ assert fast_float('۱۲.۱۲') == 12.12
+ assert fast_float('-۱۲.۱۲') == -12.12
@given(floats(allow_nan=False))
@@ -117,6 +119,8 @@ def test_fast_int_leaves_float_string_as_is(x):
def test_fast_int_converts_int_string_to_int_example():
assert fast_int('-45') == -45
assert fast_int('+45') == 45
+ assert fast_int('۱۲') == 12
+ assert fast_int('-۱۲') == -12
@given(integers())
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 71ffbf6..ce6b879 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -280,3 +280,13 @@ def test_natsorted_sorts_an_odd_collection_of_string():
'apple', 'banana', 'corn', '~~~~~~']
assert natsorted(a, alg=ns.NUMAFTER) == ['Apple', 'Banana', 'Corn',
'apple', 'banana', 'corn', '~~~~~~', '73', '5039']
+
+
+def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers():
+ a = ['1st street', '10th street', '2nd street', '2 street', '1 street', '1street',
+ '11 street', 'street 2', 'street 1', 'Street 11', '۲ street', '۱ street', '۱street',
+ '۱۲street', '۱۱ street', 'street ۲', 'street ۱', 'street ۱', 'street ۱۲', 'street ۱۱']
+ expected = ['1 street', '۱ street', '1st street', '1street', '۱street', '2 street', '۲ street',
+ '2nd street', '10th street', '11 street', '۱۱ street', '۱۲street', 'street 1',
+ 'street ۱', 'street ۱', 'street 2', 'street ۲', 'Street 11', 'street ۱۱', 'street ۱۲']
+ assert natsorted(a, alg=ns.IGNORECASE) == expected
diff --git a/test_natsort/test_unicode_numbers.py b/test_natsort/test_unicode_numbers.py
index 1b897eb..e257914 100644
--- a/test_natsort/test_unicode_numbers.py
+++ b/test_natsort/test_unicode_numbers.py
@@ -11,6 +11,10 @@ from natsort.unicode_numbers import (
numeric,
digit_chars,
digits,
+ decimal_chars,
+ decimals,
+ digits_no_decimals,
+ numeric_no_decimals,
)
@@ -24,10 +28,16 @@ def test_digit_chars_contains_only_valid_unicode_digit_characters():
assert unicodedata.digit(a, None) is not None
+def test_decimal_chars_contains_only_valid_unicode_decimal_characters():
+ for a in decimal_chars:
+ assert unicodedata.decimal(a, None) is not None
+
+
def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters():
set_numeric_hex = set(numeric_hex)
set_numeric_chars = set(numeric_chars)
set_digit_chars = set(digit_chars)
+ set_decimal_chars = set(decimal_chars)
for i in py23_range(0X110000):
try:
a = py23_unichr(i)
@@ -41,8 +51,18 @@ def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters()
if unicodedata.digit(a, None) is not None:
assert i in set_numeric_hex
assert a in set_digit_chars
+ if unicodedata.decimal(a, None) is not None:
+ assert i in set_numeric_hex
+ assert a in set_decimal_chars
+
+ assert set_decimal_chars.isdisjoint(digits_no_decimals)
+ assert set_digit_chars.issuperset(digits_no_decimals)
+
+ assert set_decimal_chars.isdisjoint(numeric_no_decimals)
+ assert set_numeric_chars.issuperset(numeric_no_decimals)
def test_combined_string_contains_all_characters_in_list():
assert numeric == ''.join(numeric_chars)
assert digits == ''.join(digit_chars)
+ assert decimals == ''.join(decimal_chars)
diff --git a/tox.ini b/tox.ini
index 8423b3e..31bdc1f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -20,7 +20,7 @@ deps =
commands =
{envpython} -m pytest --doctest-modules natsort
{envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst docs/source/howitworks.rst
- {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing
+ {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short
[testenv:py26]
commands =
@@ -30,7 +30,7 @@ commands =
commands =
{envpython} -m pytest --doctest-modules natsort
{envpython} -m pytest README.rst docs/source/intro.rst docs/source/examples.rst
- {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing
+ {envpython} -m pytest --flakes --pep8 --cov natsort --cov-report term-missing --tb=short
[testenv:pypy]
deps =