From cf2a55daf7e74d177c95149da623172b1b6d93ae Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Sun, 26 Feb 2023 23:21:37 -0800 Subject: Ensure None, NaN, and Infinity are sorted consistently Internally, these may be translated to the same value, so they will be output in the same order they were input, which could lead to suprise. This commit ensures the order is always consistent. --- natsort/utils.py | 12 +++++++++++- tests/test_natsorted.py | 29 ++++++++++++++++++++--------- tests/test_parse_number_function.py | 20 +++++++++++++++----- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/natsort/utils.py b/natsort/utils.py index b86225e..2a3745f 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -420,7 +420,17 @@ def parse_number_or_none_factory( val: Any, _nan_replace: float = nan_replace, _sep: StrOrBytes = sep ) -> BasicTuple: """Given a number, place it in a tuple with a leading null string.""" - return _sep, (_nan_replace if val != val or val is None else val) + # Add a trailing string numbers equaling _nan_replace. This will make + # the ordering between None NaN, and the NaN replacement value... + # None comes first, then NaN, then the replacement value. + if val is None: + return _sep, _nan_replace, "1" + elif val != val: + return _sep, _nan_replace, "2" + elif val == _nan_replace: + return _sep, _nan_replace, "3" + else: + return _sep, val # Return the function, possibly wrapping in tuple if PATH is selected. if alg & ns.PATH and alg & ns.UNGROUPLETTERS and alg & ns.LOCALEALPHA: diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py index eccb9d2..3d6375c 100644 --- a/tests/test_natsorted.py +++ b/tests/test_natsorted.py @@ -4,6 +4,7 @@ Here are a collection of examples of how this module can be used. See the README or the natsort homepage for more details. """ +import math from operator import itemgetter from pathlib import PurePosixPath from typing import List, Tuple, Union @@ -110,19 +111,29 @@ def test_natsorted_handles_mixed_types( @pytest.mark.parametrize( - "alg, expected, slc", + "alg, expected", [ - (ns.DEFAULT, [float("nan"), 5, "25", 1e40], slice(1, None)), - (ns.NANLAST, [5, "25", 1e40, float("nan")], slice(None, 3)), + (ns.DEFAULT, [None, float("nan"), float("-inf"), 5, "25", 1e40, float("inf")]), + (ns.NANLAST, [float("-inf"), 5, "25", 1e40, None, float("nan"), float("inf")]), ], ) -def test_natsorted_handles_nan( - alg: NSType, expected: List[Union[str, float, int]], slc: slice +def test_natsorted_consistent_ordering_with_nan_and_friends( + alg: NSType, expected: List[Union[str, float, None, int]] ) -> None: - given: List[Union[str, float, int]] = ["25", 5, float("nan"), 1e40] - # The slice is because NaN != NaN - # noinspection PyUnresolvedReferences - assert natsorted(given, alg=alg)[slc] == expected[slc] + sentinel = math.pi + expected = [sentinel if x != x else x for x in expected] + given: List[Union[str, float, None, int]] = [ + float("inf"), + float("-inf"), + "25", + 5, + float("nan"), + 1e40, + None, + ] + result = natsorted(given, alg=alg) + result = [sentinel if x != x else x for x in result] + assert result == expected def test_natsorted_with_mixed_bytes_and_str_input_raises_type_error() -> None: diff --git a/tests/test_parse_number_function.py b/tests/test_parse_number_function.py index 85d6b96..5ac5700 100644 --- a/tests/test_parse_number_function.py +++ b/tests/test_parse_number_function.py @@ -20,7 +20,7 @@ from natsort.utils import NumTransformer, parse_number_or_none_factory (ns.PATH | ns.UNGROUPLETTERS | ns.LOCALE, lambda x: ((("xx",), ("", x)),)), ], ) -@given(x=floats(allow_nan=False) | integers()) +@given(x=floats(allow_nan=False, allow_infinity=False) | integers()) def test_parse_number_factory_makes_function_that_returns_tuple( x: Union[float, int], alg: NSType, example_func: NumTransformer ) -> None: @@ -32,10 +32,20 @@ def test_parse_number_factory_makes_function_that_returns_tuple( "alg, x, result", [ (ns.DEFAULT, 57, ("", 57)), - (ns.DEFAULT, float("nan"), ("", float("-inf"))), # NaN transformed to -infinity - (ns.NANLAST, float("nan"), ("", float("+inf"))), # NANLAST makes it +infinity - (ns.DEFAULT, None, ("", float("-inf"))), # None transformed to -infinity - (ns.NANLAST, None, ("", float("+inf"))), # NANLAST makes it +infinity + ( + ns.DEFAULT, + float("nan"), + ("", float("-inf"), "2"), + ), # NaN transformed to -infinity + ( + ns.NANLAST, + float("nan"), + ("", float("+inf"), "2"), + ), # NANLAST makes it +infinity + (ns.DEFAULT, None, ("", float("-inf"), "1")), # None transformed to -infinity + (ns.NANLAST, None, ("", float("+inf"), "1")), # NANLAST makes it +infinity + (ns.DEFAULT, float("-inf"), ("", float("-inf"), "3")), + (ns.NANLAST, float("+inf"), ("", float("+inf"), "3")), ], ) def test_parse_number_factory_treats_nan_and_none_special( -- cgit v1.2.1 From 50389e16d3aba5139890e14d57257320a9bc7e11 Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Mon, 27 Feb 2023 00:22:59 -0800 Subject: Add presort to natsorted and friends This will sort the collection as strings before sorting with the natsort algorithm. This ensures that strings that are different but represent the same numerical value get sorted independent of input order. --- natsort/natsort.py | 13 ++++++++++++- natsort/ns_enum.py | 9 +++++++++ tests/test_natsorted.py | 29 +++++++++++++++++++++++++++++ tests/test_natsorted_convenience.py | 7 +++++++ tests/test_ns_enum.py | 2 ++ tests/test_os_sorted.py | 7 +++++++ 6 files changed, 66 insertions(+), 1 deletion(-) diff --git a/natsort/natsort.py b/natsort/natsort.py index ea83e48..2325443 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -288,6 +288,8 @@ def natsorted( ['num2', 'num3', 'num5'] """ + if alg & ns.PRESORT: + seq = sorted(seq, reverse=reverse, key=str) return sorted(seq, reverse=reverse, key=natsort_keygen(key, alg)) @@ -477,6 +479,8 @@ def index_natsorted( # Pair the index and sequence together, then sort by element index_seq_pair = [(x, y) for x, y in enumerate(seq)] + if alg & ns.PRESORT: + index_seq_pair.sort(reverse=reverse, key=lambda x: str(itemgetter(1)(x))) index_seq_pair.sort(reverse=reverse, key=natsort_keygen(newkey, alg)) return [x for x, _ in index_seq_pair] @@ -768,6 +772,7 @@ def os_sorted( seq: Iterable[T], key: Optional[Callable[[T], NatsortInType]] = None, reverse: bool = False, + presort: bool = False, ) -> List[T]: """ Sort elements in the same order as your operating system's file browser @@ -810,6 +815,10 @@ def os_sorted( Return the list in reversed sorted order. The default is `False`. + presort : {{True, False}}, optional + Equivalent to adding ``ns.PRESORT``, see :class:`ns` for + documentation. The default is `False`. + Returns ------- out : list @@ -825,4 +834,6 @@ def os_sorted( This will implicitly coerce all inputs to str before collating. """ - return sorted(seq, key=os_sort_keygen(key), reverse=reverse) + if presort: + seq = sorted(seq, reverse=reverse, key=str) + return sorted(seq, reverse=reverse, key=os_sort_keygen(key)) diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index c147909..02f970f 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -114,6 +114,14 @@ class ns(enum.IntEnum): # noqa: N801 treat these as +Infinity and place them after all the other numbers. By default, an NaN be treated as -Infinity and be placed first. Note that this ``None`` is treated like NaN internally. + PRESORT, PS + Sort the input as strings before sorting with the `nasort` + algorithm. This can help eliminate inconsistent sorting in cases + where two different strings represent the same number. For example, + "a1" and "a01" both are internally represented as ("a", "1), so + without `PRESORT` the order of these two values would depend on + the order they appeared in the input (because Python's `sorted` + is a stable sorting algorithm). Notes ----- @@ -143,6 +151,7 @@ class ns(enum.IntEnum): # noqa: N801 NANLAST = NL = 1 << next(_counter) COMPATIBILITYNORMALIZE = CN = 1 << next(_counter) NUMAFTER = NA = 1 << next(_counter) + PRESORT = PS = 1 << next(_counter) # Following were previously options but are now defaults. DEFAULT = 0 diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py index 3d6375c..e4a4788 100644 --- a/tests/test_natsorted.py +++ b/tests/test_natsorted.py @@ -378,3 +378,32 @@ def test_natsorted_sorts_mixed_ascii_and_non_ascii_numbers() -> None: "street ۱۲", ] assert natsorted(given, alg=ns.IGNORECASE) == expected + + +def test_natsort_sorts_consistently_with_presort() -> None: + # Demonstrate the problem: + # Sorting is order-dependent for values that have different + # string representations are equiavlent numerically. + given = ["a01", "a1.4500", "a1", "a1.45"] + expected = ["a01", "a1", "a1.4500", "a1.45"] + result = natsorted(given, alg=ns.FLOAT) + assert result == expected + + given = ["a1", "a1.45", "a01", "a1.4500"] + expected = ["a1", "a01", "a1.45", "a1.4500"] + result = natsorted(given, alg=ns.FLOAT) + assert result == expected + + # The solution - use "presort" which will sort the + # input by its string representation before sorting + # with natsorted, which gives consitent results even + # if the numeric representation is identical + expected = ["a01", "a1", "a1.45", "a1.4500"] + + given = ["a01", "a1.4500", "a1", "a1.45"] + result = natsorted(given, alg=ns.FLOAT | ns.PRESORT) + assert result == expected + + given = ["a1", "a1.45", "a01", "a1.4500"] + result = natsorted(given, alg=ns.FLOAT | ns.PRESORT) + assert result == expected diff --git a/tests/test_natsorted_convenience.py b/tests/test_natsorted_convenience.py index 0b2cd75..81bdf5c 100644 --- a/tests/test_natsorted_convenience.py +++ b/tests/test_natsorted_convenience.py @@ -88,6 +88,13 @@ def test_index_natsorted_applies_key_function_before_sorting() -> None: assert index_natsorted(given, key=itemgetter(1)) == expected +def test_index_natsorted_can_presort() -> None: + expected = [2, 0, 3, 1] + given = ["a1", "a1.4500", "a01", "a1.45"] + result = index_natsorted(given, alg=ns.FLOAT | ns.PRESORT) + assert result == expected + + def test_index_realsorted_is_identical_to_index_natsorted_with_real_alg( float_list: List[str], ) -> None: diff --git a/tests/test_ns_enum.py b/tests/test_ns_enum.py index 7a30718..c950812 100644 --- a/tests/test_ns_enum.py +++ b/tests/test_ns_enum.py @@ -18,6 +18,7 @@ from natsort import ns ("NANLAST", 0x0400), ("COMPATIBILITYNORMALIZE", 0x0800), ("NUMAFTER", 0x1000), + ("PRESORT", 0x2000), ("DEFAULT", 0x0000), ("INT", 0x0000), ("UNSIGNED", 0x0000), @@ -42,6 +43,7 @@ from natsort import ns ("NL", 0x0400), ("CN", 0x0800), ("NA", 0x1000), + ("PS", 0x2000), ], ) def test_ns_enum(given: str, expected: int) -> None: diff --git a/tests/test_os_sorted.py b/tests/test_os_sorted.py index f714437..c29c110 100644 --- a/tests/test_os_sorted.py +++ b/tests/test_os_sorted.py @@ -47,6 +47,13 @@ def test_os_sorted_key() -> None: assert result == expected +def test_os_sorted_can_presort() -> None: + given = ["a1", "a01"] + expected = ["a01", "a1"] + result = natsort.os_sorted(given, presort=True) + assert result == expected + + # The following is a master list of things that might give trouble # when sorting like the file explorer. given_characters = [ -- cgit v1.2.1 From 92969665342450fb450904776a59bd8406dfe65a Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Mon, 27 Feb 2023 00:35:16 -0800 Subject: Update changelog This closes #149 --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fdfe57..904a1b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ Unreleased --- +### Added +- The `presort` option to `natsorted` and friends to attain consistent + sort order in certain corner cases (Issue + [#149](https://github.com/SethMMorton/natsort/issues/149)) +- Logic to ensure `None` and NaN are sorted in a consistent order + (Issue [#149](https://github.com/SethMMorton/natsort/issues/149)) + ### Changed - Only convert to `str` if necessary in `os_sorted` ([@Dobatymo](https://github.com/Dobatymo), issues -- cgit v1.2.1