From e986a05f61f39fcac70782c7e6428e95e2bad6e9 Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Thu, 9 Dec 2021 21:34:23 -0800 Subject: Add tests for the cs_CZ regression This ensures that going forward this won't not sort correctly. --- tests/conftest.py | 16 ++++++++++++++++ tests/test_natsorted.py | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index c63e149..6cd922b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,3 +48,19 @@ def with_locale_de_de() -> Iterator[None]: yield finally: locale.setlocale(locale.LC_ALL, orig) + + +@pytest.fixture() +def with_locale_cs_cz() -> Iterator[None]: + """ + Convenience to load the cs_CZ locale - reset when complete - skip if missing. + """ + orig = locale.getlocale() + try: + load_locale("cs_CZ") + except locale.Error: + pytest.skip("requires cs_CZ locale to be installed") + else: + yield + finally: + locale.setlocale(locale.LC_ALL, orig) diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py index d043ab4..4a64a27 100644 --- a/tests/test_natsorted.py +++ b/tests/test_natsorted.py @@ -251,6 +251,14 @@ def test_natsorted_locale_bug_regression_test_109() -> None: assert natsorted(given, alg=ns.LOCALE) == expected +@pytest.mark.usefixtures("with_locale_cs_cz") +def test_natsorted_locale_bug_regression_test_140() -> None: + # https://github.com/SethMMorton/natsort/issues/140 + given = ["Aš", "Cheb", "Česko", "Cibulov", "Znojmo", "Žilina"] + expected = ["Aš", "Cibulov", "Česko", "Cheb", "Znojmo", "Žilina"] + assert natsorted(given, alg=ns.LOCALE) == expected + + @pytest.mark.parametrize( "alg, expected", [ -- cgit v1.2.1 From 2bfe1223b523e8c57544efb54bab71b24334fd07 Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Thu, 9 Dec 2021 22:15:46 -0800 Subject: Combine unicode normalization for LOCALE For some locales, the unicode cannot be decomposed otherwise the ordering is incorrect. --- natsort/utils.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/natsort/utils.py b/natsort/utils.py index 7102f41..c9448b4 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -235,6 +235,25 @@ def _normalize_input_factory(alg: NSType) -> StrToStr: return partial(normalize, normalization_form) +def _compose_input_factory(alg: NSType) -> StrToStr: + """ + Create a function that will compose unicode input data. + + Parameters + ---------- + alg : ns enum + Used to indicate how to compose unicode. + + Returns + ------- + func : callable + A function that accepts string (unicode) input and returns the + the input normalized with the desired composition scheme. + """ + normalization_form = "NFKC" if alg & ns.COMPATIBILITYNORMALIZE else "NFC" + return partial(normalize, normalization_form) + + @overload def natsort_key( val: NatsortInType, @@ -472,6 +491,7 @@ def parse_string_factory( orig_after_xfrm = not (alg & NS_DUMB and alg & ns.LOCALEALPHA) original_func = input_transform if orig_after_xfrm else _no_op normalize_input = _normalize_input_factory(alg) + compose_input = _compose_input_factory(alg) if alg & ns.LOCALEALPHA else _no_op def func(x: str) -> FinalTransform: # Apply string input transformation function and return to x. @@ -479,11 +499,12 @@ def parse_string_factory( # to also be the transformation function. a = normalize_input(x) b, original = input_transform(a), original_func(a) - c = splitter(b) # Split string into components. - d = filter(None, c) # Remove empty strings. - e = map(component_transform, d) # Apply transform on components. - f = sep_inserter(e, sep) # Insert '' between numbers. - return final_transform(f, original) # Apply the final transform. + c = compose_input(b) # Decompose unicode if using LOCALE + d = splitter(c) # Split string into components. + e = filter(None, d) # Remove empty strings. + f = map(component_transform, e) # Apply transform on components. + g = sep_inserter(f, sep) # Insert '' between numbers. + return final_transform(g, original) # Apply the final transform. return func -- cgit v1.2.1 From a911826be7d545a254f56e20409927fb9bfd5a2c Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Thu, 9 Dec 2021 22:18:59 -0800 Subject: Update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 263c5f4..df15326 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Unreleased --- +### Fixed + +- Compose unicode characters when using locale to ensure sorting is correct + across all locales. + [8.0.0] - 2021-11-03 --- -- cgit v1.2.1 From f9d8cc595ef134feac11573ef94e4b8172987a7a Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Fri, 10 Dec 2021 07:49:44 -0800 Subject: Black --- natsort/compat/locale.py | 1 - natsort/natsort.py | 1 - 2 files changed, 2 deletions(-) diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py index 9af5e7a..b4c5356 100644 --- a/natsort/compat/locale.py +++ b/natsort/compat/locale.py @@ -54,7 +54,6 @@ try: # noqa: C901 sep = icu.DecimalFormatSymbols.kDecimalSeparatorSymbol return cast(str, icu.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)) - except ImportError: import locale from locale import strxfrm diff --git a/natsort/natsort.py b/natsort/natsort.py index a95f9a9..9f34bc1 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -786,7 +786,6 @@ if platform.system() == "Windows": OSSortKeyType, lambda x: tuple(map(_winsort_key, _split_apply(x, key))) ) - else: # For UNIX-based platforms, ICU performs MUCH better than locale -- cgit v1.2.1 From 96d46e5a57fc6281f9c4b2cc06f8736560cfbe8a Mon Sep 17 00:00:00 2001 From: Seth Morton Date: Fri, 10 Dec 2021 08:08:06 -0800 Subject: Add new cs_CZ locale into the CI environment --- .github/workflows/tests.yml | 2 +- tests/conftest.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8dfb799..d78fdf9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,7 +36,7 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt-get update - sudo apt-get install language-pack-de language-pack-en + sudo apt-get install language-pack-de language-pack-en language-pack-cs - name: Install ICU if: matrix.extras diff --git a/tests/conftest.py b/tests/conftest.py index 6cd922b..cda2aaf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ from typing import Iterator import hypothesis import pytest +from natsort.compat.locale import dumb_sort # This disables the "too slow" hypothesis heath check globally. @@ -58,6 +59,8 @@ def with_locale_cs_cz() -> Iterator[None]: orig = locale.getlocale() try: load_locale("cs_CZ") + if dumb_sort(): + pytest.skip("requires a functioning locale library to run") except locale.Error: pytest.skip("requires cs_CZ locale to be installed") else: -- cgit v1.2.1