diff options
-rw-r--r-- | .github/workflows/tests.yml | 2 | ||||
-rw-r--r-- | CHANGELOG.md | 5 | ||||
-rw-r--r-- | natsort/compat/locale.py | 1 | ||||
-rw-r--r-- | natsort/natsort.py | 1 | ||||
-rw-r--r-- | natsort/utils.py | 31 | ||||
-rw-r--r-- | tests/conftest.py | 19 | ||||
-rw-r--r-- | tests/test_natsorted.py | 8 |
7 files changed, 59 insertions, 8 deletions
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8dfb799..d78fdf9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,7 +36,7 @@ jobs: if: matrix.os == 'ubuntu-latest' run: | sudo apt-get update - sudo apt-get install language-pack-de language-pack-en + sudo apt-get install language-pack-de language-pack-en language-pack-cs - name: Install ICU if: matrix.extras diff --git a/CHANGELOG.md b/CHANGELOG.md index 263c5f4..df15326 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Unreleased --- +### Fixed + +- Compose unicode characters when using locale to ensure sorting is correct + across all locales. + [8.0.0] - 2021-11-03 --- diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py index 9af5e7a..b4c5356 100644 --- a/natsort/compat/locale.py +++ b/natsort/compat/locale.py @@ -54,7 +54,6 @@ try: # noqa: C901 sep = icu.DecimalFormatSymbols.kDecimalSeparatorSymbol return cast(str, icu.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)) - except ImportError: import locale from locale import strxfrm diff --git a/natsort/natsort.py b/natsort/natsort.py index a95f9a9..9f34bc1 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -786,7 +786,6 @@ if platform.system() == "Windows": OSSortKeyType, lambda x: tuple(map(_winsort_key, _split_apply(x, key))) ) - else: # For UNIX-based platforms, ICU performs MUCH better than locale diff --git a/natsort/utils.py b/natsort/utils.py index 7102f41..c9448b4 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -235,6 +235,25 @@ def _normalize_input_factory(alg: NSType) -> StrToStr: return partial(normalize, normalization_form) +def _compose_input_factory(alg: NSType) -> StrToStr: + """ + Create a function that will compose unicode input data. + + Parameters + ---------- + alg : ns enum + Used to indicate how to compose unicode. + + Returns + ------- + func : callable + A function that accepts string (unicode) input and returns the + the input normalized with the desired composition scheme. + """ + normalization_form = "NFKC" if alg & ns.COMPATIBILITYNORMALIZE else "NFC" + return partial(normalize, normalization_form) + + @overload def natsort_key( val: NatsortInType, @@ -472,6 +491,7 @@ def parse_string_factory( orig_after_xfrm = not (alg & NS_DUMB and alg & ns.LOCALEALPHA) original_func = input_transform if orig_after_xfrm else _no_op normalize_input = _normalize_input_factory(alg) + compose_input = _compose_input_factory(alg) if alg & ns.LOCALEALPHA else _no_op def func(x: str) -> FinalTransform: # Apply string input transformation function and return to x. @@ -479,11 +499,12 @@ def parse_string_factory( # to also be the transformation function. a = normalize_input(x) b, original = input_transform(a), original_func(a) - c = splitter(b) # Split string into components. - d = filter(None, c) # Remove empty strings. - e = map(component_transform, d) # Apply transform on components. - f = sep_inserter(e, sep) # Insert '' between numbers. - return final_transform(f, original) # Apply the final transform. + c = compose_input(b) # Decompose unicode if using LOCALE + d = splitter(c) # Split string into components. + e = filter(None, d) # Remove empty strings. + f = map(component_transform, e) # Apply transform on components. + g = sep_inserter(f, sep) # Insert '' between numbers. + return final_transform(g, original) # Apply the final transform. return func diff --git a/tests/conftest.py b/tests/conftest.py index c63e149..cda2aaf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ from typing import Iterator import hypothesis import pytest +from natsort.compat.locale import dumb_sort # This disables the "too slow" hypothesis heath check globally. @@ -48,3 +49,21 @@ def with_locale_de_de() -> Iterator[None]: yield finally: locale.setlocale(locale.LC_ALL, orig) + + +@pytest.fixture() +def with_locale_cs_cz() -> Iterator[None]: + """ + Convenience to load the cs_CZ locale - reset when complete - skip if missing. + """ + orig = locale.getlocale() + try: + load_locale("cs_CZ") + if dumb_sort(): + pytest.skip("requires a functioning locale library to run") + except locale.Error: + pytest.skip("requires cs_CZ locale to be installed") + else: + yield + finally: + locale.setlocale(locale.LC_ALL, orig) diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py index d043ab4..4a64a27 100644 --- a/tests/test_natsorted.py +++ b/tests/test_natsorted.py @@ -251,6 +251,14 @@ def test_natsorted_locale_bug_regression_test_109() -> None: assert natsorted(given, alg=ns.LOCALE) == expected +@pytest.mark.usefixtures("with_locale_cs_cz") +def test_natsorted_locale_bug_regression_test_140() -> None: + # https://github.com/SethMMorton/natsort/issues/140 + given = ["Aš", "Cheb", "Česko", "Cibulov", "Znojmo", "Žilina"] + expected = ["Aš", "Cibulov", "Česko", "Cheb", "Znojmo", "Žilina"] + assert natsorted(given, alg=ns.LOCALE) == expected + + @pytest.mark.parametrize( "alg, expected", [ |