summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth Morton <seth.m.morton@gmail.com>2021-12-10 21:13:27 -0800
committerGitHub <noreply@github.com>2021-12-10 21:13:27 -0800
commit16933694aa6dc70cea5a765bc85641bbc5592bdb (patch)
treea184eeaaf5a7cb333b56fb0f6d73fa0957074f40
parent40a3f6c625e5e58d4c303c4366e7df2448852943 (diff)
parent96d46e5a57fc6281f9c4b2cc06f8736560cfbe8a (diff)
downloadnatsort-16933694aa6dc70cea5a765bc85641bbc5592bdb.tar.gz
Merge pull request #141 from SethMMorton/fix-sorting-in-ce-locale
Fix sorting in ce locale
-rw-r--r--.github/workflows/tests.yml2
-rw-r--r--CHANGELOG.md5
-rw-r--r--natsort/compat/locale.py1
-rw-r--r--natsort/natsort.py1
-rw-r--r--natsort/utils.py31
-rw-r--r--tests/conftest.py19
-rw-r--r--tests/test_natsorted.py8
7 files changed, 59 insertions, 8 deletions
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8dfb799..d78fdf9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,7 +36,7 @@ jobs:
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt-get update
- sudo apt-get install language-pack-de language-pack-en
+ sudo apt-get install language-pack-de language-pack-en language-pack-cs
- name: Install ICU
if: matrix.extras
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 263c5f4..df15326 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,11 @@
Unreleased
---
+### Fixed
+
+- Compose unicode characters when using locale to ensure sorting is correct
+ across all locales.
+
[8.0.0] - 2021-11-03
---
diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py
index 9af5e7a..b4c5356 100644
--- a/natsort/compat/locale.py
+++ b/natsort/compat/locale.py
@@ -54,7 +54,6 @@ try: # noqa: C901
sep = icu.DecimalFormatSymbols.kDecimalSeparatorSymbol
return cast(str, icu.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep))
-
except ImportError:
import locale
from locale import strxfrm
diff --git a/natsort/natsort.py b/natsort/natsort.py
index a95f9a9..9f34bc1 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -786,7 +786,6 @@ if platform.system() == "Windows":
OSSortKeyType, lambda x: tuple(map(_winsort_key, _split_apply(x, key)))
)
-
else:
# For UNIX-based platforms, ICU performs MUCH better than locale
diff --git a/natsort/utils.py b/natsort/utils.py
index 7102f41..c9448b4 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -235,6 +235,25 @@ def _normalize_input_factory(alg: NSType) -> StrToStr:
return partial(normalize, normalization_form)
+def _compose_input_factory(alg: NSType) -> StrToStr:
+ """
+ Create a function that will compose unicode input data.
+
+ Parameters
+ ----------
+ alg : ns enum
+ Used to indicate how to compose unicode.
+
+ Returns
+ -------
+ func : callable
+ A function that accepts string (unicode) input and returns the
+ the input normalized with the desired composition scheme.
+ """
+ normalization_form = "NFKC" if alg & ns.COMPATIBILITYNORMALIZE else "NFC"
+ return partial(normalize, normalization_form)
+
+
@overload
def natsort_key(
val: NatsortInType,
@@ -472,6 +491,7 @@ def parse_string_factory(
orig_after_xfrm = not (alg & NS_DUMB and alg & ns.LOCALEALPHA)
original_func = input_transform if orig_after_xfrm else _no_op
normalize_input = _normalize_input_factory(alg)
+ compose_input = _compose_input_factory(alg) if alg & ns.LOCALEALPHA else _no_op
def func(x: str) -> FinalTransform:
# Apply string input transformation function and return to x.
@@ -479,11 +499,12 @@ def parse_string_factory(
# to also be the transformation function.
a = normalize_input(x)
b, original = input_transform(a), original_func(a)
- c = splitter(b) # Split string into components.
- d = filter(None, c) # Remove empty strings.
- e = map(component_transform, d) # Apply transform on components.
- f = sep_inserter(e, sep) # Insert '' between numbers.
- return final_transform(f, original) # Apply the final transform.
+ c = compose_input(b) # Decompose unicode if using LOCALE
+ d = splitter(c) # Split string into components.
+ e = filter(None, d) # Remove empty strings.
+ f = map(component_transform, e) # Apply transform on components.
+ g = sep_inserter(f, sep) # Insert '' between numbers.
+ return final_transform(g, original) # Apply the final transform.
return func
diff --git a/tests/conftest.py b/tests/conftest.py
index c63e149..cda2aaf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,6 +7,7 @@ from typing import Iterator
import hypothesis
import pytest
+from natsort.compat.locale import dumb_sort
# This disables the "too slow" hypothesis heath check globally.
@@ -48,3 +49,21 @@ def with_locale_de_de() -> Iterator[None]:
yield
finally:
locale.setlocale(locale.LC_ALL, orig)
+
+
+@pytest.fixture()
+def with_locale_cs_cz() -> Iterator[None]:
+ """
+ Convenience to load the cs_CZ locale - reset when complete - skip if missing.
+ """
+ orig = locale.getlocale()
+ try:
+ load_locale("cs_CZ")
+ if dumb_sort():
+ pytest.skip("requires a functioning locale library to run")
+ except locale.Error:
+ pytest.skip("requires cs_CZ locale to be installed")
+ else:
+ yield
+ finally:
+ locale.setlocale(locale.LC_ALL, orig)
diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py
index d043ab4..4a64a27 100644
--- a/tests/test_natsorted.py
+++ b/tests/test_natsorted.py
@@ -251,6 +251,14 @@ def test_natsorted_locale_bug_regression_test_109() -> None:
assert natsorted(given, alg=ns.LOCALE) == expected
+@pytest.mark.usefixtures("with_locale_cs_cz")
+def test_natsorted_locale_bug_regression_test_140() -> None:
+ # https://github.com/SethMMorton/natsort/issues/140
+ given = ["Aš", "Cheb", "Česko", "Cibulov", "Znojmo", "Žilina"]
+ expected = ["Aš", "Cibulov", "Česko", "Cheb", "Znojmo", "Žilina"]
+ assert natsorted(given, alg=ns.LOCALE) == expected
+
+
@pytest.mark.parametrize(
"alg, expected",
[