summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2016-05-04 22:39:44 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2016-05-04 22:39:44 -0700
commitc3c11fe4d3c3d486be10e5775d699f15547c973c (patch)
tree6fdcef46e692e37ac0b9a92c622f38b7869ea09d
parentcc340d4223ee2931e7a7408cbbb65f5320b1e0aa (diff)
downloadnatsort-c3c11fe4d3c3d486be10e5775d699f15547c973c.tar.gz
Added thousands and decimal separator replacement.
This is turned on with LOCALE. Unfortunately, the thousands separators are not reliable with the OSX implementation of locale (of course), by PyICU is still good. Need to add more tests and documentation.
-rw-r--r--natsort/compat/locale.py37
-rw-r--r--natsort/locale_help.py4
-rw-r--r--natsort/utils.py23
-rw-r--r--test_natsort/compat/locale.py3
-rw-r--r--test_natsort/test_natsorted.py21
5 files changed, 63 insertions, 25 deletions
diff --git a/natsort/compat/locale.py b/natsort/compat/locale.py
index 1e1d596..5412cf5 100644
--- a/natsort/compat/locale.py
+++ b/natsort/compat/locale.py
@@ -19,23 +19,32 @@ try:
import PyICU
from locale import getlocale
- # If using PyICU, get the locale from the current global locale,
- # then create a sort key from that
- def get_pyicu_transform(l, _d={}):
- if l not in _d:
- if l == (None, None): # pragma: no cover
- c = PyICU.Collator.createInstance(PyICU.Locale())
- else:
- loc = '.'.join(l)
- c = PyICU.Collator.createInstance(PyICU.Locale(loc))
- _d[l] = c.getSortKey
- return _d[l]
use_pyicu = True
null_string = b''
+ # If using PyICU, get the locale from the current global locale,
+ def get_icu_locale():
+ try:
+ return PyICU.Locale('.'.join(getlocale()))
+ except TypeError: # pragma: no cover
+ return PyICU.Locale()
+
+ def get_pyicu_transform():
+ return PyICU.Collator.createInstance(get_icu_locale()).getSortKey
+
+ def get_thousands_sep():
+ sep = PyICU.DecimalFormatSymbols.kGroupingSeparatorSymbol
+ return PyICU.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)
+
+ def get_decimal_point():
+ sep = PyICU.DecimalFormatSymbols.kDecimalSeparatorSymbol
+ return PyICU.DecimalFormatSymbols(get_icu_locale()).getSymbol(sep)
+
def dumb_sort():
return False
+
except ImportError:
+ import locale
if sys.version[0] == '2':
from locale import strcoll
strxfrm = cmp_to_key(strcoll)
@@ -45,6 +54,12 @@ except ImportError:
null_string = ''
use_pyicu = False
+ def get_thousands_sep():
+ return locale.localeconv()['thousands_sep']
+
+ def get_decimal_point():
+ return locale.localeconv()['decimal_point']
+
# On some systems, locale is broken and does not sort in the expected
# order. We will try to detect this and compensate.
def dumb_sort():
diff --git a/natsort/locale_help.py b/natsort/locale_help.py
index c905f96..0f76d2c 100644
--- a/natsort/locale_help.py
+++ b/natsort/locale_help.py
@@ -17,7 +17,7 @@ from itertools import chain
# Local imports.
from natsort.compat.locale import use_pyicu, _low
if use_pyicu:
- from natsort.compat.locale import get_pyicu_transform, getlocale
+ from natsort.compat.locale import get_pyicu_transform
else:
from natsort.compat.locale import strxfrm
@@ -32,6 +32,6 @@ def locale_convert_function():
Return a function that will use the appropriate locale tranformation.
"""
if use_pyicu:
- return get_pyicu_transform(getlocale())
+ return get_pyicu_transform()
else:
return strxfrm
diff --git a/natsort/utils.py b/natsort/utils.py
index 9def7f3..985adb3 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -26,6 +26,10 @@ from natsort.ns_enum import ns
from natsort.unicode_numbers import digits, numeric
from natsort.locale_help import locale_convert_function, groupletters
from natsort.compat.pathlib import PurePath, has_pathlib
+from natsort.compat.locale import (
+ get_thousands_sep,
+ get_decimal_point,
+)
from natsort.compat.py23 import (
py23_str,
py23_range,
@@ -212,6 +216,21 @@ def _pre_split_function(alg):
lowfirst = alg & ns.LOWERCASEFIRST
dumb = alg & ns._DUMB
+ # Create a regular expression that will change the decimal point to
+ # a period if not already a period.
+ decimal = get_decimal_point()
+
+ switch_decimal = r'(?<=[0-9]){decimal}|{decimal}(?=[0-9])'
+ switch_decimal = switch_decimal.format(decimal=decimal)
+ switch_decimal = re.compile(switch_decimal)
+
+ # Create a regular expression that will remove thousands seprarators.
+ thousands = get_thousands_sep()
+ strip_thousands = (r'(?<![0-9]{{4}})(?<=[0-9]{{1}})'
+ r'{thousands}(?=[0-9]{{3}}([^0-9]|$))')
+ strip_thousands = strip_thousands.format(thousands=thousands)
+ strip_thousands = re.compile(strip_thousands)
+
# Build the chain of functions to execute in order.
function_chain = []
if (dumb and not lowfirst) or (lowfirst and not dumb):
@@ -221,6 +240,10 @@ def _pre_split_function(alg):
function_chain.append(methodcaller('casefold'))
else:
function_chain.append(methodcaller('lower'))
+ if alg & ns.LOCALE:
+ function_chain.append(partial(strip_thousands.sub, ''))
+ if decimal != '.':
+ function_chain.append(partial(switch_decimal.sub, '.'))
# Return the chained functions.
return chain_functions(function_chain)
diff --git a/test_natsort/compat/locale.py b/test_natsort/compat/locale.py
index a95c888..d1b07a2 100644
--- a/test_natsort/compat/locale.py
+++ b/test_natsort/compat/locale.py
@@ -32,10 +32,9 @@ except locale.Error:
# strxfrm for the current locale.
if use_pyicu:
from natsort.locale_help import get_pyicu_transform
- from locale import getlocale
def get_strxfrm():
- return get_pyicu_transform(getlocale())
+ return get_pyicu_transform()
else:
from natsort.locale_help import strxfrm
diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
index 921aa60..b749b0d 100644
--- a/test_natsort/test_natsorted.py
+++ b/test_natsort/test_natsorted.py
@@ -4,7 +4,7 @@ Here are a collection of examples of how this module can be used.
See the README or the natsort homepage for more details.
"""
from __future__ import unicode_literals, print_function
-# import pytest
+import pytest
import sys
import locale
from operator import itemgetter
@@ -13,9 +13,10 @@ from natsort import (
natsorted,
ns,
)
+from natsort.compat.locale import dumb_sort
from compat.locale import (
load_locale,
- # has_locale_de_DE,
+ has_locale_de_DE,
)
@@ -224,17 +225,17 @@ def test_natsorted_with_LOCALE_and_CAPITALFIRST_and_LOWERCASE_returns_results_so
def test_natsorted_with_LOCALE_and_en_setting_returns_results_sorted_by_en_language():
load_locale('en_US')
- a = ['c', 'ä', 'b', 'a5,6', 'a5,50']
- assert natsorted(a, alg=ns.LOCALE | ns.F) == ['a5,6', 'a5,50', 'ä', 'b', 'c']
+ a = ['c', 'a5,467.86', 'ä', 'b', 'a5367.86', 'a5,6', 'a5,50']
+ assert natsorted(a, alg=ns.LOCALE | ns.F) == ['a5,6', 'a5,50', 'a5367.86', 'a5,467.86', 'ä', 'b', 'c']
locale.setlocale(locale.LC_ALL, str(''))
-# @pytest.mark.skipif(not has_locale_de_DE, reason='requires de_DE locale')
-# def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_language():
-# load_locale('de_DE')
-# a = ['c', 'ä', 'b', 'a5,6', 'a5,50']
-# assert natsorted(a, alg=ns.LOCALE | ns.F) == ['a5,50', 'a5,6', 'ä', 'b', 'c']
-# locale.setlocale(locale.LC_ALL, str(''))
+@pytest.mark.skipif(not has_locale_de_DE or dumb_sort(), reason='requires de_DE locale and working locale')
+def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_language():
+ load_locale('de_DE')
+ a = ['c', 'a5.467,86', 'ä', 'b', 'a5367.86', 'a5,6', 'a5,50']
+ assert natsorted(a, alg=ns.LOCALE | ns.F) == ['a5,50', 'a5,6', 'a5367.86', 'a5.467,86', 'ä', 'b', 'c']
+ locale.setlocale(locale.LC_ALL, str(''))
def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():