diff options
author | Seth M Morton <seth.m.morton@gmail.com> | 2015-04-06 23:33:27 -0700 |
---|---|---|
committer | Seth M Morton <seth.m.morton@gmail.com> | 2015-04-06 23:33:27 -0700 |
commit | 72867093bce4c2cabe2ea53415fabfb6238ae7ea (patch) | |
tree | 67e5322ee99b68c3c8b457b58b58e83388f2cc6c | |
parent | 4854fc999a1d3a9a6a7a1185153737d87b63be32 (diff) | |
parent | c1b6ef887f4595f32a04678c316d2c9c68517cb6 (diff) | |
download | natsort-3.5.6.tar.gz |
natsort release version 3.5.63.5.6
- Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior
of an ordinal sort when using 'LOCALE'.
- Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for
dealing with bytes types.
-rw-r--r-- | README.rst | 42 | ||||
-rw-r--r-- | docs/source/api.rst | 1 | ||||
-rw-r--r-- | docs/source/bytes.rst | 20 | ||||
-rw-r--r-- | docs/source/changelog.rst | 10 | ||||
-rw-r--r-- | docs/source/examples.rst | 45 | ||||
-rw-r--r-- | docs/source/intro.rst | 15 | ||||
-rw-r--r-- | natsort/__init__.py | 6 | ||||
-rw-r--r-- | natsort/_version.py | 2 | ||||
-rw-r--r-- | natsort/natsort.py | 94 | ||||
-rw-r--r-- | natsort/ns_enum.py | 33 | ||||
-rw-r--r-- | natsort/utils.py | 16 | ||||
-rw-r--r-- | test_natsort/test_natsort.py | 23 | ||||
-rw-r--r-- | test_natsort/test_utils.py | 44 |
13 files changed, 313 insertions, 38 deletions
@@ -92,7 +92,22 @@ when you sort: >>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a'] >>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError -You cannot mix and match ``str`` and ``bytes`` objects on Python 3. +``natsort`` does not officially support the ``bytes`` type on Python 3, but +convenience functions are provided that help you decode to ``str`` first: + +.. code-block:: python + + >>> from natsort import as_utf8 + >>> a = [b'a', 14.0, 'b'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) + >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b'] + True + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would return the same results as sorted(a) + >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] + True The natsort algorithm does other fancy things like @@ -179,7 +194,15 @@ History These are the last three entries of the changelog. See the package documentation for the complete `changelog <http://pythonhosted.org//natsort/changelog.html>`_. -04-04-2015 v. 3.5.4 +04-06-2015 v. 3.5.6 +''''''''''''''''''' + + - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of + an ordinal sort when using 'LOCALE'. + - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for + dealing with bytes types. + +04-04-2015 v. 3.5.5 ''''''''''''''''''' - Added 'realsorted' and 'index_realsorted' functions for @@ -191,18 +214,3 @@ for the complete `changelog <http://pythonhosted.org//natsort/changelog.html>`_. - Fixed bug where a 'TypeError' was raised if a string containing a leading number was sorted with alpha-only strings when 'LOCALE' is used. - -03-26-2015 v. 3.5.3 -''''''''''''''''''' - - - Fixed bug where '--reverse-filter; option in shell script was not - getting checked for correctness. - - Documentation updates to better describe locale bug, and illustrate - upcoming default behavior change. - - Internal improvements, including making test suite more granular. - -01-13-2015 v. 3.5.2 -''''''''''''''''''' - - - Enhancement that will convert a 'pathlib.Path' object to a 'str' if - 'ns.PATH' is enabled. diff --git a/docs/source/api.rst b/docs/source/api.rst index 8919542..4084720 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -21,3 +21,4 @@ natsort API index_realsorted.rst order_by_index.rst ns_class.rst + bytes.rst diff --git a/docs/source/bytes.rst b/docs/source/bytes.rst new file mode 100644 index 0000000..c59d4ad --- /dev/null +++ b/docs/source/bytes.rst @@ -0,0 +1,20 @@ +.. default-domain:: py +.. currentmodule:: natsort + +.. _bytes_help: + +Help With Bytes On Python 3 +=========================== + +The official stance of :mod:`natsort` is to not support `bytes` for +sorting; there is just too much that can go wrong when trying to automate +conversion between `bytes` and `str`. But rather than completely give up +on `bytes`, :mod:`natsort` provides three functions that make it easy to +quickly decode `bytes` to `str` so that sorting is possible. + +.. autofunction:: decoder + +.. autofunction:: as_ascii + +.. autofunction:: as_utf8 + diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 86abfb7..2803377 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -3,7 +3,15 @@ Changelog --------- -04-04-2015 v. 3.5.4 +04-06-2015 v. 3.5.6 +''''''''''''''''''' + + - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of + an ordinal sort when using 'LOCALE'. + - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for + dealing with bytes types. + +04-04-2015 v. 3.5.5 ''''''''''''''''''' - Added 'realsorted' and 'index_realsorted' functions for diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 1f795e4..a995bb4 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -252,3 +252,48 @@ Just like the :func:`sorted` built-in function, you can supply the >>> a = ['a2', 'a9', 'a1', 'a4', 'a10'] >>> natsorted(a, reverse=True) ['a10', 'a9', 'a4', 'a2', 'a1'] + +Sorting Bytes on Python 3 +------------------------- + +Python 3 is rather strict about comparing strings and bytes, and this +can make it difficult to deal with collections of both. Because of the +challenge of guessing which encoding should be used to decode a bytes +array to a string, :mod:`natsort` does *not* try to guess and automatically +convert for you; in fact, the official stance of :mod:`natsort` is to +not support sorting bytes. Instead, some decoding convenience functions +have been provided to you (see :ref:`bytes_help`) that allow you to +provide a codec for decoding bytes through the ``key`` argument that +will allow :mod:`natsort` to convert byte arrays to strings for sorting; +these functions know not to raise an error if the input is not a byte +array, so you can use the key on any arbitrary collection of data. + +:: + + >>> from natsort import as_ascii + >>> a = [b'a', 14.0, 'b'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) + >>> natsorted(a, key=as_ascii) == [14.0, b'a', 'b'] + True + +Additionally, regular expressions cannot be run on byte arrays, making it +so that :mod:`natsort` cannot parse them for numbers. As a result, if you +run :mod:`natsort` on a list of bytes, you will get results that are like +Python's default sorting behavior. Of course, you can use the decoding +functions to solve this:: + + >>> from natsort import as_utf8 + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> natsorted(a) # doctest: +SKIP + [b'a40', b'a5', b'a56', b'a6'] + >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] + True + +If you need a codec different from ASCII or UTF-8, you can use +:func:`decoder` to generate a custom key:: + + >>> from natsort import decoder + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> natsorted(a, key=decoder('latin1')) == [b'a5', b'a6', b'a40', b'a56'] + True diff --git a/docs/source/intro.rst b/docs/source/intro.rst index 9ae466b..b79aec9 100644 --- a/docs/source/intro.rst +++ b/docs/source/intro.rst @@ -87,7 +87,20 @@ when you sort:: >>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a'] >>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError -You cannot mix and match ``str`` and ``bytes`` objects on Python 3. +:mod:`natsort` does not officially support the `bytes` type on Python 3, but +convenience functions are provided that help you decode to `str` first:: + + >>> from natsort import as_utf8 + >>> a = [b'a', 14.0, 'b'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str()) + >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b'] + True + >>> a = [b'a56', b'a5', b'a6', b'a40'] + >>> # On Python 2, natsorted(a) would would work as expected. + >>> # On Python 3, natsorted(a) would return the same results as sorted(a) + >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56'] + True The natsort algorithm does other fancy things like diff --git a/natsort/__init__.py b/natsort/__init__.py index d6f7467..4e6cc82 100644 --- a/natsort/__init__.py +++ b/natsort/__init__.py @@ -7,7 +7,8 @@ from natsort.natsort import (natsort_key, natsort_keygen, ns, natsorted, humansorted, versorted, realsorted, index_realsorted, index_natsorted, index_versorted, - index_humansorted, order_by_index) + index_humansorted, order_by_index, + decoder, as_ascii, as_utf8) from natsort._version import __version__ __all__ = [ @@ -22,5 +23,8 @@ __all__ = [ 'index_humansorted', 'index_realsorted', 'order_by_index', + 'decoder', + 'as_ascii', + 'as_utf8', 'ns', ] diff --git a/natsort/_version.py b/natsort/_version.py index 4134a94..eea91d6 100644 --- a/natsort/_version.py +++ b/natsort/_version.py @@ -2,4 +2,4 @@ from __future__ import (print_function, division, unicode_literals, absolute_import) -__version__ = '3.5.5' +__version__ = '3.5.6' diff --git a/natsort/natsort.py b/natsort/natsort.py index c94e137..8fb6754 100644 --- a/natsort/natsort.py +++ b/natsort/natsort.py @@ -22,7 +22,7 @@ from functools import partial from warnings import warn # Local imports. -from natsort.utils import _natsort_key, _args_to_enum +from natsort.utils import _natsort_key, _args_to_enum, _do_decoding from natsort.ns_enum import ns from natsort.py23compat import u_format @@ -31,6 +31,98 @@ __doc__ = u_format(__doc__) @u_format +def decoder(encoding): + """ + Return a function that can be used to decode bytes to unicode. + + Parameters + ---------- + encoding: str + The codec to use for decoding. This must be a valid unicode codec. + + Returns + ------- + decode_function: + A function that takes a single argument and attempts to decode + it using the supplied codec. Any `UnicodeErrors` are raised. + If the argument was not of `bytes` type, it is simply returned + as-is. + + See Also + -------- + as_ascii + as_utf8 + + Examples + -------- + + >>> f = decoder('utf8') + >>> f(b'bytes') == 'bytes' + True + >>> f(12345) == 12345 + True + >>> natsorted([b'a10', b'a2'], key=decoder('utf8')) == [b'a2', b'a10'] + True + >>> # On Python 3, without decoder this would return [b'a10', b'a2'] + >>> natsorted([b'a10', 'a2'], key=decoder('utf8')) == ['a2', b'a10'] + True + >>> # On Python 3, without decoder this would raise a TypeError. + + """ + return partial(_do_decoding, encoding=encoding) + + +@u_format +def as_ascii(s): + """ + Function to decode an input with the ASCII codec, or return as-is. + + Parameters + ---------- + s: + Any object. + + Returns + ------- + output: + If the input was of type `bytes`, the return value is a `str` decoded + with the ASCII codec. Otherwise, the return value is identically the + input. + + See Also + -------- + decoder + + """ + return _do_decoding(s, 'ascii') + + +@u_format +def as_utf8(s): + """ + Function to decode an input with the UTF-8 codec, or return as-is. + + Parameters + ---------- + s: + Any object. + + Returns + ------- + output: + If the input was of type `bytes`, the return value is a `str` decoded + with the UTF-8 codec. Otherwise, the return value is identically the + input. + + See Also + -------- + decoder + + """ + return _do_decoding(s, 'utf-8') + + +@u_format def natsort_key(val, key=None, number_type=float, signed=None, exp=None, as_path=None, py3_safe=None, alg=0): """\ diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index c5e0afc..f568382 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -80,6 +80,13 @@ class ns(object): ``['Apple', 'apple', 'Banana', 'banana']``. Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST` to reverse the order of upper and lower case. + CAPITALFIRST, C + Only used when `LOCALE` is enabled. Tell `natsort` to put all + capitalized words before non-capitalized words. This is essentially + the inverse of `GROUPLETTERS`, and is the default Python sorting + behavior without `LOCALE`. + UNGROUPLETTERS, UG + An alias for `CAPITALFIRST`. TYPESAFE, T Try hard to avoid "unorderable types" error on Python 3. It is the same as setting the old `py3_safe` option to `True`. @@ -113,18 +120,20 @@ class ns(object): # Sort algorithm "enum" values. -_ns = {'FLOAT': 0, 'F': 0, - 'INT': 1, 'I': 1, - 'UNSIGNED': 2, 'U': 2, - 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED - 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED - 'NOEXP': 4, 'N': 4, - 'PATH': 8, 'P': 8, - 'LOCALE': 16, 'L': 16, - 'IGNORECASE': 32, 'IC': 32, - 'LOWERCASEFIRST': 64, 'LF': 64, - 'GROUPLETTERS': 128, 'G': 128, - 'TYPESAFE': 1024, 'T': 1024, +_ns = {'FLOAT': 0, 'F': 0, + 'INT': 1, 'I': 1, + 'UNSIGNED': 2, 'U': 2, + 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED + 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED + 'NOEXP': 4, 'N': 4, + 'PATH': 8, 'P': 8, + 'LOCALE': 16, 'L': 16, + 'IGNORECASE': 32, 'IC': 32, + 'LOWERCASEFIRST': 64, 'LF': 64, + 'GROUPLETTERS': 128, 'G': 128, + 'UNGROUPLETTERS': 256, 'UG': 256, + 'CAPITALFIRST': 256, 'C': 256, + 'TYPESAFE': 1024, 'T': 1024, } # Populate the ns class with the _ns values. for x, y in _ns.items(): diff --git a/natsort/utils.py b/natsort/utils.py index 71292c5..3e756b7 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -41,7 +41,7 @@ else: # Group algorithm types for easy extraction _NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP _ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L | - ns.IC | ns.LF | ns.G | ns.TYPESAFE) + ns.IC | ns.LF | ns.G | ns.UG | ns.TYPESAFE) # The regex that locates floats _float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U) @@ -78,6 +78,16 @@ _regex_and_num_function_chooser = { } +def _do_decoding(s, encoding): + """A function to decode a bytes string, or return the object as-is.""" + try: + return s.decode(encoding) + except UnicodeError: + raise + except (AttributeError, TypeError): + return s + + def _args_to_enum(number_type, signed, exp, as_path, py3_safe): """A function to convert input booleans to an enum-type argument.""" alg = 0 @@ -270,6 +280,8 @@ def _natsort_key(val, key, alg): val = val.swapcase() if alg & _ns['IGNORECASE']: val = val.lower() + if use_locale and alg & _ns['UNGROUPLETTERS'] and val[0].isupper(): + val = ' ' + val return tuple(_number_extracter(val, regex, num_function, @@ -279,7 +291,7 @@ def _natsort_key(val, key, alg): except (TypeError, AttributeError): # Check if it is a bytes type, and if so return as a # one element tuple. - if isinstance(val, bytes): + if type(val) in (bytes,): return (val,) # If not strings, assume it is an iterable that must # be parsed recursively. Do not apply the key recursively. diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py index 6d9c853..78a3eaa 100644 --- a/test_natsort/test_natsort.py +++ b/test_natsort/test_natsort.py @@ -11,10 +11,31 @@ from operator import itemgetter from pytest import raises from natsort import natsorted, index_natsorted, natsort_key, versorted, index_versorted from natsort import humansorted, index_humansorted, natsort_keygen, order_by_index, ns -from natsort import realsorted, index_realsorted +from natsort import realsorted, index_realsorted, decoder, as_ascii, as_utf8 from natsort.utils import _natsort_key +def test_decoder_returns_function_that_can_decode_bytes_but_return_non_bytes_as_is(): + f = decoder('latin1') + a = 'bytes' + b = 14 + assert f(b'bytes') == a + assert f(b) is b # returns as-is, same object ID + if sys.version[0] == '3': + assert f(a) is a # same object returned on Python3 b/c only bytes has decode + else: + assert f(a) is not a + assert f(a) == a # not same object on Python2 because str can decode + + +def test_as_ascii_returns_bytes_as_ascii(): + assert decoder('ascii')(b'bytes') == as_ascii(b'bytes') + + +def test_as_utf8_returns_bytes_as_utf8(): + assert decoder('utf8')(b'bytes') == as_utf8(b'bytes') + + def test_natsort_key_public_raises_DeprecationWarning_when_called(): # Identical to _natsort_key # But it raises a deprecation warning diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py index 9f7c930..01fe6a3 100644 --- a/test_natsort/test_utils.py +++ b/test_natsort/test_utils.py @@ -1,14 +1,17 @@ # -*- coding: utf-8 -*- """These test the utils.py functions.""" +from __future__ import unicode_literals +import sys import locale from operator import itemgetter from pytest import raises from natsort.ns_enum import ns from natsort.utils import _number_extracter, _py3_safe, _natsort_key, _args_to_enum from natsort.utils import _float_sign_exp_re, _float_nosign_exp_re, _float_sign_noexp_re -from natsort.utils import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re +from natsort.utils import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re, _do_decoding from natsort.locale_help import use_pyicu, null_string +from natsort.py23compat import py23_str try: from fastnumbers import fast_float, fast_int @@ -23,6 +26,12 @@ else: has_pathlib = True +def test_do_decoding_decodes_bytes_string_to_unicode(): + assert type(_do_decoding(b'bytes', 'ascii')) is py23_str + assert _do_decoding(b'bytes', 'ascii') == 'bytes' + assert _do_decoding(b'bytes', 'ascii') == b'bytes'.decode('ascii') + + def test_args_to_enum_converts_signed_exp_float_to_ns_F(): # number_type, signed, exp, as_path, py3_safe assert _args_to_enum(float, True, True, False, False) == ns.F @@ -299,6 +308,16 @@ def test__natsort_key_with_GROUPLETTERS_and_LOWERCASEFIRST_inverts_text_first_th assert _natsort_key('Apple56', None, ns.G | ns.LF) == ('aapPpPlLeE', 56.0) +def test__natsort_key_with_bytes_input_only_applies_LOWERCASEFIRST_or_IGNORECASE_and_returns_in_tuple(): + if sys.version[0] == '3': + assert _natsort_key(b'Apple56', None, ns.I) == (b'Apple56',) + assert _natsort_key(b'Apple56', None, ns.LF) == (b'aPPLE56',) + assert _natsort_key(b'Apple56', None, ns.IC) == (b'apple56',) + assert _natsort_key(b'Apple56', None, ns.G) == (b'Apple56',) + else: + assert True + + def test__natsort_key_with_LOCALE_transforms_floats_according_to_the_current_locale_and_strxfrms_strings(): # Locale aware sorting locale.setlocale(locale.LC_NUMERIC, str('en_US.UTF-8')) @@ -317,3 +336,26 @@ def test__natsort_key_with_LOCALE_transforms_floats_according_to_the_current_loc assert _natsort_key('Apple56.5', None, ns.LOCALE) == (strxfrm('Apple'), 56.5) assert _natsort_key('Apple56,5', None, ns.LOCALE) == (strxfrm('Apple'), 56.5) locale.setlocale(locale.LC_NUMERIC, str('')) + + +def test__natsort_key_with_LOCALE_and_UNGROUPLETTERS_places_space_before_string_with_capital_first_letter(): + # Locale aware sorting + locale.setlocale(locale.LC_NUMERIC, str('en_US.UTF-8')) + if use_pyicu: + from natsort.locale_help import get_pyicu_transform + from locale import getlocale + strxfrm = get_pyicu_transform(getlocale()) + else: + from natsort.locale_help import strxfrm + assert _natsort_key('Apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (strxfrm(' Apple'), 56.5) + assert _natsort_key('apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (strxfrm('apple'), 56.5) + assert _natsort_key('12Apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (null_string, 12.0, strxfrm('Apple'), 56.5) + # The below are all aliases for UNGROUPLETTERS + assert ns.UNGROUPLETTERS == ns.UG + assert ns.UNGROUPLETTERS == ns.CAPITALFIRST + assert ns.UNGROUPLETTERS == ns.C + locale.setlocale(locale.LC_NUMERIC, str('')) + + +def test__natsort_key_with_UNGROUPLETTERS_does_nothing_without_LOCALE(): + assert _natsort_key('Apple56.5', None, ns.UG | ns.I) == _natsort_key('Apple56.5', None, ns.I) |