summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2015-04-06 23:33:27 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2015-04-06 23:33:27 -0700
commit72867093bce4c2cabe2ea53415fabfb6238ae7ea (patch)
tree67e5322ee99b68c3c8b457b58b58e83388f2cc6c
parent4854fc999a1d3a9a6a7a1185153737d87b63be32 (diff)
parentc1b6ef887f4595f32a04678c316d2c9c68517cb6 (diff)
downloadnatsort-3.5.6.tar.gz
natsort release version 3.5.63.5.6
- Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of an ordinal sort when using 'LOCALE'. - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for dealing with bytes types.
-rw-r--r--README.rst42
-rw-r--r--docs/source/api.rst1
-rw-r--r--docs/source/bytes.rst20
-rw-r--r--docs/source/changelog.rst10
-rw-r--r--docs/source/examples.rst45
-rw-r--r--docs/source/intro.rst15
-rw-r--r--natsort/__init__.py6
-rw-r--r--natsort/_version.py2
-rw-r--r--natsort/natsort.py94
-rw-r--r--natsort/ns_enum.py33
-rw-r--r--natsort/utils.py16
-rw-r--r--test_natsort/test_natsort.py23
-rw-r--r--test_natsort/test_utils.py44
13 files changed, 313 insertions, 38 deletions
diff --git a/README.rst b/README.rst
index c508f4b..c268032 100644
--- a/README.rst
+++ b/README.rst
@@ -92,7 +92,22 @@ when you sort:
>>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a']
>>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError
-You cannot mix and match ``str`` and ``bytes`` objects on Python 3.
+``natsort`` does not officially support the ``bytes`` type on Python 3, but
+convenience functions are provided that help you decode to ``str`` first:
+
+.. code-block:: python
+
+ >>> from natsort import as_utf8
+ >>> a = [b'a', 14.0, 'b']
+ >>> # On Python 2, natsorted(a) would would work as expected.
+ >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str())
+ >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b']
+ True
+ >>> a = [b'a56', b'a5', b'a6', b'a40']
+ >>> # On Python 2, natsorted(a) would would work as expected.
+ >>> # On Python 3, natsorted(a) would return the same results as sorted(a)
+ >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56']
+ True
The natsort algorithm does other fancy things like
@@ -179,7 +194,15 @@ History
These are the last three entries of the changelog. See the package documentation
for the complete `changelog <http://pythonhosted.org//natsort/changelog.html>`_.
-04-04-2015 v. 3.5.4
+04-06-2015 v. 3.5.6
+'''''''''''''''''''
+
+ - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of
+ an ordinal sort when using 'LOCALE'.
+ - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for
+ dealing with bytes types.
+
+04-04-2015 v. 3.5.5
'''''''''''''''''''
- Added 'realsorted' and 'index_realsorted' functions for
@@ -191,18 +214,3 @@ for the complete `changelog <http://pythonhosted.org//natsort/changelog.html>`_.
- Fixed bug where a 'TypeError' was raised if a string containing a leading
number was sorted with alpha-only strings when 'LOCALE' is used.
-
-03-26-2015 v. 3.5.3
-'''''''''''''''''''
-
- - Fixed bug where '--reverse-filter; option in shell script was not
- getting checked for correctness.
- - Documentation updates to better describe locale bug, and illustrate
- upcoming default behavior change.
- - Internal improvements, including making test suite more granular.
-
-01-13-2015 v. 3.5.2
-'''''''''''''''''''
-
- - Enhancement that will convert a 'pathlib.Path' object to a 'str' if
- 'ns.PATH' is enabled.
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 8919542..4084720 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -21,3 +21,4 @@ natsort API
index_realsorted.rst
order_by_index.rst
ns_class.rst
+ bytes.rst
diff --git a/docs/source/bytes.rst b/docs/source/bytes.rst
new file mode 100644
index 0000000..c59d4ad
--- /dev/null
+++ b/docs/source/bytes.rst
@@ -0,0 +1,20 @@
+.. default-domain:: py
+.. currentmodule:: natsort
+
+.. _bytes_help:
+
+Help With Bytes On Python 3
+===========================
+
+The official stance of :mod:`natsort` is to not support `bytes` for
+sorting; there is just too much that can go wrong when trying to automate
+conversion between `bytes` and `str`. But rather than completely give up
+on `bytes`, :mod:`natsort` provides three functions that make it easy to
+quickly decode `bytes` to `str` so that sorting is possible.
+
+.. autofunction:: decoder
+
+.. autofunction:: as_ascii
+
+.. autofunction:: as_utf8
+
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 86abfb7..2803377 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -3,7 +3,15 @@
Changelog
---------
-04-04-2015 v. 3.5.4
+04-06-2015 v. 3.5.6
+'''''''''''''''''''
+
+ - Added 'UNGROUPLETTERS' algorithm to get the case-grouping behavior of
+ an ordinal sort when using 'LOCALE'.
+ - Added convenience functions 'decoder', 'as_ascii', and 'as_utf8' for
+ dealing with bytes types.
+
+04-04-2015 v. 3.5.5
'''''''''''''''''''
- Added 'realsorted' and 'index_realsorted' functions for
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 1f795e4..a995bb4 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -252,3 +252,48 @@ Just like the :func:`sorted` built-in function, you can supply the
>>> a = ['a2', 'a9', 'a1', 'a4', 'a10']
>>> natsorted(a, reverse=True)
['a10', 'a9', 'a4', 'a2', 'a1']
+
+Sorting Bytes on Python 3
+-------------------------
+
+Python 3 is rather strict about comparing strings and bytes, and this
+can make it difficult to deal with collections of both. Because of the
+challenge of guessing which encoding should be used to decode a bytes
+array to a string, :mod:`natsort` does *not* try to guess and automatically
+convert for you; in fact, the official stance of :mod:`natsort` is to
+not support sorting bytes. Instead, some decoding convenience functions
+have been provided to you (see :ref:`bytes_help`) that allow you to
+provide a codec for decoding bytes through the ``key`` argument that
+will allow :mod:`natsort` to convert byte arrays to strings for sorting;
+these functions know not to raise an error if the input is not a byte
+array, so you can use the key on any arbitrary collection of data.
+
+::
+
+ >>> from natsort import as_ascii
+ >>> a = [b'a', 14.0, 'b']
+ >>> # On Python 2, natsorted(a) would would work as expected.
+ >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str())
+ >>> natsorted(a, key=as_ascii) == [14.0, b'a', 'b']
+ True
+
+Additionally, regular expressions cannot be run on byte arrays, making it
+so that :mod:`natsort` cannot parse them for numbers. As a result, if you
+run :mod:`natsort` on a list of bytes, you will get results that are like
+Python's default sorting behavior. Of course, you can use the decoding
+functions to solve this::
+
+ >>> from natsort import as_utf8
+ >>> a = [b'a56', b'a5', b'a6', b'a40']
+ >>> natsorted(a) # doctest: +SKIP
+ [b'a40', b'a5', b'a56', b'a6']
+ >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56']
+ True
+
+If you need a codec different from ASCII or UTF-8, you can use
+:func:`decoder` to generate a custom key::
+
+ >>> from natsort import decoder
+ >>> a = [b'a56', b'a5', b'a6', b'a40']
+ >>> natsorted(a, key=decoder('latin1')) == [b'a5', b'a6', b'a40', b'a56']
+ True
diff --git a/docs/source/intro.rst b/docs/source/intro.rst
index 9ae466b..b79aec9 100644
--- a/docs/source/intro.rst
+++ b/docs/source/intro.rst
@@ -87,7 +87,20 @@ when you sort::
>>> # On Python 2, sorted(a) would return [2.0, 6, '4.5', '5', 'a']
>>> # On Python 3, sorted(a) would raise an "unorderable types" TypeError
-You cannot mix and match ``str`` and ``bytes`` objects on Python 3.
+:mod:`natsort` does not officially support the `bytes` type on Python 3, but
+convenience functions are provided that help you decode to `str` first::
+
+ >>> from natsort import as_utf8
+ >>> a = [b'a', 14.0, 'b']
+ >>> # On Python 2, natsorted(a) would would work as expected.
+ >>> # On Python 3, natsorted(a) would raise a TypeError (bytes() < str())
+ >>> natsorted(a, key=as_utf8) == [14.0, b'a', 'b']
+ True
+ >>> a = [b'a56', b'a5', b'a6', b'a40']
+ >>> # On Python 2, natsorted(a) would would work as expected.
+ >>> # On Python 3, natsorted(a) would return the same results as sorted(a)
+ >>> natsorted(a, key=as_utf8) == [b'a5', b'a6', b'a40', b'a56']
+ True
The natsort algorithm does other fancy things like
diff --git a/natsort/__init__.py b/natsort/__init__.py
index d6f7467..4e6cc82 100644
--- a/natsort/__init__.py
+++ b/natsort/__init__.py
@@ -7,7 +7,8 @@ from natsort.natsort import (natsort_key, natsort_keygen, ns,
natsorted, humansorted, versorted,
realsorted, index_realsorted,
index_natsorted, index_versorted,
- index_humansorted, order_by_index)
+ index_humansorted, order_by_index,
+ decoder, as_ascii, as_utf8)
from natsort._version import __version__
__all__ = [
@@ -22,5 +23,8 @@ __all__ = [
'index_humansorted',
'index_realsorted',
'order_by_index',
+ 'decoder',
+ 'as_ascii',
+ 'as_utf8',
'ns',
]
diff --git a/natsort/_version.py b/natsort/_version.py
index 4134a94..eea91d6 100644
--- a/natsort/_version.py
+++ b/natsort/_version.py
@@ -2,4 +2,4 @@
from __future__ import (print_function, division,
unicode_literals, absolute_import)
-__version__ = '3.5.5'
+__version__ = '3.5.6'
diff --git a/natsort/natsort.py b/natsort/natsort.py
index c94e137..8fb6754 100644
--- a/natsort/natsort.py
+++ b/natsort/natsort.py
@@ -22,7 +22,7 @@ from functools import partial
from warnings import warn
# Local imports.
-from natsort.utils import _natsort_key, _args_to_enum
+from natsort.utils import _natsort_key, _args_to_enum, _do_decoding
from natsort.ns_enum import ns
from natsort.py23compat import u_format
@@ -31,6 +31,98 @@ __doc__ = u_format(__doc__)
@u_format
+def decoder(encoding):
+ """
+ Return a function that can be used to decode bytes to unicode.
+
+ Parameters
+ ----------
+ encoding: str
+ The codec to use for decoding. This must be a valid unicode codec.
+
+ Returns
+ -------
+ decode_function:
+ A function that takes a single argument and attempts to decode
+ it using the supplied codec. Any `UnicodeErrors` are raised.
+ If the argument was not of `bytes` type, it is simply returned
+ as-is.
+
+ See Also
+ --------
+ as_ascii
+ as_utf8
+
+ Examples
+ --------
+
+ >>> f = decoder('utf8')
+ >>> f(b'bytes') == 'bytes'
+ True
+ >>> f(12345) == 12345
+ True
+ >>> natsorted([b'a10', b'a2'], key=decoder('utf8')) == [b'a2', b'a10']
+ True
+ >>> # On Python 3, without decoder this would return [b'a10', b'a2']
+ >>> natsorted([b'a10', 'a2'], key=decoder('utf8')) == ['a2', b'a10']
+ True
+ >>> # On Python 3, without decoder this would raise a TypeError.
+
+ """
+ return partial(_do_decoding, encoding=encoding)
+
+
+@u_format
+def as_ascii(s):
+ """
+ Function to decode an input with the ASCII codec, or return as-is.
+
+ Parameters
+ ----------
+ s:
+ Any object.
+
+ Returns
+ -------
+ output:
+ If the input was of type `bytes`, the return value is a `str` decoded
+ with the ASCII codec. Otherwise, the return value is identically the
+ input.
+
+ See Also
+ --------
+ decoder
+
+ """
+ return _do_decoding(s, 'ascii')
+
+
+@u_format
+def as_utf8(s):
+ """
+ Function to decode an input with the UTF-8 codec, or return as-is.
+
+ Parameters
+ ----------
+ s:
+ Any object.
+
+ Returns
+ -------
+ output:
+ If the input was of type `bytes`, the return value is a `str` decoded
+ with the UTF-8 codec. Otherwise, the return value is identically the
+ input.
+
+ See Also
+ --------
+ decoder
+
+ """
+ return _do_decoding(s, 'utf-8')
+
+
+@u_format
def natsort_key(val, key=None, number_type=float, signed=None, exp=None,
as_path=None, py3_safe=None, alg=0):
"""\
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
index c5e0afc..f568382 100644
--- a/natsort/ns_enum.py
+++ b/natsort/ns_enum.py
@@ -80,6 +80,13 @@ class ns(object):
``['Apple', 'apple', 'Banana', 'banana']``.
Useless when used with `IGNORECASE`; use with `LOWERCASEFIRST`
to reverse the order of upper and lower case.
+ CAPITALFIRST, C
+ Only used when `LOCALE` is enabled. Tell `natsort` to put all
+ capitalized words before non-capitalized words. This is essentially
+ the inverse of `GROUPLETTERS`, and is the default Python sorting
+ behavior without `LOCALE`.
+ UNGROUPLETTERS, UG
+ An alias for `CAPITALFIRST`.
TYPESAFE, T
Try hard to avoid "unorderable types" error on Python 3. It
is the same as setting the old `py3_safe` option to `True`.
@@ -113,18 +120,20 @@ class ns(object):
# Sort algorithm "enum" values.
-_ns = {'FLOAT': 0, 'F': 0,
- 'INT': 1, 'I': 1,
- 'UNSIGNED': 2, 'U': 2,
- 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED
- 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED
- 'NOEXP': 4, 'N': 4,
- 'PATH': 8, 'P': 8,
- 'LOCALE': 16, 'L': 16,
- 'IGNORECASE': 32, 'IC': 32,
- 'LOWERCASEFIRST': 64, 'LF': 64,
- 'GROUPLETTERS': 128, 'G': 128,
- 'TYPESAFE': 1024, 'T': 1024,
+_ns = {'FLOAT': 0, 'F': 0,
+ 'INT': 1, 'I': 1,
+ 'UNSIGNED': 2, 'U': 2,
+ 'VERSION': 3, 'V': 3, # Shortcut for INT | UNSIGNED
+ 'DIGIT': 3, 'D': 3, # Shortcut for INT | UNSIGNED
+ 'NOEXP': 4, 'N': 4,
+ 'PATH': 8, 'P': 8,
+ 'LOCALE': 16, 'L': 16,
+ 'IGNORECASE': 32, 'IC': 32,
+ 'LOWERCASEFIRST': 64, 'LF': 64,
+ 'GROUPLETTERS': 128, 'G': 128,
+ 'UNGROUPLETTERS': 256, 'UG': 256,
+ 'CAPITALFIRST': 256, 'C': 256,
+ 'TYPESAFE': 1024, 'T': 1024,
}
# Populate the ns class with the _ns values.
for x, y in _ns.items():
diff --git a/natsort/utils.py b/natsort/utils.py
index 71292c5..3e756b7 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -41,7 +41,7 @@ else:
# Group algorithm types for easy extraction
_NUMBER_ALGORITHMS = ns.FLOAT | ns.INT | ns.UNSIGNED | ns.NOEXP
_ALL_BUT_PATH = (ns.F | ns.I | ns.U | ns.N | ns.L |
- ns.IC | ns.LF | ns.G | ns.TYPESAFE)
+ ns.IC | ns.LF | ns.G | ns.UG | ns.TYPESAFE)
# The regex that locates floats
_float_sign_exp_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)', re.U)
@@ -78,6 +78,16 @@ _regex_and_num_function_chooser = {
}
+def _do_decoding(s, encoding):
+ """A function to decode a bytes string, or return the object as-is."""
+ try:
+ return s.decode(encoding)
+ except UnicodeError:
+ raise
+ except (AttributeError, TypeError):
+ return s
+
+
def _args_to_enum(number_type, signed, exp, as_path, py3_safe):
"""A function to convert input booleans to an enum-type argument."""
alg = 0
@@ -270,6 +280,8 @@ def _natsort_key(val, key, alg):
val = val.swapcase()
if alg & _ns['IGNORECASE']:
val = val.lower()
+ if use_locale and alg & _ns['UNGROUPLETTERS'] and val[0].isupper():
+ val = ' ' + val
return tuple(_number_extracter(val,
regex,
num_function,
@@ -279,7 +291,7 @@ def _natsort_key(val, key, alg):
except (TypeError, AttributeError):
# Check if it is a bytes type, and if so return as a
# one element tuple.
- if isinstance(val, bytes):
+ if type(val) in (bytes,):
return (val,)
# If not strings, assume it is an iterable that must
# be parsed recursively. Do not apply the key recursively.
diff --git a/test_natsort/test_natsort.py b/test_natsort/test_natsort.py
index 6d9c853..78a3eaa 100644
--- a/test_natsort/test_natsort.py
+++ b/test_natsort/test_natsort.py
@@ -11,10 +11,31 @@ from operator import itemgetter
from pytest import raises
from natsort import natsorted, index_natsorted, natsort_key, versorted, index_versorted
from natsort import humansorted, index_humansorted, natsort_keygen, order_by_index, ns
-from natsort import realsorted, index_realsorted
+from natsort import realsorted, index_realsorted, decoder, as_ascii, as_utf8
from natsort.utils import _natsort_key
+def test_decoder_returns_function_that_can_decode_bytes_but_return_non_bytes_as_is():
+ f = decoder('latin1')
+ a = 'bytes'
+ b = 14
+ assert f(b'bytes') == a
+ assert f(b) is b # returns as-is, same object ID
+ if sys.version[0] == '3':
+ assert f(a) is a # same object returned on Python3 b/c only bytes has decode
+ else:
+ assert f(a) is not a
+ assert f(a) == a # not same object on Python2 because str can decode
+
+
+def test_as_ascii_returns_bytes_as_ascii():
+ assert decoder('ascii')(b'bytes') == as_ascii(b'bytes')
+
+
+def test_as_utf8_returns_bytes_as_utf8():
+ assert decoder('utf8')(b'bytes') == as_utf8(b'bytes')
+
+
def test_natsort_key_public_raises_DeprecationWarning_when_called():
# Identical to _natsort_key
# But it raises a deprecation warning
diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
index 9f7c930..01fe6a3 100644
--- a/test_natsort/test_utils.py
+++ b/test_natsort/test_utils.py
@@ -1,14 +1,17 @@
# -*- coding: utf-8 -*-
"""These test the utils.py functions."""
+from __future__ import unicode_literals
+import sys
import locale
from operator import itemgetter
from pytest import raises
from natsort.ns_enum import ns
from natsort.utils import _number_extracter, _py3_safe, _natsort_key, _args_to_enum
from natsort.utils import _float_sign_exp_re, _float_nosign_exp_re, _float_sign_noexp_re
-from natsort.utils import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re
+from natsort.utils import _float_nosign_noexp_re, _int_nosign_re, _int_sign_re, _do_decoding
from natsort.locale_help import use_pyicu, null_string
+from natsort.py23compat import py23_str
try:
from fastnumbers import fast_float, fast_int
@@ -23,6 +26,12 @@ else:
has_pathlib = True
+def test_do_decoding_decodes_bytes_string_to_unicode():
+ assert type(_do_decoding(b'bytes', 'ascii')) is py23_str
+ assert _do_decoding(b'bytes', 'ascii') == 'bytes'
+ assert _do_decoding(b'bytes', 'ascii') == b'bytes'.decode('ascii')
+
+
def test_args_to_enum_converts_signed_exp_float_to_ns_F():
# number_type, signed, exp, as_path, py3_safe
assert _args_to_enum(float, True, True, False, False) == ns.F
@@ -299,6 +308,16 @@ def test__natsort_key_with_GROUPLETTERS_and_LOWERCASEFIRST_inverts_text_first_th
assert _natsort_key('Apple56', None, ns.G | ns.LF) == ('aapPpPlLeE', 56.0)
+def test__natsort_key_with_bytes_input_only_applies_LOWERCASEFIRST_or_IGNORECASE_and_returns_in_tuple():
+ if sys.version[0] == '3':
+ assert _natsort_key(b'Apple56', None, ns.I) == (b'Apple56',)
+ assert _natsort_key(b'Apple56', None, ns.LF) == (b'aPPLE56',)
+ assert _natsort_key(b'Apple56', None, ns.IC) == (b'apple56',)
+ assert _natsort_key(b'Apple56', None, ns.G) == (b'Apple56',)
+ else:
+ assert True
+
+
def test__natsort_key_with_LOCALE_transforms_floats_according_to_the_current_locale_and_strxfrms_strings():
# Locale aware sorting
locale.setlocale(locale.LC_NUMERIC, str('en_US.UTF-8'))
@@ -317,3 +336,26 @@ def test__natsort_key_with_LOCALE_transforms_floats_according_to_the_current_loc
assert _natsort_key('Apple56.5', None, ns.LOCALE) == (strxfrm('Apple'), 56.5)
assert _natsort_key('Apple56,5', None, ns.LOCALE) == (strxfrm('Apple'), 56.5)
locale.setlocale(locale.LC_NUMERIC, str(''))
+
+
+def test__natsort_key_with_LOCALE_and_UNGROUPLETTERS_places_space_before_string_with_capital_first_letter():
+ # Locale aware sorting
+ locale.setlocale(locale.LC_NUMERIC, str('en_US.UTF-8'))
+ if use_pyicu:
+ from natsort.locale_help import get_pyicu_transform
+ from locale import getlocale
+ strxfrm = get_pyicu_transform(getlocale())
+ else:
+ from natsort.locale_help import strxfrm
+ assert _natsort_key('Apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (strxfrm(' Apple'), 56.5)
+ assert _natsort_key('apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (strxfrm('apple'), 56.5)
+ assert _natsort_key('12Apple56.5', None, ns.LOCALE | ns.UNGROUPLETTERS | ns.F) == (null_string, 12.0, strxfrm('Apple'), 56.5)
+ # The below are all aliases for UNGROUPLETTERS
+ assert ns.UNGROUPLETTERS == ns.UG
+ assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
+ assert ns.UNGROUPLETTERS == ns.C
+ locale.setlocale(locale.LC_NUMERIC, str(''))
+
+
+def test__natsort_key_with_UNGROUPLETTERS_does_nothing_without_LOCALE():
+ assert _natsort_key('Apple56.5', None, ns.UG | ns.I) == _natsort_key('Apple56.5', None, ns.I)