Added new string and path splitting function generators.

These complete the required components to make natsort more functional.
author: Seth M Morton <seth.m.morton@gmail.com> 2016-05-01 16:41:31 -0700
committer: Seth M Morton <seth.m.morton@gmail.com> 2016-05-01 16:41:31 -0700
commit: d741bc05b820443e8076d2b63d64ed98ba860306 (patch)
tree: d9b45153c3a0acdbbdfd7800e280a2b9b94d688a
parent: df00419f4e1a1c4ec469b4b3ae76f98cb7c12c3a (diff)
download: natsort-d741bc05b820443e8076d2b63d64ed98ba860306.tar.gz
3 files changed, 98 insertions, 124 deletions
diff --git a/natsort/utils.py b/natsort/utils.py
index 6c2362c..6bffdf9 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -146,39 +146,21 @@ def _natsort_key(val, key, alg):
         if key is not None:
             val = key(val)
 
-        # If this is a path, convert it.
-        # An AttrubuteError is raised if not a string.
-        split_as_path = False
-        if alg & ns.PATH:
-            try:
-                val = _path_splitter(val)
-            except AttributeError:
-                pass
-            else:
-                # Record that this string was split as a path so that
-                # we don't set PATH in the recursive call.
-                split_as_path = True
-
         # Assume the input are strings, which is the most common case.
-        # Apply the string modification if needed.
-        orig_val = val
         try:
             if use_locale and dumb_sort():
                 alg |= ns._DUMB
-            lowfirst = alg & ns.LOWERCASEFIRST
-            dumb = alg & ns._DUMB
-            val = _pre_split_function(alg)(val)
-            gl = alg & ns.GROUPLETTERS
-            ret = tuple(_number_extracter(val,
-                                          regex,
-                                          num_function,
-                                          use_locale,
-                                          gl or (use_locale and dumb)))
-            # Handle NaN.
-            if any(x != x for x in ret):
-                ret = _fix_nan(ret, alg)
-            val = orig_val if (alg & ns._DUMB) else val
-            return _post_string_parse_function(alg, null_string)(ret, val)
+            split = _parse_string_function(
+                alg,
+                null_string if use_locale else '',
+                regex.split,
+                _pre_split_function(alg),
+                _post_split_function(alg),
+                _post_string_parse_function(alg, null_string)
+            )
+            if alg & ns.PATH:
+                split = _parse_path_function(split)
+            return split(val)
         except (TypeError, AttributeError):
             # Check if it is a bytes type, and if so return as a
             # one element tuple.
@@ -186,12 +168,8 @@ def _natsort_key(val, key, alg):
                 return _parse_bytes_function(alg)(val)
             # If not strings, assume it is an iterable that must
             # be parsed recursively. Do not apply the key recursively.
-            # If this string was split as a path, turn off 'PATH'.
             try:
-                was_path = alg & ns.PATH
-                newalg = alg & ns._ALL_BUT_PATH
-                newalg |= (was_path * (not split_as_path))
-                return tuple([_natsort_key(x, None, newalg) for x in val])
+                return tuple([_natsort_key(x, None, alg) for x in val])
             # If there is still an error, it must be a number.
             # Return as-is, with a leading empty string.
             except TypeError:
@@ -225,7 +203,7 @@ def _number_extracter(s, regex, numconv, use_locale, group_letters):
 
 
 def _parse_bytes_function(alg):
-    """Create a function that will properly format a bytes string in a tuple."""
+    """Create a function that will format a bytes string in a tuple."""
     if alg & ns.PATH and alg & ns.IGNORECASE:
         return lambda x: ((x.lower(),),)
     elif alg & ns.PATH:
@@ -248,6 +226,26 @@ def _parse_number_function(alg, sep):
     return (lambda x: (func(x),)) if alg & ns.PATH else func
 
 
+def _parse_string_function(alg, sep, splitter, pre, post, after):
+    """Create a function that will properly split and format a string."""
+    def func(x, not_dumb=not (alg & ns._DUMB and alg & ns.LOCALE)):
+        original = x
+        x = pre(x)                 # Apply pre-splitting function
+        if not_dumb:
+            original = x
+        x = splitter(x)            # Split the string on numbers
+        x = py23_filter(None, x)   # Remove empty strings.
+        x = py23_map(post, x)      # Apply post-splitting function
+        x = _sep_inserter(x, sep)  # Insert empty strings between numbers
+        return after(x, original)  # Apply final manipulation
+    return func
+
+
+def _parse_path_function(str_split):
+    """Create a function that will properly split and format a path."""
+    return lambda x: tuple(py23_map(str_split, _path_splitter(x)))
+
+
 def _sep_inserter(iterable, sep):
     """Insert '' between numbers."""
 
@@ -329,6 +327,7 @@ def _post_string_parse_function(alg, sep):
     """
     if alg & ns.UNGROUPLETTERS and alg & ns.LOCALE:
         swap = alg & ns._DUMB and alg & ns.LOWERCASEFIRST
+
         def func(split_val,
                  val,
                  f=(lambda x: x.swapcase()) if swap else lambda x: x):
@@ -411,7 +410,7 @@ def _path_splitter(s, _d_match=re.compile(r'\.\d').match):
     b_appendleft(base)
 
     # Return the split parent paths and then the split basename.
-    return tuple(ichain(path_parts, base_parts))
+    return ichain(path_parts, base_parts)
 
 
 def _args_to_enum(**kwargs):
diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py
index b03808e..2cb0ee9 100644
--- a/test_natsort/slow_splitters.py
+++ b/test_natsort/slow_splitters.py
@@ -24,7 +24,7 @@ def int_splitter(iterable, signed, sep):
     split_by_digits = refine_split_grouping(split_by_digits)
     split = int_splitter_iter(split_by_digits, signed)
     split = sep_inserter(split, sep)
-    return list(add_leading_space_if_first_is_num(split, sep))
+    return tuple(add_leading_space_if_first_is_num(split, sep))
 
 
 def float_splitter(iterable, signed, exp, sep):
@@ -37,7 +37,7 @@ def float_splitter(iterable, signed, exp, sep):
     split_by_digits = peekable(refine_split_grouping(split_by_digits))
     split = float_splitter_iter(split_by_digits, signed, exp)
     split = sep_inserter(split, sep)
-    return list(add_leading_space_if_first_is_num(split, sep))
+    return tuple(add_leading_space_if_first_is_num(split, sep))
 
 
 def refine_split_grouping(iterable):
diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
index 06a87a6..e6c21c9 100644
--- a/test_natsort/test_utils.py
+++ b/test_natsort/test_utils.py
@@ -26,6 +26,8 @@ from natsort.utils import (
     _do_decoding,
     _path_splitter,
     chain_functions,
+    _parse_string_function,
+    _parse_path_function,
     _parse_number_function,
     _parse_bytes_function,
     _pre_split_function,
@@ -453,9 +455,9 @@ def test_sep_inserter_inserts_separator_between_two_numbers(x):
 
 def test_path_splitter_splits_path_string_by_separator_example():
     z = '/this/is/a/path'
-    assert _path_splitter(z) == tuple(pathlib.Path(z).parts)
+    assert tuple(_path_splitter(z)) == tuple(pathlib.Path(z).parts)
     z = pathlib.Path('/this/is/a/path')
-    assert _path_splitter(z) == tuple(pathlib.Path(z).parts)
+    assert tuple(_path_splitter(z)) == tuple(pathlib.Path(z).parts)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
@@ -463,13 +465,13 @@ def test_path_splitter_splits_path_string_by_separator_example():
 def test_path_splitter_splits_path_string_by_separator(x):
     assume(all(x))
     z = py23_str(pathlib.Path(*x))
-    assert _path_splitter(z) == tuple(pathlib.Path(z).parts)
+    assert tuple(_path_splitter(z)) == tuple(pathlib.Path(z).parts)
 
 
 def test_path_splitter_splits_path_string_by_separator_and_removes_extension_example():
     z = '/this/is/a/path/file.exe'
     y = tuple(pathlib.Path(z).parts)
-    assert _path_splitter(z) == y[:-1] + (pathlib.Path(z).stem, pathlib.Path(z).suffix)
+    assert tuple(_path_splitter(z)) == y[:-1] + (pathlib.Path(z).stem, pathlib.Path(z).suffix)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
@@ -478,145 +480,118 @@ def test_path_splitter_splits_path_string_by_separator_and_removes_extension(x):
     assume(all(x))
     z = py23_str(pathlib.Path(*x[:-2])) + '.' + x[-1]
     y = tuple(pathlib.Path(z).parts)
-    assert _path_splitter(z) == y[:-1] + (pathlib.Path(z).stem, pathlib.Path(z).suffix)
+    assert tuple(_path_splitter(z)) == y[:-1] + (pathlib.Path(z).stem, pathlib.Path(z).suffix)
 
 
-def test_number_extracter_raises_TypeError_if_given_a_number_example():
-    with raises(TypeError):
-        assert _number_extracter(50.0, _float_sign_exp_re, *float_nolocale_nogroup)
+def no_op(x):
+    """A function that does nothing."""
+    return x
 
 
-@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
-@given(floats())
-def test_number_extracter_raises_TypeError_if_given_a_number(x):
-    with raises(TypeError):
-        assert _number_extracter(x, _float_sign_exp_re, *float_nolocale_nogroup)
+def tuple2(x, dummy):
+    """Make the input a tuple."""
+    return tuple(x)
 
 
-def test_number_extracter_includes_plus_sign_and_exponent_in_float_definition_for_signed_exp_floats_example():
-    assert _number_extracter('a5+5.034e-1', _float_sign_exp_re, *float_nolocale_nogroup) == ['a', 5.0, '', 0.5034]
+def test_parse_string_function_raises_TypeError_if_given_a_number_example():
+    with raises(TypeError):
+        assert _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)(50.0)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
-@given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_includes_plus_sign_and_exponent_in_float_definition_for_signed_exp_floats(x):
-    assume(not any(type(y) == float and isnan(y) for y in x))
-    s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _float_sign_exp_re, *float_nolocale_nogroup) == float_splitter(s, True, True, '')
+@given(floats())
+def test_parse_string_function_raises_TypeError_if_given_a_number(x):
+    with raises(TypeError):
+        assert _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)(x)
 
 
-def test_number_extracter_excludes_plus_sign_in_float_definition_but_includes_exponent_for_unsigned_exp_floats_example():
-    assert _number_extracter('a5+5.034e-1', _float_nosign_exp_re, *float_nolocale_nogroup) == ['a', 5.0, '+', 0.5034]
+def test_parse_string_function_only_parses_digits_with_nosign_int_example():
+    assert _parse_string_function(0, '', _int_nosign_re.split, no_op, fast_int, tuple2)('a5+5.034e-1') == ('a', 5, '+', 5, '.', 34, 'e-', 1)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_excludes_plus_sign_in_float_definition_but_includes_exponent_for_unsigned_exp_floats(x):
-    assume(not any(type(y) == float and isnan(y) for y in x))
+@example([10000000000000000000000000000000000000000000000000000000000000000000000000,
+          100000000000000000000000000000000000000000000000000000000000000000000000000,
+          100000000000000000000000000000000000000000000000000000000000000000000000000])
+def test_parse_string_function_only_parses_digits_with_nosign_int(x):
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _float_nosign_exp_re, *float_nolocale_nogroup) == float_splitter(s, False, True, '')
+    assert _parse_string_function(0, '', _int_nosign_re.split, no_op, fast_int, tuple2)(s) == int_splitter(s, False, '')
 
 
-def test_number_extracter_includes_plus_and_minus_sign_in_float_definition_but_excludes_exponent_for_signed_noexp_floats_example():
-    assert _number_extracter('a5+5.034e-1', _float_sign_noexp_re, *float_nolocale_nogroup) == ['a', 5.0, '', 5.034, 'e', -1.0]
+def test_parse_string_function_parses_digit_with_sign_with_signed_int_example():
+    assert _parse_string_function(0, '', _int_sign_re.split, no_op, fast_int, tuple2)('a5+5.034e-1') == ('a', 5, '', 5, '.', 34, 'e', -1)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_includes_plus_and_minus_sign_in_float_definition_but_excludes_exponent_for_signed_noexp_floats(x):
-    assume(not any(type(y) == float and isnan(y) for y in x))
+def test_parse_string_function_parses_digit_with_sign_with_signed_int(x):
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _float_sign_noexp_re, *float_nolocale_nogroup) == float_splitter(s, True, False, '')
+    assert _parse_string_function(0, '', _int_sign_re.split, no_op, fast_int, tuple2)(s) == int_splitter(s, True, '')
 
 
-def test_number_extracter_excludes_plus_sign_and_exponent_in_float_definition_for_unsigned_noexp_floats_example():
-    assert _number_extracter('a5+5.034e-1', _float_nosign_noexp_re, *float_nolocale_nogroup) == ['a', 5.0, '+', 5.034, 'e-', 1.0]
+def test_parse_string_function_only_parses_float_with_nosign_noexp_float_example():
+    assert _parse_string_function(0, '', _float_nosign_noexp_re.split, no_op, fast_float, tuple2)('a5+5.034e-1') == ('a', 5.0, '+', 5.034, 'e-', 1.0)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_excludes_plus_sign_and_exponent_in_float_definition_for_unsigned_noexp_floats(x):
+def test_parse_string_function_only_parses_float_with_nosign_noexp_float(x):
     assume(not any(type(y) == float and isnan(y) for y in x))
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _float_nosign_noexp_re, *float_nolocale_nogroup) == float_splitter(s, False, False, '')
+    assert _parse_string_function(0, '', _float_nosign_noexp_re.split, no_op, fast_float, tuple2)(s) == float_splitter(s, False, False, '')
 
 
-def test_number_extracter_excludes_plus_and_minus_sign_in_int_definition_for_unsigned_ints_example():
-    assert _number_extracter('a5+5.034e-1', _int_nosign_re, *int_nolocale_nogroup) == ['a', 5, '+', 5, '.', 34, 'e-', 1]
+def test_parse_string_function_only_parses_float_with_exponent_with_nosign_exp_float_example():
+    assert _parse_string_function(0, '', _float_nosign_exp_re.split, no_op, fast_float, tuple2)('a5+5.034e-1') == ('a', 5.0, '+', 0.5034)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-@example([10000000000000000000000000000000000000000000000000000000000000000000000000,
-          100000000000000000000000000000000000000000000000000000000000000000000000000,
-          100000000000000000000000000000000000000000000000000000000000000000000000000])
-def test_number_extracter_excludes_plus_and_minus_sign_in_int_definition_for_unsigned_ints(x):
-    s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _int_nosign_re, *int_nolocale_nogroup) == int_splitter(s, False, '')
-
-
-def test_number_extracter_includes_plus_and_minus_sign_in_int_definition_for_signed_ints_example():
-    assert _number_extracter('a5+5.034e-1', _int_sign_re, *int_nolocale_nogroup) == ['a', 5, '', 5, '.', 34, 'e', -1]
-
-
-@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
-@given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_includes_plus_and_minus_sign_in_int_definition_for_signed_ints(x):
+def test_parse_string_function_only_parses_float_with_exponent_with_nosign_exp_float(x):
+    assume(not any(type(y) == float and isnan(y) for y in x))
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    assert _number_extracter(s, _int_sign_re, *int_nolocale_nogroup) == int_splitter(s, True, '')
-
-
-def test_number_extracter_adds_leading_empty_string_if_input_begins_with_a_number_example():
-    assert _number_extracter('6a5+5.034e-1', _float_sign_exp_re, *float_nolocale_nogroup) == ['', 6.0, 'a', 5.0, '', 0.5034]
+    assert _parse_string_function(0, '', _float_nosign_exp_re.split, no_op, fast_float, tuple2)(s) == float_splitter(s, False, True, '')
 
 
-def test_number_extracter_doubles_letters_with_lowercase_version_with_groupletters_for_float_example():
-    assert _number_extracter('A5+5.034E-1', _float_sign_exp_re, *float_nolocale_group) == ['aA', 5.0, '', 0.5034]
+def test_parse_string_function_only_parses_float_with_sign_with_sign_noexp_float_example():
+    assert _parse_string_function(0, '', _float_sign_noexp_re.split, no_op, fast_float, tuple2)('a5+5.034e-1') == ('a', 5.0, '', 5.034, 'e', -1.0)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_doubles_letters_with_lowercase_version_with_groupletters_for_float(x):
+def test_parse_string_function_only_parses_float_with_sign_with_sign_noexp_float(x):
     assume(not any(type(y) == float and isnan(y) for y in x))
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    t = float_splitter(s, True, True, '')
-    t = [''.join([low(z) + z for z in y]) if type(y) != float else y for y in t]
-    assert _number_extracter(s, _float_sign_exp_re, *float_nolocale_group) == t
+    assert _parse_string_function(0, '', _float_sign_noexp_re.split, no_op, fast_float, tuple2)(s) == float_splitter(s, True, False, '')
 
 
-def test_number_extracter_doubles_letters_with_lowercase_version_with_groupletters_for_int_example():
-    assert _number_extracter('A5+5.034E-1', _int_nosign_re, *int_nolocale_group) == ['aA', 5, '++', 5, '..', 34, 'eE--', 1]
+def test_parse_string_function_parses_float_with_sign_exp_float_example():
+    assert _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)('a5+5.034e-1') == ('a', 5.0, '', 0.5034)
+    assert _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)('6a5+5.034e-1') == ('', 6.0, 'a', 5.0, '', 0.5034)
 
 
 @pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
 @given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_doubles_letters_with_lowercase_version_with_groupletters_for_int(x):
+def test_parse_string_function_parses_float_with_sign_exp_float(x):
+    assume(not any(type(y) == float and isnan(y) for y in x))
     s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    t = int_splitter(s, False, '')
-    t = [''.join([low(z) + z for z in y]) if type(y) not in (int, long) else y for y in t]
-    assert _number_extracter(s, _int_nosign_re, *int_nolocale_group) == t
+    assert _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)(s) == float_splitter(s, True, True, '')
 
 
-def test_number_extracter_extracts_numbers_and_strxfrms_strings_with_use_locale_example():
-    load_locale('en_US')
-    strxfrm = get_strxfrm()
-    assert _number_extracter('A5+5.034E-1', _int_nosign_re, *int_locale_nogroup) == [strxfrm('A'), 5, strxfrm('+'), 5, strxfrm('.'), 34, strxfrm('E-'), 1]
-    locale.setlocale(locale.LC_NUMERIC, str(''))
+def test_parse_string_function_selects_pre_function_value_if_not_dumb():
+    def tuple2(x, orig):
+        """Make the input a tuple."""
+        return (orig[0], tuple(x))
+    assert _parse_string_function(0, '', _int_nosign_re.split, str.upper, fast_float, tuple2)('a5+5.034e-1') == ('A', ('A', 5, '+', 5, '.', 34, 'E-', 1))
+    assert _parse_string_function(ns._DUMB, '', _int_nosign_re.split, str.upper, fast_float, tuple2)('a5+5.034e-1') == ('A', ('A', 5, '+', 5, '.', 34, 'E-', 1))
+    assert _parse_string_function(ns.LOCALE, '', _int_nosign_re.split, str.upper, fast_float, tuple2)('a5+5.034e-1') == ('A', ('A', 5, '+', 5, '.', 34, 'E-', 1))
+    assert _parse_string_function(ns.LOCALE | ns._DUMB, '', _int_nosign_re.split, str.upper, fast_float, tuple2)('a5+5.034e-1') == ('a', ('A', 5, '+', 5, '.', 34, 'E-', 1))
 
 
-@pytest.mark.skipif(not use_hypothesis, reason='requires python2.7 or greater')
-@given(lists(elements=floats() | text() | integers(), min_size=1, max_size=10))
-def test_number_extracter_extracts_numbers_and_strxfrms_strings_with_use_locale(x):
-    load_locale('en_US')
-    assume(not any(any(i in bad_uni_chars for i in y) for y in x if isinstance(y, py23_str)))
-    s = ''.join(repr(y) if type(y) in (float, long, int) else y for y in x)
-    t = int_splitter(s, False, null_string)
-    try:  # Account for locale bug on Python 3.2
-        t = [y if i == 0 and y is null_string else locale_convert(y) if not isinstance(y, (float, long, int)) else y for i, y in enumerate(t)]
-        assert _number_extracter(s, _int_nosign_re, *int_locale_nogroup) == t
-    except OverflowError:
-        pass
-    locale.setlocale(locale.LC_NUMERIC, str(''))
+def test_parse_path_function_parses_string_as_path_then_as_string():
+    splt = _parse_string_function(0, '', _float_sign_exp_re.split, no_op, fast_float, tuple2)
+    assert _parse_path_function(splt)('/p/Folder (10)/file34.5nm (2).tar.gz') == (('/',), ('p', ), ('Folder (', 10.0, ')',), ('file', 34.5, 'nm (', 2.0, ')'), ('.tar',), ('.gz',))
 
 
 def test__natsort_key_with_nan_input_transforms_nan_to_negative_inf():
author	Seth M Morton <seth.m.morton@gmail.com>	2016-05-01 16:41:31 -0700
committer	Seth M Morton <seth.m.morton@gmail.com>	2016-05-01 16:41:31 -0700
commit	d741bc05b820443e8076d2b63d64ed98ba860306 (patch)
tree	d9b45153c3a0acdbbdfd7800e280a2b9b94d688a
parent	df00419f4e1a1c4ec469b4b3ae76f98cb7c12c3a (diff)
download	natsort-d741bc05b820443e8076d2b63d64ed98ba860306.tar.gz