bugfix: bcrypt os_crypt backend: fix some more crypt.crypt() utf8 encoding issues

which were causing it to fail to generate a hash. bcrypt ------ * _norm_digest(): fixed some PasslibValueError()s being thrown by os_crypt backend during UT fuzz verifier. These were due to non-UTF8 input being provided to crypt.crypt()... even though secret itself was UTF8 safe! This was because secret was being truncated/repeated as part of various backend bug workarounds; and the truncate/repeat operations weren't being done in manner that respected UTF8 character boundaries. This has now been fixed via _require_valid_utf8_bytes flag (which has been set for os_crypt backend), that enables utf8-safe mode of operation. utils ----- * added utf8_truncate() and utf8_repeat_string() helpers, for bcrypt fixes above. * simplified repeat_string() internals
author: Eli Collins <elic@assurancetechnologies.com> 2020-05-12 12:11:57 -0400
committer: Eli Collins <elic@assurancetechnologies.com> 2020-05-12 12:11:57 -0400
commit: c8d8ad8e90dac480b8c5ab1019266b34026b9b30 (patch)
tree: eeb7d97b312e6a69f662a6aa833c3c91ec36c01c
parent: cfdce7ee2911d90c2b4204d67e741ad9dd6b4e69 (diff)
download: passlib-c8d8ad8e90dac480b8c5ab1019266b34026b9b30.tar.gz
4 files changed, 237 insertions, 11 deletions
diff --git a/docs/history/1.7.rst b/docs/history/1.7.rst
index 12eb363..6497550 100644
--- a/docs/history/1.7.rst
+++ b/docs/history/1.7.rst
@@ -62,6 +62,8 @@ Other Changes
   These can't be passed through, due to limitation in stdlib's :func:`!crypt.crypt`.
   (Prior to this release, it would it incorrectly raise :exc:`~passlib.exc.MissingBackendError` instead).
 
+  Also improved legacy bcrypt format workarounds, to support a few more UTF8 edge cases than before.
+
 * Modified some internals to help run on FIPS systems (:issue:`116`):
 
   In particular, when MD5 hash is not available, :class:`~passlib.hash.hex_md5`
diff --git a/passlib/handlers/bcrypt.py b/passlib/handlers/bcrypt.py
index 5e5838a..323f2a0 100644
--- a/passlib/handlers/bcrypt.py
+++ b/passlib/handlers/bcrypt.py
@@ -27,7 +27,8 @@ _builtin_bcrypt = None  # dynamically imported by _load_backend_builtin()
 from passlib.crypto.digest import compile_hmac
 from passlib.exc import PasslibHashWarning, PasslibSecurityWarning, PasslibSecurityError
 from passlib.utils import safe_crypt, repeat_string, to_bytes, parse_version, \
-                          rng, getrandstr, test_crypt, to_unicode
+                          rng, getrandstr, test_crypt, to_unicode, \
+                          utf8_truncate, utf8_repeat_string
 from passlib.utils.binary import bcrypt64
 from passlib.utils.compat import get_unbound_method_function
 from passlib.utils.compat import u, uascii_to_str, unicode, str_to_uascii, PY3, error_from
@@ -158,6 +159,7 @@ class _BcryptCommon(uh.SubclassBackendMixin, uh.TruncateMixin, uh.HasManyIdents,
     _lacks_2y_support = False
     _lacks_2b_support = False
     _fallback_ident = IDENT_2A
+    _require_valid_utf8_bytes = False
 
     #===================================================================
     # formatting
@@ -486,8 +488,18 @@ class _BcryptCommon(uh.SubclassBackendMixin, uh.TruncateMixin, uh.HasManyIdents,
     @classmethod
     def _norm_digest_args(cls, secret, ident, new=False):
         # make sure secret is unicode
+        require_valid_utf8_bytes = cls._require_valid_utf8_bytes
         if isinstance(secret, unicode):
             secret = secret.encode("utf-8")
+        elif require_valid_utf8_bytes:
+            # if backend requires utf8 bytes (os_crypt);
+            # make sure input actually is utf8, or don't bother enabling utf-8 specific helpers.
+            try:
+                secret.decode("utf-8")
+            except UnicodeDecodeError:
+                # XXX: could just throw PasswordValueError here, backend will just do that
+                #      when _calc_digest() is actually called.
+                require_valid_utf8_bytes = False
 
         # check max secret size
         uh.validate_secret(secret)
@@ -508,7 +520,15 @@ class _BcryptCommon(uh.SubclassBackendMixin, uh.TruncateMixin, uh.HasManyIdents,
         # bcrypt only uses first 72 bytes anyways.
         # NOTE: not needed for 2y/2b, but might use 2a as fallback for them.
         if cls._has_2a_wraparound_bug and len(secret) >= 255:
-            secret = secret[:72]
+            if require_valid_utf8_bytes:
+                # backend requires valid utf8 bytes, so truncate secret to nearest valid segment.
+                # want to do this in constant time to not give away info about secret.
+                # NOTE: this only works because bcrypt will ignore everything past
+                #       secret[71], so padding to include a full utf8 sequence
+                #       won't break anything about the final output.
+                secret = utf8_truncate(secret, 72)
+            else:
+                secret = secret[:72]
 
         # special case handling for variants (ordered most common first)
         if ident == IDENT_2A:
@@ -535,7 +555,13 @@ class _BcryptCommon(uh.SubclassBackendMixin, uh.TruncateMixin, uh.HasManyIdents,
                 # we can fake $2$ behavior using the 2A/2Y/2B algorithm
                 # by repeating the password until it's at least 72 chars in length.
                 if secret:
-                    secret = repeat_string(secret, 72)
+                    if require_valid_utf8_bytes:
+                        # NOTE: this only works because bcrypt will ignore everything past
+                        #       secret[71], so padding to include a full utf8 sequence
+                        #       won't break anything about the final output.
+                        secret = utf8_repeat_string(secret, 72)
+                    else:
+                        secret = repeat_string(secret, 72)
                 ident = cls._fallback_ident
 
         elif ident == IDENT_2X:
@@ -745,6 +771,10 @@ class _OsCryptBackend(_BcryptCommon):
     backend which uses :func:`crypt.crypt`
     """
 
+    #: set flag to ensure _prepare_digest_args() doesn't create invalid utf8 string
+    #: when truncating bytes.
+    _require_valid_utf8_bytes = PY3
+
     @classmethod
     def _load_backend_mixin(mixin_cls, name, dryrun):
         if not test_crypt("test", TEST_HASH_2A):
diff --git a/passlib/tests/test_utils.py b/passlib/tests/test_utils.py
index 3fd0001..f3a275b 100644
--- a/passlib/tests/test_utils.py
+++ b/passlib/tests/test_utils.py
@@ -413,6 +413,125 @@ class MiscTest(TestCase):
         self.assertEqual(splitcomma(" a , b"), ['a', 'b'])
         self.assertEqual(splitcomma(" a, b, "), ['a', 'b'])
 
+    def test_utf8_truncate(self):
+        """
+        utf8_truncate()
+        """
+        from passlib.utils import utf8_truncate
+
+        #
+        # run through a bunch of reference strings,
+        # and make sure they truncate properly across all possible indexes
+        #
+        for source in [
+            # empty string
+            b"",
+            # strings w/ only single-byte chars
+            b"1",
+            b"123",
+            b'\x1a',
+            b'\x1a' * 10,
+            b'\x7f',
+            b'\x7f' * 10,
+            # strings w/ properly formed UTF8 continuation sequences
+            b'a\xc2\xa0\xc3\xbe\xc3\xbe',
+            b'abcdefghjusdfaoiu\xc2\xa0\xc3\xbe\xc3\xbedsfioauweoiruer',
+        ]:
+            source.decode("utf-8") # sanity check - should always be valid UTF8
+
+            end = len(source)
+            for idx in range(end + 16):
+                prefix = "source=%r index=%r: " % (source, idx)
+
+                result = utf8_truncate(source, idx)
+
+                # result should always be valid utf-8
+                result.decode("utf-8")
+
+                # result should never be larger than source
+                self.assertLessEqual(len(result), end, msg=prefix)
+
+                # result should always be in range(idx, idx+4)
+                self.assertGreaterEqual(len(result), min(idx, end), msg=prefix)
+                self.assertLess(len(result), min(idx + 4, end + 1), msg=prefix)
+
+                # should be strict prefix of source
+                self.assertEqual(result, source[:len(result)], msg=prefix)
+
+        #
+        # malformed utf8 --
+        # strings w/ only initial chars (should cut just like single-byte chars)
+        #
+        for source in [
+            b'\xca',
+            b'\xca' * 10,
+            # also test null bytes (not valid utf8, but this func should treat them like ascii)
+            b'\x00',
+            b'\x00' * 10,
+        ]:
+            end = len(source)
+            for idx in range(end + 16):
+                prefix = "source=%r index=%r: " % (source, idx)
+                result = utf8_truncate(source, idx)
+                self.assertEqual(result, source[:idx], msg=prefix)
+
+        #
+        # malformed utf8 --
+        # strings w/ only continuation chars (should cut at index+3)
+        #
+        for source in [
+            b'\xaa',
+            b'\xaa' * 10,
+        ]:
+            end = len(source)
+            for idx in range(end + 16):
+                prefix = "source=%r index=%r: " % (source, idx)
+                result = utf8_truncate(source, idx)
+                self.assertEqual(result, source[:idx+3], msg=prefix)
+
+        #
+        # string w/ some invalid utf8 --
+        # * \xaa byte is too many continuation byte after \xff start byte
+        # * \xab byte doesn't have preceding start byte
+        # XXX: could also test continuation bytes w/o start byte, WITHIN the string.
+        #      but think this covers edges well enough...
+        #
+        source = b'MN\xff\xa0\xa1\xa2\xaaOP\xab'
+
+        self.assertEqual(utf8_truncate(source, 0), b'')  # index="M", stops there
+
+        self.assertEqual(utf8_truncate(source, 1), b'M')  # index="N", stops there
+
+        self.assertEqual(utf8_truncate(source, 2), b'MN')  # index="\xff", stops there
+
+        self.assertEqual(utf8_truncate(source, 3),
+                         b'MN\xff\xa0\xa1\xa2')  # index="\xa0", runs out after index+3="\xa2"
+
+        self.assertEqual(utf8_truncate(source, 4),
+                         b'MN\xff\xa0\xa1\xa2\xaa')  # index="\xa1", runs out after index+3="\xaa"
+
+        self.assertEqual(utf8_truncate(source, 5),
+                         b'MN\xff\xa0\xa1\xa2\xaa')  # index="\xa2", stops before "O"
+
+        self.assertEqual(utf8_truncate(source, 6),
+                         b'MN\xff\xa0\xa1\xa2\xaa')  # index="\xaa", stops before "O"
+
+        self.assertEqual(utf8_truncate(source, 7),
+                         b'MN\xff\xa0\xa1\xa2\xaa')  # index="O", stops there
+
+        self.assertEqual(utf8_truncate(source, 8),
+                         b'MN\xff\xa0\xa1\xa2\xaaO')  # index="P", stops there
+
+        self.assertEqual(utf8_truncate(source, 9),
+                         b'MN\xff\xa0\xa1\xa2\xaaOP\xab')  # index="\xab", runs out at end
+
+        self.assertEqual(utf8_truncate(source, 10),
+                         b'MN\xff\xa0\xa1\xa2\xaaOP\xab')  # index=end
+
+        self.assertEqual(utf8_truncate(source, 11),
+                         b'MN\xff\xa0\xa1\xa2\xaaOP\xab')  # index=end+1
+
+
 #=============================================================================
 # byte/unicode helpers
 #=============================================================================
diff --git a/passlib/utils/__init__.py b/passlib/utils/__init__.py
index cb8872c..386ef65 100644
--- a/passlib/utils/__init__.py
+++ b/passlib/utils/__init__.py
@@ -55,7 +55,7 @@ from passlib.utils.decor import (
     classproperty,
     hybrid_method,
 )
-from passlib.exc import ExpectedStringError
+from passlib.exc import ExpectedStringError, ExpectedTypeError
 from passlib.utils.compat import (add_doc, join_bytes, join_byte_values,
                                   join_byte_elems, irange, imap, PY3, u,
                                   join_unicode, unicode, byte_elem_value, nextgetter,
@@ -574,13 +574,20 @@ def xor_bytes(left, right):
     return int_to_bytes(bytes_to_int(left) ^ bytes_to_int(right), len(left))
 
 def repeat_string(source, size):
-    """repeat or truncate <source> string, so it has length <size>"""
-    cur = len(source)
-    if size > cur:
-        mult = (size+cur-1)//cur
-        return (source*mult)[:size]
-    else:
-        return source[:size]
+    """
+    repeat or truncate <source> string, so it has length <size>
+    """
+    mult = 1 + (size - 1) // len(source)
+    return (source * mult)[:size]
+
+
+def utf8_repeat_string(source, size):
+    """
+    variant of repeat_string() which truncates to nearest UTF8 boundary.
+    """
+    mult = 1 + (size - 1) // len(source)
+    return utf8_truncate(source * mult, size)
+
 
 _BNULL = b"\x00"
 _UNULL = u("\x00")
@@ -595,6 +602,74 @@ def right_pad_string(source, size, pad=None):
     else:
         return source[:size]
 
+
+def utf8_truncate(source, index):
+    """
+    helper to truncate UTF8 byte string to nearest character boundary ON OR AFTER <index>.
+    returned prefix will always have length of at least <index>, and will stop on the
+    first byte that's not a UTF8 continuation byte (128 - 191 inclusive).
+    since utf8 should never take more than 4 bytes to encode known unicode values,
+    we can stop after ``index+3`` is reached.
+
+    :param bytes source:
+    :param int index:
+    :rtype: bytes
+    """
+    # general approach:
+    #
+    # * UTF8 bytes will have high two bits (0xC0) as one of:
+    #   00 -- ascii char
+    #   01 -- ascii char
+    #   10 -- continuation of multibyte char
+    #   11 -- start of multibyte char.
+    #   thus we can cut on anything where high bits aren't "10" (0x80; continuation byte)
+    #
+    # * UTF8 characters SHOULD always be 1 to 4 bytes, though they may be unbounded.
+    #   so we just keep going until first non-continuation byte is encountered, or end of str.
+    #   this should work predictably even for malformed/non UTF8 inputs.
+
+    if not isinstance(source, bytes):
+        raise ExpectedTypeError(source, bytes, "source")
+
+    # validate index
+    end = len(source)
+    if index < 0:
+        index = max(0, index + end)
+    if index >= end:
+        return source
+
+    # can stop search after 4 bytes, won't ever have longer utf8 sequence.
+    end = min(index + 3, end)
+
+    # loop until we find non-continuation byte
+    while index < end:
+        if byte_elem_value(source[index]) & 0xC0 != 0x80:
+            # found single-char byte, or start-char byte.
+            break
+        # else: found continuation byte.
+        index += 1
+    else:
+        assert index == end
+
+    # truncate at final index
+    result = source[:index]
+
+    def sanity_check():
+        # try to decode source
+        try:
+            text = source.decode("utf-8")
+        except UnicodeDecodeError:
+            # if source isn't valid utf8, byte level match is enough
+            return True
+
+        # validate that result was cut on character boundary
+        assert text.startswith(result.decode("utf-8"))
+        return True
+
+    assert sanity_check()
+
+    return result
+
 #=============================================================================
 # encoding helpers
 #=============================================================================
author	Eli Collins <elic@assurancetechnologies.com>	2020-05-12 12:11:57 -0400
committer	Eli Collins <elic@assurancetechnologies.com>	2020-05-12 12:11:57 -0400
commit	c8d8ad8e90dac480b8c5ab1019266b34026b9b30 (patch)
tree	eeb7d97b312e6a69f662a6aa833c3c91ec36c01c
parent	cfdce7ee2911d90c2b4204d67e741ad9dd6b4e69 (diff)
download	passlib-c8d8ad8e90dac480b8c5ab1019266b34026b9b30.tar.gz