4 files changed, 97 insertions, 217 deletions
diff --git a/embed.fnc b/embed.fnc
index 7d851bae01..8dba57ce12 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1122,11 +1122,6 @@ p	|void	|init_uniprops
 #ifdef PERL_IN_UTF8_C
 STR	|U8	|to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp  \
 		|const char dummy
-#  ifndef UV_IS_QUAD
-STR	|int	|is_utf8_cp_above_31_bits|NN const U8 * const s		    \
-					 |NN const U8 * const e		    \
-					 |const bool consider_overlongs
-#  endif
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
 p	|UV	|_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index e26c96149c..cece9a798c 100644
--- a/embed.h
+++ b/embed.h
@@ -1559,11 +1559,6 @@
 #define utf16_textfilter(a,b,c)	S_utf16_textfilter(aTHX_ a,b,c)
 #    endif
 #  endif
-#  if !defined(UV_IS_QUAD)
-#    if defined(PERL_IN_UTF8_C)
-#define is_utf8_cp_above_31_bits	S_is_utf8_cp_above_31_bits
-#    endif
-#  endif
 #  if !defined(WIN32)
 #define do_exec3(a,b,c)		Perl_do_exec3(aTHX_ a,b,c)
 #  endif
diff --git a/proto.h b/proto.h
index 94fdfecbb3..7083f13135 100644
--- a/proto.h
+++ b/proto.h
@@ -4564,15 +4564,6 @@ STATIC void	S_validate_suid(pTHX_ PerlIO *rsfp);
 /* PERL_CALLCONV void	CopFILEGV_set(pTHX_ COP * c, GV * gv); */
 #define PERL_ARGS_ASSERT_COPFILEGV_SET
 #endif
-#if !defined(UV_IS_QUAD)
-#  if defined(PERL_IN_UTF8_C)
-STATIC int	S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e, const bool consider_overlongs)
-			__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS	\
-	assert(s); assert(e)
-
-#  endif
-#endif
 #if !defined(WIN32)
 PERL_CALLCONV bool	Perl_do_exec3(pTHX_ const char *incmd, int fd, int do_report);
 #define PERL_ARGS_ASSERT_DO_EXEC3	\
diff --git a/utf8.c b/utf8.c
index f4edc05025..5877add9f2 100644
--- a/utf8.c
+++ b/utf8.c
@@ -534,168 +534,6 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
     return uvchr_to_utf8_flags(d, uv, flags);
 }
 
-#ifndef UV_IS_QUAD
-
-STATIC int
-S_is_utf8_cp_above_31_bits(const U8 * const s,
-                           const U8 * const e,
-                           const bool consider_overlongs)
-{
-    /* Returns TRUE if the first code point represented by the Perl-extended-
-     * UTF-8-encoded string starting at 's', and looking no further than 'e -
-     * 1' doesn't fit into 31 bytes.  That is, that if it is >= 2**31.
-     *
-     * The function handles the case where the input bytes do not include all
-     * the ones necessary to represent a full character.  That is, they may be
-     * the intial bytes of the representation of a code point, but possibly
-     * the final ones necessary for the complete representation may be beyond
-     * 'e - 1'.
-     *
-     * The function also can handle the case where the input is an overlong
-     * sequence.  If 'consider_overlongs' is 0, the function assumes the
-     * input is not overlong, without checking, and will return based on that
-     * assumption.  If this parameter is 1, the function will go to the trouble
-     * of figuring out if it actually evaluates to above or below 31 bits.
-     *
-     * The sequence is otherwise assumed to be well-formed, without checking.
-     */
-
-    const STRLEN len = e - s;
-    int is_overlong;
-
-    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
-
-    assert(! UTF8_IS_INVARIANT(*s) && e > s);
-
-#ifdef EBCDIC
-
-    PERL_UNUSED_ARG(consider_overlongs);
-
-    /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
-     * mean a 32-bit or larger code point (0xFF is an invariant).  0xFE can
-     * also be the start byte for a 31-bit code point; we need at least 2
-     * bytes, and maybe up through 8 bytes, to determine that.  (It can also be
-     * the start byte for an overlong sequence, but for 30-bit or smaller code
-     * points, so we don't have to worry about overlongs on EBCDIC.) */
-    if (*s != 0xFE) {
-        return 0;
-    }
-
-    if (len == 1) {
-        return -1;
-    }
-
-#else
-
-    /* On ASCII, FE and FF are the only start bytes that can evaluate to
-     * needing more than 31 bits. */
-    if (LIKELY(*s < 0xFE)) {
-        return 0;
-    }
-
-    /* What we have left are FE and FF.  Both of these require more than 31
-     * bits unless they are for overlongs. */
-    if (! consider_overlongs) {
-        return 1;
-    }
-
-    /* Here, we have FE or FF.  If the input isn't overlong, it evaluates to
-     * above 31 bits.  But we need more than one byte to discern this, so if
-     * passed just the start byte, it could be an overlong evaluating to
-     * smaller */
-    if (len == 1) {
-        return -1;
-    }
-
-    /* Having excluded len==1, and knowing that FE and FF are both valid start
-     * bytes, we can call the function below to see if the sequence is
-     * overlong.  (We don't need the full generality of the called function,
-     * but for these huge code points, speed shouldn't be a consideration, and
-     * the compiler does have enough information, since it's static to this
-     * file, to optimize to just the needed parts.) */
-    is_overlong = is_utf8_overlong(s, len);
-
-    /* If it isn't overlong, more than 31 bits are required. */
-    if (is_overlong == 0) {
-        return 1;
-    }
-
-    /* If it is indeterminate if it is overlong, return that */
-    if (is_overlong < 0) {
-        return -1;
-    }
-
-    /* Here is overlong.  Such a sequence starting with FE is below 31 bits, as
-     * the max it can be is 2**31 - 1 */
-    if (*s == 0xFE) {
-        return 0;
-    }
-
-#endif
-
-    /* Here, ASCII and EBCDIC rejoin:
-    *  On ASCII:   We have an overlong sequence starting with FF
-    *  On EBCDIC:  We have a sequence starting with FE. */
-
-    {   /* For C89, use a block so the declaration can be close to its use */
-
-#ifdef EBCDIC
-
-        /* U+7FFFFFFF (2 ** 31 - 1)
-         *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
-         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
-         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
-         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
-         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
-         * U+80000000 (2 ** 31):
-         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
-         *
-         * and since we know that *s = \xfe, any continuation sequcence
-         * following it that is gt the below is above 31 bits
-                                                [0] [1] [2] [3] [4] [5] [6] */
-        const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
-
-#else
-
-        /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
-         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
-         * FF overlong for U+80000000 (2 ** 31):
-         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
-         * and since we know that *s = \xff, any continuation sequcence
-         * following it that is gt the below is above 30 bits
-                                                [0] [1] [2] [3] [4] [5] [6] */
-        const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
-
-
-#endif
-        const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
-        const STRLEN cmp_len = MIN(conts_len, len - 1);
-
-        /* Now compare the continuation bytes in s with the ones we have
-         * compiled in that are for the largest 30 bit code point.  If we have
-         * enough bytes available to determine the answer, or the bytes we do
-         * have differ from them, we can compare the two to get a definitive
-         * answer (Note that in UTF-EBCDIC, the two lowest possible
-         * continuation bytes are \x41 and \x42.) */
-        if (cmp_len >= conts_len || memNE(s + 1,
-                                          conts_for_highest_30_bit,
-                                          cmp_len))
-        {
-            return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
-        }
-
-        /* Here, all the bytes we have are the same as the highest 30-bit code
-         * point, but we are missing so many bytes that we can't make the
-         * determination */
-        return -1;
-    }
-}
-
-#endif
-
 PERL_STATIC_INLINE int
 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
 {
@@ -843,57 +681,118 @@ S_does_utf8_overflow(const U8 * const s,
      * convert each byte to I8, but it's very rare input indeed that would
      * approach overflow, so the loop below will likely only get executed once.)
      *
-     * 'e' - 1 must not be beyond a full character. */
-
+     */
+    const STRLEN len = e - s;
+    const U8 *x;
+    const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
+    int is_overlong = 0;
 
     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
-    assert(s <= e && s + UTF8SKIP(s) >= e);
 
-#if ! defined(UV_IS_QUAD)
+    for (x = s; x < e; x++, y++) {
 
-    return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
+        /* 'y' is set up to not include the trailing bytes that are all the
+         * maximum possible continuation byte.  So when we reach the end of 'y'
+         * (known to be NUL terminated), it is impossible for 'x' to contain
+         * bytes larger than those omitted bytes, and therefore 'x' can't
+         * overflow */
+        if (*y == '\0') {
+            return 0;
+        }
 
-#else
+        /* If this byte is less than the corresponding highest non-overflowing
+         * UTF-8, the sequence doesn't overflow */
+        if (NATIVE_UTF8_TO_I8(*x) < *y) {
+            return 0;
+        }
 
-    PERL_UNUSED_ARG(consider_overlongs);
+        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+            goto overflows_if_not_overlong;
+        }
+    }
 
-    {
-        const STRLEN len = e - s;
-        const U8 *x;
-        const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
-
-        for (x = s; x < e; x++, y++) {
-
-            /* 'y' is set up to not include the trailing bytes that are all the
-             * maximum possible continuation byte.  So when we reach the end of
-             * 'y' (known to be NUL terminated), it is impossible for 'x' to
-             * contain bytes larger than those omitted bytes, and therefore 'x'
-             * can't overflow */
-            if (*y == '\0') {
-                return 0;
-            }
+    /* Got to the end, and all bytes are the same.  If the input is a whole
+     * character, it doesn't overflow.  And if it is a partial character,
+     * there's not enough information to tell */
+    return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
 
-            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
-                continue;
-            }
+  overflows_if_not_overlong:
 
-            /* If this byte is larger than the corresponding highest UTF-8
-             * byte, the sequence overflow; otherwise the byte is less than,
-             * and so the sequence doesn't overflow */
-            return NATIVE_UTF8_TO_I8(*x) > *y;
+    /* Here, a well-formed sequence overflows.  If we are assuming
+     * well-formedness, return that it overflows. */
+    if (! consider_overlongs) {
+        return 1;
+    }
 
-        }
+    /* Here, it could be the overlong malformation, and might not actuallly
+     * overflow if you were to calculate it out.
+     *
+     * See if it actually is overlong */
+    is_overlong = is_utf8_overlong(s, len);
 
-        /* Got to the end and all bytes are the same.  If the input is a whole
-         * character, it doesn't overflow.  And if it is a partial character,
-         * there's not enough information to tell */
-        if (len < STRLENs(HIGHEST_REPRESENTABLE_UTF)) {
-            return -1;
-        }
+    /* If it isn't overlong, is well-formed, so overflows */
+    if (is_overlong == 0) {
+        return 1;
+    }
 
+    /* Not long enough to determine */
+    if (is_overlong < 0) {
+        return -1;
+    }
+
+    /* Here, it appears to overflow, but it is also overlong */
+
+#if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
+
+    /* On many platforms, it is impossible for an overlong to overflow.  For
+     * these, no further work is necessary: we can return immediately that this
+     * overlong that is an apparent overflow actually isn't
+     *
+     * To see why, note that a length_N sequence can represent as overlongs all
+     * the code points representable by shorter length sequences, but no
+     * higher.  If it could represent a higher code point without being an
+     * overlong, we wouldn't have had to increase the sequence length!
+     *
+     * The highest possible start byte is FF; the next highest is FE.  The
+     * highest code point representable as an overlong on the platform is thus
+     * the highest code point representable by a non-overlong sequence whose
+     * start byte is FE.  If that value doesn't overflow the platform's word
+     * size, overlongs can't overflow.
+     *
+     * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
+     * information (the high 7 bits, all ones, say that the sequence is 7 bytes
+     * long, and the bottom, zero, bit is s placeholder. That leaves the 6
+     * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
+      If that number of bits doesn't exceed the word size, it can't overflow. */
+
+    return 0;
+
+#else
+
+    /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
+     * represent, as an overlong, the highest code point representable by an FD
+     * start byte, which is 5*6 continuation bytes of info plus one bit from
+     * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
+     * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
+     *
+     * That means only the FF start byte can have an overflowing overlong. */
+    if (*s < 0xFF) {
         return 0;
     }
 
+    /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
+     * evaluates to 2**31, so overflows an IV.  For a UV it's
+     *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
+#  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
+
+    if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
+         return -1;
+    }
+
+#  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
+
+    return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
+
 #endif
 
 }