utf8.c: Improve algorithm for detecting overflow

The code has hard-coded into it the UTF-8 for the highest representable code point for various platforms and word sizes. The algorithm is to compare the input sequence to verify it is <= the highest. But the tail of each of them has some number of the highest possible continuation byte. We need not look at the tail, as the input cannot be above the highest possible. This commit shortens the highest string constants and exits the loop when we get to where the tail used to be. This change allows for the complete removal of the code that is #ifdef'd out that would be used when we allow core to use code points up to UV_MAX.
author: Karl Williamson <khw@cpan.org> 2021-06-27 03:20:07 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:44 -0600
commit: 86fb75ad16dc25aa240885d6da2eb32c9a3685a1 (patch)
tree: cfb7768401492eb2d01f0f052f85ac1c3b708dbf /utf8.c
parent: c5b2813428f68013e30d129e5eac9481bb0b105f (diff)
download: perl-86fb75ad16dc25aa240885d6da2eb32c9a3685a1.tar.gz
1 files changed, 25 insertions, 61 deletions
diff --git a/utf8.c b/utf8.c
index e5ee90927e..7063afd5f1 100644
--- a/utf8.c
+++ b/utf8.c
@@ -800,13 +800,23 @@ S_isFF_overlong(const U8 * const s, const STRLEN len)
     return -1;
 }
 
-#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
-#  ifdef EBCDIC     /* Actually is I8 */
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+/* At some point we may want to allow core to use up to UV_MAX */
+
+#ifdef EBCDIC     /* Actually is I8 */
+#  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
+#    define HIGHEST_REPRESENTABLE_UTF8  "\xFF\xA7"
+                              /* UV_MAX "\xFF\xAF" */
+#  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
+#    define HIGHEST_REPRESENTABLE_UTF8  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
+                              /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
+#  endif
+#else
+#  if defined(UV_IS_QUAD)
+#    define HIGHEST_REPRESENTABLE_UTF8  "\xFF\x80\x87"
+                              /* UV_MAX "\xFF\x80" */
 #  else
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#    define HIGHEST_REPRESENTABLE_UTF8  "\xFD"
+                              /* UV_MAX "\xFE\x83" */
 #  endif
 #endif
 
@@ -854,6 +864,15 @@ S_does_utf8_overflow(const U8 * const s,
 
         for (x = s; x < e; x++, y++) {
 
+            /* 'y' is set up to not include the trailing bytes that are all the
+             * maximum possible continuation byte.  So when we reach the end of
+             * 'y' (known to be NUL terminated), it is impossible for 'x' to
+             * contain bytes larger than those omitted bytes, and therefore 'x'
+             * can't overflow */
+            if (*y == '\0') {
+                return 0;
+            }
+
             if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
                 continue;
             }
@@ -879,61 +898,6 @@ S_does_utf8_overflow(const U8 * const s,
 
 }
 
-#if 0
-
-/* This is the portions of the above function that deal with UV_MAX instead of
- * IV_MAX.  They are left here in case we want to combine them so that internal
- * uses can have larger code points.  The only logic difference is that the
- * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
- * different logic.
- */
-
-/* Anything larger than this will overflow the word if it were converted into a UV */
-#if defined(UV_IS_QUAD)
-#  ifdef EBCDIC     /* Actually is I8 */
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#else   /* 32-bit */
-#  ifdef EBCDIC
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#endif
-
-#ifndef HAS_EXTRA_LONG_UTF8
-
-    /* On 32 bit ASCII machines, many overlongs that start with FF don't
-     * overflow */
-    if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
-
-        /* To be such an overlong, the first bytes of 's' must match
-         * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80".  If we
-         * don't have any additional bytes available, the sequence, when
-         * completed might or might not fit in 32 bits.  But if we have that
-         * next byte, we can tell for sure.  If it is <= 0x83, then it does
-         * fit. */
-        if (len <= STRLENs(FF_OVERLONG_PREFIX)) {
-            return -1;
-        }
-
-        return s[STRLENs(FF_OVERLONG_PREFIX)] > 0x83;
-    }
-
-/* Starting with the #else, the rest of the function is identical except
- *      1.  we need to move the 'len' declaration to be global to the function
- *      2.  the endif move to just after the UNUSED_ARG.
- * An empty endif is given just below to satisfy the preprocessor
- */
-#endif
-
-#endif
-
 #undef FF_OVERLONG_PREFIX
 
 STRLEN
author	Karl Williamson <khw@cpan.org>	2021-06-27 03:20:07 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:44 -0600
commit	86fb75ad16dc25aa240885d6da2eb32c9a3685a1 (patch)
tree	cfb7768401492eb2d01f0f052f85ac1c3b708dbf /utf8.c
parent	c5b2813428f68013e30d129e5eac9481bb0b105f (diff)
download	perl-86fb75ad16dc25aa240885d6da2eb32c9a3685a1.tar.gz