From af13dd8ae148d022e85f4fdcf737e07416145e28 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sat, 10 Sep 2016 21:15:04 -0600
Subject: Revamp overlong handling in is_utf8_char_slow, fixing a bug

This combines EBCDIC and ASCII branches as much as possible, and fixes a
bug that showed up only on EBCDIC platforms, and 64-bit ASCII ones for
the highest overlong, where it could erroneously conclude that a
sequence was an overlong.

Tests are coming in a future commit.
.
---
 utf8.c | 102 ++++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 43 deletions(-)

diff --git a/utf8.c b/utf8.c
index 34c3df5e7d..2b9ea5b2d2 100644
--- a/utf8.c
+++ b/utf8.c
@@ -460,61 +460,77 @@ Perl__is_utf8_char_slow(const U8 * const s, const STRLEN len)
         }
     }
 
-#ifndef EBCDIC
-
-    /* Here is syntactically valid.  Make sure this isn't the start of an
-     * overlong.  These values were found by manually inspecting the UTF-8
-     * patterns.  See the tables in utf8.h and utfebcdic.h */
-
-    /* This is not needed on modern perls where C0 and C1 are not considered
-     * start bytes. */
-#if 0
-    if (UNLIKELY(*s < 0xC2)) {
-        return 0;
-    }
-#endif
+    /* Here is syntactically valid.  Next, make sure this isn't the start of an
+     * overlong.  Overlongs can occur whenever the number of continuation bytes
+     * changes.  That means whenever the number of leading 1 bits in a start
+     * byte increases from the next lower start byte.  That happens for start
+     * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
+     * illegal start bytes have already been excluded, so don't need to be
+     * tested here;
+     * ASCII platforms: C0, C1
+     * EBCDIC platforms C0, C1, C2, C3, C4, E0
+     *
+     * At least a second byte is required to determine if other sequences will
+     * be an overlong. */
 
     if (len > 1) {
-        if (   (*s == 0xE0 && UNLIKELY(s[1] < 0xA0))
-            || (*s == 0xF0 && UNLIKELY(s[1] < 0x90))
-            || (*s == 0xF8 && UNLIKELY(s[1] < 0x88))
-            || (*s == 0xFC && UNLIKELY(s[1] < 0x84))
-            || (*s == 0xFE && UNLIKELY(s[1] < 0x82)))
-        {
-            return 0;
-        }
-        if ((len > 6 && UNLIKELY(*s == 0xFF) && UNLIKELY(s[6] < 0x81))) {
-            return 0;
+        const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+        const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+        /* Each platform has overlongs after the start bytes given above
+         * (expressed in I8 for EBCDIC).  What constitutes an overlong varies
+         * by platform, but the logic is the same, except the E0 overlong has
+         * already been excluded on EBCDIC platforms.   The  values below were
+         * found by manually inspecting the UTF-8 patterns.  See the tables in
+         * utf8.h and utfebcdic.h */
+
+#       ifdef EBCDIC
+#           define F0_ABOVE_OVERLONG 0xB0
+#           define F8_ABOVE_OVERLONG 0xA8
+#           define FC_ABOVE_OVERLONG 0xA4
+#           define FE_ABOVE_OVERLONG 0xA2
+#           define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+                                      /* I8(0xfe) is FF */
+#       else
+
+        if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+            return 0;       /* Overlong */
         }
-    }
 
-#else   /* For EBCDIC, we use I8, which is the same on all code pages */
-    {
-        const U8 s0 = NATIVE_UTF8_TO_I8(*s);
+#           define F0_ABOVE_OVERLONG 0x90
+#           define F8_ABOVE_OVERLONG 0x88
+#           define FC_ABOVE_OVERLONG 0x84
+#           define FE_ABOVE_OVERLONG 0x82
+#           define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+#       endif
 
-        /* On modern perls C0-C4 aren't considered start bytes */
-        if ( /* s0 < 0xC5 || */ s0 == 0xE0) {
-            return 0;
+
+        if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+            || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+            || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+            || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+        {
+            return 0;       /* Overlong */
         }
 
-        if (len >= 1) {
-            const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+#   if defined(UV_IS_QUAD) || defined(EBCDIC)
 
-            if (   (s0 == 0xF0 && UNLIKELY(s1 < 0xB0))
-                || (s0 == 0xF8 && UNLIKELY(s1 < 0xA8))
-                || (s0 == 0xFC && UNLIKELY(s1 < 0xA4))
-                || (s0 == 0xFE && UNLIKELY(s1 < 0x82)))
-            {
-                return 0;
-            }
-            if ((len > 7 && UNLIKELY(s0 == 0xFF) && UNLIKELY(s[7] < 0xA1))) {
-                return 0;
-            }
+        /* Check for the FF overlong.  This happens only if all these bytes
+         * match; what comes after them doesn't matter.  See tables in utf8.h,
+         * utfebcdic.h.  (Can't happen on ASCII 32-bit platforms, as overflows
+         * instead.) */
+
+        if (   len >= sizeof(FF_OVERLONG_PREFIX) - 1
+            && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+                                               sizeof(FF_OVERLONG_PREFIX) - 1)))
+        {
+            return 0;       /* Overlong */
         }
-    }
 
 #endif
 
+    }
+
     /* Finally, see if this would overflow a UV on this platform.  See if the
      * UTF8 for this code point is larger than that for the highest
      * representable code point.  (For ASCII platforms, we could use memcmp()
-- 
cgit v1.2.1