Refactor uvoffuni_to_utf8_flags_msgs

Having a fast UVOFFUNISKIP() allows this function be be refactored to simplify it. This commit continues to shortchange large code points and EBCDIC by a little. For example, it checks if a 4-byte character is above Unicode, but no 4-byte characters fit that description in UTF-EBCDIC. This will be fixed in the next commit, which will prepare for further enhancements.
author: Karl Williamson <khw@cpan.org> 2021-06-05 16:31:11 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 05:14:44 -0600
commit: e932a5edb50acc7eadabb8b398bd071556b21ded (patch)
tree: 8066dac4c06c7be19f0d4acf80e6a4bcd93a8921 /utf8.c
parent: 3e3eb1f61aa7ee77634882c846122860a4c8ef42 (diff)
download: perl-e932a5edb50acc7eadabb8b398bd071556b21ded.tar.gz
1 files changed, 73 insertions, 119 deletions
diff --git a/utf8.c b/utf8.c
index 7e4d95b92e..cd9f4481e6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -262,149 +262,103 @@ The caller, of course, is responsible for freeing any returned HV.
 U8 *
 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, const UV flags, HV** msgs)
 {
+    U8 *p;
+    UV shifted_uv = input_uv;
+    STRLEN utf8_skip = OFFUNISKIP(input_uv);
+
     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
 
     if (msgs) {
         *msgs = NULL;
     }
 
-    if (OFFUNI_IS_INVARIANT(input_uv)) {
+    switch (utf8_skip) {
+      case 1:
         *d++ = LATIN1_TO_NATIVE(input_uv);
         return d;
-    }
 
-    if (input_uv <= MAX_UTF8_TWO_BYTE) {
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv >> SHIFT) | UTF_START_MARK(2));
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv           & MASK) |   MARK);
-        return d;
-    }
+      default:
+        if (   UNLIKELY(input_uv > MAX_LEGAL_CP
+            && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
+        {
+            Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv));
+        }
 
-    /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
-     * below, the 16 is for start bytes E0-EF (which are all the possible ones
-     * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
-     * contribute SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
-     * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
-     * 0x800-0xFFFF on ASCII */
-    if (input_uv < (16 * (1U << (2 * SHIFT)))) {
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
-        *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv  /* (1 - 1) */        & MASK) |   MARK);
-
-#ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
-                   aren't tested here */
-        /* The most likely code points in this range are below the surrogates.
-         * Do an extra test to quickly exclude those. */
-        if (UNLIKELY(input_uv >= UNICODE_SURROGATE_FIRST)) {
-            if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv)
-                         || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv)))
-            {
-                HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
-            }
-            else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
-                HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
-            }
+        p = d + utf8_skip - 1;
+        while (p >= d + 4) {
+            *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+            shifted_uv >>= SHIFT;
         }
-#endif
-        return d;
-    }
 
-    /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
-     * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
-     * happen starting with 4-byte characters on ASCII platforms.  We unify the
-     * code for these with EBCDIC, even though some of them require 5-bytes on
-     * those, because khw believes the code saving is worth the very slight
-     * performance hit on these high EBCDIC code points. */
+        /* FALLTHROUGH */
 
-    if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
-        if (UNLIKELY(      input_uv > MAX_LEGAL_CP
-                     && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
-        {
-            Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, input_uv));
-        }
-        if (       (flags & UNICODE_WARN_SUPER)
-            || (   (flags & UNICODE_WARN_PERL_EXTENDED)
-                && UNICODE_IS_PERL_EXTENDED(input_uv)))
-        {
-            const char * format = super_cp_format;
-            U32 category = packWARN(WARN_NON_UNICODE);
-            U32 flag = UNICODE_GOT_SUPER;
-
-            /* Choose the more dire applicable warning */
-            if (UNICODE_IS_PERL_EXTENDED(input_uv)) {
-                format = PL_extended_cp_format;
-                category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
-                if (flags & (UNICODE_WARN_PERL_EXTENDED
-                            |UNICODE_DISALLOW_PERL_EXTENDED))
-                {
-                    flag = UNICODE_GOT_PERL_EXTENDED;
+      case 4:
+        if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
+            if (       (flags & UNICODE_WARN_SUPER)
+                || (   (flags & UNICODE_WARN_PERL_EXTENDED)
+                    && UNICODE_IS_PERL_EXTENDED(input_uv)))
+            {
+                const char * format = super_cp_format;
+                U32 category = packWARN(WARN_NON_UNICODE);
+                U32 flag = UNICODE_GOT_SUPER;
+
+                /* Choose the more dire applicable warning */
+                if (UNICODE_IS_PERL_EXTENDED(input_uv)) {
+                    format = PL_extended_cp_format;
+                    category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
+                    if (flags & (UNICODE_WARN_PERL_EXTENDED
+                                |UNICODE_DISALLOW_PERL_EXTENDED))
+                    {
+                        flag = UNICODE_GOT_PERL_EXTENDED;
+                    }
                 }
-            }
 
-            if (msgs) {
-                *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
-                                   category, flag);
+                if (msgs) {
+                    *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
+                                    category, flag);
+                }
+                else if (    ckWARN_d(WARN_NON_UNICODE)
+                        || (   (flag & UNICODE_GOT_PERL_EXTENDED)
+                            && ckWARN(WARN_PORTABLE)))
+                {
+                    Perl_warner(aTHX_ category, format, input_uv);
+                }
             }
-            else if (    ckWARN_d(WARN_NON_UNICODE)
-                     || (   (flag & UNICODE_GOT_PERL_EXTENDED)
-                         && ckWARN(WARN_PORTABLE)))
+            if (       (flags & UNICODE_DISALLOW_SUPER)
+                || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
+                    &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
             {
-                Perl_warner(aTHX_ category, format, input_uv);
+                return NULL;
             }
         }
-        if (       (flags & UNICODE_DISALLOW_SUPER)
-            || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
-                &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
-        {
-            return NULL;
-        }
-    }
-    else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(input_uv))) {
-        HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
-    }
 
-    /* Test for and handle 4-byte result.   In the test immediately below, the
-     * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
-     * characters).  The 3 is for 3 continuation bytes; these each contribute
-     * SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
-     * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
-     * 0x1_0000-0x1F_FFFF on ASCII */
-    if (input_uv < (8 * (1U << (3 * SHIFT)))) {
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
-        *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((3 - 1) * SHIFT)) & MASK) |   MARK);
-        *d++ = I8_TO_NATIVE_UTF8(((input_uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
-        *d++ = I8_TO_NATIVE_UTF8(( input_uv  /* (1 - 1) */        & MASK) |   MARK);
-
-#ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
-                   characters.  The end-plane non-characters for EBCDIC were
-                   handled just above */
-        if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(input_uv))) {
-            HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
-        }
-        else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
-            HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
-        }
-#endif
+        d[3] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+        shifted_uv >>= SHIFT;
+        /* FALLTHROUGH */
 
-        return d;
-    }
+      case 3:
+        if (input_uv >= UNICODE_SURROGATE_FIRST) {
+            if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
+                HANDLE_UNICODE_NONCHAR(input_uv, flags, msgs);
+            }
+            else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
+                HANDLE_UNICODE_SURROGATE(input_uv, flags, msgs);
+            }
+        }
 
-    /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
-     * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
-     * format.  The unrolled version above turns out to not save all that much
-     * time, and at these high code points (well above the legal Unicode range
-     * on ASCII platforms, and well above anything in common use in EBCDIC),
-     * khw believes that less code outweighs slight performance gains. */
+        d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+        shifted_uv >>= SHIFT;
+        /* FALLTHROUGH */
 
-    {
-        STRLEN len  = OFFUNISKIP(input_uv);
-        U8 *p = d+len-1;
-        while (p > d) {
-            *p-- = I8_TO_NATIVE_UTF8((input_uv & MASK) | MARK);
-            input_uv >>= SHIFT;
-        }
-        *p = I8_TO_NATIVE_UTF8((input_uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
-        return d+len;
+      case 2:
+        d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
+        shifted_uv >>= SHIFT;
+        d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
+                                             | UTF_START_MARK(utf8_skip));
+        break;
     }
+
+    return d + utf8_skip;
 }
 
 /*
author	Karl Williamson <khw@cpan.org>	2021-06-05 16:31:11 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 05:14:44 -0600
commit	e932a5edb50acc7eadabb8b398bd071556b21ded (patch)
tree	8066dac4c06c7be19f0d4acf80e6a4bcd93a8921 /utf8.c
parent	3e3eb1f61aa7ee77634882c846122860a4c8ef42 (diff)
download	perl-e932a5edb50acc7eadabb8b398bd071556b21ded.tar.gz