2 files changed, 17 insertions, 78 deletions
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index f8e88e29cd..50d1a4e0fd 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -28,9 +28,6 @@ my $word_length = defined $Config{quadkind} ? 8 : 4;
 # individually, so we don't have to test the various combinations of partially
 # filled words.
 
-my $str = chr(0x7FFF_FFFF) x 1000;
-my $result = test_utf8_hop_safe($str, 61, -27);
-
 my $offset = 1;  # Start 1 byte past word boundary.
 
 # We choose an invariant and a variant that are at the boundaries between
diff --git a/inline.h b/inline.h
index d81188eb7f..212e42aa59 100644
--- a/inline.h
+++ b/inline.h
@@ -525,7 +525,7 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 
 /* This looks like 0x808080... */
 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
-#  define PERL_WORDSIZE            ((SSize_t) sizeof(PERL_UINTMAX_T))
+#  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 
 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
@@ -1111,6 +1111,13 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e)
 
 #endif
 
+#ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
+#  undef PERL_WORDSIZE
+#  undef PERL_COUNT_MULTIPLIER
+#  undef PERL_WORD_BOUNDARY_MASK
+#  undef PERL_VARIANTS_WORD_MASK
+#endif
+
 /*
 =for apidoc is_utf8_string
 
@@ -2076,82 +2083,24 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
 {
     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
 
-    /* XXX Note: cannot use UTF8_IS_...() too eagerly here since e.g
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
      * the bitops (especially ~) can create illegal UTF-8.
      * In other words: in Perl UTF-8 is not just for Unicode. */
 
     assert(start <= s);
     assert(off <= 0);
 
-#ifndef EBCDIC
-
-    /* Anything but a continuation byte counts as a character: it is either a
-     * start byte or a single-byte character.
-     *
-     * If we are counting back a sufficient number of characters so that in the
-     * worst case, it will be more than 3 full words, go into full word count
-     * mode.  XXX '3' is arbitray for purposes of testing, and should be
-     * adjusted based on real measurements.
-     *
-     * The worst case is if every byte is a character. */
-    if (off < -3 * PERL_WORDSIZE && (s - start) > 3) {
-
-        /* Back up to a full word boundary. */
-        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
-            if (! UTF8_IS_CONTINUATION(*s--)) {
-                off++;
-            }
-        }
-
-        /* Now look at per-word, stopping before it becomes possible that the
-         * next word could end the search */
-        while (off < -PERL_WORDSIZE && s > start + PERL_WORDSIZE) {
-            s -= PERL_WORDSIZE;
-
-            /* The number of characters in this word is the word size minus the
-             * number of continuation bytes; the latter being easier to figure
-             * out.
-             *
-             * A continuation byte has the upper 2 bits be '10', and the rest
-             * dont-cares.  Follow this procedure:
-             *                        Continuation    Start     Invariant
-             * Original                 10xxxxxx    11xxxxxx    0xxxxxxx
-             * Take the complement      01xxxxxx    00xxxxxx    1xxxxxxx
-             * Left shift by 1          10xxxxx0    0xxxxxx0    xxxxxxx0
-             * AND with the original    10xxxxxx    0xxxxxx0    0xxxxxx0
-             * AND with 10000000        10000000    00000000    00000000
-             * (which is PERL_VARIANTS_WORD_MASK)
-             * Right shift  by 7        00000001    00000000    00000000
-             *
-             * This results in every bit in the word being zero except the
-             * lowest in each continuation byte.
-             *
-             * Multiplying by PERL_COUNT_MULTIPLIER and right shifting yields
-             * the count of those 1 bits (as explained in commit
-             * 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1).  Subtracting that
-             * from the number of bytes in the word gives the number of
-             * non-continuations, i.e. the number of characters. */
-            off += PERL_WORDSIZE - ((((  ( (((~ (* (PERL_UINTMAX_T *) s))) << 1)
-                                         & (   (* (PERL_UINTMAX_T *) s)))
-                                       & PERL_VARIANTS_WORD_MASK)
-                                       >> (CHARBITS - 1))
-                                    * PERL_COUNT_MULTIPLIER)
-                                  >> ((PERL_WORDSIZE - 1) * CHARBITS));
-        }
-
-        s--;
-    }
-
-#endif
-
-    while (s > start) {
+    /* Note: if we know that the input is well-formed, we can do per-word
+     * hop-back.  Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
+     * that.  But it was reverted because doing per-word has some
+     * start-up/tear-down overhead, so only makes sense if the distance to be
+     * moved is large, and core perl doesn't currently move more than a few
+     * characters at a time.  You can reinstate it if it does become
+     * advantageous. */
+    while (off++ && s > start) {
         do {
             s--;
         } while (UTF8_IS_CONTINUATION(*s) && s > start);
-
-        if (++off >= 0) {
-            break;
-        }
     }
 
     GCC_DIAG_IGNORE(-Wcast-qual)
@@ -2159,13 +2108,6 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
     GCC_DIAG_RESTORE
 }
 
-#ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
-#  undef PERL_WORDSIZE
-#  undef PERL_COUNT_MULTIPLIER
-#  undef PERL_WORD_BOUNDARY_MASK
-#  undef PERL_VARIANTS_WORD_MASK
-#endif
-
 /*
 =for apidoc utf8_hop_safe