summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-06-14 10:35:51 -0600
committerKarl Williamson <khw@cpan.org>2022-06-14 10:50:33 -0600
commit9f4248c91978a6485a7856bc094d868d40054553 (patch)
treec07c65b28093f6cc0ef9e9ee411847caf940e57c /inline.h
parentd6ad3b72778369a84a215b498d8d60d5b03aa1af (diff)
downloadperl-9f4248c91978a6485a7856bc094d868d40054553.tar.gz
Revert "Do per-word hop back"
This reverts commit d6ad3b72778369a84a215b498d8d60d5b03aa1af and adds comments that it existed. This is so that this work which isn't really needed based on current usage in core isn't lost, and can be used in the future.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h92
1 files changed, 17 insertions, 75 deletions
diff --git a/inline.h b/inline.h
index d81188eb7f..212e42aa59 100644
--- a/inline.h
+++ b/inline.h
@@ -525,7 +525,7 @@ Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
/* This looks like 0x808080... */
# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
-# define PERL_WORDSIZE ((SSize_t) sizeof(PERL_UINTMAX_T))
+# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
@@ -1111,6 +1111,13 @@ S_variant_under_utf8_count(const U8* const s, const U8* const e)
#endif
+#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
+# undef PERL_WORDSIZE
+# undef PERL_COUNT_MULTIPLIER
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
+
/*
=for apidoc is_utf8_string
@@ -2076,82 +2083,24 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
{
PERL_ARGS_ASSERT_UTF8_HOP_BACK;
- /* XXX Note: cannot use UTF8_IS_...() too eagerly here since e.g
+ /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
* the bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
assert(start <= s);
assert(off <= 0);
-#ifndef EBCDIC
-
- /* Anything but a continuation byte counts as a character: it is either a
- * start byte or a single-byte character.
- *
- * If we are counting back a sufficient number of characters so that in the
- * worst case, it will be more than 3 full words, go into full word count
- * mode. XXX '3' is arbitray for purposes of testing, and should be
- * adjusted based on real measurements.
- *
- * The worst case is if every byte is a character. */
- if (off < -3 * PERL_WORDSIZE && (s - start) > 3) {
-
- /* Back up to a full word boundary. */
- while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
- if (! UTF8_IS_CONTINUATION(*s--)) {
- off++;
- }
- }
-
- /* Now look at per-word, stopping before it becomes possible that the
- * next word could end the search */
- while (off < -PERL_WORDSIZE && s > start + PERL_WORDSIZE) {
- s -= PERL_WORDSIZE;
-
- /* The number of characters in this word is the word size minus the
- * number of continuation bytes; the latter being easier to figure
- * out.
- *
- * A continuation byte has the upper 2 bits be '10', and the rest
- * dont-cares. Follow this procedure:
- * Continuation Start Invariant
- * Original 10xxxxxx 11xxxxxx 0xxxxxxx
- * Take the complement 01xxxxxx 00xxxxxx 1xxxxxxx
- * Left shift by 1 10xxxxx0 0xxxxxx0 xxxxxxx0
- * AND with the original 10xxxxxx 0xxxxxx0 0xxxxxx0
- * AND with 10000000 10000000 00000000 00000000
- * (which is PERL_VARIANTS_WORD_MASK)
- * Right shift by 7 00000001 00000000 00000000
- *
- * This results in every bit in the word being zero except the
- * lowest in each continuation byte.
- *
- * Multiplying by PERL_COUNT_MULTIPLIER and right shifting yields
- * the count of those 1 bits (as explained in commit
- * 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1). Subtracting that
- * from the number of bytes in the word gives the number of
- * non-continuations, i.e. the number of characters. */
- off += PERL_WORDSIZE - (((( ( (((~ (* (PERL_UINTMAX_T *) s))) << 1)
- & ( (* (PERL_UINTMAX_T *) s)))
- & PERL_VARIANTS_WORD_MASK)
- >> (CHARBITS - 1))
- * PERL_COUNT_MULTIPLIER)
- >> ((PERL_WORDSIZE - 1) * CHARBITS));
- }
-
- s--;
- }
-
-#endif
-
- while (s > start) {
+ /* Note: if we know that the input is well-formed, we can do per-word
+ * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
+ * that. But it was reverted because doing per-word has some
+ * start-up/tear-down overhead, so only makes sense if the distance to be
+ * moved is large, and core perl doesn't currently move more than a few
+ * characters at a time. You can reinstate it if it does become
+ * advantageous. */
+ while (off++ && s > start) {
do {
s--;
} while (UTF8_IS_CONTINUATION(*s) && s > start);
-
- if (++off >= 0) {
- break;
- }
}
GCC_DIAG_IGNORE(-Wcast-qual)
@@ -2159,13 +2108,6 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
GCC_DIAG_RESTORE
}
-#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
-# undef PERL_WORDSIZE
-# undef PERL_COUNT_MULTIPLIER
-# undef PERL_WORD_BOUNDARY_MASK
-# undef PERL_VARIANTS_WORD_MASK
-#endif
-
/*
=for apidoc utf8_hop_safe