Search for UTF-8 invariants by word

The functions is_utf8_invariant_string() and is_utf8_invariant_string_loc() are used in several places in the core and are part of the public API. This commit speeds them up significantly on ASCII (not EBCDIC) platforms, by changing to use word-at-a-time parsing instead of per-byte. (Per-byte is retained for any initial bytes to reach the next word boundary, and any final bytes that don't fill an entire word.) The following results were obtained parsing a long string on a 64-bit word machine: byte word ------ ------ Ir 100.00 665.35 Dr 100.00 797.03 Dw 100.00 102.12 COND 100.00 799.27 IND 100.00 97.56 COND_m 100.00 144.83 IND_m 100.00 75.00 Ir_m1 100.00 100.00 Dr_m1 100.00 100.02 Dw_m1 100.00 104.12 Ir_mm 100.00 100.00 Dr_mm 100.00 100.00 Dw_mm 100.00 100.00 100% is baseline; numbers larger than that are improvements. The COND measurement indicates, for example, that there 1/8 as many conditional branches in the word-at-a-time version.
author: Karl Williamson <khw@cpan.org> 2017-11-15 10:19:33 -0700
committer: Karl Williamson <khw@cpan.org> 2017-11-23 14:18:51 -0700
commit: e17544a60909ed9555c0dad7cd24afc40eb736e7 (patch)
tree: 3e49108314dd819ad6880ebaeb4640c0e8b3494d /inline.h
parent: 46a08a6f3bc2ec1482773059c74749f47b161b01 (diff)
download: perl-e17544a60909ed9555c0dad7cd24afc40eb736e7.tar.gz
1 files changed, 70 insertions, 11 deletions
diff --git a/inline.h b/inline.h
index 2f67af8833..ddafde9650 100644
--- a/inline.h
+++ b/inline.h
@@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>.
 
 =cut
 
-XXX On ASCII machines this could be sped up by doing word-at-a-time operations
-
 */
 
 PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* send;
     const U8* x = s;
 
     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 
-    while (x < send) {
-	if (UTF8_IS_INVARIANT(*x)) {
-            x++;
-            continue;
+    if (len == 0) {
+        len = strlen((const char *)s);
+    }
+
+    send = s + len;
+
+#ifndef EBCDIC
+    /* Try to get the widest word on this platform */
+#  ifdef HAS_LONG_LONG
+#    define PERL_WORDCAST unsigned long long
+#    define PERL_WORDSIZE LONGLONGSIZE
+#  else
+#    define PERL_WORDCAST UV
+#    define PERL_WORDSIZE UVSIZE
+#  endif
+
+#  if PERL_WORDSIZE == 4
+#    define PERL_VARIANTS_WORD_MASK 0x80808080
+#    define PERL_WORD_BOUNDARY_MASK 0x3
+#  elif PERL_WORDSIZE == 8
+#    define PERL_VARIANTS_WORD_MASK 0x8080808080808080
+#    define PERL_WORD_BOUNDARY_MASK 0x7
+#  else
+#    error Unexpected word size
+#  endif
+
+    /* Process per-byte until reach word boundary.  XXX This loop could be
+     * eliminated if we knew that this platform had fast unaligned reads */
+    while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+        if (! UTF8_IS_INVARIANT(*x)) {
+            if (ep) {
+                *ep = x;
+            }
+
+            return FALSE;
         }
+        x++;
+    }
+
+    /* Process per-word as long as we have at least a full word left */
+    while (x + PERL_WORDSIZE <= send) {
+        if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK)  {
+
+            /* Found a variant.  Just return if caller doesn't want its exact
+             * position */
+            if (! ep) {
+                return FALSE;
+            }
 
-        if (ep) {
-            *ep = x;
+            /* Otherwise fall into final loop to find which byte it is */
+            break;
         }
+        x += PERL_WORDSIZE;
+    }
 
-        return FALSE;
+#  undef PERL_WORDCAST
+#  undef PERL_WORDSIZE
+#  undef PERL_WORD_BOUNDARY_MASK
+#  undef PERL_VARIANTS_WORD_MASK
+#endif
+
+    /* Process per-byte */
+    while (x < send) {
+	if (! UTF8_IS_INVARIANT(*x)) {
+            if (ep) {
+                *ep = x;
+            }
+
+            return FALSE;
+        }
+
+        x++;
     }
 
     return TRUE;
author	Karl Williamson <khw@cpan.org>	2017-11-15 10:19:33 -0700
committer	Karl Williamson <khw@cpan.org>	2017-11-23 14:18:51 -0700
commit	e17544a60909ed9555c0dad7cd24afc40eb736e7 (patch)
tree	3e49108314dd819ad6880ebaeb4640c0e8b3494d /inline.h
parent	46a08a6f3bc2ec1482773059c74749f47b161b01 (diff)
download	perl-e17544a60909ed9555c0dad7cd24afc40eb736e7.tar.gz