Use is_utf8_invariant_string() more

Now that this function was changed to do word-at-a time searching in commit e17544a60909ed9555c0dad7cd24afc40eb736e7, we can more quickly find the first variant byte in a string, if any. Given that a lot of usage of Perl is on ASCII data, it makes sense to try this first before any byte-at-a-time processing. Since Perl can be used on things that are mostly non-ASCII, we give up at the first such one, and process the rest of the string byte-by-byte. Otherwise we could have a pipeline of finding the next variant quickly, but this would only be faster if variants were rare, which I don't feel we can be confident about, after finding at least one.
author: Karl Williamson <khw@cpan.org> 2017-11-26 17:06:44 -0700
committer: Karl Williamson <khw@cpan.org> 2017-11-27 14:48:45 -0700
commit: 33756530b5c7b031069d47839f8132f4574d2f50 (patch)
tree: 3a30b3b8f31aacdbd2a9c6cb03ffac91c315e835 /inline.h
parent: b2e7ed74dcabdba63e3e8e2ff1980e1cd109b869 (diff)
download: perl-33756530b5c7b031069d47839f8132f4574d2f50.tar.gz
1 files changed, 94 insertions, 20 deletions
diff --git a/inline.h b/inline.h
index 309d74f435..4d96305e5b 100644
--- a/inline.h
+++ b/inline.h
@@ -652,8 +652,7 @@ C<L</is_c9strict_utf8_string_loclen>>.
 PERL_STATIC_INLINE bool
 S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
 {
-    const U8* send;
-    const U8* x = s;
+    const U8 * first_variant;
 
     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
@@ -679,7 +678,10 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
         return is_c9strict_utf8_string(s, len);
     }
 
-    send = s + len;
+    if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+
     while (x < send) {
         STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
         if (UNLIKELY(! cur_len)) {
@@ -687,6 +689,7 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
         }
         x += cur_len;
     }
+    }
 
     return TRUE;
 }
@@ -721,14 +724,32 @@ See also C<L</is_utf8_string_loc>>.
 */
 
 PERL_STATIC_INLINE bool
-Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-    STRLEN outlen = 0;
+    const U8 * first_variant;
 
     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
     while (x < send) {
         const STRLEN cur_len = isUTF8_CHAR(x, send);
         if (UNLIKELY(! cur_len)) {
@@ -746,6 +767,7 @@ Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN
     }
 
     return (x == send);
+    }
 }
 
 /*
@@ -779,14 +801,32 @@ See also C<L</is_strict_utf8_string_loc>>.
 */
 
 PERL_STATIC_INLINE bool
-S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-    STRLEN outlen = 0;
+    const U8 * first_variant;
 
     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
 
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
     while (x < send) {
         const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
         if (UNLIKELY(! cur_len)) {
@@ -804,6 +844,7 @@ S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STR
     }
 
     return (x == send);
+    }
 }
 
 /*
@@ -837,14 +878,32 @@ See also C<L</is_c9strict_utf8_string_loc>>.
 */
 
 PERL_STATIC_INLINE bool
-S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-    STRLEN outlen = 0;
+    const U8 * first_variant;
 
     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
 
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
     while (x < send) {
         const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
         if (UNLIKELY(! cur_len)) {
@@ -862,6 +921,7 @@ S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, S
     }
 
     return (x == send);
+    }
 }
 
 /*
@@ -902,9 +962,7 @@ See also C<L</is_utf8_string_loc_flags>>.
 PERL_STATIC_INLINE bool
 S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
 {
-    const U8* send;
-    const U8* x = s;
-    STRLEN outlen = 0;
+    const U8 * first_variant;
 
     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
@@ -930,7 +988,22 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el
         return is_c9strict_utf8_string_loclen(s, len, ep, el);
     }
 
-    send = s + len;
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
     while (x < send) {
         const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
         if (UNLIKELY(! cur_len)) {
@@ -948,6 +1021,7 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el
     }
 
     return (x == send);
+    }
 }
 
 /*
@@ -1245,7 +1319,7 @@ complete, valid characters found in the C<el> pointer.
 
 PERL_STATIC_INLINE bool
 S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
-                                       const STRLEN len,
+                                       STRLEN len,
                                        const U8 **ep,
                                        STRLEN *el,
                                        const U32 flags)
author	Karl Williamson <khw@cpan.org>	2017-11-26 17:06:44 -0700
committer	Karl Williamson <khw@cpan.org>	2017-11-27 14:48:45 -0700
commit	33756530b5c7b031069d47839f8132f4574d2f50 (patch)
tree	3a30b3b8f31aacdbd2a9c6cb03ffac91c315e835 /inline.h
parent	b2e7ed74dcabdba63e3e8e2ff1980e1cd109b869 (diff)
download	perl-33756530b5c7b031069d47839f8132f4574d2f50.tar.gz