diff options
author | Karl Williamson <khw@cpan.org> | 2017-11-26 17:06:44 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2017-11-27 14:48:45 -0700 |
commit | 33756530b5c7b031069d47839f8132f4574d2f50 (patch) | |
tree | 3a30b3b8f31aacdbd2a9c6cb03ffac91c315e835 /inline.h | |
parent | b2e7ed74dcabdba63e3e8e2ff1980e1cd109b869 (diff) | |
download | perl-33756530b5c7b031069d47839f8132f4574d2f50.tar.gz |
Use is_utf8_invariant_string() more
Now that this function was changed to do word-at-a time searching in
commit e17544a60909ed9555c0dad7cd24afc40eb736e7, we can more quickly
find the first variant byte in a string, if any. Given that a lot of
usage of Perl is on ASCII data, it makes sense to try this first before
any byte-at-a-time processing.
Since Perl can be used on things that are mostly non-ASCII, we give up
at the first such one, and process the rest of the string byte-by-byte.
Otherwise we could have a pipeline of finding the next variant quickly,
but this would only be faster if variants were rare, which I don't feel
we can be confident about, after finding at least one.
Diffstat (limited to 'inline.h')
-rw-r--r-- | inline.h | 114 |
1 files changed, 94 insertions, 20 deletions
@@ -652,8 +652,7 @@ C<L</is_c9strict_utf8_string_loclen>>. PERL_STATIC_INLINE bool S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) { - const U8* send; - const U8* x = s; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE @@ -679,7 +678,10 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) return is_c9strict_utf8_string(s, len); } - send = s + len; + if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { + const U8* const send = s + len; + const U8* x = first_variant; + while (x < send) { STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); if (UNLIKELY(! cur_len)) { @@ -687,6 +689,7 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) } x += cur_len; } + } return TRUE; } @@ -721,14 +724,32 @@ See also C<L</is_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + while (x < send) { const STRLEN cur_len = isUTF8_CHAR(x, send); if (UNLIKELY(! cur_len)) { @@ -746,6 +767,7 @@ Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN } return (x == send); + } } /* @@ -779,14 +801,32 @@ See also C<L</is_strict_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + while (x < send) { const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); if (UNLIKELY(! cur_len)) { @@ -804,6 +844,7 @@ S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STR } return (x == send); + } } /* @@ -837,14 +878,32 @@ See also C<L</is_c9strict_utf8_string_loc>>. */ PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; + if (len == 0) { + len = strlen((const char *) s); + } + + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + while (x < send) { const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); if (UNLIKELY(! cur_len)) { @@ -862,6 +921,7 @@ S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, S } return (x == send); + } } /* @@ -902,9 +962,7 @@ See also C<L</is_utf8_string_loc_flags>>. PERL_STATIC_INLINE bool S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) { - const U8* send; - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE @@ -930,7 +988,22 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el return is_c9strict_utf8_string_loclen(s, len, ep, el); } - send = s + len; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } + + return TRUE; + } + + { + const U8* send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + while (x < send) { const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); if (UNLIKELY(! cur_len)) { @@ -948,6 +1021,7 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el } return (x == send); + } } /* @@ -1245,7 +1319,7 @@ complete, valid characters found in the C<el> pointer. PERL_STATIC_INLINE bool S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, - const STRLEN len, + STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) |