summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-11-26 17:06:44 -0700
committerKarl Williamson <khw@cpan.org>2017-11-27 14:48:45 -0700
commit33756530b5c7b031069d47839f8132f4574d2f50 (patch)
tree3a30b3b8f31aacdbd2a9c6cb03ffac91c315e835 /inline.h
parentb2e7ed74dcabdba63e3e8e2ff1980e1cd109b869 (diff)
downloadperl-33756530b5c7b031069d47839f8132f4574d2f50.tar.gz
Use is_utf8_invariant_string() more
Now that this function was changed to do word-at-a time searching in commit e17544a60909ed9555c0dad7cd24afc40eb736e7, we can more quickly find the first variant byte in a string, if any. Given that a lot of usage of Perl is on ASCII data, it makes sense to try this first before any byte-at-a-time processing. Since Perl can be used on things that are mostly non-ASCII, we give up at the first such one, and process the rest of the string byte-by-byte. Otherwise we could have a pipeline of finding the next variant quickly, but this would only be faster if variants were rare, which I don't feel we can be confident about, after finding at least one.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h114
1 files changed, 94 insertions, 20 deletions
diff --git a/inline.h b/inline.h
index 309d74f435..4d96305e5b 100644
--- a/inline.h
+++ b/inline.h
@@ -652,8 +652,7 @@ C<L</is_c9strict_utf8_string_loclen>>.
PERL_STATIC_INLINE bool
S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
{
- const U8* send;
- const U8* x = s;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
@@ -679,7 +678,10 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
return is_c9strict_utf8_string(s, len);
}
- send = s + len;
+ if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+
while (x < send) {
STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
if (UNLIKELY(! cur_len)) {
@@ -687,6 +689,7 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
}
x += cur_len;
}
+ }
return TRUE;
}
@@ -721,14 +724,32 @@ See also C<L</is_utf8_string_loc>>.
*/
PERL_STATIC_INLINE bool
-Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
while (x < send) {
const STRLEN cur_len = isUTF8_CHAR(x, send);
if (UNLIKELY(! cur_len)) {
@@ -746,6 +767,7 @@ Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN
}
return (x == send);
+ }
}
/*
@@ -779,14 +801,32 @@ See also C<L</is_strict_utf8_string_loc>>.
*/
PERL_STATIC_INLINE bool
-S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
while (x < send) {
const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
if (UNLIKELY(! cur_len)) {
@@ -804,6 +844,7 @@ S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STR
}
return (x == send);
+ }
}
/*
@@ -837,14 +878,32 @@ See also C<L</is_c9strict_utf8_string_loc>>.
*/
PERL_STATIC_INLINE bool
-S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
while (x < send) {
const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
if (UNLIKELY(! cur_len)) {
@@ -862,6 +921,7 @@ S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, S
}
return (x == send);
+ }
}
/*
@@ -902,9 +962,7 @@ See also C<L</is_utf8_string_loc_flags>>.
PERL_STATIC_INLINE bool
S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
{
- const U8* send;
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
@@ -930,7 +988,22 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el
return is_c9strict_utf8_string_loclen(s, len, ep, el);
}
- send = s + len;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
while (x < send) {
const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
if (UNLIKELY(! cur_len)) {
@@ -948,6 +1021,7 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el
}
return (x == send);
+ }
}
/*
@@ -1245,7 +1319,7 @@ complete, valid characters found in the C<el> pointer.
PERL_STATIC_INLINE bool
S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
- const STRLEN len,
+ STRLEN len,
const U8 **ep,
STRLEN *el,
const U32 flags)