summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc4
-rw-r--r--ext/XS-APItest/APItest.pm2
-rw-r--r--ext/XS-APItest/APItest.xs13
-rw-r--r--ext/XS-APItest/t/utf8.t24
-rw-r--r--inline.h81
-rw-r--r--proto.h4
6 files changed, 112 insertions, 16 deletions
diff --git a/embed.fnc b/embed.fnc
index c33833a53a..39060f0bcf 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -778,9 +778,9 @@ ADMpR |bool |is_uni_print_lc|UV c
ADMpR |bool |is_uni_punct_lc|UV c
ADMpPR |bool |is_uni_xdigit_lc|UV c
AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \
- |STRLEN const len
+ |STRLEN len
AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
- |STRLEN const len \
+ |STRLEN len \
|NULLOK const U8 ** ep
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len
diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm
index cdc7e5c380..64022244d5 100644
--- a/ext/XS-APItest/APItest.pm
+++ b/ext/XS-APItest/APItest.pm
@@ -5,7 +5,7 @@ use strict;
use warnings;
use Carp;
-our $VERSION = '0.93';
+our $VERSION = '0.94';
require XSLoader;
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 8bf1545b63..ea793ba39e 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -6005,6 +6005,19 @@ test_is_utf8_string(char *s, STRLEN len)
RETVAL
AV *
+test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep)));
+ av_push(av, newSViv(ep - ((U8 *) s + offset)));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
test_is_utf8_string_loc(char *s, STRLEN len)
PREINIT:
AV *av;
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index a463d1b6eb..123820869d 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -15,6 +15,30 @@ $|=1;
use XS::APItest;
+
+my $s = "A" x 100 ;
+my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s);
+is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant");
+
+my $above_word_length = 9;
+for my $initial (0 .. $above_word_length) {
+ for my $offset (0 .. $above_word_length) {
+ for my $trailing (0 .. $above_word_length) {
+ if ($initial >= $offset) {
+ my $variant_pos = $initial - $offset;
+ $s = "A" x $initial . "\x80" . "A" x $trailing;
+ my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset,
+ length $s);
+ is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns"
+ . " FALSE for variant at $variant_pos,"
+ . " first $offset ignored)");
+ is($ret_ref->[1], $variant_pos,
+ " And returns the correct position");
+ }
+ }
+ }
+}
+
my $pound_sign = chr utf8::unicode_to_native(163);
# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
diff --git a/inline.h b/inline.h
index 2f67af8833..ddafde9650 100644
--- a/inline.h
+++ b/inline.h
@@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>.
=cut
-XXX On ASCII machines this could be sped up by doing word-at-a-time operations
-
*/
PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* send;
const U8* x = s;
PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
- while (x < send) {
- if (UTF8_IS_INVARIANT(*x)) {
- x++;
- continue;
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
+
+ send = s + len;
+
+#ifndef EBCDIC
+ /* Try to get the widest word on this platform */
+# ifdef HAS_LONG_LONG
+# define PERL_WORDCAST unsigned long long
+# define PERL_WORDSIZE LONGLONGSIZE
+# else
+# define PERL_WORDCAST UV
+# define PERL_WORDSIZE UVSIZE
+# endif
+
+# if PERL_WORDSIZE == 4
+# define PERL_VARIANTS_WORD_MASK 0x80808080
+# define PERL_WORD_BOUNDARY_MASK 0x3
+# elif PERL_WORDSIZE == 8
+# define PERL_VARIANTS_WORD_MASK 0x8080808080808080
+# define PERL_WORD_BOUNDARY_MASK 0x7
+# else
+# error Unexpected word size
+# endif
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
}
+ x++;
+ }
+
+ /* Process per-word as long as we have at least a full word left */
+ while (x + PERL_WORDSIZE <= send) {
+ if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
+
+ /* Found a variant. Just return if caller doesn't want its exact
+ * position */
+ if (! ep) {
+ return FALSE;
+ }
- if (ep) {
- *ep = x;
+ /* Otherwise fall into final loop to find which byte it is */
+ break;
}
+ x += PERL_WORDSIZE;
+ }
- return FALSE;
+# undef PERL_WORDCAST
+# undef PERL_WORDSIZE
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
+
+ /* Process per-byte */
+ while (x < send) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
+ }
+
+ x++;
}
return TRUE;
diff --git a/proto.h b/proto.h
index 94009acac6..598e7c1c04 100644
--- a/proto.h
+++ b/proto.h
@@ -1626,11 +1626,11 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \
assert(p)
-/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len)
+/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__; */
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep)
+PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \
assert(s)