diff options
-rw-r--r-- | embed.fnc | 4 | ||||
-rw-r--r-- | ext/XS-APItest/APItest.pm | 2 | ||||
-rw-r--r-- | ext/XS-APItest/APItest.xs | 13 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 24 | ||||
-rw-r--r-- | inline.h | 81 | ||||
-rw-r--r-- | proto.h | 4 |
6 files changed, 112 insertions, 16 deletions
@@ -778,9 +778,9 @@ ADMpR |bool |is_uni_print_lc|UV c ADMpR |bool |is_uni_punct_lc|UV c ADMpPR |bool |is_uni_xdigit_lc|UV c AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \ - |STRLEN const len + |STRLEN len AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \ - |STRLEN const len \ + |STRLEN len \ |NULLOK const U8 ** ep AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len diff --git a/ext/XS-APItest/APItest.pm b/ext/XS-APItest/APItest.pm index cdc7e5c380..64022244d5 100644 --- a/ext/XS-APItest/APItest.pm +++ b/ext/XS-APItest/APItest.pm @@ -5,7 +5,7 @@ use strict; use warnings; use Carp; -our $VERSION = '0.93'; +our $VERSION = '0.94'; require XSLoader; diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 8bf1545b63..ea793ba39e 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -6005,6 +6005,19 @@ test_is_utf8_string(char *s, STRLEN len) RETVAL AV * +test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len) + PREINIT: + AV *av; + const U8 * ep; + CODE: + av = newAV(); + av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep))); + av_push(av, newSViv(ep - ((U8 *) s + offset))); + RETVAL = av; + OUTPUT: + RETVAL + +AV * test_is_utf8_string_loc(char *s, STRLEN len) PREINIT: AV *av; diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index a463d1b6eb..123820869d 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -15,6 +15,30 @@ $|=1; use XS::APItest; + +my $s = "A" x 100 ; +my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s); +is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant"); + +my $above_word_length = 9; +for my $initial (0 .. $above_word_length) { + for my $offset (0 .. $above_word_length) { + for my $trailing (0 .. $above_word_length) { + if ($initial >= $offset) { + my $variant_pos = $initial - $offset; + $s = "A" x $initial . "\x80" . "A" x $trailing; + my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset, + length $s); + is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns" + . " FALSE for variant at $variant_pos," + . " first $offset ignored)"); + is($ret_ref->[1], $variant_pos, + " And returns the correct position"); + } + } + } +} + my $pound_sign = chr utf8::unicode_to_native(163); # This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl @@ -370,29 +370,88 @@ UTF-8 invariant, this function does not change the contents of C<*ep>. =cut -XXX On ASCII machines this could be sped up by doing word-at-a-time operations - */ PERL_STATIC_INLINE bool -S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep) +S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) { - const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* send; const U8* x = s; PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC; - while (x < send) { - if (UTF8_IS_INVARIANT(*x)) { - x++; - continue; + if (len == 0) { + len = strlen((const char *)s); + } + + send = s + len; + +#ifndef EBCDIC + /* Try to get the widest word on this platform */ +# ifdef HAS_LONG_LONG +# define PERL_WORDCAST unsigned long long +# define PERL_WORDSIZE LONGLONGSIZE +# else +# define PERL_WORDCAST UV +# define PERL_WORDSIZE UVSIZE +# endif + +# if PERL_WORDSIZE == 4 +# define PERL_VARIANTS_WORD_MASK 0x80808080 +# define PERL_WORD_BOUNDARY_MASK 0x3 +# elif PERL_WORDSIZE == 8 +# define PERL_VARIANTS_WORD_MASK 0x8080808080808080 +# define PERL_WORD_BOUNDARY_MASK 0x7 +# else +# error Unexpected word size +# endif + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; } + x++; + } + + /* Process per-word as long as we have at least a full word left */ + while (x + PERL_WORDSIZE <= send) { + if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) { + + /* Found a variant. Just return if caller doesn't want its exact + * position */ + if (! ep) { + return FALSE; + } - if (ep) { - *ep = x; + /* Otherwise fall into final loop to find which byte it is */ + break; } + x += PERL_WORDSIZE; + } - return FALSE; +# undef PERL_WORDCAST +# undef PERL_WORDSIZE +# undef PERL_WORD_BOUNDARY_MASK +# undef PERL_VARIANTS_WORD_MASK +#endif + + /* Process per-byte */ + while (x < send) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } + + return FALSE; + } + + x++; } return TRUE; @@ -1626,11 +1626,11 @@ PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \ assert(p) -/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len) +/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len) __attribute__warn_unused_result__; */ #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep) +PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \ assert(s) |