diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2005-06-05 21:06:12 +0300 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2005-06-07 14:35:25 +0000 |
commit | 768c67ee6ae82fc2dd99a9dd06708f0a0cc097cb (patch) | |
tree | 2c7376ce53618bad206d31f088eae62bba0e59c2 | |
parent | e07360faf0e39eb99a2f8007e291aab60c228731 (diff) | |
download | perl-768c67ee6ae82fc2dd99a9dd06708f0a0cc097cb.tar.gz |
one more round of is_utf8_foo tuneup
Message-ID: <42A314E4.8060608@gmail.com>
p4raw-id: //depot/perl@24730
-rw-r--r-- | embed.fnc | 3 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | global.sym | 1 | ||||
-rw-r--r-- | mg.c | 6 | ||||
-rw-r--r-- | pod/perlapi.pod | 23 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 99 | ||||
-rw-r--r-- | utf8.h | 2 |
8 files changed, 101 insertions, 41 deletions
@@ -346,7 +346,8 @@ ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c Apd |STRLEN |is_utf8_char |NN const U8 *p Apd |bool |is_utf8_string |NN const U8 *s|STRLEN len -Apd |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NN const U8 **p +Apd |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|const U8 **p +Apd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|const U8 **ep|STRLEN *el ApR |bool |is_utf8_alnum |NN const U8 *p ApR |bool |is_utf8_alnumc |NN const U8 *p ApR |bool |is_utf8_idfirst|NN const U8 *p @@ -345,6 +345,7 @@ #define is_utf8_char Perl_is_utf8_char #define is_utf8_string Perl_is_utf8_string #define is_utf8_string_loc Perl_is_utf8_string_loc +#define is_utf8_string_loclen Perl_is_utf8_string_loclen #define is_utf8_alnum Perl_is_utf8_alnum #define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst @@ -2344,6 +2345,7 @@ #define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b) #define is_utf8_string_loc(a,b,c) Perl_is_utf8_string_loc(aTHX_ a,b,c) +#define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d) #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) #define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) diff --git a/global.sym b/global.sym index b4c745c130..208bd2d072 100644 --- a/global.sym +++ b/global.sym @@ -207,6 +207,7 @@ Perl_is_uni_xdigit_lc Perl_is_utf8_char Perl_is_utf8_string Perl_is_utf8_string_loc +Perl_is_utf8_string_loclen Perl_is_utf8_alnum Perl_is_utf8_alnumc Perl_is_utf8_idfirst @@ -495,10 +495,12 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) if (i > 0 && RX_MATCH_UTF8(rx)) { char *s = rx->subbeg + s1; char *send = rx->subbeg + t1; + const U8 *ep; + STRLEN el; i = t1 - s1; - if (is_utf8_string((U8*)s, i)) - i = Perl_utf8_length(aTHX_ (U8*)s, (U8*)send); + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; } if (i < 0) Perl_croak(aTHX_ "panic: magic_len: %"IVdf, (IV)i); diff --git a/pod/perlapi.pod b/pod/perlapi.pod index aadf23a5af..86d97ff99f 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -5325,6 +5325,8 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. +See also is_utf8_string_loclen() and is_utf8_string_loc(). + bool is_utf8_string(const U8 *s, STRLEN len) =for hackers @@ -5332,14 +5334,31 @@ Found in file utf8.c =item is_utf8_string_loc -Like is_ut8_string but store the location of the failure in -the last argument. +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C<ep>. + +See also is_utf8_string_loclen() and is_utf8_string(). bool is_utf8_string_loc(const U8 *s, STRLEN len, const U8 **p) =for hackers Found in file utf8.c +=item is_utf8_string_loclen + +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C<ep>, and the number of UTF-8 +encoded characters in the C<el>. + +See also is_utf8_string_loc() and is_utf8_string(). + + bool is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) + +=for hackers +Found in file utf8.c + =item pv_uni_display Build to the scalar dsv a displayable version of the string spv, @@ -620,8 +620,10 @@ PERL_CALLCONV bool Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) __attribute__nonnull__(pTHX_1); PERL_CALLCONV bool Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) - __attribute__nonnull__(pTHX_1) - __attribute__nonnull__(pTHX_3); + __attribute__nonnull__(pTHX_1); + +PERL_CALLCONV bool Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) + __attribute__nonnull__(pTHX_1); PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ const U8 *p) __attribute__warn_unused_result__ @@ -240,10 +240,9 @@ will be returned if it is valid, otherwise 0. STRLEN Perl_is_utf8_char(pTHX_ const U8 *s) { - STRLEN len; + STRLEN len = UTF8SKIP(s); #ifdef IS_UTF8_CHAR - len = UTF8SKIP(s); - if (len <= 4) + if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; #endif /* #ifdef IS_UTF8_CHAR */ return is_utf8_char_slow(s, len); @@ -257,6 +256,8 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. +See also is_utf8_string_loclen() and is_utf8_string_loc(). + =cut */ @@ -276,26 +277,26 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) if (UTF8_IS_INVARIANT(*x)) c = 1; else if (!UTF8_IS_START(*x)) - return FALSE; + goto out; else { /* ... and call is_utf8_char() only if really needed. */ #ifdef IS_UTF8_CHAR c = UTF8SKIP(x); - if (c <= 4) { - if (!IS_UTF8_CHAR(x, c)) - return FALSE; - } else { - if (!is_utf8_char_slow(x, c)) - return FALSE; - } + if (IS_UTF8_CHAR_FAST(c)) { + if (!IS_UTF8_CHAR(x, c)) + goto out; + } else if (!is_utf8_char_slow(x, c)) + goto out; #else c = is_utf8_char(x); #endif /* #ifdef IS_UTF8_CHAR */ if (!c) - return FALSE; + goto out; } x += c; } + + out: if (x != send) return FALSE; @@ -303,16 +304,20 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) } /* -=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p +=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el -Like is_ut8_string but store the location of the failure in -the last argument. +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C<ep>, and the number of UTF-8 +encoded characters in the C<el>. + +See also is_utf8_string_loc() and is_utf8_string(). =cut */ bool -Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) +Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { const U8* x = s; const U8* send; @@ -321,37 +326,63 @@ Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p) if (!len && s) len = strlen((const char *)s); send = s + len; + if (el) + *el = 0; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ if (UTF8_IS_INVARIANT(*x)) - c = 1; - else if (!UTF8_IS_START(*x)) { - if (p) - *p = x; - return FALSE; - } + c = 1; + else if (!UTF8_IS_START(*x)) + goto out; else { - /* ... and call is_utf8_char() only if really needed. */ - c = is_utf8_char(x); - if (!c) { - if (p) - *p = x; - return FALSE; - } + /* ... and call is_utf8_char() only if really needed. */ +#ifdef IS_UTF8_CHAR + c = UTF8SKIP(x); + if (IS_UTF8_CHAR_FAST(c)) { + if (!IS_UTF8_CHAR(x, c)) + c = 0; + } else + c = is_utf8_char_slow(x, c); +#else + c = is_utf8_char(x); +#endif /* #ifdef IS_UTF8_CHAR */ + if (!c) + goto out; } - x += c; + x += c; + if (el) + (*el)++; } - if (x != send) { - if (p) - *p = x; + + out: + if (ep) + *ep = x; + if (x != send) return FALSE; - } return TRUE; } /* +=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el + +Like is_ut8_string() but stores the location of the failure (in the +case of "utf8ness failure") or the location s+len (in the case of +"utf8ness success") in the C<ep>. + +See also is_utf8_string_loclen() and is_utf8_string(). + +=cut +*/ + +bool +Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep) +{ + return is_utf8_string_loclen(s, len, ep, 0); +} + +/* =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags Bottom level UTF-8 decode routine. @@ -327,3 +327,5 @@ encoded character. (n) == 3 ? IS_UTF8_CHAR_3(p) : \ (n) == 4 ? IS_UTF8_CHAR_4(p) : 0) +#define IS_UTF8_CHAR_FAST(n) ((n) <= 4) + |