diff options
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | utf8.c | 54 |
4 files changed, 54 insertions, 8 deletions
@@ -627,6 +627,7 @@ ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c Anpd |bool |is_ascii_string|NN const U8 *s|STRLEN len Anpd |STRLEN |is_utf8_char |NN const U8 *s +Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el @@ -250,6 +250,7 @@ #define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a) #define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a) #define is_utf8_char Perl_is_utf8_char +#define is_utf8_char_buf Perl_is_utf8_char_buf #define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a) #define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a) #define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a) @@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s) #define PERL_ARGS_ASSERT_IS_UTF8_CHAR \ assert(s) +PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end) + __attribute__nonnull__(1) + __attribute__nonnull__(2); +#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \ + assert(buf); assert(buf_end) + PERL_CALLCONV bool Perl_is_utf8_cntrl(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); @@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) } /* +=for apidoc is_utf8_char_buf + +Returns the number of bytes that comprise the first UTF-8 encoded character in +buffer C<buf>. C<buf_end> should point to one position beyond the end of the +buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8 +encoded character. + +Note that an INVARIANT character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut */ + +STRLEN +Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) +{ + + STRLEN len; + + PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; + + if (buf_end <= buf) { + return 0; + } + + len = buf_end - buf; + if (len > UTF8SKIP(buf)) { + len = UTF8SKIP(buf); + } + +#ifdef IS_UTF8_CHAR + if (IS_UTF8_CHAR_FAST(len)) + return IS_UTF8_CHAR(buf, len) ? len : 0; +#endif /* #ifdef IS_UTF8_CHAR */ + return is_utf8_char_slow(buf, len); +} + +/* =for apidoc is_utf8_char Tests if some arbitrary number of bytes begins in a valid UTF-8 @@ -330,14 +367,10 @@ UTF8SKIP(s) bytes. STRLEN Perl_is_utf8_char(const U8 *s) { - const STRLEN len = UTF8SKIP(s); - PERL_ARGS_ASSERT_IS_UTF8_CHAR; -#ifdef IS_UTF8_CHAR - if (IS_UTF8_CHAR_FAST(len)) - return IS_UTF8_CHAR(s, len) ? len : 0; -#endif /* #ifdef IS_UTF8_CHAR */ - return is_utf8_char_slow(s, len); + + /* Assumes we have enough space */ + return is_utf8_char_buf(s, s + UTF8SKIP(s)); } @@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, PERL_ARGS_ASSERT_IS_UTF8_COMMON; - if (!is_utf8_char(p)) + /* The API should have included a length for the UTF-8 character in <p>, + * but it doesn't. We therefor assume that p has been validated at least + * as far as there being enough bytes available in it to accommodate the + * character without reading beyond the end, and pass that number on to the + * validating routine */ + if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) return FALSE; if (!*swash) *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); |