From 492a624f4a0c250e011c6b74a3403bfc885ec961 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 11 Feb 2012 14:04:39 -0700 Subject: Add is_utf8_char_buf() This function is to replace is_utf8_char(), and requires an extra parameter to ensure that it doesn't read beyond the end of the buffer. Convert is_utf8_char() and the only place in the Perl core to use the new one, assuming in each that there is enough space. Thanks to Jarkko Hietaniemi for suggesting this function name --- embed.fnc | 1 + embed.h | 1 + proto.h | 6 ++++++ utf8.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 54 insertions(+), 8 deletions(-) diff --git a/embed.fnc b/embed.fnc index bb403933d7..892a7190ac 100644 --- a/embed.fnc +++ b/embed.fnc @@ -627,6 +627,7 @@ ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c Anpd |bool |is_ascii_string|NN const U8 *s|STRLEN len Anpd |STRLEN |is_utf8_char |NN const U8 *s +Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el diff --git a/embed.h b/embed.h index 2f9d47dd43..d429c8d0ea 100644 --- a/embed.h +++ b/embed.h @@ -250,6 +250,7 @@ #define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a) #define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a) #define is_utf8_char Perl_is_utf8_char +#define is_utf8_char_buf Perl_is_utf8_char_buf #define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a) #define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a) #define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a) diff --git a/proto.h b/proto.h index d2022d549d..dde1a43815 100644 --- a/proto.h +++ b/proto.h @@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s) #define PERL_ARGS_ASSERT_IS_UTF8_CHAR \ assert(s) +PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end) + __attribute__nonnull__(1) + __attribute__nonnull__(2); +#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \ + assert(buf); assert(buf_end) + PERL_CALLCONV bool Perl_is_utf8_cntrl(pTHX_ const U8 *p) __attribute__warn_unused_result__ __attribute__nonnull__(pTHX_1); diff --git a/utf8.c b/utf8.c index bfcc40cfd0..2e0429e476 100644 --- a/utf8.c +++ b/utf8.c @@ -315,6 +315,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) return len; } +/* +=for apidoc is_utf8_char_buf + +Returns the number of bytes that comprise the first UTF-8 encoded character in +buffer C. C should point to one position beyond the end of the +buffer. 0 is returned if C does not point to a complete, valid UTF-8 +encoded character. + +Note that an INVARIANT character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut */ + +STRLEN +Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) +{ + + STRLEN len; + + PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; + + if (buf_end <= buf) { + return 0; + } + + len = buf_end - buf; + if (len > UTF8SKIP(buf)) { + len = UTF8SKIP(buf); + } + +#ifdef IS_UTF8_CHAR + if (IS_UTF8_CHAR_FAST(len)) + return IS_UTF8_CHAR(buf, len) ? len : 0; +#endif /* #ifdef IS_UTF8_CHAR */ + return is_utf8_char_slow(buf, len); +} + /* =for apidoc is_utf8_char @@ -330,14 +367,10 @@ UTF8SKIP(s) bytes. STRLEN Perl_is_utf8_char(const U8 *s) { - const STRLEN len = UTF8SKIP(s); - PERL_ARGS_ASSERT_IS_UTF8_CHAR; -#ifdef IS_UTF8_CHAR - if (IS_UTF8_CHAR_FAST(len)) - return IS_UTF8_CHAR(s, len) ? len : 0; -#endif /* #ifdef IS_UTF8_CHAR */ - return is_utf8_char_slow(s, len); + + /* Assumes we have enough space */ + return is_utf8_char_buf(s, s + UTF8SKIP(s)); } @@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, PERL_ARGS_ASSERT_IS_UTF8_COMMON; - if (!is_utf8_char(p)) + /* The API should have included a length for the UTF-8 character in

, + * but it doesn't. We therefor assume that p has been validated at least + * as far as there being enough bytes available in it to accommodate the + * character without reading beyond the end, and pass that number on to the + * validating routine */ + if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) return FALSE; if (!*swash) *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); -- cgit v1.2.1