diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-02-11 14:04:39 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-02-11 14:35:46 -0700 |
commit | 492a624f4a0c250e011c6b74a3403bfc885ec961 (patch) | |
tree | 8961dec5b0e8d1fc0a90c8696bc15f40e8e3616e /utf8.c | |
parent | b9411ff2d79526114566a6486a0c0238691cec2d (diff) | |
download | perl-492a624f4a0c250e011c6b74a3403bfc885ec961.tar.gz |
Add is_utf8_char_buf()
This function is to replace is_utf8_char(), and requires an extra
parameter to ensure that it doesn't read beyond the end of the buffer.
Convert is_utf8_char() and the only place in the Perl core to use the
new one, assuming in each that there is enough space.
Thanks to Jarkko Hietaniemi for suggesting this function name
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 54 |
1 files changed, 46 insertions, 8 deletions
@@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len) } /* +=for apidoc is_utf8_char_buf + +Returns the number of bytes that comprise the first UTF-8 encoded character in +buffer C<buf>. C<buf_end> should point to one position beyond the end of the +buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8 +encoded character. + +Note that an INVARIANT character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. + +=cut */ + +STRLEN +Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) +{ + + STRLEN len; + + PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF; + + if (buf_end <= buf) { + return 0; + } + + len = buf_end - buf; + if (len > UTF8SKIP(buf)) { + len = UTF8SKIP(buf); + } + +#ifdef IS_UTF8_CHAR + if (IS_UTF8_CHAR_FAST(len)) + return IS_UTF8_CHAR(buf, len) ? len : 0; +#endif /* #ifdef IS_UTF8_CHAR */ + return is_utf8_char_slow(buf, len); +} + +/* =for apidoc is_utf8_char Tests if some arbitrary number of bytes begins in a valid UTF-8 @@ -330,14 +367,10 @@ UTF8SKIP(s) bytes. STRLEN Perl_is_utf8_char(const U8 *s) { - const STRLEN len = UTF8SKIP(s); - PERL_ARGS_ASSERT_IS_UTF8_CHAR; -#ifdef IS_UTF8_CHAR - if (IS_UTF8_CHAR_FAST(len)) - return IS_UTF8_CHAR(s, len) ? len : 0; -#endif /* #ifdef IS_UTF8_CHAR */ - return is_utf8_char_slow(s, len); + + /* Assumes we have enough space */ + return is_utf8_char_buf(s, s + UTF8SKIP(s)); } @@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, PERL_ARGS_ASSERT_IS_UTF8_COMMON; - if (!is_utf8_char(p)) + /* The API should have included a length for the UTF-8 character in <p>, + * but it doesn't. We therefor assume that p has been validated at least + * as far as there being enough bytes available in it to accommodate the + * character without reading beyond the end, and pass that number on to the + * validating routine */ + if (!is_utf8_char_buf(p, p + UTF8SKIP(p))) return FALSE; if (!*swash) *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0); |