summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-02-11 14:04:39 -0700
committerKarl Williamson <public@khwilliamson.com>2012-02-11 14:35:46 -0700
commit492a624f4a0c250e011c6b74a3403bfc885ec961 (patch)
tree8961dec5b0e8d1fc0a90c8696bc15f40e8e3616e
parentb9411ff2d79526114566a6486a0c0238691cec2d (diff)
downloadperl-492a624f4a0c250e011c6b74a3403bfc885ec961.tar.gz
Add is_utf8_char_buf()
This function is to replace is_utf8_char(), and requires an extra parameter to ensure that it doesn't read beyond the end of the buffer. Convert is_utf8_char() and the only place in the Perl core to use the new one, assuming in each that there is enough space. Thanks to Jarkko Hietaniemi for suggesting this function name
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--proto.h6
-rw-r--r--utf8.c54
4 files changed, 54 insertions, 8 deletions
diff --git a/embed.fnc b/embed.fnc
index bb403933d7..892a7190ac 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -627,6 +627,7 @@ ApPR |bool |is_uni_punct_lc|UV c
ApPR |bool |is_uni_xdigit_lc|UV c
Anpd |bool |is_ascii_string|NN const U8 *s|STRLEN len
Anpd |STRLEN |is_utf8_char |NN const U8 *s
+Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len
Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
diff --git a/embed.h b/embed.h
index 2f9d47dd43..d429c8d0ea 100644
--- a/embed.h
+++ b/embed.h
@@ -250,6 +250,7 @@
#define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a)
#define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a)
#define is_utf8_char Perl_is_utf8_char
+#define is_utf8_char_buf Perl_is_utf8_char_buf
#define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a)
#define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a)
#define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a)
diff --git a/proto.h b/proto.h
index d2022d549d..dde1a43815 100644
--- a/proto.h
+++ b/proto.h
@@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s)
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR \
assert(s)
+PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \
+ assert(buf); assert(buf_end)
+
PERL_CALLCONV bool Perl_is_utf8_cntrl(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index bfcc40cfd0..2e0429e476 100644
--- a/utf8.c
+++ b/utf8.c
@@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
}
/*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>. C<buf_end> should point to one position beyond the end of the
+buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+ STRLEN len;
+
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+ if (buf_end <= buf) {
+ return 0;
+ }
+
+ len = buf_end - buf;
+ if (len > UTF8SKIP(buf)) {
+ len = UTF8SKIP(buf);
+ }
+
+#ifdef IS_UTF8_CHAR
+ if (IS_UTF8_CHAR_FAST(len))
+ return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+ return is_utf8_char_slow(buf, len);
+}
+
+/*
=for apidoc is_utf8_char
Tests if some arbitrary number of bytes begins in a valid UTF-8
@@ -330,14 +367,10 @@ UTF8SKIP(s) bytes.
STRLEN
Perl_is_utf8_char(const U8 *s)
{
- const STRLEN len = UTF8SKIP(s);
-
PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
- if (IS_UTF8_CHAR_FAST(len))
- return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
- return is_utf8_char_slow(s, len);
+
+ /* Assumes we have enough space */
+ return is_utf8_char_buf(s, s + UTF8SKIP(s));
}
@@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
- if (!is_utf8_char(p))
+ /* The API should have included a length for the UTF-8 character in <p>,
+ * but it doesn't. We therefor assume that p has been validated at least
+ * as far as there being enough bytes available in it to accommodate the
+ * character without reading beyond the end, and pass that number on to the
+ * validating routine */
+ if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
if (!*swash)
*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);