summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--proto.h6
-rw-r--r--utf8.c54
4 files changed, 54 insertions, 8 deletions
diff --git a/embed.fnc b/embed.fnc
index bb403933d7..892a7190ac 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -627,6 +627,7 @@ ApPR |bool |is_uni_punct_lc|UV c
ApPR |bool |is_uni_xdigit_lc|UV c
Anpd |bool |is_ascii_string|NN const U8 *s|STRLEN len
Anpd |STRLEN |is_utf8_char |NN const U8 *s
+Anpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
Anpd |bool |is_utf8_string |NN const U8 *s|STRLEN len
Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
diff --git a/embed.h b/embed.h
index 2f9d47dd43..d429c8d0ea 100644
--- a/embed.h
+++ b/embed.h
@@ -250,6 +250,7 @@
#define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a)
#define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a)
#define is_utf8_char Perl_is_utf8_char
+#define is_utf8_char_buf Perl_is_utf8_char_buf
#define is_utf8_cntrl(a) Perl_is_utf8_cntrl(aTHX_ a)
#define is_utf8_digit(a) Perl_is_utf8_digit(aTHX_ a)
#define is_utf8_graph(a) Perl_is_utf8_graph(aTHX_ a)
diff --git a/proto.h b/proto.h
index d2022d549d..dde1a43815 100644
--- a/proto.h
+++ b/proto.h
@@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN Perl_is_utf8_char(const U8 *s)
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR \
assert(s)
+PERL_CALLCONV STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF \
+ assert(buf); assert(buf_end)
+
PERL_CALLCONV bool Perl_is_utf8_cntrl(pTHX_ const U8 *p)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index bfcc40cfd0..2e0429e476 100644
--- a/utf8.c
+++ b/utf8.c
@@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
}
/*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>. C<buf_end> should point to one position beyond the end of the
+buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+ STRLEN len;
+
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+ if (buf_end <= buf) {
+ return 0;
+ }
+
+ len = buf_end - buf;
+ if (len > UTF8SKIP(buf)) {
+ len = UTF8SKIP(buf);
+ }
+
+#ifdef IS_UTF8_CHAR
+ if (IS_UTF8_CHAR_FAST(len))
+ return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+ return is_utf8_char_slow(buf, len);
+}
+
+/*
=for apidoc is_utf8_char
Tests if some arbitrary number of bytes begins in a valid UTF-8
@@ -330,14 +367,10 @@ UTF8SKIP(s) bytes.
STRLEN
Perl_is_utf8_char(const U8 *s)
{
- const STRLEN len = UTF8SKIP(s);
-
PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
- if (IS_UTF8_CHAR_FAST(len))
- return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
- return is_utf8_char_slow(s, len);
+
+ /* Assumes we have enough space */
+ return is_utf8_char_buf(s, s + UTF8SKIP(s));
}
@@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
- if (!is_utf8_char(p))
+ /* The API should have included a length for the UTF-8 character in <p>,
+ * but it doesn't. We therefor assume that p has been validated at least
+ * as far as there being enough bytes available in it to accommodate the
+ * character without reading beyond the end, and pass that number on to the
+ * validating routine */
+ if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
if (!*swash)
*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);