From 492a624f4a0c250e011c6b74a3403bfc885ec961 Mon Sep 17 00:00:00 2001
From: Karl Williamson <public@khwilliamson.com>
Date: Sat, 11 Feb 2012 14:04:39 -0700
Subject: Add is_utf8_char_buf()

This function is to replace is_utf8_char(), and requires an extra
parameter to ensure that it doesn't read beyond the end of the buffer.

Convert is_utf8_char() and the only place in the Perl core to use the
new one, assuming in each that there is enough space.

Thanks to Jarkko Hietaniemi for suggesting this function name
---
 embed.fnc |  1 +
 embed.h   |  1 +
 proto.h   |  6 ++++++
 utf8.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 54 insertions(+), 8 deletions(-)
diff --git a/embed.fnc b/embed.fnc
index bb403933d7..892a7190ac 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -627,6 +627,7 @@ ApPR	|bool	|is_uni_punct_lc|UV c
 ApPR	|bool	|is_uni_xdigit_lc|UV c
 Anpd	|bool	|is_ascii_string|NN const U8 *s|STRLEN len
 Anpd	|STRLEN	|is_utf8_char	|NN const U8 *s
+Anpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 Anpd	|bool	|is_utf8_string	|NN const U8 *s|STRLEN len
 Anpdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
 Anpd	|bool	|is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
diff --git a/embed.h b/embed.h
index 2f9d47dd43..d429c8d0ea 100644
--- a/embed.h
+++ b/embed.h
@@ -250,6 +250,7 @@
 #define is_utf8_alpha(a)	Perl_is_utf8_alpha(aTHX_ a)
 #define is_utf8_ascii(a)	Perl_is_utf8_ascii(aTHX_ a)
 #define is_utf8_char		Perl_is_utf8_char
+#define is_utf8_char_buf	Perl_is_utf8_char_buf
 #define is_utf8_cntrl(a)	Perl_is_utf8_cntrl(aTHX_ a)
 #define is_utf8_digit(a)	Perl_is_utf8_digit(aTHX_ a)
 #define is_utf8_graph(a)	Perl_is_utf8_graph(aTHX_ a)
diff --git a/proto.h b/proto.h
index d2022d549d..dde1a43815 100644
--- a/proto.h
+++ b/proto.h
@@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN	Perl_is_utf8_char(const U8 *s)
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR	\
 	assert(s)
 
+PERL_CALLCONV STRLEN	Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end)
+			__attribute__nonnull__(1)
+			__attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF	\
+	assert(buf); assert(buf_end)
+
 PERL_CALLCONV bool	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index bfcc40cfd0..2e0429e476 100644
--- a/utf8.c
+++ b/utf8.c
@@ -315,6 +315,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
     return len;
 }
 
+/*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
+buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+    STRLEN len;
+
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+    if (buf_end <= buf) {
+	return 0;
+    }
+
+    len = buf_end - buf;
+    if (len > UTF8SKIP(buf)) {
+	len = UTF8SKIP(buf);
+    }
+
+#ifdef IS_UTF8_CHAR
+    if (IS_UTF8_CHAR_FAST(len))
+        return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+    return is_utf8_char_slow(buf, len);
+}
+
 /*
 =for apidoc is_utf8_char
 
@@ -330,14 +367,10 @@ UTF8SKIP(s) bytes.
 STRLEN
 Perl_is_utf8_char(const U8 *s)
 {
-    const STRLEN len = UTF8SKIP(s);
-
     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
-    return is_utf8_char_slow(s, len);
+
+    /* Assumes we have enough space */
+    return is_utf8_char_buf(s, s + UTF8SKIP(s));
 }
 
 
@@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
 
     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
 
-    if (!is_utf8_char(p))
+    /* The API should have included a length for the UTF-8 character in <p>,
+     * but it doesn't.  We therefor assume that p has been validated at least
+     * as far as there being enough bytes available in it to accommodate the
+     * character without reading beyond the end, and pass that number on to the
+     * validating routine */
+    if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
 	return FALSE;
     if (!*swash)
 	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
-- 
cgit v1.2.1