Add is_utf8_char_buf()

This function is to replace is_utf8_char(), and requires an extra parameter to ensure that it doesn't read beyond the end of the buffer. Convert is_utf8_char() and the only place in the Perl core to use the new one, assuming in each that there is enough space. Thanks to Jarkko Hietaniemi for suggesting this function name
author: Karl Williamson <public@khwilliamson.com> 2012-02-11 14:04:39 -0700
committer: Karl Williamson <public@khwilliamson.com> 2012-02-11 14:35:46 -0700
commit: 492a624f4a0c250e011c6b74a3403bfc885ec961 (patch)
tree: 8961dec5b0e8d1fc0a90c8696bc15f40e8e3616e
parent: b9411ff2d79526114566a6486a0c0238691cec2d (diff)
download: perl-492a624f4a0c250e011c6b74a3403bfc885ec961.tar.gz
4 files changed, 54 insertions, 8 deletions
diff --git a/embed.fnc b/embed.fnc
index bb403933d7..892a7190ac 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -627,6 +627,7 @@ ApPR	|bool	|is_uni_punct_lc|UV c
 ApPR	|bool	|is_uni_xdigit_lc|UV c
 Anpd	|bool	|is_ascii_string|NN const U8 *s|STRLEN len
 Anpd	|STRLEN	|is_utf8_char	|NN const U8 *s
+Anpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 Anpd	|bool	|is_utf8_string	|NN const U8 *s|STRLEN len
 Anpdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
 Anpd	|bool	|is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
diff --git a/embed.h b/embed.h
index 2f9d47dd43..d429c8d0ea 100644
--- a/embed.h
+++ b/embed.h
@@ -250,6 +250,7 @@
 #define is_utf8_alpha(a)	Perl_is_utf8_alpha(aTHX_ a)
 #define is_utf8_ascii(a)	Perl_is_utf8_ascii(aTHX_ a)
 #define is_utf8_char		Perl_is_utf8_char
+#define is_utf8_char_buf	Perl_is_utf8_char_buf
 #define is_utf8_cntrl(a)	Perl_is_utf8_cntrl(aTHX_ a)
 #define is_utf8_digit(a)	Perl_is_utf8_digit(aTHX_ a)
 #define is_utf8_graph(a)	Perl_is_utf8_graph(aTHX_ a)
diff --git a/proto.h b/proto.h
index d2022d549d..dde1a43815 100644
--- a/proto.h
+++ b/proto.h
@@ -1823,6 +1823,12 @@ PERL_CALLCONV STRLEN	Perl_is_utf8_char(const U8 *s)
 #define PERL_ARGS_ASSERT_IS_UTF8_CHAR	\
 	assert(s)
 
+PERL_CALLCONV STRLEN	Perl_is_utf8_char_buf(const U8 *buf, const U8 *buf_end)
+			__attribute__nonnull__(1)
+			__attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF	\
+	assert(buf); assert(buf_end)
+
 PERL_CALLCONV bool	Perl_is_utf8_cntrl(pTHX_ const U8 *p)
 			__attribute__warn_unused_result__
 			__attribute__nonnull__(pTHX_1);
diff --git a/utf8.c b/utf8.c
index bfcc40cfd0..2e0429e476 100644
--- a/utf8.c
+++ b/utf8.c
@@ -316,6 +316,43 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 }
 
 /*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
+buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+    STRLEN len;
+
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+    if (buf_end <= buf) {
+	return 0;
+    }
+
+    len = buf_end - buf;
+    if (len > UTF8SKIP(buf)) {
+	len = UTF8SKIP(buf);
+    }
+
+#ifdef IS_UTF8_CHAR
+    if (IS_UTF8_CHAR_FAST(len))
+        return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+    return is_utf8_char_slow(buf, len);
+}
+
+/*
 =for apidoc is_utf8_char
 
 Tests if some arbitrary number of bytes begins in a valid UTF-8
@@ -330,14 +367,10 @@ UTF8SKIP(s) bytes.
 STRLEN
 Perl_is_utf8_char(const U8 *s)
 {
-    const STRLEN len = UTF8SKIP(s);
-
     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
-    return is_utf8_char_slow(s, len);
+
+    /* Assumes we have enough space */
+    return is_utf8_char_buf(s, s + UTF8SKIP(s));
 }
 
 
@@ -1645,7 +1678,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
 
     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
 
-    if (!is_utf8_char(p))
+    /* The API should have included a length for the UTF-8 character in <p>,
+     * but it doesn't.  We therefor assume that p has been validated at least
+     * as far as there being enough bytes available in it to accommodate the
+     * character without reading beyond the end, and pass that number on to the
+     * validating routine */
+    if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
 	return FALSE;
     if (!*swash)
 	*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
author	Karl Williamson <public@khwilliamson.com>	2012-02-11 14:04:39 -0700
committer	Karl Williamson <public@khwilliamson.com>	2012-02-11 14:35:46 -0700
commit	492a624f4a0c250e011c6b74a3403bfc885ec961 (patch)
tree	8961dec5b0e8d1fc0a90c8696bc15f40e8e3616e
parent	b9411ff2d79526114566a6486a0c0238691cec2d (diff)
download	perl-492a624f4a0c250e011c6b74a3403bfc885ec961.tar.gz