Add is_utf8_valid_partial_char_flags()

This is a generalization of is_utf8_valid_partial_char to allow the caller to automatically exclude things such as surrogates.
author: Karl Williamson <khw@cpan.org> 2016-09-11 22:18:57 -0600
committer: Karl Williamson <khw@cpan.org> 2016-09-17 21:10:50 -0600
commit: f1c999a79ad93bb81cbb7b1bec96e06c33773b81 (patch)
tree: 40cd551dc7ef325bc99caf71ce437d9232f5e3f7
parent: 6cbb924831d50981620d4c51f8b12da5f269e569 (diff)
download: perl-f1c999a79ad93bb81cbb7b1bec96e06c33773b81.tar.gz
4 files changed, 38 insertions, 7 deletions
diff --git a/embed.fnc b/embed.fnc
index 450a4868d7..3bdc4266b7 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -744,7 +744,10 @@ Abmnpd	|STRLEN	|is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 AnipdP	|bool	|is_utf8_string	|NN const U8 *s|STRLEN len
 Anpdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NN const U8 **ep
 Anipd	|bool	|is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
-AnidP	|bool	|is_utf8_valid_partial_char|NN const U8 * const s|NN const U8 * const e
+AmndP	|bool	|is_utf8_valid_partial_char				    \
+		|NN const U8 * const s|NN const U8 * const e
+AnidP	|bool	|is_utf8_valid_partial_char_flags			    \
+		|NN const U8 * const s|NN const U8 * const e|const U32 flags
 AMpR	|bool	|_is_uni_FOO|const U8 classnum|const UV c
 AMpR	|bool	|_is_utf8_FOO|const U8 classnum|NN const U8 *p
 ADMpR	|bool	|is_utf8_alnum	|NN const U8 *p
diff --git a/embed.h b/embed.h
index 8ff1c93794..50a19a41ba 100644
--- a/embed.h
+++ b/embed.h
@@ -296,7 +296,7 @@
 #define is_utf8_string		Perl_is_utf8_string
 #define is_utf8_string_loclen	Perl_is_utf8_string_loclen
 #define is_utf8_upper(a)	Perl_is_utf8_upper(aTHX_ a)
-#define is_utf8_valid_partial_char	S_is_utf8_valid_partial_char
+#define is_utf8_valid_partial_char_flags	S_is_utf8_valid_partial_char_flags
 #define is_utf8_xdigit(a)	Perl_is_utf8_xdigit(aTHX_ a)
 #define is_utf8_xidcont(a)	Perl_is_utf8_xidcont(aTHX_ a)
 #define is_utf8_xidfirst(a)	Perl_is_utf8_xidfirst(aTHX_ a)
diff --git a/inline.h b/inline.h
index 41d0a9cdce..44fb484d48 100644
--- a/inline.h
+++ b/inline.h
@@ -526,17 +526,42 @@ failure can be signalled without having to wait for the next read.
 
 =cut
 */
+#define is_utf8_valid_partial_char(s, e) is_utf8_valid_partial_char_flags(s, e, 0)
+
+/*
+
+=for apidoc is_utf8_valid_partial_char_flags
+
+Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
+or not the input is a valid UTF-8 encoded partial character, but it takes an
+extra parameter, C<flags>, which can further restrict which code points are
+considered valid.
+
+If C<flags> is 0, this behaves identically to
+C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
+of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
+there is any sequence of bytes that can complete the input partial character in
+such a way that a non-prohibited character is formed, the function returns
+TRUE; otherwise FALSE.  Non characters cannot be determined based on partial
+character input.  But many  of the other possible excluded types can be
+determined from just the first one or two bytes.
+
+=cut
+ */
+
 PERL_STATIC_INLINE bool
-S_is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
+S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
 {
+    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
 
-    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
 
     if (s >= e || s + UTF8SKIP(s) <= e) {
         return FALSE;
     }
 
-    return cBOOL(_is_utf8_char_helper(s, e, 0));
+    return cBOOL(_is_utf8_char_helper(s, e, flags));
 }
 
 /* ------------------------------- perl.h ----------------------------- */
diff --git a/proto.h b/proto.h
index c20986b627..b13b42e1e1 100644
--- a/proto.h
+++ b/proto.h
@@ -1628,9 +1628,12 @@ PERL_CALLCONV bool	Perl_is_utf8_upper(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_UPPER	\
 	assert(p)
 
-PERL_STATIC_INLINE bool	S_is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
+/* PERL_CALLCONV bool	is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
+			__attribute__pure__; */
+
+PERL_STATIC_INLINE bool	S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
 			__attribute__pure__;
-#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR	\
+#define PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS	\
 	assert(s); assert(e)
 
 PERL_CALLCONV bool	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
author	Karl Williamson <khw@cpan.org>	2016-09-11 22:18:57 -0600
committer	Karl Williamson <khw@cpan.org>	2016-09-17 21:10:50 -0600
commit	f1c999a79ad93bb81cbb7b1bec96e06c33773b81 (patch)
tree	40cd551dc7ef325bc99caf71ce437d9232f5e3f7
parent	6cbb924831d50981620d4c51f8b12da5f269e569 (diff)
download	perl-f1c999a79ad93bb81cbb7b1bec96e06c33773b81.tar.gz