Add is_utf8_fixed_width_buf_flags() and use it

This encodes a simple pattern that may not be immediately obvious to someone needing it. If you have a fixed-size buffer that is full of purportedly UTF-8 bytes, is it valid or not? It's easy to do, as shown in this commit. The file test operators -T and -B can be simpified by using this function.
author: Karl Williamson <khw@cpan.org> 2016-09-25 22:04:08 -0600
committer: Karl Williamson <khw@cpan.org> 2016-09-25 22:24:20 -0600
commit: 8bc127bf58304a1e46a3e33d30b0b8b6f21abb07 (patch)
tree: e19a856cb909b92fa7bd5a7c68dd1bc2fce270b0 /inline.h
parent: 9f2abfdef8903cce0a7b12ce12788ce7e9f72ed1 (diff)
download: perl-8bc127bf58304a1e46a3e33d30b0b8b6f21abb07.tar.gz
1 files changed, 89 insertions, 0 deletions
diff --git a/inline.h b/inline.h
index 1fc9065b0d..66ba348714 100644
--- a/inline.h
+++ b/inline.h
@@ -341,6 +341,9 @@ C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loc_flags>>,
 C<L</is_utf8_string_loclen>>,
 C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 C<L</is_strict_utf8_string>>,
 C<L</is_strict_utf8_string_loc>>,
 C<L</is_strict_utf8_string_loclen>>,
@@ -387,6 +390,9 @@ See also
 C<L</is_utf8_invariant_string>>,
 C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 
 =cut
 */
@@ -435,6 +441,9 @@ C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loc_flags>>,
 C<L</is_utf8_string_loclen>>,
 C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 C<L</is_strict_utf8_string_loc>>,
 C<L</is_strict_utf8_string_loclen>>,
 C<L</is_c9strict_utf8_string>>,
@@ -488,6 +497,9 @@ C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loc_flags>>,
 C<L</is_utf8_string_loclen>>,
 C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 C<L</is_strict_utf8_string>>,
 C<L</is_strict_utf8_string_loc>>,
 C<L</is_strict_utf8_string_loclen>>,
@@ -546,6 +558,9 @@ C<L</is_utf8_string_loc>>,
 C<L</is_utf8_string_loc_flags>>,
 C<L</is_utf8_string_loclen>>,
 C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 C<L</is_strict_utf8_string>>,
 C<L</is_strict_utf8_string_loc>>,
 C<L</is_strict_utf8_string_loclen>>,
@@ -968,6 +983,80 @@ S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const
     return cBOOL(_is_utf8_char_helper(s, e, flags));
 }
 
+/*
+
+=for apidoc is_utf8_fixed_width_buf_flags
+
+Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
+is entirely valid UTF-8, subject to the restrictions given by C<flags>;
+otherwise it returns FALSE.
+
+If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
+without restriction.  If the final few bytes of the buffer do not form a
+complete code point, this will return TRUE anyway, provided that
+C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
+
+If C<flags> in non-zero, it can be any combination of the
+C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
+same meanings.
+
+This function differs from C<L</is_utf8_string_flags>> only in that the latter
+returns FALSE if the final few bytes of the string don't form a complete code
+point.
+
+=cut
+ */
+#define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loc_flags
+
+Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
+failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
+to the beginning of any partial character at the end of the buffer; if there is
+no partial character C<*ep> will contain C<s>+C<len>.
+
+See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loclen_flags
+
+Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
+complete, valid characters found in the C<el> pointer.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
+                                       const STRLEN len,
+                                       const U8 **ep,
+                                       STRLEN *el,
+                                       const U32 flags)
+{
+    const U8 * maybe_partial;
+
+    PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
+
+    if (! ep) {
+        ep  = &maybe_partial;
+    }
+
+    /* If it's entirely valid, return that; otherwise see if the only error is
+     * that the final few bytes are for a partial character */
+    return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
+           || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
+}
+
 /* ------------------------------- perl.h ----------------------------- */
 
 /*
author	Karl Williamson <khw@cpan.org>	2016-09-25 22:04:08 -0600
committer	Karl Williamson <khw@cpan.org>	2016-09-25 22:24:20 -0600
commit	8bc127bf58304a1e46a3e33d30b0b8b6f21abb07 (patch)
tree	e19a856cb909b92fa7bd5a7c68dd1bc2fce270b0 /inline.h
parent	9f2abfdef8903cce0a7b12ce12788ce7e9f72ed1 (diff)
download	perl-8bc127bf58304a1e46a3e33d30b0b8b6f21abb07.tar.gz