summaryrefslogtreecommitdiff
path: root/inline.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-25 22:04:08 -0600
committerKarl Williamson <khw@cpan.org>2016-09-25 22:24:20 -0600
commit8bc127bf58304a1e46a3e33d30b0b8b6f21abb07 (patch)
treee19a856cb909b92fa7bd5a7c68dd1bc2fce270b0 /inline.h
parent9f2abfdef8903cce0a7b12ce12788ce7e9f72ed1 (diff)
downloadperl-8bc127bf58304a1e46a3e33d30b0b8b6f21abb07.tar.gz
Add is_utf8_fixed_width_buf_flags() and use it
This encodes a simple pattern that may not be immediately obvious to someone needing it. If you have a fixed-size buffer that is full of purportedly UTF-8 bytes, is it valid or not? It's easy to do, as shown in this commit. The file test operators -T and -B can be simpified by using this function.
Diffstat (limited to 'inline.h')
-rw-r--r--inline.h89
1 files changed, 89 insertions, 0 deletions
diff --git a/inline.h b/inline.h
index 1fc9065b0d..66ba348714 100644
--- a/inline.h
+++ b/inline.h
@@ -341,6 +341,9 @@ C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
@@ -387,6 +390,9 @@ See also
C<L</is_utf8_invariant_string>>,
C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
=cut
*/
@@ -435,6 +441,9 @@ C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
C<L</is_c9strict_utf8_string>>,
@@ -488,6 +497,9 @@ C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
@@ -546,6 +558,9 @@ C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
@@ -968,6 +983,80 @@ S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const
return cBOOL(_is_utf8_char_helper(s, e, flags));
}
+/*
+
+=for apidoc is_utf8_fixed_width_buf_flags
+
+Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
+is entirely valid UTF-8, subject to the restrictions given by C<flags>;
+otherwise it returns FALSE.
+
+If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
+without restriction. If the final few bytes of the buffer do not form a
+complete code point, this will return TRUE anyway, provided that
+C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
+
+If C<flags> in non-zero, it can be any combination of the
+C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
+same meanings.
+
+This function differs from C<L</is_utf8_string_flags>> only in that the latter
+returns FALSE if the final few bytes of the string don't form a complete code
+point.
+
+=cut
+ */
+#define is_utf8_fixed_width_buf_flags(s, len, flags) \
+ is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loc_flags
+
+Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
+failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
+to the beginning of any partial character at the end of the buffer; if there is
+no partial character C<*ep> will contain C<s>+C<len>.
+
+See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
+ is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loclen_flags
+
+Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
+complete, valid characters found in the C<el> pointer.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
+ const STRLEN len,
+ const U8 **ep,
+ STRLEN *el,
+ const U32 flags)
+{
+ const U8 * maybe_partial;
+
+ PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
+
+ if (! ep) {
+ ep = &maybe_partial;
+ }
+
+ /* If it's entirely valid, return that; otherwise see if the only error is
+ * that the final few bytes are for a partial character */
+ return is_utf8_string_loclen_flags(s, len, ep, el, flags)
+ || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
+}
+
/* ------------------------------- perl.h ----------------------------- */
/*