summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-14 18:54:23 -0600
committerKarl Williamson <khw@cpan.org>2016-09-17 21:10:50 -0600
commit25e3a4e08a8b645de44458470ff4f139baf56682 (patch)
treec1e0fc724804ede3de8493a757c375538d042f9c /utf8.h
parenta82be82b512232b63f28c5865113f7990fb59a3a (diff)
downloadperl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz
Add isUTF8_CHAR_flags() macro
This is like the previous 2 commits, but the macro takes a flags parameter so any combination of the disallowed flags may be used. The others, along with the original isUTF8_CHAR(), are the most commonly desired strictures, and use an implementation of a, hopefully, inlined trie for speed. This is for generality and the major portion of its implementation isn't inlined.
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h34
1 files changed, 34 insertions, 0 deletions
diff --git a/utf8.h b/utf8.h
index 4278ebbf2a..392a86a560 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1056,6 +1056,40 @@ L<perlunicode/Noncharacter code points>.
? 0 \
: is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
+/*
+
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0. If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+=cut
+*/
+
+#define isUTF8_CHAR_flags(s, e, flags) \
+ (UNLIKELY((e) <= (s)) \
+ ? 0 \
+ : (UTF8_IS_INVARIANT(*s)) \
+ ? 1 \
+ : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
+ ? 0 \
+ : _is_utf8_char_helper(s, e, flags))
+
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
* retained solely for backwards compatibility */
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)