diff options
author | Karl Williamson <khw@cpan.org> | 2016-09-14 18:54:23 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-09-17 21:10:50 -0600 |
commit | 25e3a4e08a8b645de44458470ff4f139baf56682 (patch) | |
tree | c1e0fc724804ede3de8493a757c375538d042f9c /utf8.h | |
parent | a82be82b512232b63f28c5865113f7990fb59a3a (diff) | |
download | perl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz |
Add isUTF8_CHAR_flags() macro
This is like the previous 2 commits, but the macro takes a flags
parameter so any combination of the disallowed flags may be used. The
others, along with the original isUTF8_CHAR(), are the most commonly
desired strictures, and use an implementation of a, hopefully, inlined
trie for speed. This is for generality and the major portion of its
implementation isn't inlined.
Diffstat (limited to 'utf8.h')
-rw-r--r-- | utf8.h | 34 |
1 files changed, 34 insertions, 0 deletions
@@ -1056,6 +1056,40 @@ L<perlunicode/Noncharacter code points>. ? 0 \ : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) +/* + +=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, +that represents some code point, subject to the restrictions given by C<flags>; +otherwise it evaluates to 0. If non-zero, the value gives how many bytes +starting at C<s> comprise the code point's representation. + +If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; +if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results +as C<L</isSTRICT_UTF8_CHAR>>; +and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives +the same results as C<L</isC9_STRICT_UTF8_CHAR>>. +Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags +understood by C<L</utf8n_to_uvchr>>, with the same meanings. + +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. + +=cut +*/ + +#define isUTF8_CHAR_flags(s, e, flags) \ + (UNLIKELY((e) <= (s)) \ + ? 0 \ + : (UTF8_IS_INVARIANT(*s)) \ + ? 1 \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ + ? 0 \ + : _is_utf8_char_helper(s, e, flags)) + /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is * retained solely for backwards compatibility */ #define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n) |