Add isUTF8_CHAR_flags() macro

This is like the previous 2 commits, but the macro takes a flags parameter so any combination of the disallowed flags may be used. The others, along with the original isUTF8_CHAR(), are the most commonly desired strictures, and use an implementation of a, hopefully, inlined trie for speed. This is for generality and the major portion of its implementation isn't inlined.
author: Karl Williamson <khw@cpan.org> 2016-09-14 18:54:23 -0600
committer: Karl Williamson <khw@cpan.org> 2016-09-17 21:10:50 -0600
commit: 25e3a4e08a8b645de44458470ff4f139baf56682 (patch)
tree: c1e0fc724804ede3de8493a757c375538d042f9c
parent: a82be82b512232b63f28c5865113f7990fb59a3a (diff)
download: perl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz
3 files changed, 116 insertions, 3 deletions
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index b7b7dbd739..954bb60b30 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -5328,6 +5328,13 @@ test_isUTF8_CHAR(char *s, STRLEN len)
         RETVAL
 
 STRLEN
+test_isUTF8_CHAR_flags(char *s, STRLEN len, U32 flags)
+    CODE:
+        RETVAL = isUTF8_CHAR_flags((U8 *) s, (U8 *) s + len, flags);
+    OUTPUT:
+        RETVAL
+
+STRLEN
 test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
     CODE:
         RETVAL = isSTRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 6619ecadce..8122534571 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -70,6 +70,10 @@ my $UTF8_WARN_SUPER             = 0x0400;
 my $UTF8_DISALLOW_ABOVE_31_BIT  = 0x0800;
 my $UTF8_WARN_ABOVE_31_BIT      = 0x1000;
 my $UTF8_CHECK_ONLY             = 0x2000;
+my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
+                             = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
+my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+              = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
 
 # Test uvchr_to_utf8().
 my $UNICODE_WARN_SURROGATE        = 0x0001;
@@ -472,6 +476,28 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     undef @warnings;
 
+    $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
+    is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
+    $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
+    is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isUTF8_CHAR_flags() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
     $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
     my $expected_len = ($valid_under_strict) ? $len : 0;
     is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
@@ -493,6 +519,19 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
         diag "The warnings were: " . join(", ", @warnings);
     }
 
+    undef @warnings;
+
+    $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
+    is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
     $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
     $expected_len = ($valid_under_c9strict) ? $len : 0;
     is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
@@ -516,6 +555,17 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     undef @warnings;
 
+    $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+    is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
     $ret_ref = test_valid_utf8_to_uvchr($bytes);
     is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
     is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
@@ -784,6 +834,16 @@ foreach my $test (@malformations) {
         diag "The warnings were: " . join(", ", @warnings);
     }
 
+    undef @warnings;
+
+    $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
+    is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
+    unless (is(scalar @warnings, 0,
+               "$testname: isUTF8_CHAR() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
     $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
     is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
     unless (is(scalar @warnings, 0,
@@ -1293,15 +1353,19 @@ foreach my $test (@tests) {
         use warnings;
         undef @warnings;
         my $ret = test_isUTF8_CHAR($bytes, $length);
+        my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
         if ($will_overflow) {
             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
+            is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
         }
         else {
             is($ret, $length,
                "isUTF8_CHAR() $testname: returns expected length: $length");
+            is($ret_flags, $length,
+               "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
         }
         unless (is(scalar @warnings, 0,
-                "isUTF8_CHAR() $testname: generated no warnings"))
+                "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
         {
             diag "The warnings were: " . join(", ", @warnings);
         }
@@ -1318,9 +1382,13 @@ foreach my $test (@tests) {
                                : $length;
             is($ret, $expected_ret,
                "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+            $ret = test_isUTF8_CHAR_flags($bytes, $length,
+                                          $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
+            is($ret, $expected_ret,
+               "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
         }
         unless (is(scalar @warnings, 0,
-                "isSTRICT_UTF8_CHAR() $testname: generated no warnings"))
+                "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
         {
             diag "The warnings were: " . join(", ", @warnings);
         }
@@ -1337,9 +1405,13 @@ foreach my $test (@tests) {
                                : $length;
             is($ret, $expected_ret,
                "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+            $ret = test_isUTF8_CHAR_flags($bytes, $length,
+                                          $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+            is($ret, $expected_ret,
+               "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
         }
         unless (is(scalar @warnings, 0,
-                "isC9_STRICT_UTF8_CHAR() $testname: generated no warnings"))
+                "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
         {
             diag "The warnings were: " . join(", ", @warnings);
         }
diff --git a/utf8.h b/utf8.h
index 4278ebbf2a..392a86a560 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1056,6 +1056,40 @@ L<perlunicode/Noncharacter code points>.
         ? 0                                                                 \
         : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
 
+/*
+
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+=cut
+*/
+
+#define isUTF8_CHAR_flags(s, e, flags)                                      \
+    (UNLIKELY((e) <= (s))                                                   \
+    ? 0                                                                     \
+    : (UTF8_IS_INVARIANT(*s))                                               \
+      ? 1                                                                   \
+      : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
+        ? 0                                                                 \
+        : _is_utf8_char_helper(s, e, flags))
+
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
  * retained solely for backwards compatibility */
 #define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)
author	Karl Williamson <khw@cpan.org>	2016-09-14 18:54:23 -0600
committer	Karl Williamson <khw@cpan.org>	2016-09-17 21:10:50 -0600
commit	25e3a4e08a8b645de44458470ff4f139baf56682 (patch)
tree	c1e0fc724804ede3de8493a757c375538d042f9c
parent	a82be82b512232b63f28c5865113f7990fb59a3a (diff)
download	perl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz