Add macro for Unicode Corregindum #9 strict

This macro follows Unicode Corrigendum #9 to allow non-character code points. These are still discouraged but not completely forbidden. It's best for code that isn't intended to operate on arbitrary other code text to use the original definition, but code that does things, such as source code control, should change to use this definition if it wants to be Unicode-strict. Perl can't adopt C9 wholesale, as it might create security holes in existing applications that rely on Perl keeping non-chars out.
author: Karl Williamson <khw@cpan.org> 2016-09-12 16:52:41 -0600
committer: Karl Williamson <khw@cpan.org> 2016-09-17 21:10:50 -0600
commit: a82be82b512232b63f28c5865113f7990fb59a3a (patch)
tree: 63e6434ecae9cfe89149e4f043dfdcac39a10434
parent: e23e8bc1957a5981b8a507b62471ae38ec06c661 (diff)
download: perl-a82be82b512232b63f28c5865113f7990fb59a3a.tar.gz
6 files changed, 164 insertions, 2 deletions
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index c226defc85..b7b7dbd739 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -5334,6 +5334,13 @@ test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
     OUTPUT:
         RETVAL
 
+STRLEN
+test_isC9_STRICT_UTF8_CHAR(char *s, STRLEN len)
+    CODE:
+        RETVAL = isC9_STRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
+    OUTPUT:
+        RETVAL
+
 IV
 test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
     CODE:
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index acba778067..6619ecadce 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -423,9 +423,11 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
     }
 
     my $valid_under_strict = 1;
+    my $valid_under_c9strict = 1;
     if ($n > 0x10FFFF) {
         $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
         $valid_under_strict = 0;
+        $valid_under_c9strict = 0;
     }
     elsif (($n & 0xFFFE) == 0xFFFE) {
         $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
@@ -491,6 +493,27 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
         diag "The warnings were: " . join(", ", @warnings);
     }
 
+    $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
+    $expected_len = ($valid_under_c9strict) ? $len : 0;
+    is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
+    undef @warnings;
+
+    $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
+    is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
+
+    unless (is(scalar @warnings, 0,
+               "Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
     undef @warnings;
 
     $ret_ref = test_valid_utf8_to_uvchr($bytes);
@@ -769,6 +792,14 @@ foreach my $test (@malformations) {
         diag "The warnings were: " . join(", ", @warnings);
     }
 
+    $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
+    is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
+    unless (is(scalar @warnings, 0,
+               "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
+    {
+        diag "The warnings were: " . join(", ", @warnings);
+    }
+
     for my $j (1 .. $length - 1) {
         my $partial = substr($bytes, 0, $j);
 
@@ -1294,6 +1325,25 @@ foreach my $test (@tests) {
             diag "The warnings were: " . join(", ", @warnings);
         }
 
+        undef @warnings;
+        $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
+        if ($will_overflow) {
+            is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
+        }
+        else {
+            my $expected_ret = (   $testname =~ /surrogate/
+                                || $allowed_uv > 0x10FFFF)
+                               ? 0
+                               : $length;
+            is($ret, $expected_ret,
+               "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+        }
+        unless (is(scalar @warnings, 0,
+                "isC9_STRICT_UTF8_CHAR() $testname: generated no warnings"))
+        {
+            diag "The warnings were: " . join(", ", @warnings);
+        }
+
         # Test partial character handling, for each byte not a full character
         for my $j (1.. $length - 1) {
 
diff --git a/regcharclass.h b/regcharclass.h
index 2daedcd3ac..6f5d14b72c 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1876,6 +1876,6 @@
  * 5c7eb94310e2aaa15702fd6bed24ff0e7ab5448f9a8231d8c49ca96c9e941089 lib/unicore/mktables
  * cdecb300baad839a6f62791229f551a4fa33f3cbdca08e378dc976466354e778 lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
- * e3dc81163da3e92f7be01e9b953f6edb548eba93f1abb3d334e3b0469573c46d regen/regcharclass.pl
+ * 66e20f857451956f9fc7ad7432de972e84fb857885009838878bcf6f91ffbeef regen/regcharclass.pl
  * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl
  * ex: set ro: */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index abc4942354..f3f8b997f3 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1704,6 +1704,16 @@ SURROGATE: Surrogate code points
 #0xF0000 - 0xFFFFD
 #0x100000 - 0x10FFFD
 
+#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrogates
+#=> UTF8 :no_length_checks only_ascii_platform
+#0x0080 - 0xD7FF
+#0xE000 - 0x10FFFF
+#
+#C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points including non-character code points, no surrogates
+#=> UTF8 :no_length_checks only_ebcdic_platform
+#0x00A0 - 0xD7FF
+#0xE000 - 0x10FFFF
+
 QUOTEMETA: Meta-characters that \Q should quote
 => high :fast
 \p{_Perl_Quotemeta}
diff --git a/utf8.h b/utf8.h
index dba65d39dc..4278ebbf2a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -383,6 +383,28 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
     : 0 )                                                                   \
 : 0 )
 
+/*  Similarly,
+        C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code
+                                     points, no surrogates
+	0x0080 - 0xD7FF
+	0xE000 - 0x10FFFF
+*/
+/*** GENERATED CODE ***/
+#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)                     \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
+    ( LIKELY( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                    \
+: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) || ( ((U8*)s)[0] & 0xFE ) == 0xEE ) ?\
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ((U8*)s)[1] & 0xE0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ?                          \
+    ( LIKELY( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: LIKELY( ( ( ( 0xF4 == ((U8*)s)[0] ) && ( ( ((U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
+
 #endif /* EBCDIC vs ASCII */
 
 /* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -989,7 +1011,8 @@ be a surrogate nor a non-character code point.  Thus this excludes any code
 point from Perl's extended UTF-8.
 
 This is used to efficiently decide if the next few bytes in C<s> is
-legal Unicode-acceptable UTF-8 for a single character.
+legal Unicode-acceptable UTF-8 for a single character.  Use
+C<L</isC9_STRICT_UTF8_CHAR>> to also accept non-character code points.
 
 =cut
 */
@@ -1003,6 +1026,36 @@ legal Unicode-acceptable UTF-8 for a single character.
         ? 0                                                                 \
         : is_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
 
+/*
+
+=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points.  This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange.  See
+L<perlunicode/Noncharacter code points>.
+
+=cut
+*/
+
+#define isC9_STRICT_UTF8_CHAR(s, e)                                         \
+    (UNLIKELY((e) <= (s))                                                   \
+    ? 0                                                                     \
+    : (UTF8_IS_INVARIANT(*s))                                               \
+      ? 1                                                                   \
+      : UNLIKELY(((e) - (s)) < UTF8SKIP(s))                                 \
+        ? 0                                                                 \
+        : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
+
 /* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
  * retained solely for backwards compatibility */
 #define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)
diff --git a/utfebcdic.h b/utfebcdic.h
index b0e62ca11a..fd247af157 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -371,6 +371,27 @@ explicitly forbidden, and the shortest possible encoding should always be used
     ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
 : ( 0xDF == ((U8*)s)[0] || 0xEA == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ? is_STRICT_UTF8_CHAR_utf8_no_length_checks_part0(s) : is_STRICT_UTF8_CHAR_utf8_no_length_checks_part1(s) )
 
+/*      C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points
+                             including non-character code points, no surrogates
+	0x00A0 - 0xD7FF
+	0xE000 - 0x10FFFF
+*/
+/*** GENERATED CODE ***/
+#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)             \
+( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB6 ) ) ?\
+    ( LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 )\
+: ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) ?\
+    ( LIKELY( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) ? 3 : 0 )\
+: ( 0xDC == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( 0x57 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
+: ( 0xDD == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x64 ) || ( 0x67 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
+: ( ( ((U8*)s)[0] & 0xFE ) == 0xDE || 0xE1 == ((U8*)s)[0] || ( 0xEA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) ) ?\
+    ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) ? 4 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( ( 0x49 == ((U8*)s)[1] || 0x4A == ((U8*)s)[1] ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( ((U8*)s)[4] & 0xFC ) == 0x70 ) ) ? 5 : 0 )\
+: LIKELY( ( ( ( ( 0xEE == ((U8*)s)[0] ) && ( 0x41 == ((U8*)s)[1] || 0x42 == ((U8*)s)[1] ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( ((U8*)s)[3] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( ((U8*)s)[4] & 0xFC ) == 0x70 ) ) ? 5 : 0 )
+
 #endif
 
 #if '^' == 176 /* CP 037 */
@@ -453,6 +474,27 @@ explicitly forbidden, and the shortest possible encoding should always be used
     ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
 : ( 0xDF == ((U8*)s)[0] || 0xEA == ((U8*)s)[0] || 0xEC == ((U8*)s)[0] ) ? is_STRICT_UTF8_CHAR_utf8_no_length_checks_part0(s) : is_STRICT_UTF8_CHAR_utf8_no_length_checks_part1(s) )
 
+/*      C9_STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points
+                             including non-character code points, no surrogates
+	0x00A0 - 0xD7FF
+	0xE000 - 0x10FFFF
+*/
+/*** GENERATED CODE ***/
+#define is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)             \
+( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ( 0xB1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
+    ( LIKELY( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ? 2 : 0 )\
+: ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) ?\
+    ( LIKELY( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) ? 3 : 0 )\
+: ( 0xDC == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( 0x57 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
+: ( 0xDD == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( ((U8*)s)[1] & 0xFE ) == 0x62 || ( 0x66 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
+: ( ( ((U8*)s)[0] & 0xFE ) == 0xDE || 0xE1 == ((U8*)s)[0] || ( 0xEA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEC ) ) ?\
+    ( LIKELY( ( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) ? 4 : 0 )\
+: ( 0xED == ((U8*)s)[0] ) ?                                                 \
+    ( LIKELY( ( ( ( ( 0x49 == ((U8*)s)[1] || 0x4A == ((U8*)s)[1] ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || 0x5F == ((U8*)s)[4] || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( 0x70 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x72 ) ) ) ? 5 : 0 )\
+: LIKELY( ( ( ( ( 0xEE == ((U8*)s)[0] ) && ( 0x41 == ((U8*)s)[1] || 0x42 == ((U8*)s)[1] ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x4A ) || ( 0x51 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x59 ) || 0x5F == ((U8*)s)[3] || ( 0x62 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x6A ) || ( 0x70 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x4A ) || ( 0x51 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x59 ) || 0x5F == ((U8*)s)[4] || ( 0x62 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x6A ) || ( 0x70 <= ((U8*)s)[4] && ((U8*)s)[4] <= 0x72 ) ) ) ? 5 : 0 )
+
 #endif
 
 /* is_UTF8_CHAR_utf8_no_length_checks() in both code pages handles UTF-8 that
author	Karl Williamson <khw@cpan.org>	2016-09-12 16:52:41 -0600
committer	Karl Williamson <khw@cpan.org>	2016-09-17 21:10:50 -0600
commit	a82be82b512232b63f28c5865113f7990fb59a3a (patch)
tree	63e6434ecae9cfe89149e4f043dfdcac39a10434
parent	e23e8bc1957a5981b8a507b62471ae38ec06c661 (diff)
download	perl-a82be82b512232b63f28c5865113f7990fb59a3a.tar.gz