diff options
author | Karl Williamson <khw@cpan.org> | 2016-09-14 18:54:23 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-09-17 21:10:50 -0600 |
commit | 25e3a4e08a8b645de44458470ff4f139baf56682 (patch) | |
tree | c1e0fc724804ede3de8493a757c375538d042f9c | |
parent | a82be82b512232b63f28c5865113f7990fb59a3a (diff) | |
download | perl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz |
Add isUTF8_CHAR_flags() macro
This is like the previous 2 commits, but the macro takes a flags
parameter so any combination of the disallowed flags may be used. The
others, along with the original isUTF8_CHAR(), are the most commonly
desired strictures, and use an implementation of a, hopefully, inlined
trie for speed. This is for generality and the major portion of its
implementation isn't inlined.
-rw-r--r-- | ext/XS-APItest/APItest.xs | 7 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 78 | ||||
-rw-r--r-- | utf8.h | 34 |
3 files changed, 116 insertions, 3 deletions
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index b7b7dbd739..954bb60b30 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -5328,6 +5328,13 @@ test_isUTF8_CHAR(char *s, STRLEN len) RETVAL STRLEN +test_isUTF8_CHAR_flags(char *s, STRLEN len, U32 flags) + CODE: + RETVAL = isUTF8_CHAR_flags((U8 *) s, (U8 *) s + len, flags); + OUTPUT: + RETVAL + +STRLEN test_isSTRICT_UTF8_CHAR(char *s, STRLEN len) CODE: RETVAL = isSTRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len); diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 6619ecadce..8122534571 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -70,6 +70,10 @@ my $UTF8_WARN_SUPER = 0x0400; my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800; my $UTF8_WARN_ABOVE_31_BIT = 0x1000; my $UTF8_CHECK_ONLY = 0x2000; +my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE + = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; +my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE + = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; # Test uvchr_to_utf8(). my $UNICODE_WARN_SURROGATE = 0x0001; @@ -472,6 +476,28 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } undef @warnings; + $ret = test_isUTF8_CHAR_flags($bytes, $len, 0); + is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len"); + + unless (is(scalar @warnings, 0, + "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings")) + { + diag "The warnings were: " . join(", ", @warnings); + } + + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0); + is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0"); + + unless (is(scalar @warnings, 0, + "Verify isUTF8_CHAR_flags() generated no warnings")) + { + diag "The warnings were: " . join(", ", @warnings); + } + + undef @warnings; + $ret = test_isSTRICT_UTF8_CHAR($bytes, $len); my $expected_len = ($valid_under_strict) ? $len : 0; is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len"); @@ -493,6 +519,19 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } diag "The warnings were: " . join(", ", @warnings); } + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR"); + + unless (is(scalar @warnings, 0, + "Verify isUTF8_CHAR() for $hex_n generated no warnings")) + { + diag "The warnings were: " . join(", ", @warnings); + } + + undef @warnings; + $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len); $expected_len = ($valid_under_c9strict) ? $len : 0; is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len"); @@ -516,6 +555,17 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } undef @warnings; + $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR"); + + unless (is(scalar @warnings, 0, + "Verify isUTF8_CHAR() for $hex_n generated no warnings")) + { + diag "The warnings were: " . join(", ", @warnings); + } + + undef @warnings; + $ret_ref = test_valid_utf8_to_uvchr($bytes); is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n"); is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len"); @@ -784,6 +834,16 @@ foreach my $test (@malformations) { diag "The warnings were: " . join(", ", @warnings); } + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $length, 0); + is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0"); + unless (is(scalar @warnings, 0, + "$testname: isUTF8_CHAR() generated no warnings")) + { + diag "The warnings were: " . join(", ", @warnings); + } + $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0"); unless (is(scalar @warnings, 0, @@ -1293,15 +1353,19 @@ foreach my $test (@tests) { use warnings; undef @warnings; my $ret = test_isUTF8_CHAR($bytes, $length); + my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); if ($will_overflow) { is($ret, 0, "isUTF8_CHAR() $testname: returns 0"); + is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0"); } else { is($ret, $length, "isUTF8_CHAR() $testname: returns expected length: $length"); + is($ret_flags, $length, + "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length"); } unless (is(scalar @warnings, 0, - "isUTF8_CHAR() $testname: generated no warnings")) + "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings")) { diag "The warnings were: " . join(", ", @warnings); } @@ -1318,9 +1382,13 @@ foreach my $test (@tests) { : $length; is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret"); + $ret = test_isUTF8_CHAR_flags($bytes, $length, + $UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + is($ret, $expected_ret, + "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR"); } unless (is(scalar @warnings, 0, - "isSTRICT_UTF8_CHAR() $testname: generated no warnings")) + "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings")) { diag "The warnings were: " . join(", ", @warnings); } @@ -1337,9 +1405,13 @@ foreach my $test (@tests) { : $length; is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret"); + $ret = test_isUTF8_CHAR_flags($bytes, $length, + $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + is($ret, $expected_ret, + "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR"); } unless (is(scalar @warnings, 0, - "isC9_STRICT_UTF8_CHAR() $testname: generated no warnings")) + "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings")) { diag "The warnings were: " . join(", ", @warnings); } @@ -1056,6 +1056,40 @@ L<perlunicode/Noncharacter code points>. ? 0 \ : is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s)) +/* + +=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags + +Evaluates to non-zero if the first few bytes of the string starting at C<s> and +looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl, +that represents some code point, subject to the restrictions given by C<flags>; +otherwise it evaluates to 0. If non-zero, the value gives how many bytes +starting at C<s> comprise the code point's representation. + +If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>; +if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results +as C<L</isSTRICT_UTF8_CHAR>>; +and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives +the same results as C<L</isC9_STRICT_UTF8_CHAR>>. +Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags +understood by C<L</utf8n_to_uvchr>>, with the same meanings. + +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. + +=cut +*/ + +#define isUTF8_CHAR_flags(s, e, flags) \ + (UNLIKELY((e) <= (s)) \ + ? 0 \ + : (UTF8_IS_INVARIANT(*s)) \ + ? 1 \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ + ? 0 \ + : _is_utf8_char_helper(s, e, flags)) + /* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is * retained solely for backwards compatibility */ #define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n) |