summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2016-09-14 18:54:23 -0600
committerKarl Williamson <khw@cpan.org>2016-09-17 21:10:50 -0600
commit25e3a4e08a8b645de44458470ff4f139baf56682 (patch)
treec1e0fc724804ede3de8493a757c375538d042f9c
parenta82be82b512232b63f28c5865113f7990fb59a3a (diff)
downloadperl-25e3a4e08a8b645de44458470ff4f139baf56682.tar.gz
Add isUTF8_CHAR_flags() macro
This is like the previous 2 commits, but the macro takes a flags parameter so any combination of the disallowed flags may be used. The others, along with the original isUTF8_CHAR(), are the most commonly desired strictures, and use an implementation of a, hopefully, inlined trie for speed. This is for generality and the major portion of its implementation isn't inlined.
-rw-r--r--ext/XS-APItest/APItest.xs7
-rw-r--r--ext/XS-APItest/t/utf8.t78
-rw-r--r--utf8.h34
3 files changed, 116 insertions, 3 deletions
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index b7b7dbd739..954bb60b30 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -5328,6 +5328,13 @@ test_isUTF8_CHAR(char *s, STRLEN len)
RETVAL
STRLEN
+test_isUTF8_CHAR_flags(char *s, STRLEN len, U32 flags)
+ CODE:
+ RETVAL = isUTF8_CHAR_flags((U8 *) s, (U8 *) s + len, flags);
+ OUTPUT:
+ RETVAL
+
+STRLEN
test_isSTRICT_UTF8_CHAR(char *s, STRLEN len)
CODE:
RETVAL = isSTRICT_UTF8_CHAR((U8 *) s, (U8 *) s + len);
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 6619ecadce..8122534571 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -70,6 +70,10 @@ my $UTF8_WARN_SUPER = 0x0400;
my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800;
my $UTF8_WARN_ABOVE_31_BIT = 0x1000;
my $UTF8_CHECK_ONLY = 0x2000;
+my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
+ = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
+my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
# Test uvchr_to_utf8().
my $UNICODE_WARN_SURROGATE = 0x0001;
@@ -472,6 +476,28 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
undef @warnings;
+ $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
+ is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
+
+ unless (is(scalar @warnings, 0,
+ "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
+ {
+ diag "The warnings were: " . join(", ", @warnings);
+ }
+
+ undef @warnings;
+
+ $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
+ is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
+
+ unless (is(scalar @warnings, 0,
+ "Verify isUTF8_CHAR_flags() generated no warnings"))
+ {
+ diag "The warnings were: " . join(", ", @warnings);
+ }
+
+ undef @warnings;
+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
my $expected_len = ($valid_under_strict) ? $len : 0;
is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
@@ -493,6 +519,19 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
diag "The warnings were: " . join(", ", @warnings);
}
+ undef @warnings;
+
+ $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
+ is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
+
+ unless (is(scalar @warnings, 0,
+ "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
+ {
+ diag "The warnings were: " . join(", ", @warnings);
+ }
+
+ undef @warnings;
+
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
$expected_len = ($valid_under_c9strict) ? $len : 0;
is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
@@ -516,6 +555,17 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
undef @warnings;
+ $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+ is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
+
+ unless (is(scalar @warnings, 0,
+ "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
+ {
+ diag "The warnings were: " . join(", ", @warnings);
+ }
+
+ undef @warnings;
+
$ret_ref = test_valid_utf8_to_uvchr($bytes);
is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
@@ -784,6 +834,16 @@ foreach my $test (@malformations) {
diag "The warnings were: " . join(", ", @warnings);
}
+ undef @warnings;
+
+ $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
+ is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
+ unless (is(scalar @warnings, 0,
+ "$testname: isUTF8_CHAR() generated no warnings"))
+ {
+ diag "The warnings were: " . join(", ", @warnings);
+ }
+
$ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
unless (is(scalar @warnings, 0,
@@ -1293,15 +1353,19 @@ foreach my $test (@tests) {
use warnings;
undef @warnings;
my $ret = test_isUTF8_CHAR($bytes, $length);
+ my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
if ($will_overflow) {
is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
+ is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
}
else {
is($ret, $length,
"isUTF8_CHAR() $testname: returns expected length: $length");
+ is($ret_flags, $length,
+ "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
}
unless (is(scalar @warnings, 0,
- "isUTF8_CHAR() $testname: generated no warnings"))
+ "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
{
diag "The warnings were: " . join(", ", @warnings);
}
@@ -1318,9 +1382,13 @@ foreach my $test (@tests) {
: $length;
is($ret, $expected_ret,
"isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+ $ret = test_isUTF8_CHAR_flags($bytes, $length,
+ $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
+ is($ret, $expected_ret,
+ "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
}
unless (is(scalar @warnings, 0,
- "isSTRICT_UTF8_CHAR() $testname: generated no warnings"))
+ "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
{
diag "The warnings were: " . join(", ", @warnings);
}
@@ -1337,9 +1405,13 @@ foreach my $test (@tests) {
: $length;
is($ret, $expected_ret,
"isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
+ $ret = test_isUTF8_CHAR_flags($bytes, $length,
+ $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+ is($ret, $expected_ret,
+ "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
}
unless (is(scalar @warnings, 0,
- "isC9_STRICT_UTF8_CHAR() $testname: generated no warnings"))
+ "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
{
diag "The warnings were: " . join(", ", @warnings);
}
diff --git a/utf8.h b/utf8.h
index 4278ebbf2a..392a86a560 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1056,6 +1056,40 @@ L<perlunicode/Noncharacter code points>.
? 0 \
: is_C9_STRICT_UTF8_CHAR_utf8_no_length_checks(s))
+/*
+
+=for apidoc Am|STRLEN|isUTF8_CHAR_flags|const U8 *s|const U8 *e| const U32 flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0. If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+=cut
+*/
+
+#define isUTF8_CHAR_flags(s, e, flags) \
+ (UNLIKELY((e) <= (s)) \
+ ? 0 \
+ : (UTF8_IS_INVARIANT(*s)) \
+ ? 1 \
+ : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
+ ? 0 \
+ : _is_utf8_char_helper(s, e, flags))
+
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
* retained solely for backwards compatibility */
#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)