utf8.c: refactor utf8n_to_uvuni()

The prior version had a number of issues, some of which have been taken care of in previous commits. The goal when presented with malformed input is to consume as few bytes as possible, so as to position the input for the next try to the first possible byte that could be the beginning of a character. We don't want to consume too few bytes, so that the next call has us thinking that what is the middle of a character is really the beginning; nor do we want to consume too many, so as to skip valid input characters. (This is forbidden by the Unicode standard because of security considerations.) The previous code could do both of these under various circumstances. In some cases it took as a given that the first byte in a character is correct, and skipped looking at the rest of the bytes in the sequence. This is wrong when just that first byte is garbled. We have to look at all bytes in the expected sequence to make sure it hasn't been prematurely terminated from what we were led to expect by that first byte. Likewise when we get an overflow: we have to keep looking at each byte in the sequence. It may be that the initial byte was garbled, so that it appeared that there was going to be overflow, but in reality, the input was supposed to be a shorter sequence that doesn't overflow. We want to have an error on that shorter sequence, and advance the pointer to just beyond it, which is the first position where a valid character could start. This fixes a long-standing TODO from an externally supplied utf8 decode test suite. And, the old algorithm for finding overflow failed to detect it on some inputs. This was spotted by Hugo van der Sanden, who suggested the new algorithm that this commit uses, and which should work in all instances. For example, on a 32-bit machine, any string beginning with "\xFE" and having the next byte be either "\x86" or \x87 overflows, but this was missed by the old algorithm. Another bug was that the code was careless about what happens when a malformation occurs that the input flags allow. For example, a sequence should not start with a continuation byte. If that malformation is allowed, the code pretended it is a start byte and extracts the "length" of the sequence from it. But pretending it is a start byte is not the same thing as it actually being a start byte, and so there is no extractable length in it, so the number that this code thought was "length" was bogus. Yet another bug fixed is that if only the warning subcategories of the utf8 category were turned on, and not the entire utf8 category itself, warnings were not raised that should have been. And yet another change is that given malformed input with warnings turned off, this function used to return whatever it had computed so far, which is incomplete or erroneous garbage. This commit changes to return the REPLACEMENT CHARACTER instead. Thanks to Hugo van der Sanden for reviewing and finding problems with an earlier version of these commits
author: Karl Williamson <public@khwilliamson.com> 2012-04-18 17:36:01 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-04-26 11:58:57 -0600
commit: eb83ed87110e41de6a4cd4463f75df60798a9243 (patch)
tree: aa376aad6c2d6923ae6ac97e44feae5b71e47cce /ext
parent: 0b8d30e8ba4bed9219a0a08549fd9d07661587ee (diff)
download: perl-eb83ed87110e41de6a4cd4463f75df60798a9243.tar.gz
2 files changed, 258 insertions, 0 deletions
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 34fbfdeb1a..e2d34d92bb 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -1127,6 +1127,41 @@ bytes_cmp_utf8(bytes, utf8)
     OUTPUT:
 	RETVAL
 
+AV *
+test_utf8n_to_uvuni(s, len, flags)
+
+        SV *s
+        SV *len
+        SV *flags
+    PREINIT:
+        STRLEN retlen;
+        UV ret;
+        STRLEN slen;
+
+    CODE:
+        /* Call utf8n_to_uvuni() with the inputs.  It always asks for the
+         * actual length to be returned
+         *
+         * Length to assume <s> is; not checked, so could have buffer overflow
+         */
+        RETVAL = newAV();
+        sv_2mortal((SV*)RETVAL);
+
+        ret
+         = utf8n_to_uvuni((U8*) SvPV(s, slen), SvUV(len), &retlen, SvUV(flags));
+
+        /* Returns the return value in [0]; <retlen> in [1] */
+        av_push(RETVAL, newSVuv(ret));
+        if (retlen == (STRLEN) -1) {
+            av_push(RETVAL, newSViv(-1));
+        }
+        else {
+            av_push(RETVAL, newSVuv(retlen));
+        }
+
+    OUTPUT:
+        RETVAL
+
 MODULE = XS::APItest:Overload	PACKAGE = XS::APItest::Overload
 
 void
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 9ad99f27f1..b59fb69212 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -24,4 +24,227 @@ foreach ([0, '', '', 'empty'],
     is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
 }
 
+# Test uft8n_to_uvuni().  These provide essentially complete code coverage.
+
+# Copied from utf8.h
+my $UTF8_ALLOW_EMPTY            = 0x0001;
+my $UTF8_ALLOW_CONTINUATION     = 0x0002;
+my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
+my $UTF8_ALLOW_SHORT            = 0x0008;
+my $UTF8_ALLOW_LONG             = 0x0010;
+my $UTF8_DISALLOW_SURROGATE     = 0x0020;
+my $UTF8_WARN_SURROGATE         = 0x0040;
+my $UTF8_DISALLOW_NONCHAR       = 0x0080;
+my $UTF8_WARN_NONCHAR           = 0x0100;
+my $UTF8_DISALLOW_SUPER         = 0x0200;
+my $UTF8_WARN_SUPER             = 0x0400;
+my $UTF8_DISALLOW_FE_FF         = 0x0800;
+my $UTF8_WARN_FE_FF             = 0x1000;
+my $UTF8_CHECK_ONLY             = 0x2000;
+
+my $REPLACEMENT = 0xFFFD;
+
+my @warnings;
+
+use warnings 'utf8';
+local $SIG{__WARN__} = sub { push @warnings, @_ };
+
+# First test the malformations.  All these raise category utf8 warnings.
+foreach my $test (
+    [ "zero length string malformation", "", 0,
+        $UTF8_ALLOW_EMPTY, 0, 0,
+        qr/empty string/
+    ],
+    [ "orphan continuation byte malformation", "\x80a", 2,
+        $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
+        qr/unexpected continuation byte/
+    ],
+    [ "premature next character malformation (immediate)", "\xc2a", 2,
+        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
+        qr/unexpected non-continuation byte.*immediately after start byte/
+    ],
+    [ "premature next character malformation (non-immediate)", "\xf0\x80a", 3,
+        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
+        qr/unexpected non-continuation byte .* 2 bytes after start byte/
+    ],
+    [ "too short malformation", "\xf0\x80a", 2,
+        # Having the 'a' after this, but saying there are only 2 bytes also
+        # tests that we pay attention to the passed in length
+        $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
+        qr/2 bytes, need 4/
+    ],
+    [ "overlong malformation", "\xc1\xaf", 2,
+        $UTF8_ALLOW_LONG, ord('o'), 2,
+        qr/2 bytes, need 1/
+    ],
+    [ "overflow malformation", "\xff\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 13,
+        0,  # There is no way to allow this malformation
+        $REPLACEMENT, 13,
+        qr/overflow/
+    ],
+) {
+    my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
+
+    next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
+
+    # Test what happens when this malformation is not allowed
+    undef @warnings;
+    my $ret_ref = test_utf8n_to_uvuni($bytes, $length, 0);
+    is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
+    is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length");
+    if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
+        like($warnings[0], $message, "$testname: disallowed: Got expected warning");
+    }
+    else {
+        if (scalar @warnings) {
+            note "The warnings were: " . join(", ", @warnings);
+        }
+    }
+
+    {   # Next test when disallowed, and warnings are off.
+        undef @warnings;
+        no warnings 'utf8';
+        my $ret_ref = test_utf8n_to_uvuni($bytes, $length, 0);
+        is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0");
+        is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length");
+        if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) {
+            note "The warnings were: " . join(", ", @warnings);
+        }
+    }
+
+    # Test with CHECK_ONLY
+    undef @warnings;
+    $ret_ref = test_utf8n_to_uvuni($bytes, $length, $UTF8_CHECK_ONLY);
+    is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
+    is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length");
+    if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
+        note "The warnings were: " . join(", ", @warnings);
+    }
+
+    next if $allow_flags == 0;    # Skip if can't allow this malformation
+
+    # Test when the malformation is allowed
+    undef @warnings;
+    $ret_ref = test_utf8n_to_uvuni($bytes, $length, $allow_flags);
+    is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv");
+    is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length");
+    if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated"))
+    {
+        note "The warnings were: " . join(", ", @warnings);
+    }
+}
+
+my $FF_ret;
+
+use Unicode::UCD;
+my $has_quad = ($Unicode::UCD::MAX_CP > 0xFFFF_FFFF);
+if ($has_quad) {
+    no warnings qw{portable overflow};
+    $FF_ret = 0x1000000000;
+}
+else {  # The above overflows unless a quad platform
+    $FF_ret = 0;
+}
+
+# Now test the cases where a legal code point is generated, but may or may not
+# be allowed/warned on.
+foreach my $test (
+    [ "surrogate", "\xed\xa4\x8d",
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, 'surrogate', 0xD90D, 3,
+        qr/surrogate/
+    ],
+    [ "non_unicode", "\xf4\x90\x80\x80",
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, 'non_unicode', 0x110000, 4,
+        qr/not Unicode/
+    ],
+    [ "non-character code point", "\xEF\xB7\x90",
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, 'nonchar', 0xFDD0, 3,
+        qr/Unicode non-character.*is illegal for open interchange/
+    ],
+    [ "begins with FE", "\xfe\x82\x80\x80\x80\x80\x80",
+
+        # This code point is chosen so that it is representable in a UV on
+        # 32-bit machines, otherwise we would have to handle it like the FF
+        # ones
+        $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
+        qr/Code point beginning with byte .* is not Unicode, and not portable/
+    ],
+    [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
+        $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
+        qr/Code point beginning with byte .* is not Unicode, and not portable/
+    ],
+) {
+    my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
+
+    my $length = length $bytes;
+
+    # This is more complicated than the malformations tested earlier, as there
+    # are several orthogonal variables involved.  We test all the subclasses
+    # of utf8 warnings to verify they work with and without the utf8 class,
+    # and don't have effects on other sublass warnings
+    foreach my $warning (0, 'utf8', 'surrogate', 'nonchar', 'non_unicode') {
+        foreach my $warn_flag (0, $warn_flags) {
+            foreach my $disallow_flag (0, $disallow_flags) {
+
+                # On 32-bit machines, anything beginning with \xff is not
+                # representable, and would overflow even if we were to allow
+                # them in this test.
+                next if ! $has_quad
+                        && ! $disallow_flag
+                        && substr($bytes, 0, 1) eq "\xff";
+
+                no warnings 'utf8';
+                my $eval_warn = $warning eq 0 ? "no warnings" : "use warnings '$warning'";
+                my $this_name = "$testname: " . (($disallow_flag) ? 'disallowed' : 'allowed');
+                $this_name .= ", $eval_warn";
+                $this_name .= ", " . (($warn_flag) ? 'with warning flag' : 'no warning flag');
+
+                undef @warnings;
+                my $ret_ref;
+                #note __LINE__ . ": $eval_warn; \$ret_ref = test_utf8n_to_uvuni('$bytes', $length, $warn_flag|$disallow_flag)";
+                my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvuni('$bytes', $length, $warn_flag|$disallow_flag)";
+                eval "$eval_text";
+                if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
+                    note "\$!='$!'; eval'd=\"$eval_text\"";
+                    next;
+                }
+                if ($disallow_flag) {
+                    is($ret_ref->[0], 0, "$this_name: Returns 0");
+                }
+                else {
+                    is($ret_ref->[0], $allowed_uv, "$this_name: Returns expected uv");
+                }
+                is($ret_ref->[1], $expected_len, "$this_name: Returns expected length");
+
+                if ($warn_flag && ($warning eq 'utf8' || $warning eq $category)) {
+                    if (is(scalar @warnings, 1, "$this_name: Got a single warning ")) {
+                        like($warnings[0], $message, "$this_name: Got expected warning");
+                    }
+                    else {
+                        if (scalar @warnings) {
+                            note "The warnings were: " . join(", ", @warnings);
+                        }
+                    }
+                }
+                else {
+                    if (!is(scalar @warnings, 0, "$this_name: No warnings generated"))
+                    {
+                        note "The warnings were: " . join(", ", @warnings);
+                    }
+                }
+
+                if ($disallow_flag) {
+                    undef @warnings;
+                    $ret_ref = test_utf8n_to_uvuni($bytes, $length, $disallow_flag|$UTF8_CHECK_ONLY);
+                    is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0");
+                    is($ret_ref->[1], -1, "$this_name: CHECK_ONLY: returns expected length");
+                    if (! is(scalar @warnings, 0, "$this_name, CHECK_ONLY: no warnings generated")) {
+                        note "The warnings were: " . join(", ", @warnings);
+                    }
+                }
+            }
+        }
+    }
+}
+
 done_testing;
author	Karl Williamson <public@khwilliamson.com>	2012-04-18 17:36:01 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-04-26 11:58:57 -0600
commit	eb83ed87110e41de6a4cd4463f75df60798a9243 (patch)
tree	aa376aad6c2d6923ae6ac97e44feae5b71e47cce /ext
parent	0b8d30e8ba4bed9219a0a08549fd9d07661587ee (diff)
download	perl-eb83ed87110e41de6a4cd4463f75df60798a9243.tar.gz