diff options
author | Nicholas Clark <nick@ccl4.org> | 2010-11-11 16:08:43 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2010-11-11 16:08:43 +0000 |
commit | fed3ba5d6b9222e6e73844680734b059e616c86b (patch) | |
tree | c8a449308b28520170011d015883c39c887fb9e8 | |
parent | 08a6f934b8306af074a22b05f6de14f564a9da18 (diff) | |
download | perl-fed3ba5d6b9222e6e73844680734b059e616c86b.tar.gz |
Add Perl_bytes_cmp_utf8() to compare character sequences in different encodings
Convert sv_eq_flags() and sv_cmp_flags() to use it.
Previously, to compare two strings of characters, where was was in UTF-8, and
one was not, you had to either:
1: Upgrade the second to UTF-8
2: Compare the resulting octet sequence
3: Free the temporary UTF-8 string
or:
1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal
2: Else compare the resulting octet sequence
3: Free the temporary byte string
Which for the general case involves a malloc()/free() and at least two O(n)
scans per comparison.
Whereas this approach has no allocation, a single O(n) scan, which terminates
as early as the best case for the second approach.
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | ext/XS-APItest/APItest.xs | 18 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf8.t | 27 | ||||
-rw-r--r-- | global.sym | 1 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | sv.c | 33 | ||||
-rw-r--r-- | t/porting/diag.t | 5 | ||||
-rw-r--r-- | utf8.c | 69 |
10 files changed, 142 insertions, 21 deletions
@@ -3461,6 +3461,7 @@ ext/XS-APItest/t/swaplabel.t test recursive descent label parsing ext/XS-APItest/t/swaptwostmts.t test recursive descent statement parsing ext/XS-APItest/t/temp_lv_sub.t XS::APItest: tests for lvalue subs returning temps ext/XS-APItest/t/utf16_to_utf8.t Test behaviour of utf16_to_utf8{,reversed} +ext/XS-APItest/t/utf8.t Tests for code in utf8.c ext/XS-APItest/t/xs_special_subs_require.t for require too ext/XS-APItest/t/xs_special_subs.t Test that XS BEGIN/CHECK/INIT/END work ext/XS-APItest/t/xsub_h.t Tests for XSUB.h @@ -1292,6 +1292,8 @@ AdpPR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e ApdPR |IV |utf8_distance |NN const U8 *a|NN const U8 *b ApdPR |U8* |utf8_hop |NN const U8 *s|I32 off ApMd |U8* |utf8_to_bytes |NN U8 *s|NN STRLEN *len +Apd |int |bytes_cmp_utf8 |NN const U8 *b|STRLEN blen|NN const U8 *u \ + |STRLEN ulen ApMd |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8 ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *len Apd |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen @@ -47,6 +47,7 @@ #define av_undef(a) Perl_av_undef(aTHX_ a) #define av_unshift(a,b) Perl_av_unshift(aTHX_ a,b) #define block_gimme() Perl_block_gimme(aTHX) +#define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) #define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c) #define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) #define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 23dd9637c6..285fedffe5 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -913,6 +913,24 @@ INCLUDE: const-xs.inc INCLUDE: numeric.xs +MODULE = XS::APItest::utf8 PACKAGE = XS::APItest::utf8 + +int +bytes_cmp_utf8(bytes, utf8) + SV *bytes + SV *utf8 + PREINIT: + const U8 *b; + STRLEN blen; + const U8 *u; + STRLEN ulen; + CODE: + b = (const U8 *)SvPVbyte(bytes, blen); + u = (const U8 *)SvPVbyte(utf8, ulen); + RETVAL = bytes_cmp_utf8(b, blen, u, ulen); + OUTPUT: + RETVAL + MODULE = XS::APItest:Overload PACKAGE = XS::APItest::Overload SV * diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t new file mode 100644 index 0000000000..9ad99f27f1 --- /dev/null +++ b/ext/XS-APItest/t/utf8.t @@ -0,0 +1,27 @@ +#!perl -w + +use strict; +use Test::More; + +use XS::APItest; + +foreach ([0, '', '', 'empty'], + [0, 'N', 'N', '1 char'], + [1, 'NN', 'N', '1 char substring'], + [-2, 'Perl', 'Rules', 'different'], + [0, chr 163, chr 163, 'pound sign'], + [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'], + [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'], + [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'], + [-1, '!', "!\x{1F42A}", 'Initial substrings match'], + ) { + my ($expect, $left, $right, $desc) = @$_; + my $copy = $right; + utf8::encode($copy); + is(bytes_cmp_utf8($left, $copy), $expect, $desc); + next if $right =~ tr/\0-\377//c; + utf8::encode($left); + is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed"); +} + +done_testing; diff --git a/global.sym b/global.sym index 95a232b362..fbfa98bd1e 100644 --- a/global.sym +++ b/global.sym @@ -42,6 +42,7 @@ Perl_av_undef Perl_av_unshift Perl_block_gimme Perl_blockhook_register +Perl_bytes_cmp_utf8 Perl_bytes_from_utf8 Perl_bytes_to_utf8 Perl_call_argv @@ -187,6 +187,12 @@ PERL_CALLCONV void Perl_blockhook_register(pTHX_ BHK *hk) PERL_CALLCONV void Perl_boot_core_PerlIO(pTHX); PERL_CALLCONV void Perl_boot_core_UNIVERSAL(pTHX); PERL_CALLCONV void Perl_boot_core_mro(pTHX); +PERL_CALLCONV int Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_BYTES_CMP_UTF8 \ + assert(b); assert(u) + PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); @@ -7044,28 +7044,15 @@ Perl_sv_eq_flags(pTHX_ register SV *sv1, register SV *sv2, const U32 flags) } } else { - bool is_utf8 = TRUE; - if (SvUTF8(sv1)) { - /* sv1 is the UTF-8 one, - * if is equal it must be downgrade-able */ - char * const pv = (char*)bytes_from_utf8((const U8*)pv1, - &cur1, &is_utf8); - if (pv != pv1) - pv1 = tpv = pv; + /* sv1 is the UTF-8 one */ + return bytes_cmp_utf8((const U8*)pv2, cur2, + (const U8*)pv1, cur1) == 0; } else { - /* sv2 is the UTF-8 one, - * if is equal it must be downgrade-able */ - char * const pv = (char *)bytes_from_utf8((const U8*)pv2, - &cur2, &is_utf8); - if (pv != pv2) - pv2 = tpv = pv; - } - if (is_utf8) { - /* Downgrade not possible - cannot be eq */ - assert (tpv == 0); - return FALSE; + /* sv2 is the UTF-8 one */ + return bytes_cmp_utf8((const U8*)pv1, cur1, + (const U8*)pv2, cur2) == 0; } } } @@ -7140,7 +7127,9 @@ Perl_sv_cmp_flags(pTHX_ register SV *const sv1, register SV *const sv2, pv2 = SvPV_const(svrecode, cur2); } else { - pv2 = tpv = (char*)bytes_to_utf8((const U8*)pv2, &cur2); + const int retval = -bytes_cmp_utf8((const U8*)pv2, cur2, + (const U8*)pv1, cur1); + return retval ? retval < 0 ? -1 : +1 : 0; } } else { @@ -7150,7 +7139,9 @@ Perl_sv_cmp_flags(pTHX_ register SV *const sv1, register SV *const sv2, pv1 = SvPV_const(svrecode, cur1); } else { - pv1 = tpv = (char*)bytes_to_utf8((const U8*)pv1, &cur1); + const int retval = bytes_cmp_utf8((const U8*)pv1, cur1, + (const U8*)pv2, cur2); + return retval ? retval < 0 ? -1 : +1 : 0; } } } diff --git a/t/porting/diag.t b/t/porting/diag.t index 2978a5b9a0..073156a317 100644 --- a/t/porting/diag.t +++ b/t/porting/diag.t @@ -275,7 +275,12 @@ sub check_file { # # PLEASE DO NOT ADD TO THIS LIST. Instead, write an entry in # pod/perldiag.pod for your new (warning|error). + +# Also FIXME this test, as the first entry in TODO *is* covered by the +# description: Malformed UTF-8 character (%s) __DATA__ +Malformed UTF-8 character (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x) + %s (%d) does not match %s (%d), %s (%d) smaller than %s (%d), Argument "%s" isn't numeric @@ -805,6 +805,75 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) } /* +=for apidoc bytes_cmp_utf8 + +Compares the sequence of characters (stored as octets) in b, blen with the +sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are +equal, -1 or -2 if the first string is less than the second string, +1 or +2 +if the first string is greater than the second string. + +-1 or +1 is returned if the shorter string was identical to the start of the +longer string. -2 or +2 is returned if the was a difference between characters +within the strings. + +=cut +*/ + +int +Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) +{ + const U8 *const bend = b + blen; + const U8 *const uend = u + ulen; + + PERL_ARGS_ASSERT_BYTES_CMP_UTF8; + + PERL_UNUSED_CONTEXT; + + while (b < bend && u < uend) { + U8 c = *u++; + if (!UTF8_IS_INVARIANT(c)) { + if (UTF8_IS_DOWNGRADEABLE_START(c)) { + if (u < uend) { + U8 c1 = *u++; + if (UTF8_IS_CONTINUATION(c1)) { + c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1); + c = ASCII_TO_NATIVE(c); + } else { + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "Malformed UTF-8 character " + "(unexpected non-continuation byte 0x%02x" + ", immediately after start byte 0x%02x)" + /* Dear diag.t, it's in the pod. */ + "%s%s", c1, c, + PL_op ? " in " : "", + PL_op ? OP_DESC(PL_op) : ""); + return -2; + } + } else { + if (PL_op) + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees); + return -2; /* Really want to return undef :-) */ + } + } else { + return -2; + } + } + if (*b != c) { + return *b < c ? -2 : +2; + } + ++b; + } + + if (b == bend && u == uend) + return 0; + + return b < bend ? +1 : -1; +} + +/* =for apidoc utf8_to_bytes Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. |