From fed3ba5d6b9222e6e73844680734b059e616c86b Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Thu, 11 Nov 2010 16:08:43 +0000 Subject: Add Perl_bytes_cmp_utf8() to compare character sequences in different encodings Convert sv_eq_flags() and sv_cmp_flags() to use it. Previously, to compare two strings of characters, where was was in UTF-8, and one was not, you had to either: 1: Upgrade the second to UTF-8 2: Compare the resulting octet sequence 3: Free the temporary UTF-8 string or: 1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal 2: Else compare the resulting octet sequence 3: Free the temporary byte string Which for the general case involves a malloc()/free() and at least two O(n) scans per comparison. Whereas this approach has no allocation, a single O(n) scan, which terminates as early as the best case for the second approach. --- ext/XS-APItest/APItest.xs | 18 ++++++++++++++++++ ext/XS-APItest/t/utf8.t | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 ext/XS-APItest/t/utf8.t (limited to 'ext') diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 23dd9637c6..285fedffe5 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -913,6 +913,24 @@ INCLUDE: const-xs.inc INCLUDE: numeric.xs +MODULE = XS::APItest::utf8 PACKAGE = XS::APItest::utf8 + +int +bytes_cmp_utf8(bytes, utf8) + SV *bytes + SV *utf8 + PREINIT: + const U8 *b; + STRLEN blen; + const U8 *u; + STRLEN ulen; + CODE: + b = (const U8 *)SvPVbyte(bytes, blen); + u = (const U8 *)SvPVbyte(utf8, ulen); + RETVAL = bytes_cmp_utf8(b, blen, u, ulen); + OUTPUT: + RETVAL + MODULE = XS::APItest:Overload PACKAGE = XS::APItest::Overload SV * diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t new file mode 100644 index 0000000000..9ad99f27f1 --- /dev/null +++ b/ext/XS-APItest/t/utf8.t @@ -0,0 +1,27 @@ +#!perl -w + +use strict; +use Test::More; + +use XS::APItest; + +foreach ([0, '', '', 'empty'], + [0, 'N', 'N', '1 char'], + [1, 'NN', 'N', '1 char substring'], + [-2, 'Perl', 'Rules', 'different'], + [0, chr 163, chr 163, 'pound sign'], + [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'], + [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'], + [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'], + [-1, '!', "!\x{1F42A}", 'Initial substrings match'], + ) { + my ($expect, $left, $right, $desc) = @$_; + my $copy = $right; + utf8::encode($copy); + is(bytes_cmp_utf8($left, $copy), $expect, $desc); + next if $right =~ tr/\0-\377//c; + utf8::encode($left); + is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed"); +} + +done_testing; -- cgit v1.2.1