From fed3ba5d6b9222e6e73844680734b059e616c86b Mon Sep 17 00:00:00 2001
From: Nicholas Clark <nick@ccl4.org>
Date: Thu, 11 Nov 2010 16:08:43 +0000
Subject: Add Perl_bytes_cmp_utf8() to compare character sequences in different
 encodings

Convert sv_eq_flags() and sv_cmp_flags() to use it.

Previously, to compare two strings of characters, where was was in UTF-8, and
one was not, you had to either:

1: Upgrade the second to UTF-8
2: Compare the resulting octet sequence
3: Free the temporary UTF-8 string

or:

1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal
2: Else compare the resulting octet sequence
3: Free the temporary byte string

Which for the general case involves a malloc()/free() and at least two O(n)
scans per comparison.

Whereas this approach has no allocation, a single O(n) scan, which terminates
as early as the best case for the second approach.
---
 ext/XS-APItest/APItest.xs | 18 ++++++++++++++++++
 ext/XS-APItest/t/utf8.t   | 27 +++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 ext/XS-APItest/t/utf8.t

(limited to 'ext')

diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 23dd9637c6..285fedffe5 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -913,6 +913,24 @@ INCLUDE: const-xs.inc
 
 INCLUDE: numeric.xs
 
+MODULE = XS::APItest::utf8	PACKAGE = XS::APItest::utf8
+
+int
+bytes_cmp_utf8(bytes, utf8)
+	SV *bytes
+	SV *utf8
+    PREINIT:
+	const U8 *b;
+	STRLEN blen;
+	const U8 *u;
+	STRLEN ulen;
+    CODE:
+	b = (const U8 *)SvPVbyte(bytes, blen);
+	u = (const U8 *)SvPVbyte(utf8, ulen);
+	RETVAL = bytes_cmp_utf8(b, blen, u, ulen);
+    OUTPUT:
+	RETVAL
+
 MODULE = XS::APItest:Overload	PACKAGE = XS::APItest::Overload
 
 SV *
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
new file mode 100644
index 0000000000..9ad99f27f1
--- /dev/null
+++ b/ext/XS-APItest/t/utf8.t
@@ -0,0 +1,27 @@
+#!perl -w
+
+use strict;
+use Test::More;
+
+use XS::APItest;
+
+foreach ([0, '', '', 'empty'],
+	 [0, 'N', 'N', '1 char'],
+	 [1, 'NN', 'N', '1 char substring'],
+	 [-2, 'Perl', 'Rules', 'different'],
+	 [0, chr 163, chr 163, 'pound sign'],
+	 [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'],
+	 [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'],
+	 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
+	 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
+	) {
+    my ($expect, $left, $right, $desc) = @$_;
+    my $copy = $right;
+    utf8::encode($copy);
+    is(bytes_cmp_utf8($left, $copy), $expect, $desc);
+    next if $right =~ tr/\0-\377//c;
+    utf8::encode($left);
+    is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
+}
+
+done_testing;
-- 
cgit v1.2.1