Add Perl_bytes_cmp_utf8() to compare character sequences in different encodings

Convert sv_eq_flags() and sv_cmp_flags() to use it. Previously, to compare two strings of characters, where was was in UTF-8, and one was not, you had to either: 1: Upgrade the second to UTF-8 2: Compare the resulting octet sequence 3: Free the temporary UTF-8 string or: 1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal 2: Else compare the resulting octet sequence 3: Free the temporary byte string Which for the general case involves a malloc()/free() and at least two O(n) scans per comparison. Whereas this approach has no allocation, a single O(n) scan, which terminates as early as the best case for the second approach.
author: Nicholas Clark <nick@ccl4.org> 2010-11-11 16:08:43 +0000
committer: Nicholas Clark <nick@ccl4.org> 2010-11-11 16:08:43 +0000
commit: fed3ba5d6b9222e6e73844680734b059e616c86b (patch)
tree: c8a449308b28520170011d015883c39c887fb9e8
parent: 08a6f934b8306af074a22b05f6de14f564a9da18 (diff)
download: perl-fed3ba5d6b9222e6e73844680734b059e616c86b.tar.gz
10 files changed, 142 insertions, 21 deletions
diff --git a/MANIFEST b/MANIFEST
index 9c4fe13eba..06ae5e4ff1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3461,6 +3461,7 @@ ext/XS-APItest/t/swaplabel.t	test recursive descent label parsing
 ext/XS-APItest/t/swaptwostmts.t	test recursive descent statement parsing
 ext/XS-APItest/t/temp_lv_sub.t	XS::APItest: tests for lvalue subs returning temps
 ext/XS-APItest/t/utf16_to_utf8.t	Test behaviour of utf16_to_utf8{,reversed}
+ext/XS-APItest/t/utf8.t		Tests for code in utf8.c
 ext/XS-APItest/t/xs_special_subs_require.t	for require too
 ext/XS-APItest/t/xs_special_subs.t	Test that XS BEGIN/CHECK/INIT/END work
 ext/XS-APItest/t/xsub_h.t	Tests for XSUB.h
diff --git a/embed.fnc b/embed.fnc
index 3baf32ef8b..6e3434bdbf 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1292,6 +1292,8 @@ AdpPR	|STRLEN	|utf8_length	|NN const U8* s|NN const U8 *e
 ApdPR	|IV	|utf8_distance	|NN const U8 *a|NN const U8 *b
 ApdPR	|U8*	|utf8_hop	|NN const U8 *s|I32 off
 ApMd	|U8*	|utf8_to_bytes	|NN U8 *s|NN STRLEN *len
+Apd	|int	|bytes_cmp_utf8	|NN const U8 *b|STRLEN blen|NN const U8 *u \
+				|STRLEN ulen
 ApMd	|U8*	|bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
 ApMd	|U8*	|bytes_to_utf8	|NN const U8 *s|NN STRLEN *len
 Apd	|UV	|utf8_to_uvchr	|NN const U8 *s|NULLOK STRLEN *retlen
diff --git a/embed.h b/embed.h
index b743761f7a..0d70e87f9c 100644
--- a/embed.h
+++ b/embed.h
@@ -47,6 +47,7 @@
 #define av_undef(a)		Perl_av_undef(aTHX_ a)
 #define av_unshift(a,b)		Perl_av_unshift(aTHX_ a,b)
 #define block_gimme()		Perl_block_gimme(aTHX)
+#define bytes_cmp_utf8(a,b,c,d)	Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 #define bytes_from_utf8(a,b,c)	Perl_bytes_from_utf8(aTHX_ a,b,c)
 #define bytes_to_utf8(a,b)	Perl_bytes_to_utf8(aTHX_ a,b)
 #define call_argv(a,b,c)	Perl_call_argv(aTHX_ a,b,c)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 23dd9637c6..285fedffe5 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -913,6 +913,24 @@ INCLUDE: const-xs.inc
 
 INCLUDE: numeric.xs
 
+MODULE = XS::APItest::utf8	PACKAGE = XS::APItest::utf8
+
+int
+bytes_cmp_utf8(bytes, utf8)
+	SV *bytes
+	SV *utf8
+    PREINIT:
+	const U8 *b;
+	STRLEN blen;
+	const U8 *u;
+	STRLEN ulen;
+    CODE:
+	b = (const U8 *)SvPVbyte(bytes, blen);
+	u = (const U8 *)SvPVbyte(utf8, ulen);
+	RETVAL = bytes_cmp_utf8(b, blen, u, ulen);
+    OUTPUT:
+	RETVAL
+
 MODULE = XS::APItest:Overload	PACKAGE = XS::APItest::Overload
 
 SV *
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
new file mode 100644
index 0000000000..9ad99f27f1
--- /dev/null
+++ b/ext/XS-APItest/t/utf8.t
@@ -0,0 +1,27 @@
+#!perl -w
+
+use strict;
+use Test::More;
+
+use XS::APItest;
+
+foreach ([0, '', '', 'empty'],
+	 [0, 'N', 'N', '1 char'],
+	 [1, 'NN', 'N', '1 char substring'],
+	 [-2, 'Perl', 'Rules', 'different'],
+	 [0, chr 163, chr 163, 'pound sign'],
+	 [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'],
+	 [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'],
+	 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
+	 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
+	) {
+    my ($expect, $left, $right, $desc) = @$_;
+    my $copy = $right;
+    utf8::encode($copy);
+    is(bytes_cmp_utf8($left, $copy), $expect, $desc);
+    next if $right =~ tr/\0-\377//c;
+    utf8::encode($left);
+    is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
+}
+
+done_testing;
diff --git a/global.sym b/global.sym
index 95a232b362..fbfa98bd1e 100644
--- a/global.sym
+++ b/global.sym
@@ -42,6 +42,7 @@ Perl_av_undef
 Perl_av_unshift
 Perl_block_gimme
 Perl_blockhook_register
+Perl_bytes_cmp_utf8
 Perl_bytes_from_utf8
 Perl_bytes_to_utf8
 Perl_call_argv
diff --git a/proto.h b/proto.h
index 2b0905a09c..186c2a67c3 100644
--- a/proto.h
+++ b/proto.h
@@ -187,6 +187,12 @@ PERL_CALLCONV void	Perl_blockhook_register(pTHX_ BHK *hk)
 PERL_CALLCONV void	Perl_boot_core_PerlIO(pTHX);
 PERL_CALLCONV void	Perl_boot_core_UNIVERSAL(pTHX);
 PERL_CALLCONV void	Perl_boot_core_mro(pTHX);
+PERL_CALLCONV int	Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_BYTES_CMP_UTF8	\
+	assert(b); assert(u)
+
 PERL_CALLCONV U8*	Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2);
diff --git a/sv.c b/sv.c
index 27b4bd6fa8..a1ca186b6f 100644
--- a/sv.c
+++ b/sv.c
@@ -7044,28 +7044,15 @@ Perl_sv_eq_flags(pTHX_ register SV *sv1, register SV *sv2, const U32 flags)
 	      }
 	 }
 	 else {
-	      bool is_utf8 = TRUE;
-
 	      if (SvUTF8(sv1)) {
-		   /* sv1 is the UTF-8 one,
-		    * if is equal it must be downgrade-able */
-		   char * const pv = (char*)bytes_from_utf8((const U8*)pv1,
-						     &cur1, &is_utf8);
-		   if (pv != pv1)
-			pv1 = tpv = pv;
+		  /* sv1 is the UTF-8 one  */
+		  return bytes_cmp_utf8((const U8*)pv2, cur2,
+					(const U8*)pv1, cur1) == 0;
 	      }
 	      else {
-		   /* sv2 is the UTF-8 one,
-		    * if is equal it must be downgrade-able */
-		   char * const pv = (char *)bytes_from_utf8((const U8*)pv2,
-						      &cur2, &is_utf8);
-		   if (pv != pv2)
-			pv2 = tpv = pv;
-	      }
-	      if (is_utf8) {
-		   /* Downgrade not possible - cannot be eq */
-		   assert (tpv == 0);
-		   return FALSE;
+		  /* sv2 is the UTF-8 one  */
+		  return bytes_cmp_utf8((const U8*)pv1, cur1,
+					(const U8*)pv2, cur2) == 0;
 	      }
 	 }
     }
@@ -7140,7 +7127,9 @@ Perl_sv_cmp_flags(pTHX_ register SV *const sv1, register SV *const sv2,
 		 pv2 = SvPV_const(svrecode, cur2);
 	    }
 	    else {
-		 pv2 = tpv = (char*)bytes_to_utf8((const U8*)pv2, &cur2);
+		const int retval = -bytes_cmp_utf8((const U8*)pv2, cur2,
+						   (const U8*)pv1, cur1);
+		return retval ? retval < 0 ? -1 : +1 : 0;
 	    }
 	}
 	else {
@@ -7150,7 +7139,9 @@ Perl_sv_cmp_flags(pTHX_ register SV *const sv1, register SV *const sv2,
 		 pv1 = SvPV_const(svrecode, cur1);
 	    }
 	    else {
-		 pv1 = tpv = (char*)bytes_to_utf8((const U8*)pv1, &cur1);
+		const int retval = bytes_cmp_utf8((const U8*)pv1, cur1,
+						  (const U8*)pv2, cur2);
+		return retval ? retval < 0 ? -1 : +1 : 0;
 	    }
 	}
     }
diff --git a/t/porting/diag.t b/t/porting/diag.t
index 2978a5b9a0..073156a317 100644
--- a/t/porting/diag.t
+++ b/t/porting/diag.t
@@ -275,7 +275,12 @@ sub check_file {
 # 
 # PLEASE DO NOT ADD TO THIS LIST.  Instead, write an entry in
 # pod/perldiag.pod for your new (warning|error).
+
+# Also FIXME this test, as the first entry in TODO *is* covered by the
+# description: Malformed UTF-8 character (%s)
 __DATA__
+Malformed UTF-8 character (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)
+
 %s (%d) does not match %s (%d),
 %s (%d) smaller than %s (%d),
 Argument "%s" isn't numeric
diff --git a/utf8.c b/utf8.c
index 818af02904..019d49fbac 100644
--- a/utf8.c
+++ b/utf8.c
@@ -805,6 +805,75 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 }
 
 /*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+    const U8 *const bend = b + blen;
+    const U8 *const uend = u + ulen;
+
+    PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+    PERL_UNUSED_CONTEXT;
+
+    while (b < bend && u < uend) {
+        U8 c = *u++;
+	if (!UTF8_IS_INVARIANT(c)) {
+	    if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+		if (u < uend) {
+		    U8 c1 = *u++;
+		    if (UTF8_IS_CONTINUATION(c1)) {
+			c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1);
+			c = ASCII_TO_NATIVE(c);
+		    } else {
+			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+					 "Malformed UTF-8 character "
+					 "(unexpected non-continuation byte 0x%02x"
+					 ", immediately after start byte 0x%02x)"
+					 /* Dear diag.t, it's in the pod.  */
+					 "%s%s", c1, c,
+					 PL_op ? " in " : "",
+					 PL_op ? OP_DESC(PL_op) : "");
+			return -2;
+		    }
+		} else {
+		    if (PL_op)
+			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+					 "%s in %s", unees, OP_DESC(PL_op));
+		    else
+			Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+		    return -2; /* Really want to return undef :-)  */
+		}
+	    } else {
+		return -2;
+	    }
+	}
+	if (*b != c) {
+	    return *b < c ? -2 : +2;
+	}
+	++b;
+    }
+
+    if (b == bend && u == uend)
+	return 0;
+
+    return b < bend ? +1 : -1;
+}
+
+/*
 =for apidoc utf8_to_bytes
 
 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
author	Nicholas Clark <nick@ccl4.org>	2010-11-11 16:08:43 +0000
committer	Nicholas Clark <nick@ccl4.org>	2010-11-11 16:08:43 +0000
commit	fed3ba5d6b9222e6e73844680734b059e616c86b (patch)
tree	c8a449308b28520170011d015883c39c887fb9e8
parent	08a6f934b8306af074a22b05f6de14f564a9da18 (diff)
download	perl-fed3ba5d6b9222e6e73844680734b059e616c86b.tar.gz