summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2010-11-11 16:08:43 +0000
committerNicholas Clark <nick@ccl4.org>2010-11-11 16:08:43 +0000
commitfed3ba5d6b9222e6e73844680734b059e616c86b (patch)
treec8a449308b28520170011d015883c39c887fb9e8 /utf8.c
parent08a6f934b8306af074a22b05f6de14f564a9da18 (diff)
downloadperl-fed3ba5d6b9222e6e73844680734b059e616c86b.tar.gz
Add Perl_bytes_cmp_utf8() to compare character sequences in different encodings
Convert sv_eq_flags() and sv_cmp_flags() to use it. Previously, to compare two strings of characters, where was was in UTF-8, and one was not, you had to either: 1: Upgrade the second to UTF-8 2: Compare the resulting octet sequence 3: Free the temporary UTF-8 string or: 1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal 2: Else compare the resulting octet sequence 3: Free the temporary byte string Which for the general case involves a malloc()/free() and at least two O(n) scans per comparison. Whereas this approach has no allocation, a single O(n) scan, which terminates as early as the best case for the second approach.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c69
1 files changed, 69 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 818af02904..019d49fbac 100644
--- a/utf8.c
+++ b/utf8.c
@@ -805,6 +805,75 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
}
/*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+ const U8 *const bend = b + blen;
+ const U8 *const uend = u + ulen;
+
+ PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+ PERL_UNUSED_CONTEXT;
+
+ while (b < bend && u < uend) {
+ U8 c = *u++;
+ if (!UTF8_IS_INVARIANT(c)) {
+ if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+ if (u < uend) {
+ U8 c1 = *u++;
+ if (UTF8_IS_CONTINUATION(c1)) {
+ c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1);
+ c = ASCII_TO_NATIVE(c);
+ } else {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "Malformed UTF-8 character "
+ "(unexpected non-continuation byte 0x%02x"
+ ", immediately after start byte 0x%02x)"
+ /* Dear diag.t, it's in the pod. */
+ "%s%s", c1, c,
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
+ return -2;
+ }
+ } else {
+ if (PL_op)
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, OP_DESC(PL_op));
+ else
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+ return -2; /* Really want to return undef :-) */
+ }
+ } else {
+ return -2;
+ }
+ }
+ if (*b != c) {
+ return *b < c ? -2 : +2;
+ }
+ ++b;
+ }
+
+ if (b == bend && u == uend)
+ return 0;
+
+ return b < bend ? +1 : -1;
+}
+
+/*
=for apidoc utf8_to_bytes
Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.