diff options
author | Nicholas Clark <nick@ccl4.org> | 2010-11-11 16:08:43 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2010-11-11 16:08:43 +0000 |
commit | fed3ba5d6b9222e6e73844680734b059e616c86b (patch) | |
tree | c8a449308b28520170011d015883c39c887fb9e8 /utf8.c | |
parent | 08a6f934b8306af074a22b05f6de14f564a9da18 (diff) | |
download | perl-fed3ba5d6b9222e6e73844680734b059e616c86b.tar.gz |
Add Perl_bytes_cmp_utf8() to compare character sequences in different encodings
Convert sv_eq_flags() and sv_cmp_flags() to use it.
Previously, to compare two strings of characters, where was was in UTF-8, and
one was not, you had to either:
1: Upgrade the second to UTF-8
2: Compare the resulting octet sequence
3: Free the temporary UTF-8 string
or:
1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal
2: Else compare the resulting octet sequence
3: Free the temporary byte string
Which for the general case involves a malloc()/free() and at least two O(n)
scans per comparison.
Whereas this approach has no allocation, a single O(n) scan, which terminates
as early as the best case for the second approach.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 69 |
1 files changed, 69 insertions, 0 deletions
@@ -805,6 +805,75 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) } /* +=for apidoc bytes_cmp_utf8 + +Compares the sequence of characters (stored as octets) in b, blen with the +sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are +equal, -1 or -2 if the first string is less than the second string, +1 or +2 +if the first string is greater than the second string. + +-1 or +1 is returned if the shorter string was identical to the start of the +longer string. -2 or +2 is returned if the was a difference between characters +within the strings. + +=cut +*/ + +int +Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen) +{ + const U8 *const bend = b + blen; + const U8 *const uend = u + ulen; + + PERL_ARGS_ASSERT_BYTES_CMP_UTF8; + + PERL_UNUSED_CONTEXT; + + while (b < bend && u < uend) { + U8 c = *u++; + if (!UTF8_IS_INVARIANT(c)) { + if (UTF8_IS_DOWNGRADEABLE_START(c)) { + if (u < uend) { + U8 c1 = *u++; + if (UTF8_IS_CONTINUATION(c1)) { + c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1); + c = ASCII_TO_NATIVE(c); + } else { + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "Malformed UTF-8 character " + "(unexpected non-continuation byte 0x%02x" + ", immediately after start byte 0x%02x)" + /* Dear diag.t, it's in the pod. */ + "%s%s", c1, c, + PL_op ? " in " : "", + PL_op ? OP_DESC(PL_op) : ""); + return -2; + } + } else { + if (PL_op) + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees); + return -2; /* Really want to return undef :-) */ + } + } else { + return -2; + } + } + if (*b != c) { + return *b < c ? -2 : +2; + } + ++b; + } + + if (b == bend && u == uend) + return 0; + + return b < bend ? +1 : -1; +} + +/* =for apidoc utf8_to_bytes Converts a string C<s> of length C<len> from UTF-8 into native byte encoding. |