summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--utf8.c260
1 files changed, 152 insertions, 108 deletions
diff --git a/utf8.c b/utf8.c
index 6c522d033a..8fd5db9938 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2504,22 +2504,33 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
/*
=for apidoc ibcmp_utf8
-Return true if the strings s1 and s2 differ case-insensitively, false
-if not (if they are equal case-insensitively). If u1 is true, the
-string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
-the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
-are false, the respective string is assumed to be in native 8-bit
-encoding.
-
-If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
-in there (they will point at the beginning of the I<next> character).
-If the pointers behind pe1 or pe2 are non-NULL, they are the end
-pointers beyond which scanning will not continue under any
-circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
-s2+l2 will be used as goal end pointers that will also stop the scan,
-and which qualify towards defining a successful match: all the scans
-that define an explicit length must reach their goal pointers for
-a match to succeed).
+Returns true if the strings s1 and s2 differ case-insensitively, false
+if they are equal case-insensitively. Note that this is the complement of what
+you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+
+If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
+with respect to s2.
+
+If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach. The
+scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal. Correspondingly for l2 with respect to
+s2.
+
+If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer beyond which scanning of s1 will not continue under
+any circumstances. This means that if both l1 and pe1 are specified, and pe1
+is less than s1+l1, the match will never be successful because it can never
+get as far as its goal. Correspondingly for pe2 with respect to s2.
+
+At least one of s1 and s2 must have a goal, and if both do, both have to be
+reached for a successful match. Also, if the fold of a character is multiple
+characters, all of them must be matched (see tr21 reference below for
+'folding').
+
+Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+it will be set to point to the beginning of the I<next> character of s1 beyond
+what was matched. Correspondingly for pe2 and s2.
For case-insensitiveness, the "casefolding" of Unicode is used
instead of upper/lowercasing both the characters, see
@@ -2529,98 +2540,131 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
I32
Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
{
- dVAR;
- register const U8 *p1 = (const U8*)s1;
- register const U8 *p2 = (const U8*)s2;
- register const U8 *f1 = NULL;
- register const U8 *f2 = NULL;
- register U8 *e1 = NULL;
- register U8 *q1 = NULL;
- register U8 *e2 = NULL;
- register U8 *q2 = NULL;
- STRLEN n1 = 0, n2 = 0;
- U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
- U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
- U8 natbuf[1+1];
- STRLEN foldlen1, foldlen2;
- bool match;
-
- PERL_ARGS_ASSERT_IBCMP_UTF8;
-
- if (pe1)
- e1 = *(U8**)pe1;
- /* assert(e1 || l1); */
- if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
- f1 = (const U8*)s1 + l1;
- if (pe2)
- e2 = *(U8**)pe2;
- /* assert(e2 || l2); */
- if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
- f2 = (const U8*)s2 + l2;
-
- /* This shouldn't happen. However, putting an assert() there makes some
- * tests fail. */
- /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
- if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
- return 1; /* mismatch; possible infinite loop or false positive */
-
- if (!u1 || !u2)
- natbuf[1] = 0; /* Need to terminate the buffer. */
-
- while ((e1 == 0 || p1 < e1) &&
- (f1 == 0 || p1 < f1) &&
- (e2 == 0 || p2 < e2) &&
- (f2 == 0 || p2 < f2)) {
- if (n1 == 0) {
- if (u1)
- to_utf8_fold(p1, foldbuf1, &foldlen1);
- else {
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
- to_utf8_fold(natbuf, foldbuf1, &foldlen1);
- }
- q1 = foldbuf1;
- n1 = foldlen1;
- }
- if (n2 == 0) {
- if (u2)
- to_utf8_fold(p2, foldbuf2, &foldlen2);
- else {
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
- to_utf8_fold(natbuf, foldbuf2, &foldlen2);
- }
- q2 = foldbuf2;
- n2 = foldlen2;
- }
- while (n1 && n2) {
- if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
- (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
- memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
- return 1; /* mismatch */
- n1 -= UTF8SKIP(q1);
- q1 += UTF8SKIP(q1);
- n2 -= UTF8SKIP(q2);
- q2 += UTF8SKIP(q2);
- }
- if (n1 == 0)
- p1 += u1 ? UTF8SKIP(p1) : 1;
- if (n2 == 0)
- p2 += u2 ? UTF8SKIP(p2) : 1;
-
- }
-
- /* A match is defined by all the scans that specified
- * an explicit length reaching their final goals. */
- match = (n1 == 0 && n2 == 0 /* Must not match partial char; Bug #72998 */
- && (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2));
-
- if (match) {
- if (pe1)
- *pe1 = (char*)p1;
- if (pe2)
- *pe2 = (char*)p2;
- }
-
- return match ? 0 : 1; /* 0 match, 1 mismatch */
+ dVAR;
+ register const U8 *p1 = (const U8*)s1; /* Point to current char */
+ register const U8 *p2 = (const U8*)s2;
+ register const U8 *g1 = NULL; /* goal for s1 */
+ register const U8 *g2 = NULL;
+ register const U8 *e1 = NULL; /* Don't scan s1 past this */
+ register U8 *f1 = NULL; /* Point to current folded */
+ register const U8 *e2 = NULL;
+ register U8 *f2 = NULL;
+ STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
+ U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+ U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
+ these always fit in 2 bytes */
+
+ PERL_ARGS_ASSERT_IBCMP_UTF8;
+
+ if (pe1) {
+ e1 = *(U8**)pe1;
+ }
+
+ if (l1) {
+ g1 = (const U8*)s1 + l1;
+ }
+
+ if (pe2) {
+ e2 = *(U8**)pe2;
+ }
+
+ if (l2) {
+ g2 = (const U8*)s2 + l2;
+ }
+
+ /* Must have at least one goal */
+ assert(g1 || g2);
+
+ if (g1) {
+
+ /* Will never match if goal is out-of-bounds */
+ assert(! e1 || e1 >= g1);
+
+ /* Here, there isn't an end pointer, or it is beyond the goal. We
+ * only go as far as the goal */
+ e1 = g1;
+ }
+ else assert(e1); /* Must have an end for looking at s1 */
+
+ /* Same for goal for s2 */
+ if (g2) {
+ assert(! e2 || e2 >= g2);
+ e2 = g2;
+ }
+ else assert(e2);
+
+ /* Look through both strings, a character at a time */
+ while (p1 < e1 && p2 < e2) {
+
+ /* If at the beginning of a new character in s1, get its fold to use */
+ if (n1 == 0) {
+ if (u1) {
+ to_utf8_fold(p1, foldbuf1, &n1);
+ }
+ else { /* Not utf8, convert to it first and then get fold */
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+ to_utf8_fold(natbuf, foldbuf1, &n1);
+ }
+ f1 = foldbuf1;
+ }
+
+ if (n2 == 0) { /* Same for s2 */
+ if (u2) {
+ to_utf8_fold(p2, foldbuf2, &n2);
+ }
+ else {
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+ to_utf8_fold(natbuf, foldbuf2, &n2);
+ }
+ f2 = foldbuf2;
+ }
+
+ /* While there is more to look for in both folds, see if they
+ * continue to match */
+ while (n1 && n2) {
+ U8 fold_length = UTF8SKIP(f1);
+ if (fold_length != UTF8SKIP(f2)
+ || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+ function call for single
+ character */
+ || memNE((char*)f1, (char*)f2, fold_length))
+ {
+ return 1; /* mismatch */
+ }
+
+ /* Here, they matched, advance past them */
+ n1 -= fold_length;
+ f1 += fold_length;
+ n2 -= fold_length;
+ f2 += fold_length;
+ }
+
+ /* When reach the end of any fold, advance the input past it */
+ if (n1 == 0) {
+ p1 += u1 ? UTF8SKIP(p1) : 1;
+ }
+ if (n2 == 0) {
+ p2 += u2 ? UTF8SKIP(p2) : 1;
+ }
+ } /* End of loop through both strings */
+
+ /* A match is defined by each scan that specified an explicit length
+ * reaching its final goal, and the other not having matched a partial
+ * character (which can happen when the fold of a character is more than one
+ * character). */
+ if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
+ return 1;
+ }
+
+ /* Successful match. Set output pointers */
+ if (pe1) {
+ *pe1 = (char*)p1;
+ }
+ if (pe2) {
+ *pe2 = (char*)p2;
+ }
+ return 0;
}
/*