1 files changed, 152 insertions, 108 deletions
diff --git a/utf8.c b/utf8.c
index 6c522d033a..8fd5db9938 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2504,22 +2504,33 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
 /*
 =for apidoc ibcmp_utf8
 
-Return true if the strings s1 and s2 differ case-insensitively, false
-if not (if they are equal case-insensitively).  If u1 is true, the
-string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
-the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
-are false, the respective string is assumed to be in native 8-bit
-encoding.
-
-If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
-in there (they will point at the beginning of the I<next> character).
-If the pointers behind pe1 or pe2 are non-NULL, they are the end
-pointers beyond which scanning will not continue under any
-circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
-s2+l2 will be used as goal end pointers that will also stop the scan,
-and which qualify towards defining a successful match: all the scans
-that define an explicit length must reach their goal pointers for
-a match to succeed).
+Returns true if the strings s1 and s2 differ case-insensitively, false
+if they are equal case-insensitively.  Note that this is the complement of what
+you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+
+If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
+with respect to s2.
+
+If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach.  The
+scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal.  Correspondingly for l2 with respect to
+s2.
+
+If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer beyond which scanning of s1 will not continue under
+any circumstances.  This means that if both l1 and pe1 are specified, and pe1
+is less than s1+l1, the match will never be successful because it can never
+get as far as its goal.  Correspondingly for pe2 with respect to s2.
+
+At least one of s1 and s2 must have a goal, and if both do, both have to be
+reached for a successful match.   Also, if the fold of a character is multiple
+characters, all of them must be matched (see tr21 reference below for
+'folding').
+
+Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+it will be set to point to the beginning of the I<next> character of s1 beyond
+what was matched.  Correspondingly for pe2 and s2.
 
 For case-insensitiveness, the "casefolding" of Unicode is used
 instead of upper/lowercasing both the characters, see
@@ -2529,98 +2540,131 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
 I32
 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
 {
-     dVAR;
-     register const U8 *p1  = (const U8*)s1;
-     register const U8 *p2  = (const U8*)s2;
-     register const U8 *f1 = NULL;
-     register const U8 *f2 = NULL;
-     register U8 *e1 = NULL;
-     register U8 *q1 = NULL;
-     register U8 *e2 = NULL;
-     register U8 *q2 = NULL;
-     STRLEN n1 = 0, n2 = 0;
-     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
-     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-     U8 natbuf[1+1];
-     STRLEN foldlen1, foldlen2;
-     bool match;
-
-     PERL_ARGS_ASSERT_IBCMP_UTF8;
-     
-     if (pe1)
-	  e1 = *(U8**)pe1;
-     /* assert(e1 || l1); */
-     if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
-	  f1 = (const U8*)s1 + l1;
-     if (pe2)
-	  e2 = *(U8**)pe2;
-     /* assert(e2 || l2); */
-     if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
-	  f2 = (const U8*)s2 + l2;
-
-     /* This shouldn't happen. However, putting an assert() there makes some
-      * tests fail. */
-     /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
-     if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
-	  return 1; /* mismatch; possible infinite loop or false positive */
-
-     if (!u1 || !u2)
-	  natbuf[1] = 0; /* Need to terminate the buffer. */
-
-     while ((e1 == 0 || p1 < e1) &&
-	    (f1 == 0 || p1 < f1) &&
-	    (e2 == 0 || p2 < e2) &&
-	    (f2 == 0 || p2 < f2)) {
-	  if (n1 == 0) {
-	       if (u1)
-		    to_utf8_fold(p1, foldbuf1, &foldlen1);
-	       else {
-		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
-	       }
-	       q1 = foldbuf1;
-	       n1 = foldlen1;
-	  }
-	  if (n2 == 0) {
-	       if (u2)
-		    to_utf8_fold(p2, foldbuf2, &foldlen2);
-	       else {
-		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
-	       }
-	       q2 = foldbuf2;
-	       n2 = foldlen2;
-	  }
-	  while (n1 && n2) {
-	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
-		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
-		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
-		   return 1; /* mismatch */
-	       n1 -= UTF8SKIP(q1);
-	       q1 += UTF8SKIP(q1);
-	       n2 -= UTF8SKIP(q2);
-	       q2 += UTF8SKIP(q2);
-	  }
-	  if (n1 == 0)
-	       p1 += u1 ? UTF8SKIP(p1) : 1;
-	  if (n2 == 0)
-	       p2 += u2 ? UTF8SKIP(p2) : 1;
-
-     }
-
-     /* A match is defined by all the scans that specified
-      * an explicit length reaching their final goals. */
-     match = (n1 == 0 && n2 == 0    /* Must not match partial char; Bug #72998 */
-	     && (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2));
-
-     if (match) {
-	  if (pe1)
-	       *pe1 = (char*)p1;
-	  if (pe2)
-	       *pe2 = (char*)p2;
-     }
-
-     return match ? 0 : 1; /* 0 match, 1 mismatch */
+    dVAR;
+    register const U8 *p1  = (const U8*)s1; /* Point to current char */
+    register const U8 *p2  = (const U8*)s2;
+    register const U8 *g1 = NULL;	/* goal for s1 */
+    register const U8 *g2 = NULL;
+    register const U8 *e1 = NULL;	/* Don't scan s1 past this */
+    register U8 *f1 = NULL;		/* Point to current folded */
+    register const U8 *e2 = NULL;
+    register U8 *f2 = NULL;
+    STRLEN n1 = 0, n2 = 0;		/* Number of bytes in current char */
+    U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+    U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+    U8 natbuf[2];		/* Holds native 8-bit char converted to utf8;
+				   these always fit in 2 bytes */
+
+    PERL_ARGS_ASSERT_IBCMP_UTF8;
+
+    if (pe1) {
+	e1 = *(U8**)pe1;
+    }
+
+    if (l1) {
+	g1 = (const U8*)s1 + l1;
+    }
+
+    if (pe2) {
+	e2 = *(U8**)pe2;
+    }
+
+    if (l2) {
+	g2 = (const U8*)s2 + l2;
+    }
+
+    /* Must have at least one goal */
+    assert(g1 || g2);
+
+    if (g1) {
+
+	/* Will never match if goal is out-of-bounds */
+	assert(! e1  || e1 >= g1);
+
+	/* Here, there isn't an end pointer, or it is beyond the goal.  We
+	* only go as far as the goal */
+	e1 = g1;
+    }
+    else assert(e1);	/* Must have an end for looking at s1 */
+
+    /* Same for goal for s2 */
+    if (g2) {
+	assert(! e2  || e2 >= g2);
+	e2 = g2;
+    }
+    else assert(e2);
+
+    /* Look through both strings, a character at a time */
+    while (p1 < e1 && p2 < e2) {
+
+	/* If at the beginning of a new character in s1, get its fold to use */
+	if (n1 == 0) {
+	    if (u1) {
+		to_utf8_fold(p1, foldbuf1, &n1);
+	    }
+	    else {  /* Not utf8, convert to it first and then get fold */
+		uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+		to_utf8_fold(natbuf, foldbuf1, &n1);
+	    }
+	    f1 = foldbuf1;
+	}
+
+	if (n2 == 0) {    /* Same for s2 */
+	    if (u2) {
+		to_utf8_fold(p2, foldbuf2, &n2);
+	    }
+	    else {
+		uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+		to_utf8_fold(natbuf, foldbuf2, &n2);
+	    }
+	    f2 = foldbuf2;
+	}
+
+	/* While there is more to look for in both folds, see if they
+	* continue to match */
+	while (n1 && n2) {
+	    U8 fold_length = UTF8SKIP(f1);
+	    if (fold_length != UTF8SKIP(f2)
+		|| (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+						       function call for single
+						       character */
+		|| memNE((char*)f1, (char*)f2, fold_length))
+	    {
+		return 1; /* mismatch */
+	    }
+
+	    /* Here, they matched, advance past them */
+	    n1 -= fold_length;
+	    f1 += fold_length;
+	    n2 -= fold_length;
+	    f2 += fold_length;
+	}
+
+	/* When reach the end of any fold, advance the input past it */
+	if (n1 == 0) {
+	    p1 += u1 ? UTF8SKIP(p1) : 1;
+	}
+	if (n2 == 0) {
+	    p2 += u2 ? UTF8SKIP(p2) : 1;
+	}
+    } /* End of loop through both strings */
+
+    /* A match is defined by each scan that specified an explicit length
+    * reaching its final goal, and the other not having matched a partial
+    * character (which can happen when the fold of a character is more than one
+    * character). */
+    if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
+	return 1;
+    }
+
+    /* Successful match.  Set output pointers */
+    if (pe1) {
+	*pe1 = (char*)p1;
+    }
+    if (pe2) {
+	*pe2 = (char*)p2;
+    }
+    return 0;
 }
 
 /*