summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-10-16 12:38:58 -0600
committerKarl Williamson <public@khwilliamson.com>2011-10-17 21:52:16 -0600
commit18f762c3fc05b01f683e8eefc830106deeb0df8f (patch)
tree2e2822fb5fd51da51523bb54ba4b53f250ae52c1 /utf8.c
parent85514a3470928b4c673195c2db9e5f567341b656 (diff)
downloadperl-18f762c3fc05b01f683e8eefc830106deeb0df8f.tar.gz
utf8.c: Add 'input pre-folded' flags to foldEQ_utf8_flags
This adds flags so that if one of the input strings is known to already have been folded, this routine can skip the (redundant) folding step.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c24
1 files changed, 24 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index c8bdc7a308..700104140b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3354,6 +3354,9 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
* points below 256; unicode rules for above 255; and
* folds that cross those boundaries are disallowed,
* like the NOMIX_ASCII option
+ * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
+ * routine. This allows that step to be skipped.
+ * FOLDEQ_S2_ALREADY_FOLDED Similarly.
*/
I32
Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
@@ -3375,6 +3378,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
+ /* The algorithm requires that input with the flags on the first line of
+ * the assert not be pre-folded. */
+ assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
+ && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+
if (pe1) {
e1 = *(U8**)pe1;
}
@@ -3416,6 +3424,10 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
assert(e2);
}
+ /* If both operands are already folded, we could just do a memEQ on the
+ * whole strings at once, but it would be better if the caller realized
+ * this and didn't even call us */
+
/* Look through both strings, a character at a time */
while (p1 < e1 && p2 < e2) {
@@ -3423,10 +3435,15 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
* and the length of the fold. (exception: locale rules just get the
* character to a single byte) */
if (n1 == 0) {
+ if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
+ f1 = (U8 *) p1;
+ n1 = UTF8SKIP(f1);
/* If in locale matching, we use two sets of rules, depending on if
* the code point is above or below 255. Here, we test for and
* handle locale rules */
+ }
+ else {
if ((flags & FOLDEQ_UTF8_LOCALE)
&& (! u1 || UTF8_IS_INVARIANT(*p1) || UTF8_IS_DOWNGRADEABLE_START(*p1)))
{
@@ -3466,9 +3483,15 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
to_utf8_fold(natbuf, foldbuf1, &n1);
}
f1 = foldbuf1;
+ }
}
if (n2 == 0) { /* Same for s2 */
+ if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
+ f2 = (U8 *) p2;
+ n2 = UTF8SKIP(f2);
+ }
+ else {
if ((flags & FOLDEQ_UTF8_LOCALE)
&& (! u2 || UTF8_IS_INVARIANT(*p2) || UTF8_IS_DOWNGRADEABLE_START(*p2)))
{
@@ -3508,6 +3531,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
to_utf8_fold(natbuf, foldbuf2, &n2);
}
f2 = foldbuf2;
+ }
}
/* Here f1 and f2 point to the beginning of the strings to compare.