summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-01-12 18:06:49 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-01-12 18:06:49 +0000
commit61a36c016d02518f6bc19ca7f7d1b5a51cc47d49 (patch)
tree9a6389564428b888e502d8aea0ff37f234ce2a5e
parent141d3726fc72209aa58460ec315973400348d8f5 (diff)
downloadperl-61a36c016d02518f6bc19ca7f7d1b5a51cc47d49.tar.gz
Special treatment for U+03B0 and U+0390 in //i,
this means that we can remove the minlen pessimisations introduced by the #14096. p4raw-id: //depot/perl@14219
-rw-r--r--pp_hot.c4
-rw-r--r--regcomp.c44
-rw-r--r--regexec.c4
3 files changed, 46 insertions, 6 deletions
diff --git a/pp_hot.c b/pp_hot.c
index df52bb19f9..b759153270 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1235,9 +1235,7 @@ PP(pp_match)
pm = PL_curpm;
rx = PM_GETRE(pm);
}
- if (rx->minlen > len &&
- !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
- )
+ if (rx->minlen > len)
goto failure;
truebase = t = s;
diff --git a/regcomp.c b/regcomp.c
index 65b0a5f6e3..bf1b42f06e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -736,6 +736,50 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
n = nnext;
}
}
+
+ if (UTF && OP(scan) == EXACTF) {
+/*
+ Two problematic code points in Unicode casefolding of EXACT nodes:
+
+ U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+ which casefold to
+
+ Unicode UTF-8
+
+ U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+ This means that in case-insensitive matching (or "loose matching",
+ as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+ length of the above casefolded versions) can match a target string
+ of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+ This would rather mess up the minimum length computation.
+
+ What we'll do is to look for the tail four bytes, and then peek
+ at the preceding two bytes to see whether we need to decrease
+ the minimum length by four (six minus two).
+
+ Thanks to the design of UTF-8, there cannot be false matches:
+ A sequence of valid UTF-8 bytes cannot be a subsequence of
+ another valid sequence of UTF-8 bytes.
+
+*/
+ char *s0 = STRING(scan), *s, *t;
+ char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+ char *t0 = "\xcc\x88\xcc\x81";
+ char *t1 = t0 + 3;
+
+ for (s = s0 + 2;
+ s < s2 && (t = ninstr(s, s1, t0, t1));
+ s = t + 4) {
+ if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+ ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ min -= 4;
+ }
+ }
+
#ifdef DEBUGGING
/* Allow dumping */
n = scan + NODE_SZ_STR(scan);
diff --git a/regexec.c b/regexec.c
index 0ceff78a7f..78c4e24fc1 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1554,9 +1554,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
}
minlen = prog->minlen;
- if (strend - startpos < minlen &&
- !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */
- ) {
+ if (strend - startpos < minlen) {
DEBUG_r(PerlIO_printf(Perl_debug_log,
"String too short [regexec_flags]...\n"));
goto phooey;