summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-10-16 10:56:28 -0600
committerKarl Williamson <public@khwilliamson.com>2012-10-16 21:48:37 -0600
commit613a425dda0dc9e3f838151a5d796c902cfd922e (patch)
treee1bfc7e5bdad859a2c63c3316496ac3e8738431b
parent79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (diff)
downloadperl-613a425dda0dc9e3f838151a5d796c902cfd922e.tar.gz
regexec.c: Tighten loops in regrepeat()
regrepeat() is used to match some simple thing repeatedly in a row. In the case of EXACTFish nodes, it will repeat a single character (and its fold). Prior to this commit, it was using the full generality of foldEQ_utf8() whenever the target was encoded in UTF-8. This full generality requires quite a bit of processing. However, most Unicode folds are of the simple variety containing just a character and its upper- or lower-cased equivalent, and so the full generality of foldEQ_utf8() is needed only comparatively infrequently. This commit takes advantage of the newly added and enhanced S_setup_EXACTISH_ST_c1_c2() to look at the character being repeated and decide what level of generality is needed. regrepeat() then uses a loop that is only as complicated as needed. This also adds some asserts that the nodes contain exactly 1 character
-rw-r--r--regexec.c78
1 files changed, 42 insertions, 36 deletions
diff --git a/regexec.c b/regexec.c
index 73160b76c3..e631f834da 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6579,6 +6579,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
scan = loceol;
break;
case EXACT:
+ assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
+
c = (U8)*STRING(p);
/* Can use a simple loop if the pattern char to match on is invariant
@@ -6656,13 +6658,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
case EXACTFU:
utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
- do_exactf:
- c = (U8)*STRING(p);
+ do_exactf: {
+ int c1, c2;
+ U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
- if (utf8_target
- || OP(p) == EXACTFU_SS
- || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c)))
- {
+ assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
+
+ if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
+ if (c1 == CHRTEST_VOID) {
/* Use full Unicode fold matching */
char *tmpeol = loceol;
STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
@@ -6674,38 +6677,41 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
tmpeol = loceol;
hardcount++;
}
-
- /* XXX Note that the above handles properly the German sharp s in
- * the pattern matching ss in the string. But it doesn't handle
- * properly cases where the string contains say 'LIGATURE ff' and
- * the pattern is 'f+'. This would require, say, a new function or
- * revised interface to foldEQ_utf8(), in which the maximum number
- * of characters to match could be passed and it would return how
- * many actually did. This is just one of many cases where
- * multi-char folds don't work properly, and so the fix is being
- * deferred */
- }
- else {
- U8 folded;
-
- /* Here, the string isn't utf8; and either the pattern isn't utf8
- * or c is an invariant, so its utf8ness doesn't affect c. Can
- * just do simple comparisons for exact or fold matching. */
- switch (OP(p)) {
- case EXACTF: folded = PL_fold[c]; break;
- case EXACTFA:
- case EXACTFU_TRICKYFOLD:
- case EXACTFU: folded = PL_fold_latin1[c]; break;
- case EXACTFL: folded = PL_fold_locale[c]; break;
- default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
- }
- while (scan < loceol &&
- (UCHARAT(scan) == c || UCHARAT(scan) == folded))
- {
- scan++;
- }
+ }
+ else if (utf8_target) {
+ if (c1 == c2) {
+ while (hardcount < max
+ && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ while (hardcount < max
+ && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
+ || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ }
+ else if (c1 == c2) {
+ while (scan < loceol && UCHARAT(scan) == c1) {
+ scan++;
+ }
+ }
+ else {
+ while (scan < loceol &&
+ (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
+ {
+ scan++;
+ }
+ }
}
break;
+ }
case ANYOF:
if (utf8_target) {
STRLEN inclasslen;