diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-10-16 10:56:28 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-10-16 21:48:37 -0600 |
commit | 613a425dda0dc9e3f838151a5d796c902cfd922e (patch) | |
tree | e1bfc7e5bdad859a2c63c3316496ac3e8738431b | |
parent | 79a2a0e89816b80870df1f9b9e7bb5fb1edcd556 (diff) | |
download | perl-613a425dda0dc9e3f838151a5d796c902cfd922e.tar.gz |
regexec.c: Tighten loops in regrepeat()
regrepeat() is used to match some simple thing repeatedly in a row. In
the case of EXACTFish nodes, it will repeat a single character (and its
fold). Prior to this commit, it was using the full generality of
foldEQ_utf8() whenever the target was encoded in UTF-8. This full
generality requires quite a bit of processing. However, most
Unicode folds are of the simple variety containing just a character and
its upper- or lower-cased equivalent, and so the full generality of
foldEQ_utf8() is needed only comparatively infrequently.
This commit takes advantage of the newly added and enhanced
S_setup_EXACTISH_ST_c1_c2() to look at the character being repeated and
decide what level of generality is needed. regrepeat() then uses a loop
that is only as complicated as needed.
This also adds some asserts that the nodes contain exactly 1 character
-rw-r--r-- | regexec.c | 78 |
1 files changed, 42 insertions, 36 deletions
@@ -6579,6 +6579,8 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan = loceol; break; case EXACT: + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); + c = (U8)*STRING(p); /* Can use a simple loop if the pattern char to match on is invariant @@ -6656,13 +6658,14 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; - do_exactf: - c = (U8)*STRING(p); + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; - if (utf8_target - || OP(p) == EXACTFU_SS - || (UTF_PATTERN && ! UTF8_IS_INVARIANT(c))) - { + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) { + if (c1 == CHRTEST_VOID) { /* Use full Unicode fold matching */ char *tmpeol = loceol; STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1; @@ -6674,38 +6677,41 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma tmpeol = loceol; hardcount++; } - - /* XXX Note that the above handles properly the German sharp s in - * the pattern matching ss in the string. But it doesn't handle - * properly cases where the string contains say 'LIGATURE ff' and - * the pattern is 'f+'. This would require, say, a new function or - * revised interface to foldEQ_utf8(), in which the maximum number - * of characters to match could be passed and it would return how - * many actually did. This is just one of many cases where - * multi-char folds don't work properly, and so the fix is being - * deferred */ - } - else { - U8 folded; - - /* Here, the string isn't utf8; and either the pattern isn't utf8 - * or c is an invariant, so its utf8ness doesn't affect c. Can - * just do simple comparisons for exact or fold matching. */ - switch (OP(p)) { - case EXACTF: folded = PL_fold[c]; break; - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU: folded = PL_fold_latin1[c]; break; - case EXACTFL: folded = PL_fold_locale[c]; break; - default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); - } - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == folded)) - { - scan++; - } + } + else if (utf8_target) { + if (c1 == c2) { + while (hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } } break; + } case ANYOF: if (utf8_target) { STRLEN inclasslen; |