diff options
author | Yves Orton <demerphq@gmail.com> | 2010-08-23 14:36:37 +0200 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2010-08-23 14:42:20 +0200 |
commit | 92f3d4829170316374b610b3fc665389803d93f8 (patch) | |
tree | 75ec84dc6954d249d557a2b23893c9a87f910edf /regexec.c | |
parent | d67eb5f4852e0f62e6cd359eeceb88a45e6164ea (diff) | |
download | perl-92f3d4829170316374b610b3fc665389803d93f8.tar.gz |
fix rt75680 - when working with utf8 strings one must always use s+=UTF8SKIP(s) to move to the next char
Most of the regex code where do the two types of increments are wrapped up in macros.
Unfortunately these macros arent suitable in this case because we use goto to jump
inside the loop under some situations, and since this is a one-off case I figured the
modest C&P associated was better than creating a new macro just for this case.
There is still a possible bug here marked by an XXX, which will need to be fixed
once I find out the correct way to simulate strptr--. Additionally I havent found
a test case that actually exposes this form of the bug.
Moral of the story, utf8 makes string scanning awkward... And slow...
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 77 |
1 files changed, 56 insertions, 21 deletions
@@ -2018,33 +2018,68 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre end = HOP3c(strend, -dontbother, strbeg) - 1; /* for multiline we only have to try after newlines */ if (prog->check_substr || prog->check_utf8) { - if (s == startpos) - goto after_try; - while (1) { - if (regtry(®info, &s)) - goto got_it; - after_try: - if (s > end) - goto phooey; - if (prog->extflags & RXf_USE_INTUIT) { - s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); - if (!s) - goto phooey; - } - else - s++; - } - } else { - if (s > startpos) + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ while (s < end) { if (*s++ == '\n') { /* don't need PL_utf8skip here */ if (regtry(®info, &s)) goto got_it; } - } - } - } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { |