diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-10-11 14:56:27 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-10-11 20:37:50 -0600 |
commit | 5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a (patch) | |
tree | 8384a55d211e51dffe7b0c2da0ada570f826df52 /regexec.c | |
parent | def6ed225c88257d6c50813a8212783f0d267e0e (diff) | |
download | perl-5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a.tar.gz |
regexec.c: Fix EXACT node handling in regrepeat()
Commit b40a2c17551b484a78122be98db5dc06bb4614d5 introduced a bug in
handling EXACT nodes when the pattern is in UTF-8. This cleans that up.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 52 |
1 files changed, 35 insertions, 17 deletions
@@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case EXACT: c = (U8)*STRING(p); - if (! utf8_target || UNI_IS_INVARIANT(c)) { + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) { while (scan < loceol && UCHARAT(scan) == c) { scan++; } } else if (UTF_PATTERN) { - STRLEN scan_char_len; - - loceol = PL_regeol; + if (utf8_target) { + STRLEN scan_char_len; + loceol = PL_regeol; + + /* When both target and pattern are UTF-8, we have to do s + * string EQ */ + while (hardcount < max + && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol + && scan_char_len <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { - while (hardcount < max - && scan + (scan_char_len = UTF8SKIP(scan)) < loceol - && scan_char_len <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) - { - scan += scan_char_len; - hardcount++; - } + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ } - else { + else { - /* Here, the string is utf8, the pattern isn't, but <c> is different - * in utf8 than not, so can't compare them directly. Outside the - * loop, find the two utf8 bytes that represent c, and then - * look for those in sequence in the utf8 string */ + /* Here, the string must be utf8; pattern isn't, and <c> is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); loceol = PL_regeol; @@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } } break; + case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; |