diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-10-20 10:20:29 -0600 |
---|---|---|
committer | Father Chrysostomos <sprout@cpan.org> | 2010-10-21 05:56:30 -0700 |
commit | 634c83a2672252257e360eb1939b7ec762ef6308 (patch) | |
tree | cb4ff7fdfa156a475b0cd9b5b9ace912cbc84318 /regexec.c | |
parent | d53d27f973b3f4329ad8aa1e1a11554c8e19c3e3 (diff) | |
download | perl-634c83a2672252257e360eb1939b7ec762ef6308.tar.gz |
regexec.c: utf8 doesn't match non-utf8 self
Some regex patterns don't match a character with itself when the target
string is in utf8 and the pattern isn't, and the character is variant
under utf8. (This means only Latin1-range characters in the pattern are
affected.)
The solution is to test for this case and use the utf8 representation of
the pattern character for the comparison.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 40 |
1 files changed, 37 insertions, 3 deletions
@@ -5750,10 +5750,44 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; - case EXACT: /* length of string is 1 */ + case EXACT: + /* To get here, EXACT nodes must have *byte* length == 1. That means + * they match only characters in the string that can be expressed as a + * single byte. For non-utf8 strings, that means a simple match. For + * utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as it must be a single byte, so either it isn't utf8, or + * if it is it's an invariant */ + c = (U8)*STRING(p); - while (scan < loceol && UCHARAT(scan) == c) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + if ((! utf8_target) || UNI_IS_INVARIANT(c)) { + + /* Here, the string isn't utf8, or the character in the EXACT + * node is the same in utf8 as not, so can just do equality. + * Each matching char must be 1 byte long */ + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the char to match is different + * in utf8 than not. Fastest to find the two utf8 bytes that + * represent c, and then look for those in sequence in the utf8 + * string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } break; case EXACTF: /* length of string is 1 */ c = (U8)*STRING(p); |