summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2010-10-20 10:20:29 -0600
committerFather Chrysostomos <sprout@cpan.org>2010-10-21 05:56:30 -0700
commit634c83a2672252257e360eb1939b7ec762ef6308 (patch)
treecb4ff7fdfa156a475b0cd9b5b9ace912cbc84318 /regexec.c
parentd53d27f973b3f4329ad8aa1e1a11554c8e19c3e3 (diff)
downloadperl-634c83a2672252257e360eb1939b7ec762ef6308.tar.gz
regexec.c: utf8 doesn't match non-utf8 self
Some regex patterns don't match a character with itself when the target string is in utf8 and the pattern isn't, and the character is variant under utf8. (This means only Latin1-range characters in the pattern are affected.) The solution is to test for this case and use the utf8 representation of the pattern character for the comparison.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c40
1 files changed, 37 insertions, 3 deletions
diff --git a/regexec.c b/regexec.c
index 901703fd3f..f87c2fa93c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5750,10 +5750,44 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
case CANY:
scan = loceol;
break;
- case EXACT: /* length of string is 1 */
+ case EXACT:
+ /* To get here, EXACT nodes must have *byte* length == 1. That means
+ * they match only characters in the string that can be expressed as a
+ * single byte. For non-utf8 strings, that means a simple match. For
+ * utf8 strings, the character matched must be an invariant, or
+ * downgradable to a single byte. The pattern's utf8ness is
+ * irrelevant, as it must be a single byte, so either it isn't utf8, or
+ * if it is it's an invariant */
+
c = (U8)*STRING(p);
- while (scan < loceol && UCHARAT(scan) == c)
- scan++;
+ assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+ if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+
+ /* Here, the string isn't utf8, or the character in the EXACT
+ * node is the same in utf8 as not, so can just do equality.
+ * Each matching char must be 1 byte long */
+ while (scan < loceol && UCHARAT(scan) == c) {
+ scan++;
+ }
+ }
+ else {
+
+ /* Here, the string is utf8, and the char to match is different
+ * in utf8 than not. Fastest to find the two utf8 bytes that
+ * represent c, and then look for those in sequence in the utf8
+ * string */
+ U8 high = UTF8_TWO_BYTE_HI(c);
+ U8 low = UTF8_TWO_BYTE_LO(c);
+ loceol = PL_regeol;
+ while (hardcount < max
+ && scan + 1 < loceol
+ && UCHARAT(scan) == high
+ && UCHARAT(scan + 1) == low)
+ {
+ scan += 2;
+ hardcount++;
+ }
+ }
break;
case EXACTF: /* length of string is 1 */
c = (U8)*STRING(p);