diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-07 16:30:33 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-07-07 16:30:33 +0000 |
commit | 798472a8be18bffaa324fa1c6cc052895a063e47 (patch) | |
tree | 705a42ce28f65a5b1fc978bccad3831fdf1a9bae | |
parent | b17a62a5663fbb1eda289040153ed59188551e15 (diff) | |
download | pcre-798472a8be18bffaa324fa1c6cc052895a063e47.tar.gz |
Fix caseless backreferences for non-ASCII characters.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@354 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | pcre_exec.c | 30 | ||||
-rw-r--r-- | testdata/testinput6 | 18 | ||||
-rw-r--r-- | testdata/testoutput6 | 42 |
4 files changed, 92 insertions, 2 deletions
@@ -17,6 +17,10 @@ Version 8.0 02 Jul-08 3. Change 12 for 7.7 introduced a bug in pcre_study() when a pattern contained a group with a zero qualifier. The result of the study could be incorrect, or the function might crash, depending on the pattern. + +4. Caseless matching was not working for non-ASCII characters in back + references. For example, /(\x{de})\1/8i was not matching \x{de}\x{fe}. + It now works when Unicode Property Support is available. Version 7.7 07-May-08 diff --git a/pcre_exec.c b/pcre_exec.c index 428916b..008b950 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -158,13 +158,39 @@ printf("\n"); if (length > md->end_subject - eptr) return FALSE; -/* Separate the caselesss case for speed */ +/* Separate the caseless case for speed. In UTF-8 mode we can only do this +properly if Unicode properties are supported. Otherwise, we can check only +ASCII characters. */ if ((ims & PCRE_CASELESS) != 0) { +#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UCP + if (md->utf8) + { + USPTR endptr = eptr + length; + while (eptr < endptr) + { + int c, d; + GETCHARINC(c, eptr); + GETCHARINC(d, p); + if (c != d && c != UCD_OTHERCASE(d)) return FALSE; + } + } + else +#endif +#endif + + /* The same code works when not in UTF-8 mode and in UTF-8 mode when there + is no UCP support. */ + while (length-- > 0) - if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; + { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; } } + +/* In the caseful case, we can just compare the bytes, whether or not we +are in UTF-8 mode. */ + else { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } diff --git a/testdata/testinput6 b/testdata/testinput6 index a386e9a..a8640f9 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -925,4 +925,22 @@ was broken in all cases./ ** Failers \x{1d79}\x{a77d} +/(A)\1/8i + AA + Aa + aa + aA + +/(\x{de})\1/8i + \x{de}\x{de} + \x{de}\x{fe} + \x{fe}\x{fe} + \x{fe}\x{de} + +/(\x{10a})\1/8i + \x{10a}\x{10a} + \x{10a}\x{10b} + \x{10b}\x{10b} + \x{10b}\x{10a} + / End of testinput6 / diff --git a/testdata/testoutput6 b/testdata/testoutput6 index c3b4c81..caba466 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -1705,4 +1705,46 @@ No match \x{1d79}\x{a77d} No match +/(A)\1/8i + AA + 0: AA + 1: A + Aa + 0: Aa + 1: A + aa + 0: aa + 1: a + aA + 0: aA + 1: a + +/(\x{de})\1/8i + \x{de}\x{de} + 0: \x{de}\x{de} + 1: \x{de} + \x{de}\x{fe} + 0: \x{de}\x{fe} + 1: \x{de} + \x{fe}\x{fe} + 0: \x{fe}\x{fe} + 1: \x{fe} + \x{fe}\x{de} + 0: \x{fe}\x{de} + 1: \x{fe} + +/(\x{10a})\1/8i + \x{10a}\x{10a} + 0: \x{10a}\x{10a} + 1: \x{10a} + \x{10a}\x{10b} + 0: \x{10a}\x{10b} + 1: \x{10a} + \x{10b}\x{10b} + 0: \x{10b}\x{10b} + 1: \x{10b} + \x{10b}\x{10a} + 0: \x{10b}\x{10a} + 1: \x{10b} + / End of testinput6 / |