summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-04-20 17:28:23 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-04-20 17:28:23 +0000
commit0ccf556bc0830fdc09d32d52ae317a95fe631ca9 (patch)
treedea25f018b65a3c9c8ae3cb8e6eff14d1f8ae57e
parent7a2c66fafd71e7d5cd43ba0cc5648531d3696227 (diff)
downloadpcre-0ccf556bc0830fdc09d32d52ae317a95fe631ca9.tar.gz
Fix auto-possessifying bugs when PCRE_UCP is not set, but character tables
specify characters in the range 127-255 are letters, spaces, etc. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@962 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog35
-rw-r--r--pcre_compile.c24
-rw-r--r--testdata/testinput1536
-rw-r--r--testdata/testinput1836
-rw-r--r--testdata/testoutput15136
-rw-r--r--testdata/testoutput18136
6 files changed, 376 insertions, 27 deletions
diff --git a/ChangeLog b/ChangeLog
index cc900d9..11d0026 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,7 +9,7 @@ Version 8.31
2. Removed a bashism from the RunTest script.
3. Add a cast to pcre_exec.c to fix the warning "unary minus operator applied
- to unsigned type, result still unsigned" that was given by an MS compiler
+ to unsigned type, result still unsigned" that was given by an MS compiler
on encountering the code "-sizeof(xxx)".
4. Partial matching support is added to the JIT compiler.
@@ -18,13 +18,13 @@ Version 8.31
of more than one character:
(a) /^(..)\1/ did not partially match "aba" because checking references was
- done on an "all or nothing" basis. This also applied to repeated
+ done on an "all or nothing" basis. This also applied to repeated
references.
(b) \R did not give a hard partial match if \r was found at the end of the
subject.
- (c) \X did not give a hard partial match after matching one or more
+ (c) \X did not give a hard partial match after matching one or more
characters at the end of the subject.
(d) When newline was set to CRLF, a pattern such as /a$/ did not recognize
@@ -33,11 +33,11 @@ Version 8.31
(e) When newline was set to CRLF, the metacharacter "." did not recognize
a partial match for a CR character at the end of the subject string.
-6. If JIT is requested using /S++ or -s++ (instead of just /S+ or -s+) when
- running pcretest, the text "(JIT)" added to the output whenever JIT is
+6. If JIT is requested using /S++ or -s++ (instead of just /S+ or -s+) when
+ running pcretest, the text "(JIT)" added to the output whenever JIT is
actually used to run the match.
-7. Individual JIT compile options can be set in pcretest by following -s+[+]
+7. Individual JIT compile options can be set in pcretest by following -s+[+]
or /S+[+] with a digit between 1 and 7.
8. OP_NOT now supports any UTF character not just single-byte ones.
@@ -46,21 +46,21 @@ Version 8.31
10. The command "./RunTest list" lists the available tests without actually
running any of them. (Because I keep forgetting what they all are.)
-
-11. Add PCRE_INFO_MAXLOOKBEHIND.
+
+11. Add PCRE_INFO_MAXLOOKBEHIND.
12. Applied a (slightly modified) user-supplied patch that improves performance
when the heap is used for recursion (compiled with --disable-stack-for-
- recursion). Instead of malloc and free for each heap frame each time a
- logical recursion happens, frames are retained on a chain and re-used where
- possible. This sometimes gives as much as 30% improvement.
-
+ recursion). Instead of malloc and free for each heap frame each time a
+ logical recursion happens, frames are retained on a chain and re-used where
+ possible. This sometimes gives as much as 30% improvement.
+
13. As documented, (*COMMIT) is now confined to within a recursive subpattern
call.
14. As documented, (*COMMIT) is now confined to within a positive assertion.
-15. It is now possible to link pcretest with libedit as an alternative to
+15. It is now possible to link pcretest with libedit as an alternative to
libreadline.
16. (*COMMIT) control verb is now supported by the JIT compiler.
@@ -86,10 +86,15 @@ Version 8.31
matches in certain environments (the workspace was not being correctly
retained). Also added to pcre_dfa_exec() a simple plausibility check on
some of the workspace data at the beginning of a restart.
-
+
25. \s*\R was auto-possessifying the \s* when it should not, whereas \S*\R
was not doing so when it should - probably a typo introduced by SVN 528
- (change 8.10/14).
+ (change 8.10/14).
+
+26. When PCRE_UCP was not set, \w+\x{c4} was incorrectly auto-possessifying the
+ \w+ when the character tables indicated that \x{c4} was a word character.
+ There were several related cases, all because the tests for doing a table
+ lookup were testing for characters less than 127 instead of 255.
Version 8.30 04-February-2012
diff --git a/pcre_compile.c b/pcre_compile.c
index b267f31..07b8a00 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -3132,22 +3132,22 @@ if (next >= 0) switch(op_code)
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
case OP_DIGIT:
- return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
+ return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
case OP_NOT_DIGIT:
- return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
+ return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
case OP_WHITESPACE:
- return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
+ return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
case OP_NOT_WHITESPACE:
- return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
+ return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
case OP_WORDCHAR:
- return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
+ return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
case OP_NOT_WORDCHAR:
- return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
+ return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
case OP_HSPACE:
case OP_NOT_HSPACE:
@@ -3225,22 +3225,22 @@ switch(op_code)
switch(-next)
{
case ESC_d:
- return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
+ return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
case ESC_D:
- return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
+ return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
case ESC_s:
- return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
+ return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
case ESC_S:
- return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
+ return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
case ESC_w:
- return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
+ return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
case ESC_W:
- return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
+ return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
case ESC_h:
case ESC_H:
diff --git a/testdata/testinput15 b/testdata/testinput15
index f995646..48e1b03 100644
--- a/testdata/testinput15
+++ b/testdata/testinput15
@@ -277,4 +277,40 @@ correctly, but that messes up comparisons). --/
/\777/8DZ
+/\w+\x{C4}/8BZ
+ a\x{C4}\x{C4}
+
+/\w+\x{C4}/8BZT1
+ a\x{C4}\x{C4}
+
+/\W+\x{C4}/8BZ
+ !\x{C4}
+
+/\W+\x{C4}/8BZT1
+ !\x{C4}
+
+/\W+\x{A1}/8BZ
+ !\x{A1}
+
+/\W+\x{A1}/8BZT1
+ !\x{A1}
+
+/X\s+\x{A0}/8BZ
+ X\x20\x{A0}\x{A0}
+
+/X\s+\x{A0}/8BZT1
+ X\x20\x{A0}\x{A0}
+
+/\S+\x{A0}/8BZ
+ X\x{A0}\x{A0}
+
+/\S+\x{A0}/8BZT1
+ X\x{A0}\x{A0}
+
+/\x{a0}+\s!/8BZ
+ \x{a0}\x20!
+
+/\x{a0}+\s!/8BZT1
+ \x{a0}\x20!
+
/-- End of testinput15 --/
diff --git a/testdata/testinput18 b/testdata/testinput18
index e6c77a8..d0e46dd 100644
--- a/testdata/testinput18
+++ b/testdata/testinput18
@@ -240,4 +240,40 @@ correctly, but that messes up comparisons). --/
/νΌ€/8
+/\w+\x{C4}/8BZ
+ a\x{C4}\x{C4}
+
+/\w+\x{C4}/8BZT1
+ a\x{C4}\x{C4}
+
+/\W+\x{C4}/8BZ
+ !\x{C4}
+
+/\W+\x{C4}/8BZT1
+ !\x{C4}
+
+/\W+\x{A1}/8BZ
+ !\x{A1}
+
+/\W+\x{A1}/8BZT1
+ !\x{A1}
+
+/X\s+\x{A0}/8BZ
+ X\x20\x{A0}\x{A0}
+
+/X\s+\x{A0}/8BZT1
+ X\x20\x{A0}\x{A0}
+
+/\S+\x{A0}/8BZ
+ X\x{A0}\x{A0}
+
+/\S+\x{A0}/8BZT1
+ X\x{A0}\x{A0}
+
+/\x{a0}+\s!/8BZ
+ \x{a0}\x20!
+
+/\x{a0}+\s!/8BZT1
+ \x{a0}\x20!
+
/-- End of testinput18 --/
diff --git a/testdata/testoutput15 b/testdata/testoutput15
index c965939..b145ea6 100644
--- a/testdata/testoutput15
+++ b/testdata/testoutput15
@@ -910,4 +910,140 @@ Options: utf
First char = \x{c7}
Need char = \x{bf}
+/\w+\x{C4}/8BZ
+------------------------------------------------------------------
+ Bra
+ \w++
+ \x{c4}
+ Ket
+ End
+------------------------------------------------------------------
+ a\x{C4}\x{C4}
+ 0: a\x{c4}
+
+/\w+\x{C4}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \w+
+ \x{c4}
+ Ket
+ End
+------------------------------------------------------------------
+ a\x{C4}\x{C4}
+ 0: a\x{c4}\x{c4}
+
+/\W+\x{C4}/8BZ
+------------------------------------------------------------------
+ Bra
+ \W+
+ \x{c4}
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{C4}
+ 0: !\x{c4}
+
+/\W+\x{C4}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \W++
+ \x{c4}
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{C4}
+ 0: !\x{c4}
+
+/\W+\x{A1}/8BZ
+------------------------------------------------------------------
+ Bra
+ \W+
+ \x{a1}
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{A1}
+ 0: !\x{a1}
+
+/\W+\x{A1}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \W+
+ \x{a1}
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{A1}
+ 0: !\x{a1}
+
+/X\s+\x{A0}/8BZ
+------------------------------------------------------------------
+ Bra
+ X
+ \s++
+ \x{a0}
+ Ket
+ End
+------------------------------------------------------------------
+ X\x20\x{A0}\x{A0}
+ 0: X \x{a0}
+
+/X\s+\x{A0}/8BZT1
+------------------------------------------------------------------
+ Bra
+ X
+ \s+
+ \x{a0}
+ Ket
+ End
+------------------------------------------------------------------
+ X\x20\x{A0}\x{A0}
+ 0: X \x{a0}\x{a0}
+
+/\S+\x{A0}/8BZ
+------------------------------------------------------------------
+ Bra
+ \S+
+ \x{a0}
+ Ket
+ End
+------------------------------------------------------------------
+ X\x{A0}\x{A0}
+ 0: X\x{a0}\x{a0}
+
+/\S+\x{A0}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \S++
+ \x{a0}
+ Ket
+ End
+------------------------------------------------------------------
+ X\x{A0}\x{A0}
+ 0: X\x{a0}
+
+/\x{a0}+\s!/8BZ
+------------------------------------------------------------------
+ Bra
+ \x{a0}++
+ \s
+ !
+ Ket
+ End
+------------------------------------------------------------------
+ \x{a0}\x20!
+ 0: \x{a0} !
+
+/\x{a0}+\s!/8BZT1
+------------------------------------------------------------------
+ Bra
+ \x{a0}+
+ \s
+ !
+ Ket
+ End
+------------------------------------------------------------------
+ \x{a0}\x20!
+ 0: \x{a0} !
+
/-- End of testinput15 --/
diff --git a/testdata/testoutput18 b/testdata/testoutput18
index a129854..adc3ab4 100644
--- a/testdata/testoutput18
+++ b/testdata/testoutput18
@@ -845,4 +845,140 @@ Error -24 (bad offset value)
/νΌ€/8
Failed: invalid UTF-16 string at offset 0
+/\w+\x{C4}/8BZ
+------------------------------------------------------------------
+ Bra
+ \w++
+ \xc4
+ Ket
+ End
+------------------------------------------------------------------
+ a\x{C4}\x{C4}
+ 0: a\x{c4}
+
+/\w+\x{C4}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \w+
+ \xc4
+ Ket
+ End
+------------------------------------------------------------------
+ a\x{C4}\x{C4}
+ 0: a\x{c4}\x{c4}
+
+/\W+\x{C4}/8BZ
+------------------------------------------------------------------
+ Bra
+ \W+
+ \xc4
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{C4}
+ 0: !\x{c4}
+
+/\W+\x{C4}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \W++
+ \xc4
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{C4}
+ 0: !\x{c4}
+
+/\W+\x{A1}/8BZ
+------------------------------------------------------------------
+ Bra
+ \W+
+ \xa1
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{A1}
+ 0: !\x{a1}
+
+/\W+\x{A1}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \W+
+ \xa1
+ Ket
+ End
+------------------------------------------------------------------
+ !\x{A1}
+ 0: !\x{a1}
+
+/X\s+\x{A0}/8BZ
+------------------------------------------------------------------
+ Bra
+ X
+ \s++
+ \xa0
+ Ket
+ End
+------------------------------------------------------------------
+ X\x20\x{A0}\x{A0}
+ 0: X \x{a0}
+
+/X\s+\x{A0}/8BZT1
+------------------------------------------------------------------
+ Bra
+ X
+ \s+
+ \xa0
+ Ket
+ End
+------------------------------------------------------------------
+ X\x20\x{A0}\x{A0}
+ 0: X \x{a0}\x{a0}
+
+/\S+\x{A0}/8BZ
+------------------------------------------------------------------
+ Bra
+ \S+
+ \xa0
+ Ket
+ End
+------------------------------------------------------------------
+ X\x{A0}\x{A0}
+ 0: X\x{a0}\x{a0}
+
+/\S+\x{A0}/8BZT1
+------------------------------------------------------------------
+ Bra
+ \S++
+ \xa0
+ Ket
+ End
+------------------------------------------------------------------
+ X\x{A0}\x{A0}
+ 0: X\x{a0}
+
+/\x{a0}+\s!/8BZ
+------------------------------------------------------------------
+ Bra
+ \xa0++
+ \s
+ !
+ Ket
+ End
+------------------------------------------------------------------
+ \x{a0}\x20!
+ 0: \x{a0} !
+
+/\x{a0}+\s!/8BZT1
+------------------------------------------------------------------
+ Bra
+ \xa0+
+ \s
+ !
+ Ket
+ End
+------------------------------------------------------------------
+ \x{a0}\x20!
+ 0: \x{a0} !
+
/-- End of testinput18 --/