Load pcre-2.05 into code/trunk.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@33 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2007-02-24 21:39:01 +0000
committer: nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2007-02-24 21:39:01 +0000
commit: 4864ac99ba4c4395fd8dc157ec734e228c780eb4 (patch)
tree: 05365588d734737b615b02336dc978642ce55783
parent: 685a411841b5e69517e02508446b317764cd6d70 (diff)
download: pcre-4864ac99ba4c4395fd8dc157ec734e228c780eb4.tar.gz
12 files changed, 403 insertions, 46 deletions
diff --git a/ChangeLog b/ChangeLog
index 56f17fa..2259f87 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,20 @@ ChangeLog for PCRE
 ------------------
 
 
+Version 2.05 21-Apr-99
+----------------------
+
+1. Changed the type of magic_number from int to long int so that it works
+properly on 16-bit systems.
+
+2. Fixed a bug which caused patterns starting with .* not to work correctly
+when the subject string contained newline characters. PCRE was assuming
+anchoring for such patterns in all cases, which is not correct because .* will
+not pass a newline unless PCRE_DOTALL is set. It now assumes anchoring only if
+DOTALL is set at top level; otherwise it knows that patterns starting with .*
+must be retried after every newline in the subject.
+
+
 Version 2.04 18-Feb-99
 ----------------------
 
diff --git a/README b/README
index 02803f6..2db0070 100644
--- a/README
+++ b/README
@@ -41,11 +41,11 @@ The distribution should contain the following files:
   pgrep.1           man page for pgrep
   pgrep.c           source of a grep utility that uses PCRE
   perltest          Perl test program
-  testinput         test data, compatible with Perl 5.004 and 5.005
+  testinput1        test data, compatible with Perl 5.004 and 5.005
   testinput2        test data for error messages and non-Perl things
   testinput3        test data, compatible with Perl 5.005
   testinput4        test data for locale-specific tests
-  testoutput        test results corresponding to testinput
+  testoutput1       test results corresponding to testinput
   testoutput2       test results corresponding to testinput2
   testoutput3       test results corresponding to testinput3
   testoutput4       test results corresponding to testinput4
@@ -314,7 +314,7 @@ The perltest program
 The perltest program tests Perl's regular expressions; it has the same
 specification as pcretest, and so can be given identical input, except that
 input patterns can be followed only by Perl's lower case options. The contents
-of testinput and testinput3 meet this condition.
+of testinput1 and testinput3 meet this condition.
 
 The data lines are processed as Perl strings, so if they contain $ or @
 characters, these have to be escaped. For this reason, all such characters in
@@ -330,4 +330,4 @@ contains malformed regular expressions, in order to check that PCRE diagnoses
 them correctly.
 
 Philip Hazel <ph10@cam.ac.uk>
-February 1999
+April 1999
diff --git a/RunTest b/RunTest
index 385be54..a23c511 100755
--- a/RunTest
+++ b/RunTest
@@ -33,9 +33,9 @@ fi
 
 if [ $do1 = yes ] ; then
   echo "Testing main functionality (Perl compatible)"
-  ./pcretest testinput testtry
+  ./pcretest testinput1 testtry
   if [ $? = 0 ] ; then
-    $cf testtry testoutput
+    $cf testtry testoutput1
     if [ $? != 0 ] ; then exit 1; fi
   else exit 1
   fi
diff --git a/internal.h b/internal.h
index a955393..2b28ac1 100644
--- a/internal.h
+++ b/internal.h
@@ -3,7 +3,7 @@
 *************************************************/
 
 
-#define PCRE_VERSION       "2.04 19-Feb-1999"
+#define PCRE_VERSION       "2.05 21-Apr-1999"
 
 
 /* This is a library of functions to support regular expressions whose syntax
@@ -92,7 +92,7 @@ time, run time or study time, respectively. */
 
 /* Magic number to provide a small check against being handed junk. */
 
-#define MAGIC_NUMBER  0x50435245   /* 'PCRE' */
+#define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
 
 /* Miscellaneous definitions */
 
@@ -262,7 +262,7 @@ typedef unsigned char uschar;
 runs on as long as necessary after the end. */
 
 typedef struct real_pcre {
-  unsigned int  magic_number;
+  unsigned long int magic_number;
   const unsigned char *tables;
   unsigned short int options;
   unsigned char top_bracket;
diff --git a/pcre.3 b/pcre.3
index 927a6ad..ec356e1 100644
--- a/pcre.3
+++ b/pcre.3
@@ -264,14 +264,25 @@ negative numbers:
   PCRE_ERROR_BADMAGIC   the "magic number" was not found
 
 If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to.
-
-If the \fIfirstcharptr\fR argument is not NULL, is is used to pass back
-information about the first character of any matched string. If there is a
-fixed first character, e.g. from a pattern such as (cat|cow|coyote), then it is
-returned in the integer pointed to by \fIfirstcharptr\fR. Otherwise, if the
-pattern was compiled with the PCRE_MULTILINE option, and every branch started
-with "^", then -1 is returned, indicating that the pattern will match at the
+pattern was compiled is placed in the integer it points to. These option bits
+are those specified in the call to \fBpcre_compile()\fR, modified by any
+top-level option settings within the pattern itself, and with the PCRE_ANCHORED
+bit set if the form of the pattern implies that it can match only at the start
+of a subject string.
+
+If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
+it is used to pass back information about the first character of any matched
+string. If there is a fixed first character, e.g. from a pattern such as
+(cat|cow|coyote), then it is returned in the integer pointed to by
+\fIfirstcharptr\fR. Otherwise, if either
+
+  (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
+      starts with "^", or
+
+  (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
+      (if it were set, the pattern would be anchored),
+
+then -1 is returned, indicating that the pattern matches only at the
 start of a subject string or after any "\\n" within the string. Otherwise -2 is
 returned.
 
@@ -1050,9 +1061,15 @@ When a parenthesized subpattern is quantified with a minimum repeat count that
 is greater than 1 or with a limited maximum, more store is required for the
 compiled pattern, in proportion to the size of the minimum or maximum.
 
-If a pattern starts with .* then it is implicitly anchored, since whatever
-follows will be tried against every character position in the subject string.
-PCRE treats this as though it were preceded by \\A.
+If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
+to Perl's /s) is set, thus allowing the . to match newlines, then the pattern
+is implicitly anchored, because whatever follows will be tried against every
+character position in the subject string, so there is no point in retrying the
+overall match at any position after the first. PCRE treats such a pattern as
+though it were preceded by \\A. In cases where it is known that the subject
+string contains no newlines, it is worth setting PCRE_DOTALL when the pattern
+begins with .* in order to obtain this optimization, or alternatively using ^
+to indicate anchoring explicitly.
 
 When a capturing subpattern is repeated, the value captured is the substring
 that matched the final iteration. For example, after
@@ -1262,7 +1279,7 @@ proceeds from left to right, PCRE will look for each "a" in the subject and
 then see if what follows matches the rest of the pattern. If the pattern is
 specified as
 
-  .*abcd$
+  ^.*abcd$
 
 then the initial .* matches the entire string at first, but when this fails, it
 backtracks to match all but the last character, then all but the last two
@@ -1270,7 +1287,7 @@ characters, and so on. Once again the search for "a" covers the entire string,
 from right to left, so we are no better off. However, if the pattern is written
 as
 
-  (?>.*)(?<=abcd)
+  ^(?>.*)(?<=abcd)
 
 then there can be no backtracking for the .* item; it can match only the entire
 string. The subsequent lookbehind assertion does a single test on the last four
@@ -1344,6 +1361,23 @@ required behaviour is usually the most efficient. Jeffrey Friedl's book
 contains a lot of discussion about optimizing regular expressions for efficient
 performance.
 
+When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is
+implicitly anchored by PCRE, since it can match only at the start of a subject
+string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization,
+because the . metacharacter does not then match a newline, and if the subject
+string contains newlines, the pattern may match from the character immediately
+following one of them instead of from the very start. For example, the pattern
+
+   (.*) second
+
+matches the subject "first\\nand second" (where \\n stands for a newline
+character) with the first captured substring being "and". In order to do this,
+PCRE has to retry the match starting after every newline in the subject.
+
+If you are using such a pattern with subject strings that do not contain
+newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
+the pattern with ^.* to indicate explicit anchoring. That saves PCRE from
+having to scan along the subject looking for a newline to restart at.
 
 .SH AUTHOR
 Philip Hazel <ph10@cam.ac.uk>
diff --git a/pcre.c b/pcre.c
index d4af5f8..dd5852d 100644
--- a/pcre.c
+++ b/pcre.c
@@ -1817,9 +1817,9 @@ all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
 it's anchored. However, if this is a multiline pattern, then only OP_SOD
 counts, since OP_CIRC can match in the middle.
 
-A branch is also implicitly anchored if it starts with .* because that will try
-the rest of the pattern at all possible matching points, so there is no point
-trying them again.
+A branch is also implicitly anchored if it starts with .* and DOTALL is set,
+because that will try the rest of the pattern at all possible matching points,
+so there is no point trying them again.
 
 Arguments:
   code       points to start of expression (the bracket)
@@ -1837,7 +1837,8 @@ do {
    register int op = *scode;
    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
      { if (!is_anchored(scode, options)) return FALSE; }
-   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
+   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
+            (*options & PCRE_DOTALL) != 0)
      { if (scode[1] != OP_ANY) return FALSE; }
    else if (op != OP_SOD &&
            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
@@ -1851,11 +1852,13 @@ return TRUE;
 
 
 /*************************************************
-*     Check for start with \n line expression    *
+*         Check for starting with ^ or .*        *
 *************************************************/
 
-/* This is called for multiline expressions to try to find out if every branch
-starts with ^ so that "first char" processing can be done to speed things up.
+/* This is called to find out if every branch starts with ^ or .* so that
+"first char" processing can be done to speed things up in multiline
+matching and for non-DOTALL patterns that start with .* (which must start at
+the beginning or after \n).
 
 Argument:  points to start of expression (the bracket)
 Returns:   TRUE or FALSE
@@ -1869,6 +1872,8 @@ do {
    register int op = *scode;
    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
      { if (!is_startline(scode)) return FALSE; }
+   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
+     { if (scode[1] != OP_ANY) return FALSE; }
    else if (op != OP_CIRC) return FALSE;
    code += (code[1] << 8) + code[2];
    }
@@ -2546,11 +2551,15 @@ if (*errorptr != NULL)
   return NULL;
   }
 
-/* If the anchored option was not passed, set flag if we can determine that it
-is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if
-we can determine what the first character has to be, because that speeds up
-unanchored matches no end. In the case of multiline matches, an alternative is
-to set the PCRE_STARTLINE flag if all branches start with ^. */
+/* If the anchored option was not passed, set flag if we can determine that the
+pattern is anchored by virtue of ^ characters or \A or anything else (such as
+starting with .* when DOTALL is set).
+
+Otherwise, see if we can determine what the first character has to be, because
+that speeds up unanchored matches no end. If not, see if we can set the
+PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
+start with ^. and also when all branches start with .* for non-DOTALL matches.
+*/
 
 if ((options & PCRE_ANCHORED) == 0)
   {
diff --git a/testinput b/testinput1
index d5d7eb3..2d0116c 100644
--- a/testinput
+++ b/testinput1
@@ -1740,4 +1740,77 @@
     aaa
     aaaaaaaa    
 
+/.*\.gif/
+    borfle\nbib.gif\nno
+
+/.{0,}\.gif/
+    borfle\nbib.gif\nno
+
+/.*\.gif/m
+    borfle\nbib.gif\nno
+
+/.*\.gif/s
+    borfle\nbib.gif\nno
+
+/.*\.gif/ms
+    borfle\nbib.gif\nno
+    
+/.*$/
+    borfle\nbib.gif\nno
+
+/.*$/m
+    borfle\nbib.gif\nno
+
+/.*$/s
+    borfle\nbib.gif\nno
+
+/.*$/ms
+    borfle\nbib.gif\nno
+    
+/.*$/
+    borfle\nbib.gif\nno\n
+
+/.*$/m
+    borfle\nbib.gif\nno\n
+
+/.*$/s
+    borfle\nbib.gif\nno\n
+
+/.*$/ms
+    borfle\nbib.gif\nno\n
+    
+/(.*X|^B)/
+    abcde\n1234Xyz
+    BarFoo 
+    *** Failers
+    abcde\nBar  
+
+/(.*X|^B)/m
+    abcde\n1234Xyz
+    BarFoo 
+    abcde\nBar  
+
+/(.*X|^B)/s
+    abcde\n1234Xyz
+    BarFoo 
+    *** Failers
+    abcde\nBar  
+
+/(.*X|^B)/ms
+    abcde\n1234Xyz
+    BarFoo 
+    abcde\nBar  
+
+/(?s)(.*X|^B)/
+    abcde\n1234Xyz
+    BarFoo 
+    *** Failers 
+    abcde\nBar  
+
+/(?s:.*X|^B)/
+    abcde\n1234Xyz
+    BarFoo 
+    *** Failers 
+    abcde\nBar  
+
 / End of test input /
diff --git a/testinput2 b/testinput2
index e8ca8f8..39a7560 100644
--- a/testinput2
+++ b/testinput2
@@ -430,4 +430,16 @@
 )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ 
 )?)?)?)?)?)?)?)?)?otherword/M
 
+/.*X/D
+
+/.*X/Ds
+
+/(.*X|^B)/D
+
+/(.*X|^B)/Ds
+    
+/(?s)(.*X|^B)/D
+
+/(?s:.*X|^B)/D
+
 / End of test input /
diff --git a/testoutput b/testoutput1
index 3f677c0..bfe8862 100644
--- a/testoutput
+++ b/testoutput1
@@ -1,4 +1,4 @@
-PCRE version 2.04 19-Feb-1999
+PCRE version 2.05 21-Apr-1999
 
 /the quick brown fox/
     the quick brown fox
@@ -2647,5 +2647,125 @@ No match
  0: aaaaaaaa
  1: a
 
+/.*\.gif/
+    borfle\nbib.gif\nno
+ 0: bib.gif
+
+/.{0,}\.gif/
+    borfle\nbib.gif\nno
+ 0: bib.gif
+
+/.*\.gif/m
+    borfle\nbib.gif\nno
+ 0: bib.gif
+
+/.*\.gif/s
+    borfle\nbib.gif\nno
+ 0: borfle\x0abib.gif
+
+/.*\.gif/ms
+    borfle\nbib.gif\nno
+ 0: borfle\x0abib.gif
+    
+/.*$/
+    borfle\nbib.gif\nno
+ 0: no
+
+/.*$/m
+    borfle\nbib.gif\nno
+ 0: borfle
+
+/.*$/s
+    borfle\nbib.gif\nno
+ 0: borfle\x0abib.gif\x0ano
+
+/.*$/ms
+    borfle\nbib.gif\nno
+ 0: borfle\x0abib.gif\x0ano
+    
+/.*$/
+    borfle\nbib.gif\nno\n
+ 0: no
+
+/.*$/m
+    borfle\nbib.gif\nno\n
+ 0: borfle
+
+/.*$/s
+    borfle\nbib.gif\nno\n
+ 0: borfle\x0abib.gif\x0ano\x0a
+
+/.*$/ms
+    borfle\nbib.gif\nno\n
+ 0: borfle\x0abib.gif\x0ano\x0a
+    
+/(.*X|^B)/
+    abcde\n1234Xyz
+ 0: 1234X
+ 1: 1234X
+    BarFoo 
+ 0: B
+ 1: B
+    *** Failers
+No match
+    abcde\nBar  
+No match
+
+/(.*X|^B)/m
+    abcde\n1234Xyz
+ 0: 1234X
+ 1: 1234X
+    BarFoo 
+ 0: B
+ 1: B
+    abcde\nBar  
+ 0: B
+ 1: B
+
+/(.*X|^B)/s
+    abcde\n1234Xyz
+ 0: abcde\x0a1234X
+ 1: abcde\x0a1234X
+    BarFoo 
+ 0: B
+ 1: B
+    *** Failers
+No match
+    abcde\nBar  
+No match
+
+/(.*X|^B)/ms
+    abcde\n1234Xyz
+ 0: abcde\x0a1234X
+ 1: abcde\x0a1234X
+    BarFoo 
+ 0: B
+ 1: B
+    abcde\nBar  
+ 0: B
+ 1: B
+
+/(?s)(.*X|^B)/
+    abcde\n1234Xyz
+ 0: abcde\x0a1234X
+ 1: abcde\x0a1234X
+    BarFoo 
+ 0: B
+ 1: B
+    *** Failers 
+No match
+    abcde\nBar  
+No match
+
+/(?s:.*X|^B)/
+    abcde\n1234Xyz
+ 0: abcde\x0a1234X
+    BarFoo 
+ 0: B
+    *** Failers 
+No match
+    abcde\nBar  
+No match
+
 / End of test input /
 
diff --git a/testoutput2 b/testoutput2
index af2309e..09148ff 100644
--- a/testoutput2
+++ b/testoutput2
@@ -1,4 +1,4 @@
-PCRE version 2.04 19-Feb-1999
+PCRE version 2.05 21-Apr-1999
 
 /(a)b|/
 Identifying subpattern count = 1
@@ -106,13 +106,13 @@ Failed: unrecognized character after (? at offset 2
 
 /.*b/
 Identifying subpattern count = 0
-Options: anchored
-No first char
+No options
+First char at start or follows \n
 
 /.*?b/
 Identifying subpattern count = 0
-Options: anchored
-No first char
+No options
+First char at start or follows \n
 
 /cat|dog|elephant/
 Identifying subpattern count = 0
@@ -299,8 +299,8 @@ No first char
 
 /.*((abc)$|(def))/
 Identifying subpattern count = 3
-Options: anchored
-No first char
+No options
+First char at start or follows \n
     defabc
  0: defabc
  1: abc
@@ -679,8 +679,8 @@ No first char
 
 /(?>.*)(?<=(abcd)|(xyz))/
 Identifying subpattern count = 2
-Options: anchored
-No first char
+No options
+First char at start or follows \n
     alphabetabcd
  0: alphabetabcd
  1: abcd
@@ -986,6 +986,101 @@ Identifying subpattern count = 8
 No options
 First char = 'w'
 
+/.*X/D
+------------------------------------------------------------------
+  0   8 Bra 0
+  3     Any*
+  5   1 X
+  8   8 Ket
+ 11     End
+------------------------------------------------------------------
+Identifying subpattern count = 0
+No options
+First char at start or follows \n
+
+/.*X/Ds
+------------------------------------------------------------------
+  0   8 Bra 0
+  3     Any*
+  5   1 X
+  8   8 Ket
+ 11     End
+------------------------------------------------------------------
+Identifying subpattern count = 0
+Options: anchored dotall
+No first char
+
+/(.*X|^B)/D
+------------------------------------------------------------------
+  0  21 Bra 0
+  3   8 Bra 1
+  6     Any*
+  8   1 X
+ 11   7 Alt
+ 14     ^
+ 15   1 B
+ 18  15 Ket
+ 21  21 Ket
+ 24     End
+------------------------------------------------------------------
+Identifying subpattern count = 1
+No options
+First char at start or follows \n
+
+/(.*X|^B)/Ds
+------------------------------------------------------------------
+  0  21 Bra 0
+  3   8 Bra 1
+  6     Any*
+  8   1 X
+ 11   7 Alt
+ 14     ^
+ 15   1 B
+ 18  15 Ket
+ 21  21 Ket
+ 24     End
+------------------------------------------------------------------
+Identifying subpattern count = 1
+Options: anchored dotall
+No first char
+    
+/(?s)(.*X|^B)/D
+------------------------------------------------------------------
+  0  21 Bra 0
+  3   8 Bra 1
+  6     Any*
+  8   1 X
+ 11   7 Alt
+ 14     ^
+ 15   1 B
+ 18  15 Ket
+ 21  21 Ket
+ 24     End
+------------------------------------------------------------------
+Identifying subpattern count = 1
+Options: anchored dotall
+No first char
+
+/(?s:.*X|^B)/D
+------------------------------------------------------------------
+  0  27 Bra 0
+  3  10 Bra 0
+  6  04 Opt
+  8     Any*
+ 10   1 X
+ 13   9 Alt
+ 16  04 Opt
+ 18     ^
+ 19   1 B
+ 22  19 Ket
+ 25  00 Opt
+ 27  27 Ket
+ 30     End
+------------------------------------------------------------------
+Identifying subpattern count = 0
+No options
+First char at start or follows \n
+
 / End of test input /
 Identifying subpattern count = 0
 No options
diff --git a/testoutput3 b/testoutput3
index ffff396..6d597cd 100644
--- a/testoutput3
+++ b/testoutput3
@@ -1,4 +1,4 @@
-PCRE version 2.04 19-Feb-1999
+PCRE version 2.05 21-Apr-1999
 
 /(?<!bar)foo/
     foo
diff --git a/testoutput4 b/testoutput4
index d301506..0e156c4 100644
--- a/testoutput4
+++ b/testoutput4
@@ -1,4 +1,4 @@
-PCRE version 2.04 19-Feb-1999
+PCRE version 2.05 21-Apr-1999
 
 /^[\w]+/
     *** Failers
author	nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2007-02-24 21:39:01 +0000
committer	nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2007-02-24 21:39:01 +0000
commit	4864ac99ba4c4395fd8dc157ec734e228c780eb4 (patch)
tree	05365588d734737b615b02336dc978642ce55783
parent	685a411841b5e69517e02508446b317764cd6d70 (diff)
download	pcre-4864ac99ba4c4395fd8dc157ec734e228c780eb4.tar.gz