summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:18 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:18 +0000
commitf08d5b6354f668c0047281d81eda8d0fd2a9e82d (patch)
tree55286e49e0fdc9b8ffa6fc2e769efe100defa3e4
parent19ccd22bd088b5a640d244365ab9025509e678c0 (diff)
downloadpcre-f08d5b6354f668c0047281d81eda8d0fd2a9e82d.tar.gz
Load pcre-4.3 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@69 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog48
-rw-r--r--Makefile.in4
-rwxr-xr-xconfigure17
-rw-r--r--configure.in4
-rw-r--r--doc/html/pcreposix.html4
-rw-r--r--doc/pcre.txt5
-rw-r--r--doc/pcreposix.34
-rw-r--r--internal.h7
-rw-r--r--pcre.c169
-rw-r--r--pcreposix.c4
-rw-r--r--pcreposix.h2
-rw-r--r--pcretest.c32
-rw-r--r--testdata/testinput13
-rw-r--r--testdata/testinput415
-rw-r--r--testdata/testoutput16
-rw-r--r--testdata/testoutput22
-rw-r--r--testdata/testoutput32
-rw-r--r--testdata/testoutput422
-rw-r--r--testdata/testoutput52
19 files changed, 260 insertions, 92 deletions
diff --git a/ChangeLog b/ChangeLog
index 7258ff6..b912314 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,54 @@
ChangeLog for PCRE
------------------
+Version 4.3 21-May-03
+---------------------
+
+1. Two instances of @WIN_PREFIX@ omitted from the Windows targets in the
+ Makefile.
+
+2. Some refactoring to improve the quality of the code:
+
+ (i) The utf8_table... variables are now declared "const".
+
+ (ii) The code for \cx, which used the "case flipping" table to upper case
+ lower case letters, now just substracts 32. This is ASCII-specific,
+ but the whole concept of \cx is ASCII-specific, so it seems
+ reasonable.
+
+ (iii) PCRE was using its character types table to recognize decimal and
+ hexadecimal digits in the pattern. This is silly, because it handles
+ only 0-9, a-f, and A-F, but the character types table is locale-
+ specific, which means strange things might happen. A private
+ table is now used for this - though it costs 256 bytes, a table is
+ much faster than multiple explicit tests. Of course, the standard
+ character types table is still used for matching digits in subject
+ strings against \d.
+
+ (iv) Strictly, the identifier ESC_t is reserved by POSIX (all identifiers
+ ending in _t are). So I've renamed it as ESC_tee.
+
+3. The first argument for regexec() in the POSIX wrapper should have been
+ defined as "const".
+
+4. Changed pcretest to use malloc() for its buffers so that they can be
+ Electric Fenced for debugging.
+
+5. There were several places in the code where, in UTF-8 mode, PCRE would try
+ to read one or more bytes before the start of the subject string. Often this
+ had no effect on PCRE's behaviour, but in some circumstances it could
+ provoke a segmentation fault.
+
+6. A lookbehind at the start of a pattern in UTF-8 mode could also cause PCRE
+ to try to read one or more bytes before the start of the subject string.
+
+7. A lookbehind in a pattern matched in non-UTF-8 mode on a PCRE compiled with
+ UTF-8 support could misbehave in various ways if the subject string
+ contained bytes with the 0x80 bit set and the 0x40 bit unset in a lookbehind
+ area. (PCRE was not checking for the UTF-8 mode flag, and trying to move
+ back over UTF-8 characters.)
+
+
Version 4.2 14-Apr-03
---------------------
diff --git a/Makefile.in b/Makefile.in
index ee9ca7c..ecdd6ef 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -194,8 +194,8 @@ wininstall : winshared
$(mkinstalldirs) $(DESTDIR)$(BINDIR)
$(INSTALL) .libs/@WIN_PREFIX@pcre.dll $(DESTDIR)$(BINDIR)/@WIN_PREFIX@pcre.dll
$(INSTALL) .libs/@WIN_PREFIX@pcreposix.dll $(DESTDIR)$(BINDIR)/@WIN_PREFIX@pcreposix.dll
- $(INSTALL) .libs/libpcreposix.dll.a $(DESTDIR)$(LIBDIR)/libpcreposix.dll.a
- $(INSTALL) .libs/libpcre.dll.a $(DESTDIR)$(LIBDIR)/libpcre.dll.a
+ $(INSTALL) .libs/@WIN_PREFIX@libpcreposix.dll.a $(DESTDIR)$(LIBDIR)/@WIN_PREFIX@libpcreposix.dll.a
+ $(INSTALL) .libs/@WIN_PREFIX@libpcre.dll.a $(DESTDIR)$(LIBDIR)/@WIN_PREFIX@libpcre.dll.a
-strip -g $(DESTDIR)$(BINDIR)/@WIN_PREFIX@pcre.dll
-strip -g $(DESTDIR)$(BINDIR)/@WIN_PREFIX@pcreposix.dll
-strip $(DESTDIR)$(BINDIR)/pcregrep@EXEEXT@
diff --git a/configure b/configure
index 7218c49..51ac584 100755
--- a/configure
+++ b/configure
@@ -936,15 +936,6 @@ if test "$ac_init_help" = "long"; then
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures this package to adapt to many kinds of systems.
@@ -1441,8 +1432,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
PCRE_MAJOR=4
-PCRE_MINOR=2
-PCRE_DATE=14-Apr-2003
+PCRE_MINOR=3
+PCRE_DATE=21-May-2003
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
@@ -5103,7 +5094,7 @@ else
;;
darwin* | rhapsody*)
- # This patch put in by hand by PH (12-Mar-2003) for Darwin 1.3.
+ # This patch put in by hand by PH (22-May-2003) for Darwin 1.3.
case "$host_os" in
rhapsody* | darwin1.[[012]])
allow_undefined_flag='-undefined suppress'
@@ -5113,7 +5104,7 @@ else
;;
esac
# End of hand-inserted patch
-
+
# FIXME: Relying on posixy $() will cause problems for
# cross-compilation, but unfortunately the echo tests do not
# yet detect zsh echo's removal of \ escapes.
diff --git a/configure.in b/configure.in
index 56580ee..5394f4f 100644
--- a/configure.in
+++ b/configure.in
@@ -21,8 +21,8 @@ dnl digits for minor numbers less than 10. There are unlikely to be
dnl that many releases anyway.
PCRE_MAJOR=4
-PCRE_MINOR=2
-PCRE_DATE=14-Apr-2003
+PCRE_MINOR=3
+PCRE_DATE=21-May-2003
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
dnl Default values for miscellaneous macros
diff --git a/doc/html/pcreposix.html b/doc/html/pcreposix.html
index bdd9f3f..d0a5e12 100644
--- a/doc/html/pcreposix.html
+++ b/doc/html/pcreposix.html
@@ -62,7 +62,9 @@ a replacement library. Other POSIX options are not even defined.
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
still those of Perl, subject to the setting of various PCRE options, as
-described below.
+described below. "POSIX-like in style" means that the API approximates to the
+POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
+domains it is probably even less compatible.
</P>
<P>
The header for these functions is supplied as <b>pcreposix.h</b> to avoid any
diff --git a/doc/pcre.txt b/doc/pcre.txt
index af147c0..1ec5f2c 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -3090,7 +3090,10 @@ DESCRIPTION
that is POSIX-like in style. The syntax and semantics of the
regular expressions themselves are still those of Perl, sub-
ject to the setting of various PCRE options, as described
- below.
+ below. "POSIX-like in style" means that the API approximates
+ to the POSIX definition; it is not fully POSIX-compatible,
+ and in multi-byte encoding domains it is probably even less
+ compatible.
The header for these functions is supplied as pcreposix.h to
avoid any potential clash with other POSIX libraries. It
diff --git a/doc/pcreposix.3 b/doc/pcreposix.3
index 018abdb..5198630 100644
--- a/doc/pcreposix.3
+++ b/doc/pcreposix.3
@@ -50,7 +50,9 @@ a replacement library. Other POSIX options are not even defined.
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
still those of Perl, subject to the setting of various PCRE options, as
-described below.
+described below. "POSIX-like in style" means that the API approximates to the
+POSIX definition; it is not fully POSIX-compatible, and in multi-byte encoding
+domains it is probably even less compatible.
The header for these functions is supplied as \fBpcreposix.h\fR to avoid any
potential clash with other POSIX libraries. It can, of course, be renamed or
diff --git a/internal.h b/internal.h
index 346f672..973e7ee 100644
--- a/internal.h
+++ b/internal.h
@@ -247,8 +247,11 @@ ESC_n is defined as yet another macro, which is set in config.h to either \n
#define ESC_r '\r'
#endif
-#ifndef ESC_t
-#define ESC_t '\t'
+/* We can't officially use ESC_t because it is a POSIX reserved identifier
+(presumably because of all the others like size_t). */
+
+#ifndef ESC_tee
+#define ESC_tee '\t'
#endif
/* These are escaped items that aren't just an encoding of a particular data
diff --git a/pcre.c b/pcre.c
index 83e6e94..5da0f76 100644
--- a/pcre.c
+++ b/pcre.c
@@ -113,7 +113,7 @@ static const short int escapes[] = {
0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
- 0, 0, ESC_r, -ESC_s, ESC_t, 0, 0, -ESC_w, /* p - w */
+ 0, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
0, 0, -ESC_z /* x - z */
};
@@ -150,6 +150,56 @@ static const int posix_class_maps[] = {
cbit_xdigit,-1, -1 /* xdigit */
};
+/* Table to identify ASCII digits and hex digits. This is used when compiling
+patterns. Note that the tables in chartables are dependent on the locale, and
+may mark arbitrary characters as digits - but the PCRE compiling code expects
+to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
+a private table here. It costs 256 bytes, but it is a lot faster than doing
+character value tests (at least in some simple cases I timed), and in some
+applications one wants PCRE to compile efficiently as well as match
+efficiently.
+
+For convenience, we use the same bit definitions as in chartables:
+
+ 0x04 decimal digit
+ 0x08 hexadecimal digit
+
+Then we can use ctype_digit and ctype_xdigit in the code. */
+
+static const unsigned char digitab[] =
+ {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
+ 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
+ 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
+ 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
/* Definition to allow mutual recursion */
@@ -315,19 +365,20 @@ tables. */
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
-static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
+static const int utf8_table1[] =
+ { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
/* These are the indicator bits and the mask for the data bits to set in the
first byte of a character, indexed by the number of additional bytes. */
-static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/* Table of the number of extra characters, indexed by the first character
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
0x3d. */
-static uschar utf8_table4[] = {
+static const uschar utf8_table4[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
@@ -686,7 +737,7 @@ else
{
oldptr = ptr;
c -= '0';
- while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
+ while ((digitab[ptr[1]] & ctype_digit) != 0)
c = c * 10 + *(++ptr) - '0';
if (c < 10 || c <= bracount)
{
@@ -712,8 +763,7 @@ else
case '0':
c -= '0';
- while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
- ptr[1] != '8' && ptr[1] != '9')
+ while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
c = c * 8 + *(++ptr) - '0';
c &= 255; /* Take least significant 8 bits */
break;
@@ -728,12 +778,12 @@ else
const uschar *pt = ptr + 2;
register int count = 0;
c = 0;
- while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
+ while ((digitab[*pt] & ctype_xdigit) != 0)
{
+ int cc = *pt++;
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
count++;
- c = c * 16 + cd->lcc[*pt] -
- (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
- pt++;
+ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
}
if (*pt == '}')
{
@@ -749,11 +799,11 @@ else
/* Read just a single hex char */
c = 0;
- while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
+ while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
{
- ptr++;
- c = c * 16 + cd->lcc[*ptr] -
- (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
+ int cc = *(++ptr);
+ if (cc >= 'a') cc -= 32; /* Convert to upper case */
+ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
}
break;
@@ -767,9 +817,10 @@ else
return 0;
}
- /* A letter is upper-cased; then the 0x40 bit is flipped */
+ /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
+ is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
- if (c >= 'a' && c <= 'z') c = cd->fcc[c];
+ if (c >= 'a' && c <= 'z') c -= 32;
c ^= 0x40;
break;
@@ -815,15 +866,16 @@ Returns: TRUE or FALSE
static BOOL
is_counted_repeat(const uschar *p, compile_data *cd)
{
-if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
-while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
+if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;
if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;
-if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
-while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
+if ((digitab[*p++] && ctype_digit) == 0) return FALSE;
+while ((digitab[*p] & ctype_digit) != 0) p++;
+
return (*p == '}');
}
@@ -856,14 +908,14 @@ read_repeat_counts(const uschar *p, int *minp, int *maxp,
int min = 0;
int max = -1;
-while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
+while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
if (*p == '}') max = min; else
{
if (*(++p) != '}')
{
max = 0;
- while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
+ while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
if (max < min)
{
*errorptr = ERR4;
@@ -2570,9 +2622,11 @@ for (;; ptr++)
ptr += 3;
}
- /* Condition to test for a numbered subpattern match */
+ /* Condition to test for a numbered subpattern match. We know that
+ if a digit follows ( then there will just be digits until ) because
+ the syntax was checked in the first pass. */
- else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
+ else if ((digitab[ptr[1]] && ctype_digit) != 0)
{
int condref; /* Don't amalgamate; some compilers */
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
@@ -2625,7 +2679,7 @@ for (;; ptr++)
*code++ = OP_CALLOUT;
{
int n = 0;
- while ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
+ while ((digitab[*(++ptr)] & ctype_digit) != 0)
n = n * 10 + *ptr - '0';
if (n > 255)
{
@@ -2725,8 +2779,7 @@ for (;; ptr++)
{
const uschar *called;
recno = 0;
-
- while ((cd->ctypes[*ptr] & ctype_digit) != 0)
+ while((digitab[*ptr] & ctype_digit) != 0)
recno = recno * 10 + *ptr++ - '0';
/* Come here from code above that handles a named recursion */
@@ -4164,7 +4217,7 @@ while ((c = *(++ptr)) != 0)
case '5': case '6': case '7': case '8': case '9':
ptr += 2;
if (c != 'R')
- while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
+ while ((digitab[*(++ptr)] & ctype_digit) != 0);
if (*ptr != ')')
{
*errorptr = ERR29;
@@ -4190,7 +4243,7 @@ while ((c = *(++ptr)) != 0)
case 'C':
ptr += 2;
- while ((compile_block.ctypes[*(++ptr)] & ctype_digit) != 0);
+ while ((digitab[*(++ptr)] & ctype_digit) != 0);
if (*ptr != ')')
{
*errorptr = ERR39;
@@ -4257,11 +4310,11 @@ while ((c = *(++ptr)) != 0)
ptr += 4;
length += 3;
}
- else if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
+ else if ((digitab[ptr[3]] & ctype_digit) != 0)
{
ptr += 4;
length += 3;
- while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
+ while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
if (*ptr != ')')
{
*errorptr = ERR26;
@@ -5171,17 +5224,28 @@ for (;;)
case OP_REVERSE:
#ifdef SUPPORT_UTF8
- c = GET(ecode,1);
- for (i = 0; i < c; i++)
+ if (md->utf8)
{
- eptr--;
- BACKCHAR(eptr)
+ c = GET(ecode,1);
+ for (i = 0; i < c; i++)
+ {
+ eptr--;
+ if (eptr < md->start_subject) return MATCH_NOMATCH;
+ BACKCHAR(eptr)
+ }
}
-#else
- eptr -= GET(ecode,1);
+ else
#endif
- if (eptr < md->start_subject) return MATCH_NOMATCH;
+ /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
+
+ {
+ eptr -= GET(ecode,1);
+ if (eptr < md->start_subject) return MATCH_NOMATCH;
+ }
+
+ /* Skip to next op code */
+
ecode += 1 + LINK_SIZE;
break;
@@ -5999,11 +6063,12 @@ for (;;)
}
eptr += len;
}
- while (eptr >= pp)
+ for (;;)
{
- if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
+ if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
MATCH_NOMATCH) return rrc;
- BACKCHAR(eptr)
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
+ BACKCHAR(eptr);
}
}
else
@@ -6111,10 +6176,11 @@ for (;;)
if (!match_xclass(c, data)) break;
eptr += len;
}
- while (eptr >= pp)
+ for(;;)
{
- if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
+ if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
MATCH_NOMATCH) return rrc;
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr)
}
return MATCH_NOMATCH;
@@ -6490,11 +6556,11 @@ for (;;)
if (c == d) break;
eptr += len;
}
- while (eptr >= pp)
+ for(;;)
{
if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
MATCH_NOMATCH) return rrc;
- eptr--;
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
@@ -6595,11 +6661,11 @@ for (;;)
if (c == d) break;
eptr += len;
}
- while (eptr >= pp)
+ for(;;)
{
if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
MATCH_NOMATCH) return rrc;
- eptr--;
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
@@ -7053,10 +7119,11 @@ for (;;)
/* eptr is now past the end of the maximum run */
- while (eptr >= pp)
+ for(;;)
{
- if ((rrc = match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) !=
+ if ((rrc = match(eptr, ecode, offset_top, md, ims, eptrb, 0)) !=
MATCH_NOMATCH) return rrc;
+ if (eptr-- == pp) break; /* Stop if tried at original pos */
BACKCHAR(eptr);
}
}
diff --git a/pcreposix.c b/pcreposix.c
index ef7beef..49094f2 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -236,7 +236,7 @@ block of store on the stack, to reduce the use of malloc/free. The threshold is
in a macro that can be changed at configure time. */
int
-regexec(regex_t *preg, const char *string, size_t nmatch,
+regexec(const regex_t *preg, const char *string, size_t nmatch,
regmatch_t pmatch[], int eflags)
{
int rc;
@@ -248,7 +248,7 @@ BOOL allocated_ovector = FALSE;
if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
-preg->re_erroffset = (size_t)(-1); /* Only has meaning after compile */
+((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */
if (nmatch > 0)
{
diff --git a/pcreposix.h b/pcreposix.h
index 05a0d7d..2b97bf4 100644
--- a/pcreposix.h
+++ b/pcreposix.h
@@ -77,7 +77,7 @@ typedef struct {
/* The functions */
extern int regcomp(regex_t *, const char *, int);
-extern int regexec(regex_t *, const char *, size_t, regmatch_t *, int);
+extern int regexec(const regex_t *, const char *, size_t, regmatch_t *, int);
extern size_t regerror(int, const regex_t *, char *, size_t);
extern void regfree(regex_t *);
diff --git a/pcretest.c b/pcretest.c
index ed77cf0..ad729b7 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -37,6 +37,9 @@ Makefile. */
#define LOOPREPEAT 50000
+#define BUFFER_SIZE 30000
+#define DBUFFER_SIZE 1024
+
static FILE *outfile;
static int log_store = 0;
@@ -50,13 +53,13 @@ static size_t gotten_store;
-static int utf8_table1[] = {
+static const int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
-static int utf8_table2[] = {
+static const int utf8_table2[] = {
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-static int utf8_table3[] = {
+static const int utf8_table3[] = {
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
@@ -387,8 +390,15 @@ int posix = 0;
#endif
int debug = 0;
int done = 0;
-unsigned char buffer[30000];
-unsigned char dbuffer[1024];
+
+unsigned char *buffer;
+unsigned char *dbuffer;
+
+/* Get buffers from malloc() so that Electric Fence will check their misuse
+when I am debugging. */
+
+buffer = malloc(BUFFER_SIZE);
+dbuffer = malloc(DBUFFER_SIZE);
/* Static so that new_malloc can use it. */
@@ -518,7 +528,7 @@ while (!done)
use_utf8 = 0;
if (infile == stdin) printf(" re> ");
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL) break;
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
fflush(outfile);
@@ -549,7 +559,7 @@ while (!done)
}
if (*pp != 0) break;
- len = sizeof(buffer) - (pp - buffer);
+ len = BUFFER_SIZE - (pp - buffer);
if (len < 256)
{
fprintf(outfile, "** Expression too long - missing delimiter?\n");
@@ -648,7 +658,7 @@ while (!done)
if (rc != 0)
{
- (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
+ (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
goto SKIP_DATA;
}
@@ -689,7 +699,7 @@ while (!done)
{
for (;;)
{
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
{
done = 1;
goto CONTINUE;
@@ -921,7 +931,7 @@ while (!done)
callout_fail_id = -1;
if (infile == stdin) printf("data> ");
- if (fgets((char *)buffer, sizeof(buffer), infile) == NULL)
+ if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
{
done = 1;
goto CONTINUE;
@@ -1134,7 +1144,7 @@ while (!done)
if (rc != 0)
{
- (void)regerror(rc, &preg, (char *)buffer, sizeof(buffer));
+ (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
}
else
diff --git a/testdata/testinput1 b/testdata/testinput1
index 137065f..c4b99c6 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -3835,4 +3835,7 @@
ÅæåäÀ
Åæåäß
+/(?<=Z)X./
+ \x84XAZXB
+
/ End of testinput1 /
diff --git a/testdata/testinput4 b/testdata/testinput4
index 4d73a15..7bd6d26 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -495,4 +495,19 @@
XYZ
\x{123}
+/^[ac]*b/8
+ xb
+
+/^[ac\x{100}]*b/8
+ xb
+
+/^[^x]*b/8i
+ xb
+
+/^[^x]*b/8
+ xb
+
+/^\d*b/8
+ xb
+
/ End of testinput4 /
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index a8617af..63214b7 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -1,4 +1,4 @@
-PCRE version 4.2 14-Apr-2003
+PCRE version 4.3 21-May-2003
/the quick brown fox/
the quick brown fox
@@ -6267,5 +6267,9 @@ No match
Åæåäß
0: \xc5\xe6\xe5\xe4\xdf
+/(?<=Z)X./
+ \x84XAZXB
+ 0: XB
+
/ End of testinput1 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index e16ccab..22a345b 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -1,4 +1,4 @@
-PCRE version 4.2 14-Apr-2003
+PCRE version 4.3 21-May-2003
/(a)b|/
Capturing subpattern count = 1
diff --git a/testdata/testoutput3 b/testdata/testoutput3
index 42b62ba..5dac092 100644
--- a/testdata/testoutput3
+++ b/testdata/testoutput3
@@ -1,4 +1,4 @@
-PCRE version 4.2 14-Apr-2003
+PCRE version 4.3 21-May-2003
/^[\w]+/
*** Failers
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 939acae..312cfbe 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1,4 +1,4 @@
-PCRE version 4.2 14-Apr-2003
+PCRE version 4.3 21-May-2003
/-- Do not use the \x{} construct except with patterns that have the --/
/-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/
@@ -854,5 +854,25 @@ No match
\x{123}
0: \x{123}
+/^[ac]*b/8
+ xb
+No match
+
+/^[ac\x{100}]*b/8
+ xb
+No match
+
+/^[^x]*b/8i
+ xb
+No match
+
+/^[^x]*b/8
+ xb
+No match
+
+/^\d*b/8
+ xb
+No match
+
/ End of testinput4 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index b86adb8..b681214 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1,4 +1,4 @@
-PCRE version 4.2 14-Apr-2003
+PCRE version 4.3 21-May-2003
/\x{100}/8DM
Memory allocation (code space): 11