summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:38:21 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:38:21 +0000
commit67741e1934bcbb6e8ceb2a75a5c6f7f14ae28438 (patch)
tree2bc441cae7879531ea48fdfdc48319f8c19373b1
parent688871425cacb8bd84efbf423e15ea9fc204f280 (diff)
downloadpcre-67741e1934bcbb6e8ceb2a75a5c6f7f14ae28438.tar.gz
Load pcre-1.05 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@13 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog7
-rw-r--r--Tech.Notes20
-rw-r--r--internal.h3
-rw-r--r--pcre.c111
-rw-r--r--pcretest.c8
-rw-r--r--study.c1
-rw-r--r--testinput16
-rw-r--r--testinput216
-rw-r--r--testoutput26
-rw-r--r--testoutput238
10 files changed, 210 insertions, 36 deletions
diff --git a/ChangeLog b/ChangeLog
index 7fe6927..a893116 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,13 @@ ChangeLog for PCRE
------------------
+Version 1.05 23-Dec-97
+----------------------
+
+1. Negated character classes containing more than one character were failing if
+PCRE_CASELESS was set at run time.
+
+
Version 1.04 19-Dec-97
----------------------
diff --git a/Tech.Notes b/Tech.Notes
index f17b661..69c5504 100644
--- a/Tech.Notes
+++ b/Tech.Notes
@@ -117,9 +117,21 @@ instances of OP_CHARS are used.
Character classes
-----------------
-OP_CLASS is used for a character class. It is followed by a 32-byte bit map
-containing a 1 bit for every character that is acceptable. The bits are counted
-from the least significant end of each byte.
+OP_CLASS is used for a character class, and OP_NEGCLASS for a negated character
+class, provided there are at least two characters in the class. If there is
+only one character, OP_CHARS is used for a positive class, and OP_NOT for a
+negative one. A set of repeating opcodes (OP_NOTSTAR etc.) are used for a
+repeated, negated, single-character class.
+
+Both OP_CLASS and OP_NEGCLASS are followed by a 32-byte bit map containing a 1
+bit for every character that is acceptable. The bits are counted from the least
+significant end of each byte. The reason for having two opcodes is to cope with
+negated character classes when caseless matching is specified at run time but
+not at compile time. If it is specified at compile time, the bit map is built
+appropriately. This is the only time that a distinction is made between
+OP_CLASS and OP_NEGCLASS, when the bit map was built in a caseful manner but
+matching must be caseless. For OP_CLASS, a character matches if either of its
+cases is in the bit map, but for OP_NEGCLASS, both of them must be present.
Back references
@@ -196,4 +208,4 @@ OP_ONCE.
Philip Hazel
-October 1997
+December 1997
diff --git a/internal.h b/internal.h
index af0b42d..5951752 100644
--- a/internal.h
+++ b/internal.h
@@ -3,7 +3,7 @@
*************************************************/
-#define PCRE_VERSION "1.04 22-Dec-1997"
+#define PCRE_VERSION "1.05 23-Dec-1997"
/* This is a library of functions to support regular expressions whose syntax
@@ -181,6 +181,7 @@ enum {
OP_CRMINRANGE,
OP_CLASS, /* Match a character class */
+ OP_NEGCLASS, /* Match a character class, specified negatively */
OP_REF, /* Match a back reference */
OP_ALT, /* Start of alternation */
diff --git a/pcre.c b/pcre.c
index 43dee7c..81532c8 100644
--- a/pcre.c
+++ b/pcre.c
@@ -54,7 +54,7 @@ the external pcre header. */
static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
-/* Text forms of OP_ values and things, for debugging */
+/* Text forms of OP_ values and things, for debugging (not all used) */
#ifdef DEBUG
static const char *OP_names[] = {
@@ -65,7 +65,7 @@ static const char *OP_names[] = {
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref",
+ "class", "negclass", "Ref",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
"Brazero", "Braminzero", "Bra"
};
@@ -91,7 +91,8 @@ static short int escapes[] = {
/* Definition to allow mutual recursion */
-static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **);
+static BOOL
+ compile_regex(int, int *, uschar **, const uschar **, const char **);
/* Structure for passing "static" information around between the functions
doing the matching, so that they are thread-safe. */
@@ -306,6 +307,7 @@ do {
/* Check a class or a back reference for a zero minimum */
case OP_CLASS:
+ case OP_NEGCLASS:
case OP_REF:
cc += (*cc == OP_REF)? 2 : 33;
@@ -670,16 +672,22 @@ for (;; ptr++)
case '[':
previous = code;
- *code++ = OP_CLASS;
- /* If the first character is '^', set the negation flag */
+ /* If the first character is '^', set the negation flag, and use a
+ different opcode. This only matters if caseless matching is specified at
+ runtime. */
if ((c = *(++ptr)) == '^')
{
negate_class = TRUE;
+ *code++ = OP_NEGCLASS;
c = *(++ptr);
}
- else negate_class = FALSE;
+ else
+ {
+ negate_class = FALSE;
+ *code++ = OP_CLASS;
+ }
/* Keep a count of chars so that we can optimize the case of just a single
character. */
@@ -1015,7 +1023,8 @@ for (;; ptr++)
/* If previous was a character class or a back reference, we put the repeat
stuff after it. */
- else if (*previous == OP_CLASS || *previous == OP_REF)
+ else if (*previous == OP_CLASS || *previous == OP_NEGCLASS ||
+ *previous == OP_REF)
{
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
@@ -2090,11 +2099,12 @@ while (code < code_end)
goto CLASS_REF_REPEAT;
case OP_CLASS:
+ case OP_NEGCLASS:
{
int i, min, max;
- code++;
- printf(" [");
+ if (*code++ == OP_CLASS) printf(" [");
+ else printf(" ^[");
for (i = 0; i < 256; i++)
{
@@ -2714,10 +2724,14 @@ for (;;)
item to see if there is repeat information following. Then obey similar
code to character type repeats - written out again for speed. If caseless
matching was set at runtime but not at compile time, we have to check both
- versions of a character. */
+ versions of a character, and we have to behave differently for positive and
+ negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
+ treated differently. */
case OP_CLASS:
+ case OP_NEGCLASS:
{
+ BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
const uschar *data = ecode + 1; /* Save for matching */
ecode += 33; /* Advance past the item */
@@ -2746,15 +2760,8 @@ for (;;)
break;
default: /* No repeat follows */
- if (eptr >= md->end_subject) return FALSE;
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
- if (md->runtime_caseless)
- {
- c = pcre_fcc[c];
- if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */
- }
- return FALSE;
+ min = max = 1;
+ break;
}
/* First, ensure the minimum number of matches are present. */
@@ -2763,12 +2770,30 @@ for (;;)
{
if (eptr >= md->end_subject) return FALSE;
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- if (md->runtime_caseless)
+
+ /* Either not runtime caseless, or it was a positive class. For
+ runtime caseless, continue if either case is in the map. */
+
+ if (!nasty_case)
{
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ if (md->runtime_caseless)
+ {
+ c = pcre_fcc[c];
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ }
+ }
+
+ /* Runtime caseless and it was a negative class. Continue only if
+ both cases are in the map. */
+
+ else
+ {
+ if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
+
return FALSE;
}
@@ -2787,12 +2812,30 @@ for (;;)
if (match(eptr, ecode, offset_top, md)) return TRUE;
if (i >= max || eptr >= md->end_subject) return FALSE;
c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- if (md->runtime_caseless)
+
+ /* Either not runtime caseless, or it was a positive class. For
+ runtime caseless, continue if either case is in the map. */
+
+ if (!nasty_case)
+ {
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ if (md->runtime_caseless)
+ {
+ c = pcre_fcc[c];
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ }
+ }
+
+ /* Runtime caseless and it was a negative class. Continue only if
+ both cases are in the map. */
+
+ else
{
+ if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
+
return FALSE;
}
/* Control never gets here */
@@ -2807,12 +2850,30 @@ for (;;)
{
if (eptr >= md->end_subject) break;
c = *eptr;
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- if (md->runtime_caseless)
+
+ /* Either not runtime caseless, or it was a positive class. For
+ runtime caseless, continue if either case is in the map. */
+
+ if (!nasty_case)
+ {
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ if (md->runtime_caseless)
+ {
+ c = pcre_fcc[c];
+ if ((data[c/8] & (1 << (c&7))) != 0) continue;
+ }
+ }
+
+ /* Runtime caseless and it was a negative class. Continue only if
+ both cases are in the map. */
+
+ else
{
+ if ((data[c/8] & (1 << (c&7))) == 0) break;
c = pcre_fcc[c];
if ((data[c/8] & (1 << (c&7))) != 0) continue;
}
+
break;
}
diff --git a/pcretest.c b/pcretest.c
index 06e70e5..3ea4b16 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -38,7 +38,7 @@ static const char *OP_names[] = {
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref",
+ "class", "negclass", "Ref",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
"Brazero", "Braminzero", "Bra"
};
@@ -161,11 +161,11 @@ for(;;)
goto CLASS_REF_REPEAT;
case OP_CLASS:
+ case OP_NEGCLASS:
{
int i, min, max;
-
- code++;
- printf(" [");
+ if (*code++ == OP_CLASS) printf(" [");
+ else printf(" ^[");
for (i = 0; i < 256; i++)
{
diff --git a/study.c b/study.c
index c81632e..2291b73 100644
--- a/study.c
+++ b/study.c
@@ -208,6 +208,7 @@ do
according to the repeat count. */
case OP_CLASS:
+ case OP_NEGCLASS:
{
tcode++;
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
diff --git a/testinput b/testinput
index de98e81..6dee4dc 100644
--- a/testinput
+++ b/testinput
@@ -1575,4 +1575,20 @@
*** Failers
aaaaaaa
+/[^a]/
+ aaaabcd
+ aaAabcd
+
+/[^a]/i
+ aaaabcd
+ aaAabcd
+
+/[^az]/
+ aaaabcd
+ aaAabcd
+
+/[^az]/i
+ aaaabcd
+ aaAabcd
+
/ End of test input /
diff --git a/testinput2 b/testinput2
index 0cdb18b..d5aa14a 100644
--- a/testinput2
+++ b/testinput2
@@ -267,4 +267,20 @@
/a[]b/
+/[^a]/
+ \Iaaaabcd
+ \IaaAabcd
+
+/[^az]/
+ \Iaaaabcd
+ \IaaAabcd
+
+/[^az]/
+ \Izazabcd
+ \IAaZabcd
+
+/[^aeiou ]{3,}/
+ co-processors, and for
+ \Ico-processors, and for
+
/ End of test input /
diff --git a/testoutput b/testoutput
index 3fac753..66b0e0a 100644
--- a/testoutput
+++ b/testoutput
@@ -1,5 +1,5 @@
Testing Perl-Compatible Regular Expressions
-PCRE version 1.04 22-Dec-1997
+PCRE version 1.05 23-Dec-1997
/the quick brown fox/
the quick brown fox
@@ -2342,5 +2342,29 @@ No match
aaaaaaa
No match
+/[^a]/
+ aaaabcd
+ 0: b
+ aaAabcd
+ 0: A
+
+/[^a]/i
+ aaaabcd
+ 0: b
+ aaAabcd
+ 0: b
+
+/[^az]/
+ aaaabcd
+ 0: b
+ aaAabcd
+ 0: A
+
+/[^az]/i
+ aaaabcd
+ 0: b
+ aaAabcd
+ 0: b
+
/ End of test input /
diff --git a/testoutput2 b/testoutput2
index 75e20e0..bc1d6b5 100644
--- a/testoutput2
+++ b/testoutput2
@@ -1,5 +1,5 @@
Testing Perl-Compatible Regular Expressions
-PCRE version 1.04 22-Dec-1997
+PCRE version 1.05 23-Dec-1997
/(a)b|/
Identifying subpattern count = 1
@@ -605,6 +605,42 @@ Failed: unmatched brackets at offset 0
/a[]b/
Failed: missing terminating ] for character class at offset 4
+/[^a]/
+Identifying subpattern count = 0
+No options
+No first char
+ \Iaaaabcd
+ 0: b
+ \IaaAabcd
+ 0: b
+
+/[^az]/
+Identifying subpattern count = 0
+No options
+No first char
+ \Iaaaabcd
+ 0: b
+ \IaaAabcd
+ 0: b
+
+/[^az]/
+Identifying subpattern count = 0
+No options
+No first char
+ \Izazabcd
+ 0: b
+ \IAaZabcd
+ 0: b
+
+/[^aeiou ]{3,}/
+Identifying subpattern count = 0
+No options
+No first char
+ co-processors, and for
+ 0: -pr
+ \Ico-processors, and for
+ 0: -pr
+
/ End of test input /
Identifying subpattern count = 0
No options