diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:38:21 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:38:21 +0000 |
commit | 67741e1934bcbb6e8ceb2a75a5c6f7f14ae28438 (patch) | |
tree | 2bc441cae7879531ea48fdfdc48319f8c19373b1 | |
parent | 688871425cacb8bd84efbf423e15ea9fc204f280 (diff) | |
download | pcre-67741e1934bcbb6e8ceb2a75a5c6f7f14ae28438.tar.gz |
Load pcre-1.05 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@13 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | Tech.Notes | 20 | ||||
-rw-r--r-- | internal.h | 3 | ||||
-rw-r--r-- | pcre.c | 111 | ||||
-rw-r--r-- | pcretest.c | 8 | ||||
-rw-r--r-- | study.c | 1 | ||||
-rw-r--r-- | testinput | 16 | ||||
-rw-r--r-- | testinput2 | 16 | ||||
-rw-r--r-- | testoutput | 26 | ||||
-rw-r--r-- | testoutput2 | 38 |
10 files changed, 210 insertions, 36 deletions
@@ -2,6 +2,13 @@ ChangeLog for PCRE ------------------ +Version 1.05 23-Dec-97 +---------------------- + +1. Negated character classes containing more than one character were failing if +PCRE_CASELESS was set at run time. + + Version 1.04 19-Dec-97 ---------------------- @@ -117,9 +117,21 @@ instances of OP_CHARS are used. Character classes ----------------- -OP_CLASS is used for a character class. It is followed by a 32-byte bit map -containing a 1 bit for every character that is acceptable. The bits are counted -from the least significant end of each byte. +OP_CLASS is used for a character class, and OP_NEGCLASS for a negated character +class, provided there are at least two characters in the class. If there is +only one character, OP_CHARS is used for a positive class, and OP_NOT for a +negative one. A set of repeating opcodes (OP_NOTSTAR etc.) are used for a +repeated, negated, single-character class. + +Both OP_CLASS and OP_NEGCLASS are followed by a 32-byte bit map containing a 1 +bit for every character that is acceptable. The bits are counted from the least +significant end of each byte. The reason for having two opcodes is to cope with +negated character classes when caseless matching is specified at run time but +not at compile time. If it is specified at compile time, the bit map is built +appropriately. This is the only time that a distinction is made between +OP_CLASS and OP_NEGCLASS, when the bit map was built in a caseful manner but +matching must be caseless. For OP_CLASS, a character matches if either of its +cases is in the bit map, but for OP_NEGCLASS, both of them must be present. Back references @@ -196,4 +208,4 @@ OP_ONCE. Philip Hazel -October 1997 +December 1997 @@ -3,7 +3,7 @@ *************************************************/ -#define PCRE_VERSION "1.04 22-Dec-1997" +#define PCRE_VERSION "1.05 23-Dec-1997" /* This is a library of functions to support regular expressions whose syntax @@ -181,6 +181,7 @@ enum { OP_CRMINRANGE, OP_CLASS, /* Match a character class */ + OP_NEGCLASS, /* Match a character class, specified negatively */ OP_REF, /* Match a back reference */ OP_ALT, /* Start of alternation */ @@ -54,7 +54,7 @@ the external pcre header. */ static char rep_min[] = { 0, 0, 1, 1, 0, 0 }; static char rep_max[] = { 0, 0, 0, 0, 1, 1 }; -/* Text forms of OP_ values and things, for debugging */ +/* Text forms of OP_ values and things, for debugging (not all used) */ #ifdef DEBUG static const char *OP_names[] = { @@ -65,7 +65,7 @@ static const char *OP_names[] = { "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", - "class", "Ref", + "class", "negclass", "Ref", "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once", "Brazero", "Braminzero", "Bra" }; @@ -91,7 +91,8 @@ static short int escapes[] = { /* Definition to allow mutual recursion */ -static BOOL compile_regex(int, int *, uschar **, const uschar **, const char **); +static BOOL + compile_regex(int, int *, uschar **, const uschar **, const char **); /* Structure for passing "static" information around between the functions doing the matching, so that they are thread-safe. */ @@ -306,6 +307,7 @@ do { /* Check a class or a back reference for a zero minimum */ case OP_CLASS: + case OP_NEGCLASS: case OP_REF: cc += (*cc == OP_REF)? 2 : 33; @@ -670,16 +672,22 @@ for (;; ptr++) case '[': previous = code; - *code++ = OP_CLASS; - /* If the first character is '^', set the negation flag */ + /* If the first character is '^', set the negation flag, and use a + different opcode. This only matters if caseless matching is specified at + runtime. */ if ((c = *(++ptr)) == '^') { negate_class = TRUE; + *code++ = OP_NEGCLASS; c = *(++ptr); } - else negate_class = FALSE; + else + { + negate_class = FALSE; + *code++ = OP_CLASS; + } /* Keep a count of chars so that we can optimize the case of just a single character. */ @@ -1015,7 +1023,8 @@ for (;; ptr++) /* If previous was a character class or a back reference, we put the repeat stuff after it. */ - else if (*previous == OP_CLASS || *previous == OP_REF) + else if (*previous == OP_CLASS || *previous == OP_NEGCLASS || + *previous == OP_REF) { if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; @@ -2090,11 +2099,12 @@ while (code < code_end) goto CLASS_REF_REPEAT; case OP_CLASS: + case OP_NEGCLASS: { int i, min, max; - code++; - printf(" ["); + if (*code++ == OP_CLASS) printf(" ["); + else printf(" ^["); for (i = 0; i < 256; i++) { @@ -2714,10 +2724,14 @@ for (;;) item to see if there is repeat information following. Then obey similar code to character type repeats - written out again for speed. If caseless matching was set at runtime but not at compile time, we have to check both - versions of a character. */ + versions of a character, and we have to behave differently for positive and + negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are + treated differently. */ case OP_CLASS: + case OP_NEGCLASS: { + BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless; const uschar *data = ecode + 1; /* Save for matching */ ecode += 33; /* Advance past the item */ @@ -2746,15 +2760,8 @@ for (;;) break; default: /* No repeat follows */ - if (eptr >= md->end_subject) return FALSE; - c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - if (md->runtime_caseless) - { - c = pcre_fcc[c]; - if ((data[c/8] & (1 << (c&7))) != 0) continue; /* With main loop */ - } - return FALSE; + min = max = 1; + break; } /* First, ensure the minimum number of matches are present. */ @@ -2763,12 +2770,30 @@ for (;;) { if (eptr >= md->end_subject) return FALSE; c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) { + if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + } + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else + { + if ((data[c/8] & (1 << (c&7))) == 0) return FALSE; c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; } + return FALSE; } @@ -2787,12 +2812,30 @@ for (;;) if (match(eptr, ecode, offset_top, md)) return TRUE; if (i >= max || eptr >= md->end_subject) return FALSE; c = *eptr++; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) + { + if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + } + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else { + if ((data[c/8] & (1 << (c&7))) == 0) return FALSE; c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; } + return FALSE; } /* Control never gets here */ @@ -2807,12 +2850,30 @@ for (;;) { if (eptr >= md->end_subject) break; c = *eptr; - if ((data[c/8] & (1 << (c&7))) != 0) continue; - if (md->runtime_caseless) + + /* Either not runtime caseless, or it was a positive class. For + runtime caseless, continue if either case is in the map. */ + + if (!nasty_case) + { + if ((data[c/8] & (1 << (c&7))) != 0) continue; + if (md->runtime_caseless) + { + c = pcre_fcc[c]; + if ((data[c/8] & (1 << (c&7))) != 0) continue; + } + } + + /* Runtime caseless and it was a negative class. Continue only if + both cases are in the map. */ + + else { + if ((data[c/8] & (1 << (c&7))) == 0) break; c = pcre_fcc[c]; if ((data[c/8] & (1 << (c&7))) != 0) continue; } + break; } @@ -38,7 +38,7 @@ static const char *OP_names[] = { "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", - "class", "Ref", + "class", "negclass", "Ref", "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once", "Brazero", "Braminzero", "Bra" }; @@ -161,11 +161,11 @@ for(;;) goto CLASS_REF_REPEAT; case OP_CLASS: + case OP_NEGCLASS: { int i, min, max; - - code++; - printf(" ["); + if (*code++ == OP_CLASS) printf(" ["); + else printf(" ^["); for (i = 0; i < 256; i++) { @@ -208,6 +208,7 @@ do according to the repeat count. */ case OP_CLASS: + case OP_NEGCLASS: { tcode++; for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; @@ -1575,4 +1575,20 @@ *** Failers aaaaaaa +/[^a]/ + aaaabcd + aaAabcd + +/[^a]/i + aaaabcd + aaAabcd + +/[^az]/ + aaaabcd + aaAabcd + +/[^az]/i + aaaabcd + aaAabcd + / End of test input / @@ -267,4 +267,20 @@ /a[]b/ +/[^a]/ + \Iaaaabcd + \IaaAabcd + +/[^az]/ + \Iaaaabcd + \IaaAabcd + +/[^az]/ + \Izazabcd + \IAaZabcd + +/[^aeiou ]{3,}/ + co-processors, and for + \Ico-processors, and for + / End of test input / @@ -1,5 +1,5 @@ Testing Perl-Compatible Regular Expressions -PCRE version 1.04 22-Dec-1997 +PCRE version 1.05 23-Dec-1997 /the quick brown fox/ the quick brown fox @@ -2342,5 +2342,29 @@ No match aaaaaaa No match +/[^a]/ + aaaabcd + 0: b + aaAabcd + 0: A + +/[^a]/i + aaaabcd + 0: b + aaAabcd + 0: b + +/[^az]/ + aaaabcd + 0: b + aaAabcd + 0: A + +/[^az]/i + aaaabcd + 0: b + aaAabcd + 0: b + / End of test input / diff --git a/testoutput2 b/testoutput2 index 75e20e0..bc1d6b5 100644 --- a/testoutput2 +++ b/testoutput2 @@ -1,5 +1,5 @@ Testing Perl-Compatible Regular Expressions -PCRE version 1.04 22-Dec-1997 +PCRE version 1.05 23-Dec-1997 /(a)b|/ Identifying subpattern count = 1 @@ -605,6 +605,42 @@ Failed: unmatched brackets at offset 0 /a[]b/ Failed: missing terminating ] for character class at offset 4 +/[^a]/ +Identifying subpattern count = 0 +No options +No first char + \Iaaaabcd + 0: b + \IaaAabcd + 0: b + +/[^az]/ +Identifying subpattern count = 0 +No options +No first char + \Iaaaabcd + 0: b + \IaaAabcd + 0: b + +/[^az]/ +Identifying subpattern count = 0 +No options +No first char + \Izazabcd + 0: b + \IAaZabcd + 0: b + +/[^aeiou ]{3,}/ +Identifying subpattern count = 0 +No options +No first char + co-processors, and for + 0: -pr + \Ico-processors, and for + 0: -pr + / End of test input / Identifying subpattern count = 0 No options |