diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:38:45 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:38:45 +0000 |
commit | c87b6bbacc291c0a1e1d8a396de1b621151a7822 (patch) | |
tree | fa4cea127d16be9ca8d47822c5c8e7e76fdc1687 | |
parent | d2884975c80217601913be24ef07254f2b9900cd (diff) | |
download | pcre-c87b6bbacc291c0a1e1d8a396de1b621151a7822.tar.gz |
Load pcre-2.01 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@25 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | Makefile | 20 | ||||
-rw-r--r-- | README | 110 | ||||
-rwxr-xr-x | RunTest | 53 | ||||
-rw-r--r-- | Tech.Notes | 57 | ||||
-rw-r--r-- | deftables.c | 142 | ||||
-rw-r--r-- | internal.h | 62 | ||||
-rw-r--r-- | maketables.c | 144 | ||||
-rw-r--r-- | pcre.3 | 114 | ||||
-rw-r--r-- | pcre.c | 366 | ||||
-rw-r--r-- | pcre.h | 4 | ||||
-rw-r--r-- | pcreposix.c | 2 | ||||
-rw-r--r-- | pcretest.c | 51 | ||||
-rw-r--r-- | pgrep.c | 2 | ||||
-rw-r--r-- | study.c | 67 | ||||
-rw-r--r-- | testinput4 | 64 | ||||
-rw-r--r-- | testoutput | 2 | ||||
-rw-r--r-- | testoutput2 | 2 | ||||
-rw-r--r-- | testoutput3 | 2 | ||||
-rw-r--r-- | testoutput4 | 113 |
20 files changed, 946 insertions, 439 deletions
@@ -2,6 +2,14 @@ ChangeLog for PCRE ------------------ +Version 2.01 21-Oct-98 +---------------------- + +1. Changed the API for pcre_compile() to allow for the provision of a pointer +to character tables built by pcre_maketables() in the current locale. If NULL +is passed, the default tables are used. + + Version 2.00 24-Sep-98 ---------------------- @@ -19,7 +19,7 @@ RANLIB = @true ########################################################################## -OBJ = chartables.o study.o pcre.o +OBJ = maketables.o study.o pcre.o all: libpcre.a libpcreposix.a pcretest pgrep @@ -39,14 +39,14 @@ libpcreposix.a: pcreposix.o $(AR) libpcreposix.a pcreposix.o $(RANLIB) libpcreposix.a -pcre.o: pcre.c pcre.h internal.h Makefile +pcre.o: chartables.c pcre.c pcre.h internal.h Makefile $(CC) -c $(CFLAGS) pcre.c pcreposix.o: pcreposix.c pcreposix.h internal.h pcre.h Makefile $(CC) -c $(CFLAGS) pcreposix.c -chartables.o: chartables.c - $(CC) -c $(CFLAGS) chartables.c +maketables.o: maketables.c pcre.h internal.h Makefile + $(CC) -c $(CFLAGS) maketables.c study.o: study.c pcre.h internal.h Makefile $(CC) -c $(CFLAGS) study.c @@ -57,15 +57,15 @@ pcretest.o: pcretest.c pcre.h Makefile pgrep.o: pgrep.c pcre.h Makefile $(CC) -c $(CFLAGS) pgrep.c -# An auxiliary program makes the character tables +# An auxiliary program makes the default character table source -chartables.c: maketables - ./maketables >chartables.c +chartables.c: deftables + ./deftables >chartables.c -maketables: maketables.c Makefile - $(CC) -o maketables $(CFLAGS) maketables.c +deftables: deftables.c maketables.c pcre.h internal.h Makefile + $(CC) -o deftables $(CFLAGS) deftables.c -# We deliberately omit maketables and chartables.c from 'make clean'; once made +# We deliberately omit deftables and chartables.c from 'make clean'; once made # chartables.c shouldn't change, and if people have edited the tables by hand, # you don't want to throw them away. @@ -8,6 +8,14 @@ README file for PCRE (Perl-compatible regular expressions) * ovector is required at matching time, to provide some additional workspace. * * The new man page has details. This change was necessary in order to support * * some of the new functionality in Perl 5.005. * +* * +* IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.00 * +* * +* Another (I hope this is the last!) change has been made to the API for the * +* pcre_compile() function. An additional argument has been added to make it * +* possible to pass over a pointer to character tables built in the current * +* locale by pcre_maketables(). To use the default tables, this new arguement * +* should be passed as NULL. * ******************************************************************************* The distribution should contain the following files: @@ -19,7 +27,8 @@ The distribution should contain the following files: Tech.Notes notes on the encoding pcre.3 man page for the functions pcreposix.3 man page for the POSIX wrapper API - maketables.c auxiliary program for building chartables.c + deftables.c auxiliary program for building chartables.c + maketables.c ) study.c ) source of pcre.c ) the functions pcreposix.c ) @@ -33,9 +42,11 @@ The distribution should contain the following files: testinput test data, compatible with Perl 5.004 and 5.005 testinput2 test data for error messages and non-Perl things testinput3 test data, compatible with Perl 5.005 + testinput4 test data for locale-specific tests testoutput test results corresponding to testinput testoutput2 test results corresponding to testinput2 - testoutput3 test results corresponding to testinpug3 + testoutput3 test results corresponding to testinput3 + testoutput4 test results corresponding to testinput4 To build PCRE, edit Makefile for your system (it is a fairly simple make file, and there are some comments at the top) and then run it. It builds two @@ -61,6 +72,19 @@ widespread, these two test files may get amalgamated. The second set of tests check pcre_info(), pcre_study(), error detection and run-time flags that are specific to PCRE, as well as the POSIX wrapper API. +The fourth set of tests checks pcre_maketables(), the facility for building a +set of character tables for a specific locale and using them instead of the +default tables. The tests make use of the "fr" (French) locale. Before running +the test, the script checks for the presence of this locale by running the +"locale" command. If that command fails, or if it doesn't include "fr" in the +list of available locales, the fourth test cannot be run, and a comment is +output to say why. If running this test produces instances of the error + + ** Failed to set locale "fr" + +in the comparison output, it means that locale is not available on your system, +despite being listed by "locale". This does not mean that PCRE is broken. + To install PCRE, copy libpcre.a to any suitable library directory (e.g. /usr/local/lib), pcre.h to any suitable include directory (e.g. /usr/local/include), and pcre.3 to any suitable man directory (e.g. @@ -83,23 +107,28 @@ uses the POSIX API, it will have to be renamed or pointed at by a link. Character tables ---------------- -PCRE uses four tables for manipulating and identifying characters. These are -compiled from a source file called chartables.c. This is not supplied in -the distribution, but is built by the program maketables (compiled from -maketables.c), which uses the ANSI C character handling functions such as -isalnum(), isalpha(), isupper(), islower(), etc. to build the table sources. -This means that the default C locale set in your system may affect the contents -of the tables. You can change the tables by editing chartables.c and then -re-building PCRE. If you do this, you should probably also edit Makefile to -ensure that the file doesn't ever get re-generated. - -The first two tables pcre_lcc[] and pcre_fcc[] provide lower casing and a -case flipping functions, respectively. The pcre_cbits[] table consists of four -32-byte bit maps which identify digits, letters, "word" characters, and white -space, respectively. These are used when building 32-byte bit maps that -represent character classes. - -The pcre_ctypes[] table has bits indicating various character types, as +PCRE uses four tables for manipulating and identifying characters. The final +argument of the pcre_compile() function is a pointer to a block of memory +containing the concatenated tables. A call to pcre_maketables() is used to +generate a set of tables in the current locale. However, if the final argument +is passed as NULL, a set of default tables that is built into the binary is +used. + +The source file called chartables.c contains the default set of tables. This is +not supplied in the distribution, but is built by the program deftables +(compiled from deftables.c), which uses the ANSI C character handling functions +such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table +sources. This means that the default C locale set your system will control the +contents of the tables. You can change the default tables by editing +chartables.c and then re-building PCRE. If you do this, you should probably +also edit Makefile to ensure that the file doesn't ever get re-generated. + +The first two 256-byte tables provide lower casing and case flipping functions, +respectively. The next table consists of three 32-byte bit maps which identify +digits, "word" characters, and white space, respectively. These are used when +building 32-byte bit maps that represent character classes. + +The final 256-byte table has bits indicating various character types, as follows: 1 white space character @@ -138,10 +167,28 @@ same effect as they do in Perl. There are also some upper case options that do not match Perl options: /A, /E, and /X set PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and PCRE_EXTRA respectively. -The /D option is a PCRE debugging feature. It causes the internal form of -compiled regular expressions to be output after compilation. The /S option -causes pcre_study() to be called after the expression has been compiled, and -the results used when the expression is matched. + +The /L option must be followed directly by the name of a locale, for example, + + /pattern/Lfr + +For this reason, it must be the last option letter. The given locale is set, +pcre_maketables() is called to build a set of character tables for the locale, +and this is then passed to pcre_compile() when compiling the regular +expression. Without an /L option, NULL is passed as the tables pointer; that +is, /L applies only to the expression on which it appears. + +The /I option requests that pcretest output information about the compiled +expression (whether it is anchored, has a fixed first character, and so on). It +does this by calling pcre_info() after compiling an expression, and outputting +the information it gets back. If the pattern is studied, the results of that +are also output. + +The /D option is a PCRE debugging feature, which also assumes /I. It causes the +internal form of compiled regular expressions to be output after compilation. + +The /S option causes pcre_study() to be called after the expression has been +compiled, and the results used when the expression is matched. Finally, the /P option causes pcretest to call PCRE via the POSIX wrapper API rather than its native API. When this is done, all other options except /i and @@ -206,9 +253,9 @@ following flags has any effect in this case. If the option -d is given to pcretest, it is equivalent to adding /D to each regular expression: the internal form is output after compilation. -If the option -i (for "information") is given to pcretest, it calls pcre_info() -after compiling an expression, and outputs the information it gets back. If the -pattern is studied, the results of that are also output. +If the option -i is given to pcretest, it is equivalent to adding /I to each +regular expression: information about the compiled pattern is given after +compilation. If the option -s is given to pcretest, it outputs the size of each compiled pattern after it has been compiled. @@ -237,10 +284,11 @@ for pcretest, and the special upper case options such as /A that pcretest recognizes are not used in this file. The output should be identical, apart from the initial identifying banner. -The testinput2 file is not suitable for feeding to Perltest, since it does -make use of the special upper case options and escapes that pcretest uses to -test some features of PCRE. It also contains malformed regular expressions, in -order to check that PCRE diagnoses them correctly. +The testinput2 and testinput4 files are not suitable for feeding to Perltest, +since they do make use of the special upper case options and escapes that +pcretest uses to test some features of PCRE. The first of these files also +contains malformed regular expressions, in order to check that PCRE diagnoses +them correctly. Philip Hazel <ph10@cam.ac.uk> -September 1998 +October 1998 @@ -9,31 +9,34 @@ cf=diff do1=no do2=no do3=no +do4=no while [ $# -gt 0 ] ; do case $1 in 1) do1=yes;; 2) do2=yes;; 3) do3=yes;; - *) echo "Unknown test number $1"; exit 1;; + 4) do4=yes;; + *) echo "Unknown test number $1"; exit 1;; esac shift done -if [ $do1 = no -a $do2 = no -a $do3 = no ] ; then +if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no ] ; then do1=yes do2=yes do3=yes -fi - + do4=yes +fi + # Primary test, Perl-compatible if [ $do1 = yes ] ; then echo "Testing main functionality (Perl compatible)" ./pcretest testinput testtry - if [ $? = 0 ] ; then + if [ $? = 0 ] ; then $cf testtry testoutput - if [ $? != 0 ] ; then exit 1; fi + if [ $? != 0 ] ; then exit 1; fi else exit 1 fi fi @@ -43,9 +46,9 @@ fi if [ $do2 = yes ] ; then echo "Testing API and error handling (not Perl compatible)" ./pcretest -i testinput2 testtry - if [ $? = 0 ] ; then - $cf testtry testoutput2 - if [ $? != 0 ] ; then exit 1; fi + if [ $? = 0 ] ; then + $cf testtry testoutput2 + if [ $? != 0 ] ; then exit 1; fi else exit 1 fi fi @@ -55,15 +58,37 @@ fi if [ $do3 = yes ] ; then echo "Testing Perl 5.005 features (Perl 5.005 compatible)" ./pcretest testinput3 testtry - if [ $? = 0 ] ; then + if [ $? = 0 ] ; then $cf testtry testoutput3 - if [ $? != 0 ] ; then exit 1; fi + if [ $? != 0 ] ; then exit 1; fi else exit 1 fi fi if [ $do1 = yes -a $do2 = yes -a $do3 = yes ] ; then - echo "Tests all ran OK" -fi + echo "The three main tests all ran OK" + echo " " +fi + +# Locale-specific tests, provided the "fr" locale is available + +if [ $do4 = yes ] ; then + locale -a | grep '^fr$' >/dev/null + if [ $? -eq 0 ] ; then + echo "Testing locale-specific features (using 'fr' locale)" + ./pcretest testinput4 testtry + if [ $? = 0 ] ; then + $cf testtry testoutput4 + if [ $? != 0 ] ; then exit 1; fi + echo "Locale test ran OK" + echo " " + else exit 1 + fi + else + echo "Cannot test locale-specific features - 'fr' locale not found," + echo "or the \"locale\" command is not available to check for it." + echo " " + fi +fi -# End +# End @@ -23,18 +23,18 @@ optionally, minimizing in Perl) the amount of the subject that matches individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's terminology. -For this set of functions, I tried at first to invent an algorithm that used an -amount of store bounded by a multiple of the number of characters in the -pattern, to save on compiling time. However, because of the greater complexity -in Perl regular expressions, I couldn't do this. In any case, a first pass -through the pattern is needed, in order to find internal flag settings like -(?i) at top level. So it works by running a very degenerate first pass to -calculate a maximum store size, and then a second pass to do the real compile - -which may use a bit less than the predicted amount of store. The idea is that -this is going to turn out faster because the first pass is degenerate and the -second can just store stuff straight into the vector. It does make the -compiling functions bigger, of course, but they have got quite big anyway to -handle all the Perl stuff. +For this set of functions that forms PCRE, I tried at first to invent an +algorithm that used an amount of store bounded by a multiple of the number of +characters in the pattern, to save on compiling time. However, because of the +greater complexity in Perl regular expressions, I couldn't do this. In any +case, a first pass through the pattern is needed, in order to find internal +flag settings like (?i) at top level. So it works by running a very degenerate +first pass to calculate a maximum store size, and then a second pass to do the +real compile - which may use a bit less than the predicted amount of store. The +idea is that this is going to turn out faster because the first pass is +degenerate and the second can just store stuff straight into the vector. It +does make the compiling functions bigger, of course, but they have got quite +big anyway to handle all the Perl stuff. The compiled form of a pattern is a vector of bytes, containing items of variable length. The first byte in an item is an opcode, and the length of the @@ -118,21 +118,16 @@ instances of OP_CHARS are used. Character classes ----------------- -OP_CLASS is used for a character class, and OP_NEGCLASS for a negated character -class, provided there are at least two characters in the class. If there is -only one character, OP_CHARS is used for a positive class, and OP_NOT for a -negative one. A set of repeating opcodes (OP_NOTSTAR etc.) are used for a -repeated, negated, single-character class. +OP_CLASS is used for a character class, provided there are at least two +characters in the class. If there is only one character, OP_CHARS is used for a +positive class, and OP_NOT for a negative one (that is, for something like +[^a]). Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a +repeated, negated, single-character class. The normal ones (OP_STAR etc.) are +used for a repeated positive single-character class. -Both OP_CLASS and OP_NEGCLASS are followed by a 32-byte bit map containing a 1 +OP_CLASS is followed by a 32-byte bit map containing a 1 bit for every character that is acceptable. The bits are counted from the least -significant end of each byte. The reason for having two opcodes is to cope with -negated character classes when caseless matching is specified at run time but -not at compile time. If it is specified at compile time, the bit map is built -appropriately. This is the only time that a distinction is made between -OP_CLASS and OP_NEGCLASS, when the bit map was built in a caseful manner but -matching must be caseless. For OP_CLASS, a character matches if either of its -cases is in the bit map, but for OP_NEGCLASS, both of them must be present. +significant end of each byte. Back references @@ -144,8 +139,9 @@ OP_REF is followed by a single byte containing the reference number. Repeating character classes and back references ----------------------------------------------- -In both cases, the repeat information follows the base item. The matching code -looks at the following opcode to see if it is one of +Single-character classes are handled specially (see above). This applies to +OP_CLASS and OP_REF. In both cases, the repeat information follows the base +item. The matching code looks at the following opcode to see if it is one of OP_CRSTAR OP_CRMINSTAR @@ -201,8 +197,9 @@ Forward assertions are just like other subpatterns, but starting with one of the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion is OP_REVERSE, followed by a two byte count of the number of characters to move -back. A separate count is present in each alternative of a lookbehind -assertion, allowing them to have different fixed lengths. +back the pointer in the subject string. A separate count is present in each +alternative of a lookbehind assertion, allowing them to have different fixed +lengths. Once-only subpatterns @@ -237,4 +234,4 @@ the compiled data. Philip Hazel -September 1998 +October 1998 diff --git a/deftables.c b/deftables.c new file mode 100644 index 0000000..b6c8e58 --- /dev/null +++ b/deftables.c @@ -0,0 +1,142 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* +PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + +Written by: Philip Hazel <ph10@cam.ac.uk> + + Copyright (c) 1998 University of Cambridge + +----------------------------------------------------------------------------- +Permission is granted to anyone to use this software for any purpose on any +computer system, and to redistribute it freely, subject to the following +restrictions: + +1. This software is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. + +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. +----------------------------------------------------------------------------- + +See the file Tech.Notes for some information on the internals. +*/ + + +/* This is a support program to generate the file chartables.c, containing +character tables of various kinds. They are built according to the default C +locale and used as the default tables by PCRE. Now that pcre_maketables is +a function visible to the outside world, we make use of its code from here in +order to be consistent. */ + +#include <ctype.h> +#include <stdio.h> +#include <string.h> + +#include "internal.h" + +#define DEFTABLES +#include "maketables.c" + + +int main(void) +{ +int i; +unsigned const char *tables = pcre_maketables(); + +printf( + "/*************************************************\n" + "* Perl-Compatible Regular Expressions *\n" + "*************************************************/\n\n" + "/* This file is automatically written by the makechartables auxiliary \n" + "program. If you edit it by hand, you might like to edit the Makefile to \n" + "prevent its ever being regenerated.\n\n" + "This file is #included in the compilation of pcre.c to build the default\n" + "character tables which are used when no tables are passed to the compile\n" + "function. */\n\n" + "static unsigned char pcre_default_tables[] = {\n\n" + "/* This table is a lower casing table. */\n\n"); + +printf(" "); +for (i = 0; i < 256; i++) + { + if ((i & 7) == 0 && i != 0) printf("\n "); + printf("%3d", *tables++); + if (i != 255) printf(","); + } +printf(",\n\n"); + +printf("/* This table is a case flipping table. */\n\n"); + +printf(" "); +for (i = 0; i < 256; i++) + { + if ((i & 7) == 0 && i != 0) printf("\n "); + printf("%3d", *tables++); + if (i != 255) printf(","); + } +printf(",\n\n"); + +printf( + "/* This table contains bit maps for digits, 'word' chars, and white\n" + "space. Each map is 32 bytes long and the bits run from the least\n" + "significant end of each byte. */\n\n"); + +printf(" "); +for (i = 0; i < cbit_length; i++) + { + if ((i & 7) == 0 && i != 0) + { + if ((i & 31) == 0) printf("\n"); + printf("\n "); + } + printf("0x%02x", *tables++); + if (i != cbit_length - 1) printf(","); + } +printf(" ,\n\n"); + +printf( + "/* This table identifies various classes of character by individual bits:\n" + " 0x%02x white space character\n" + " 0x%02x letter\n" + " 0x%02x decimal digit\n" + " 0x%02x hexadecimal digit\n" + " 0x%02x alphanumeric or '_'\n" + " 0x%02x regular expression metacharacter or binary zero\n*/\n\n", + ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word, + ctype_meta); + +printf(" "); +for (i = 0; i < 256; i++) + { + if ((i & 7) == 0 && i != 0) + { + printf(" /* "); + if (isprint(i-8)) printf(" %c -", i-8); + else printf("%3d-", i-8); + if (isprint(i-1)) printf(" %c ", i-1); + else printf("%3d", i-1); + printf(" */\n "); + } + printf("0x%02x", *tables++); + if (i != 255) printf(","); + } + +printf("};/* "); +if (isprint(i-8)) printf(" %c -", i-8); + else printf("%3d-", i-8); +if (isprint(i-1)) printf(" %c ", i-1); + else printf("%3d", i-1); +printf(" */\n\n/* End of chartables.c */\n"); + +return 0; +} + +/* End of deftables.c */ @@ -3,7 +3,7 @@ *************************************************/ -#define PCRE_VERSION "2.00 24-Sep-1998" +#define PCRE_VERSION "2.01 21-Oct-1998" /* This is a library of functions to support regular expressions whose syntax @@ -259,6 +259,7 @@ runs on as long as necessary after the end. */ typedef struct real_pcre { unsigned int magic_number; + const unsigned char *tables; unsigned short int options; unsigned char top_bracket; unsigned char top_backref; @@ -273,14 +274,38 @@ typedef struct real_pcre_extra { unsigned char start_bits[32]; } real_pcre_extra; -/* Global tables from chartables.c */ -extern uschar pcre_lcc[]; -extern uschar pcre_fcc[]; -extern uschar pcre_cbits[]; -extern uschar pcre_ctypes[]; - -/* Bit definitions for entries in pcre_ctypes[]. */ +/* Structure for passing "static" information around between the functions +doing the compiling, so that they are thread-safe. */ + +typedef struct compile_data { + const uschar *lcc; /* Points to lower casing table */ + const uschar *fcc; /* Points to case-flippint table */ + const uschar *cbits; /* Points to character type table */ + const uschar *ctypes; /* Points to table of type maps */ +} compile_data; + +/* Structure for passing "static" information around between the functions +doing the matching, so that they are thread-safe. */ + +typedef struct match_data { + int errorcode; /* As it says */ + int *offset_vector; /* Offset vector */ + int offset_end; /* One past the end */ + int offset_max; /* The maximum usable for return data */ + const uschar *lcc; /* Points to lower casing table */ + const uschar *ctypes; /* Points to table of type maps */ + BOOL offset_overflow; /* Set if too many extractions */ + BOOL notbol; /* NOTBOL flag */ + BOOL noteol; /* NOTEOL flag */ + BOOL endonly; /* Dollar not before final \n */ + const uschar *start_subject; /* Start of the subject string */ + const uschar *end_subject; /* End of the subject string */ + const uschar *end_match_ptr; /* Subject position at end match */ + int end_offset_top; /* Highwater mark at end of match */ +} match_data; + +/* Bit definitions for entries in the pcre_ctypes table. */ #define ctype_space 0x01 #define ctype_letter 0x02 @@ -289,12 +314,21 @@ extern uschar pcre_ctypes[]; #define ctype_word 0x10 /* alphameric or '_' */ #define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */ -/* Offsets for the bitmap tables */ +/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set +of bits for a class map. */ + +#define cbit_digit 0 /* for \d */ +#define cbit_word 32 /* for \w */ +#define cbit_space 64 /* for \s */ +#define cbit_length 96 /* Length of the cbits table */ + +/* Offsets of the various tables from the base tables pointer, and +total length. */ -#define cbit_digit 0 -#define cbit_letter 32 -#define cbit_word 64 -#define cbit_space 96 -#define cbit_length 128 /* Length of the cbits table */ +#define lcc_offset 0 +#define fcc_offset 256 +#define cbits_offset 512 +#define ctypes_offset (cbits_offset + cbit_length) +#define tables_length (ctypes_offset + 256) /* End of internal.h */ diff --git a/maketables.c b/maketables.c index 4566c36..370a0e9 100644 --- a/maketables.c +++ b/maketables.c @@ -30,97 +30,67 @@ See the file Tech.Notes for some information on the internals. */ -/* This is a support program to generate the file chartables.c, containing -character tables of various kinds. They are built according to the local C -locale. */ - -#include <ctype.h> -#include <stdio.h> -#include <string.h> +/* This file is compiled on its own as part of the PCRE library. However, +it is also included in the compilation of deftables.c, in which case the macro +DEFTABLES is defined. */ +#ifndef DEFTABLES #include "internal.h" +#endif + + + +/************************************************* +* Create PCRE character tables * +*************************************************/ -int main(void) +/* This function builds a set of character tables for use by PCRE and returns +a pointer to them. They are build using the ctype functions, and consequently +their contents will depend upon the current locale setting. When compiled as +part of the library, the store is obtained via pcre_malloc(), but when compiled +inside deftables, use malloc(). + +Arguments: none +Returns: pointer to the contiguous block of data +*/ + +unsigned const char * +pcre_maketables(void) { +unsigned char *yield, *p; int i; -unsigned char cbits[cbit_length]; - -printf( - "/*************************************************\n" - "* Perl-Compatible Regular Expressions *\n" - "*************************************************/\n\n" - "/* This file is automatically written by the makechartables auxiliary \n" - "program. If you edit it by hand, you might like to edit the Makefile to \n" - "prevent its ever being regenerated. */\n\n" - "/* This table is a lower casing table. */\n\n" - "unsigned char pcre_lcc[] = {\n"); - -printf(" "); -for (i = 0; i < 256; i++) - { - if ((i & 7) == 0 && i != 0) printf("\n "); - printf("%3d", tolower(i)); - if (i != 255) printf(","); - } -printf(" };\n\n"); -printf( - "/* This table is a case flipping table. */\n\n" - "unsigned char pcre_fcc[] = {\n"); +#ifndef DEFTABLES +yield = (pcre_malloc)(tables_length); +#else +yield = malloc(tables_length); +#endif -printf(" "); -for (i = 0; i < 256; i++) - { - if ((i & 7) == 0 && i != 0) printf("\n "); - printf("%3d", islower(i)? toupper(i) : tolower(i)); - if (i != 255) printf(","); - } -printf(" };\n\n"); +if (yield == NULL) return NULL; +p = yield; + +/* First comes the lower casing table */ + +for (i = 0; i < 256; i++) *p++ = tolower(i); + +/* Next the case-flipping table */ -printf( - "/* This table contains bit maps for digits, letters, 'word' chars, and\n" - "white space. Each map is 32 bytes long and the bits run from the least\n" - "significant end of each byte. */\n\n" - "unsigned char pcre_cbits[] = {\n"); +for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); -memset(cbits, 0, sizeof(cbits)); +/* Then the character class tables */ +memset(p, 0, cbit_length); for (i = 0; i < 256; i++) { - if (isdigit(i)) cbits[cbit_digit + i/8] |= 1 << (i&7); - if (isalpha(i)) cbits[cbit_letter + i/8] |= 1 << (i&7); + if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7); if (isalnum(i) || i == '_') - cbits[cbit_word + i/8] |= 1 << (i&7); - if (isspace(i)) cbits[cbit_space + i/8] |= 1 << (i&7); + p[cbit_word + i/8] |= 1 << (i&7); + if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); } +p += cbit_length; + +/* Finally, the character type table */ -printf(" "); -for (i = 0; i < cbit_length; i++) - { - if ((i & 7) == 0 && i != 0) - { - if ((i & 31) == 0) printf("\n"); - printf("\n "); - } - printf("0x%02x", cbits[i]); - if (i != cbit_length - 1) printf(","); - } -printf(" };\n\n"); - -printf( - "/* This table identifies various classes of character by individual bits:\n" - " 0x%02x white space character\n" - " 0x%02x letter\n" - " 0x%02x decimal digit\n" - " 0x%02x hexadecimal digit\n" - " 0x%02x alphanumeric or '_'\n" - " 0x%02x regular expression metacharacter or binary zero\n*/\n\n", - ctype_space, ctype_letter, ctype_digit, ctype_xdigit, ctype_word, - ctype_meta); - -printf("unsigned char pcre_ctypes[] = {\n"); - -printf(" "); for (i = 0; i < 256; i++) { int x = 0; @@ -130,28 +100,10 @@ for (i = 0; i < 256; i++) if (isxdigit(i)) x += ctype_xdigit; if (isalnum(i) || i == '_') x += ctype_word; if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta; - - if ((i & 7) == 0 && i != 0) - { - printf(" /* "); - if (isprint(i-8)) printf(" %c -", i-8); - else printf("%3d-", i-8); - if (isprint(i-1)) printf(" %c ", i-1); - else printf("%3d", i-1); - printf(" */\n "); - } - printf("0x%02x", x); - if (i != 255) printf(","); + *p++ = x; } -printf("};/* "); -if (isprint(i-8)) printf(" %c -", i-8); - else printf("%3d-", i-8); -if (isprint(i-1)) printf(" %c ", i-1); - else printf("%3d", i-1); -printf(" */\n\n/* End of chartables.c */\n"); - -return 0; +return yield; } /* End of maketables.c */ @@ -8,7 +8,12 @@ pcre - Perl-compatible regular expressions. .br .B pcre *pcre_compile(const char *\fIpattern\fR, int \fIoptions\fR, .ti +5n -.B const char **\fIerrptr\fR, int *\fIerroffset\fR); +.B const char **\fIerrptr\fR, int *\fIerroffset\fR, +.ti +5n +.B const unsigned char *\fItableptr\fR); +.PP +.br +.B const unsigned char *pcre_maketables(void); .PP .br .B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR, @@ -34,18 +39,6 @@ pcre - Perl-compatible regular expressions. .PP .br .B void (*pcre_free)(void *); -.PP -.br -.B unsigned char *pcre_cbits[128]; -.PP -.br -.B unsigned char *pcre_ctypes[256]; -.PP -.br -.B unsigned char *pcre_fcc[256]; -.PP -.br -.B unsigned char *pcre_lcc[256]; @@ -60,7 +53,10 @@ a set of wrapper functions that correspond to the POSIX API. See The three functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR are used for compiling and matching regular expressions. The -function \fBpcre_info()\fR is used to find out information about a compiled +function \fBpcre_maketables()\fR is used (optionally) to build a set of +character tables in the current locale for passing to \fBpcre_compile()\fR. + +The function \fBpcre_info()\fR is used to find out information about a compiled pattern, while the function \fBpcre_version()\fR returns a pointer to a string containing the version of PCRE and its date of release. @@ -70,16 +66,11 @@ respectively. PCRE calls the memory management functions via these variables, so a calling program can replace them if it wishes to intercept the calls. This should be done before calling any PCRE functions. -The other global variables are character tables. They are initialized when PCRE -is compiled, from source that is generated by reference to the C character type -functions, but which a user of PCRE is free to modify. In principle the tables -could also be modified at run time. See PCRE's README file for more details. - .SH MULTI-THREADING The PCRE functions can be used in multi-threading applications, with the -proviso that the character tables and the memory management functions pointed -to by \fBpcre_malloc\fR and \fBpcre_free\fR are shared by all threads. +proviso that the memory management functions pointed to by \fBpcre_malloc\fR +and \fBpcre_free\fR are shared by all threads. The compiled form of a regular expression is not altered during matching, so the same compiled pattern can safely be used by several threads at once. @@ -88,10 +79,12 @@ the same compiled pattern can safely be used by several threads at once. .SH COMPILING A PATTERN The function \fBpcre_compile()\fR is called to compile a pattern into an internal form. The pattern is a C string terminated by a binary zero, and -is passed in the argument \fIpattern\fR. A pointer to the compiled code block -is returned. The \fBpcre\fR type is defined for this for convenience, but in -fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the contents of the -block are not defined. +is passed in the argument \fIpattern\fR. A pointer to a single block of memory +that is obtained via \fBpcre_malloc\fR is returned. This contains the +compiled code and related data. The \fBpcre\fR type is defined for this for +convenience, but in fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the +contents of the block are not externally defined. It is up to the caller to +free the memory when it is no longer required. .PP The size of a compiled pattern is roughly proportional to the length of the pattern string, except that each character class (other than those containing @@ -111,11 +104,14 @@ time. If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately. Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual -error message. - -The offset from the start of the pattern to the character where the error was -discovered is placed in the variable pointed to by \fIerroffset\fR, which must -not be NULL. If it is, an immediate error is given. +error message. The offset from the start of the pattern to the character where +the error was discovered is placed in the variable pointed to by +\fIerroffset\fR, which must not be NULL. If it is, an immediate error is given. +.PP +If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of +character tables which are built when it is compiled, using the default C +locale. Otherwise, \fItableptr\fR must be the result of a call to +\fBpcre_maketables()\fR. See the section on locale support below. .PP The following option bits are defined in the header file: @@ -210,6 +206,33 @@ not have a single fixed starting character. A bitmap of possible starting characters is created. +.SH LOCALE SUPPORT +PCRE handles caseless matching, and determines whether characters are letters, +digits, or whatever, by reference to a set of tables. The library contains a +default set of tables which is created in the default C locale when PCRE is +compiled. This is used when the final argument of \fBpcre_compile()\fR is NULL, +and is sufficient for many applications. + +An alternative set of tables can, however, be supplied. Such tables are built +by calling the \fBpcre_maketables()\fR function, which has no arguments, in the +relevant locale. The result can then be passed to \fBpcre_compile()\ as often +as necessary. For example, to build and use tables that are appropriate for the +French locale (where accented characters with codes greater than 128 are +treated as letters), the following code could be used: + + setlocale(LC_CTYPE, "fr"); + tables = pcre_maketables(); + re = pcre_compile(..., tables); + +The tables are built in memory that is obtained via \fBpcre_malloc\fR. The +pointer that is passed to \fBpcre_compile\fR is saved with the compiled +pattern, and the same tables are used via this pointer by \fBpcre_study()\fR +and \fBpcre_match()\fR. Thus for any single pattern, compilation, studying and +matching all happen in the same locale, but different patterns can be compiled +in different locales. It is the caller's responsibility to ensure that the +memory containing the tables remains available for as long as it is needed. + + .SH MATCHING A PATTERN The function \fBpcre_exec()\fR is called to match a subject string against a pre-compiled pattern, which is passed in the \fIcode\fR argument. If the @@ -579,11 +602,16 @@ Each pair of escape sequences partitions the complete set of characters into two disjoint sets. Any given character matches one, and only one, of each pair. A "word" character is any letter or digit or the underscore character, that is, -any character which can be part of a Perl "word". These character type -sequences can appear both inside and outside character classes. They each match -one character of the appropriate type. If the current matching point is at the -end of the subject string, all of them fail, since there is no character to -match. +any character which can be part of a Perl "word". The definition of letters and +digits is controlled by PCRE's character tables, and may vary if locale- +specific matching is taking place (see "Locale support" above). For example, in +the "fr" (French) locale, some character codes greater than 128 are used for +accented letters, and these are matched by \\w. + +These character type sequences can appear both inside and outside character +classes. They each match one character of the appropriate type. If the current +matching point is at the end of the subject string, all of them fail, since +there is no character to match. The fourth use of backslash is for certain simple assertions. An assertion specifies a condition that has to be met at a particular point in a match, @@ -682,10 +710,10 @@ are in the class by enumerating those that are not. It is not an assertion: it still consumes a character from the subject string, and fails if the current pointer is at the end of the string. -When PCRE_CASELESS is set, any letters in a class represent both their upper -case and lower case versions, so for example, a caseless [aeiou] matches "A" as -well as "a", and a caseless [^aeiou] does not match "A", whereas a caseful -version would. +When caseless matching is set, any letters in a class represent both their +upper case and lower case versions, so for example, a caseless [aeiou] matches +"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a +caseful version would. The newline character is never treated in any special way in character classes, whatever the setting of the PCRE_DOTALL or PCRE_MULTILINE options is. A class @@ -702,9 +730,11 @@ octal or hexadecimal representation of "]" can, however, be used to end a range. Ranges operate in ASCII collating sequence. They can also be used for -characters specified numerically, for example [\\000-\\037]. If a range such as -[W-c] is used when PCRE_CASELESS is set, it matches the letters involved in -either case, so is equivalent to [][\\^_`wxyzabc], matched caselessly. +characters specified numerically, for example [\\000-\\037]. If a range that +includes letters is used when caseless matching is set, it matches the letters +in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched +caselessly, and if character tables for the "fr" locale are in use, +[\\xc8-\\xcb] matches accented E characters in both cases. The character types \\d, \\D, \\s, \\S, \\w, and \\W may also appear in a character class, and add the characters that they match to the class. For @@ -107,25 +107,7 @@ static const short int escapes[] = { static BOOL compile_regex(int, int, int *, uschar **, const uschar **, const char **, - BOOL, int); - -/* Structure for passing "static" information around between the functions -doing the matching, so that they are thread-safe. */ - -typedef struct match_data { - int errorcode; /* As it says */ - int *offset_vector; /* Offset vector */ - int offset_end; /* One past the end */ - int offset_max; /* The maximum usable for return data */ - BOOL offset_overflow; /* Set if too many extractions */ - BOOL notbol; /* NOTBOL flag */ - BOOL noteol; /* NOTEOL flag */ - BOOL endonly; /* Dollar not before final \n */ - const uschar *start_subject; /* Start of the subject string */ - const uschar *end_subject; /* End of the subject string */ - const uschar *end_match_ptr; /* Subject position at end match */ - int end_offset_top; /* Highwater mark at end of match */ -} match_data; + BOOL, int, compile_data *); @@ -145,6 +127,20 @@ void (*pcre_free)(void *) = free; /************************************************* +* Default character tables * +*************************************************/ + +/* A default set of character tables is included in the PCRE binary. Its source +is built by the maketables auxiliary program, which uses the default C ctypes +functions, and put in the file chartables.c. These tables are used by PCRE +whenever the caller of pcre_compile() does not provide an alternate set of +tables. */ + +#include "chartables.c" + + + +/************************************************* * Return version string * *************************************************/ @@ -237,6 +233,7 @@ Arguments: bracount number of previous extracting brackets options the options bits isclass TRUE if inside a character class + cd pointer to char tables block Returns: zero or positive => a data character negative => a special escape sequence @@ -245,7 +242,7 @@ Returns: zero or positive => a data character static int check_escape(const uschar **ptrptr, const char **errorptr, int bracount, - int options, BOOL isclass) + int options, BOOL isclass, compile_data *cd) { const uschar *ptr = *ptrptr; int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */ @@ -288,7 +285,7 @@ else { oldptr = ptr; c -= '0'; - while ((pcre_ctypes[ptr[1]] & ctype_digit) != 0) + while ((cd->ctypes[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 10 || c <= bracount) { @@ -314,7 +311,7 @@ else case '0': c -= '0'; - while(i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_digit) != 0 && + while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && ptr[1] != '8' && ptr[1] != '9') c = c * 8 + *(++ptr) - '0'; break; @@ -323,11 +320,11 @@ else case 'x': c = 0; - while (i++ < 2 && (pcre_ctypes[ptr[1]] & ctype_xdigit) != 0) + while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) { ptr++; - c = c * 16 + pcre_lcc[*ptr] - - (((pcre_ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W'); + c = c * 16 + cd->lcc[*ptr] - + (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W'); } break; @@ -341,13 +338,15 @@ else /* A letter is upper-cased; then the 0x40 bit is flipped */ - if (c >= 'a' && c <= 'z') c = pcre_fcc[c]; + if (c >= 'a' && c <= 'z') c = cd->fcc[c]; c ^= 0x40; break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, - for Perl compatibility, it is a literal. */ + for Perl compatibility, it is a literal. This code looks a bit odd, but + there used to be some cases other than the default, and there may be again + in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) @@ -377,22 +376,23 @@ where the ddds are digits. Arguments: p pointer to the first char after '{' + cd pointer to char tables block Returns: TRUE or FALSE */ static BOOL -is_counted_repeat(const uschar *p) +is_counted_repeat(const uschar *p, compile_data *cd) { -if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE; -while ((pcre_ctypes[*p] & ctype_digit) != 0) p++; +if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE; +while ((cd->ctypes[*p] & ctype_digit) != 0) p++; if (*p == '}') return TRUE; if (*p++ != ',') return FALSE; if (*p == '}') return TRUE; -if ((pcre_ctypes[*p++] & ctype_digit) == 0) return FALSE; -while ((pcre_ctypes[*p] & ctype_digit) != 0) p++; +if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE; +while ((cd->ctypes[*p] & ctype_digit) != 0) p++; return (*p == '}'); } @@ -412,25 +412,27 @@ Arguments: maxp pointer to int for max returned as -1 if no max errorptr points to pointer to error message + cd pointer to character tables clock Returns: pointer to '}' on success; current ptr on error, with errorptr set */ static const uschar * -read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr) +read_repeat_counts(const uschar *p, int *minp, int *maxp, + const char **errorptr, compile_data *cd) { int min = 0; int max = -1; -while ((pcre_ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; +while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; if (*p == '}') max = min; else { if (*(++p) != '}') { max = 0; - while((pcre_ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; + while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; if (max < min) { *errorptr = ERR4; @@ -615,20 +617,22 @@ for (;;) /* Scan the pattern, compiling it into the code vector. Arguments: - options the option bits - brackets points to number of brackets used - code points to the pointer to the current code point - ptrptr points to the current pattern pointer - errorptr points to pointer to error message - optchanged set to the value of the last OP_OPT item compiled - -Returns: TRUE on success - FALSE, with *errorptr set on error + options the option bits + brackets points to number of brackets used + code points to the pointer to the current code point + ptrptr points to the current pattern pointer + errorptr points to pointer to error message + optchanged set to the value of the last OP_OPT item compiled + cd contains pointers to tables + +Returns: TRUE on success + FALSE, with *errorptr set on error */ static BOOL compile_branch(int options, int *brackets, uschar **codeptr, - const uschar **ptrptr, const char **errorptr, int *optchanged) + const uschar **ptrptr, const char **errorptr, int *optchanged, + compile_data *cd) { int repeat_type, op_type; int repeat_min, repeat_max; @@ -660,7 +664,7 @@ for (;; ptr++) c = *ptr; if ((options & PCRE_EXTENDED) != 0) { - if ((pcre_ctypes[c] & ctype_space) != 0) continue; + if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while ((c = *(++ptr)) != 0 && c != '\n'); @@ -748,37 +752,38 @@ for (;; ptr++) if (c == '\\') { - c = check_escape(&ptr, errorptr, *brackets, options, TRUE); + c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); if (-c == ESC_b) c = '\b'; else if (c < 0) { + register const uschar *cbits = cd->cbits; class_charcount = 10; switch (-c) { case ESC_d: - for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit]; continue; case ESC_D: - for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit]; continue; case ESC_w: for (c = 0; c < 32; c++) - class[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]); continue; case ESC_W: for (c = 0; c < 32; c++) - class[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]); continue; case ESC_s: - for (c = 0; c < 32; c++) class[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space]; continue; case ESC_S: - for (c = 0; c < 32; c++) class[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space]; continue; default: @@ -810,7 +815,7 @@ for (;; ptr++) if (d == '\\') { - d = check_escape(&ptr, errorptr, *brackets, options, TRUE); + d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); if (d < 0) { if (d == -ESC_b) d = '\b'; else @@ -832,7 +837,7 @@ for (;; ptr++) class[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { - int uc = pcre_fcc[c]; /* flip case */ + int uc = cd->fcc[c]; /* flip case */ class[uc/8] |= (1 << (uc&7)); } class_charcount++; /* in case a one-char range */ @@ -847,7 +852,7 @@ for (;; ptr++) class [c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { - c = pcre_fcc[c]; /* flip case */ + c = cd->fcc[c]; /* flip case */ class[c/8] |= (1 << (c&7)); } class_charcount++; @@ -894,8 +899,8 @@ for (;; ptr++) /* Various kinds of repeat */ case '{': - if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr); + if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR; + ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd); if (*errorptr != NULL) goto FAILED; goto REPEAT; @@ -1191,7 +1196,7 @@ for (;; ptr++) case '(': bravalue = OP_COND; /* Conditional group */ - if ((pcre_ctypes[*(++ptr)] & ctype_digit) != 0) + if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0) { condref = *ptr - '0'; while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; @@ -1324,7 +1329,8 @@ for (;; ptr++) errorptr, /* Where to put an error message */ (bravalue == OP_ASSERTBACK || bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ - condref)) /* Condition reference number */ + condref, /* Condition reference number */ + cd)) /* Tables block */ goto FAILED; /* At the end of compiling, code is still pointing to the start of the @@ -1372,7 +1378,7 @@ for (;; ptr++) case '\\': tempptr = ptr; - c = check_escape(&ptr, errorptr, *brackets, options, FALSE); + c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values are arranged to be the negation of the corresponding OP_values. For the @@ -1417,7 +1423,7 @@ for (;; ptr++) { if ((options & PCRE_EXTENDED) != 0) { - if ((pcre_ctypes[c] & ctype_space) != 0) continue; + if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while ((c = *(++ptr)) != 0 && c != '\n'); @@ -1433,7 +1439,7 @@ for (;; ptr++) if (c == '\\') { tempptr = ptr; - c = check_escape(&ptr, errorptr, *brackets, options, FALSE); + c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); if (c < 0) { ptr = tempptr; break; } } @@ -1445,7 +1451,7 @@ for (;; ptr++) /* This "while" is the end of the "do" above. */ - while (length < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0); + while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); /* Compute the length and set it in the data vector, and advance to the next state. */ @@ -1490,13 +1496,15 @@ Argument: errorptr -> pointer to error message lookbehind TRUE if this is a lookbehind assertion condref > 0 for OPT_CREF setting at start of conditional group + cd points to the data block with tables pointers Returns: TRUE on success */ static BOOL compile_regex(int options, int optchanged, int *brackets, uschar **codeptr, - const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref) + const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref, + compile_data *cd) { const uschar *ptr = *ptrptr; uschar *code = *codeptr; @@ -1543,7 +1551,7 @@ for (;;) /* Now compile the branch */ - if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged)) + if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd)) { *ptrptr = ptr; return FALSE; @@ -1813,6 +1821,7 @@ Arguments: options various option bits errorptr pointer to pointer to error text erroroffset ptr offset in pattern where error was detected + tables pointer to character tables or NULL Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set @@ -1820,7 +1829,7 @@ Returns: pointer to compiled data block, or NULL on error, pcre * pcre_compile(const char *pattern, int options, const char **errorptr, - int *erroroffset) + int *erroroffset, const unsigned char *tables) { real_pcre *re; int length = 3; /* For initial BRA plus length */ @@ -1833,6 +1842,7 @@ int branch_newextra; unsigned int brastackptr = 0; uschar *code; const uschar *ptr; +compile_data compile_block; int brastack[BRASTACK_SIZE]; uschar bralenstack[BRASTACK_SIZE]; @@ -1861,6 +1871,16 @@ if ((options & ~PUBLIC_OPTIONS) != 0) return NULL; } +/* Set up pointers to the individual character tables */ + +if (tables == NULL) tables = pcre_default_tables; +compile_block.lcc = tables + lcc_offset; +compile_block.fcc = tables + fcc_offset; +compile_block.cbits = tables + cbits_offset; +compile_block.ctypes = tables + ctypes_offset; + +/* Reflect pattern for debugging output */ + DPRINTF(("------------------------------------------------------------------\n")); DPRINTF(("%s\n", pattern)); @@ -1879,7 +1899,7 @@ while ((c = *(++ptr)) != 0) if ((options & PCRE_EXTENDED) != 0) { - if ((pcre_ctypes[c] & ctype_space) != 0) continue; + if ((compile_block.ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while ((c = *(++ptr)) != 0 && c != '\n'); @@ -1897,7 +1917,7 @@ while ((c = *(++ptr)) != 0) case '\\': { const uschar *save_ptr = ptr; - c = check_escape(&ptr, errorptr, bracount, options, FALSE); + c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if (c >= 0) { @@ -1917,9 +1937,9 @@ while ((c = *(++ptr)) != 0) int refnum = -c - ESC_REF; if (refnum > top_backref) top_backref = refnum; length++; /* For single back reference */ - if (ptr[1] == '{' && is_counted_repeat(ptr+2)) + if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block)) { - ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); + ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if ((min == 0 && (max == 1 || max == -1)) || (min == 1 && max == -1)) @@ -1943,8 +1963,8 @@ while ((c = *(++ptr)) != 0) or back reference. */ case '{': - if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; - ptr = read_repeat_counts(ptr+1, &min, &max, errorptr); + if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR; + ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if ((min == 0 && (max == 1 || max == -1)) || (min == 1 && max == -1)) @@ -1979,7 +1999,8 @@ while ((c = *(++ptr)) != 0) { if (*ptr == '\\') { - int ch = check_escape(&ptr, errorptr, bracount, options, TRUE); + int ch = check_escape(&ptr, errorptr, bracount, options, TRUE, + &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if (-ch == ESC_b) class_charcount++; else class_charcount = 10; } @@ -1996,9 +2017,9 @@ while ((c = *(++ptr)) != 0) /* A repeat needs either 1 or 5 bytes. */ - if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) + if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block)) { - ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); + ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if ((min == 0 && (max == 1 || max == -1)) || (min == 1 && max == -1)) @@ -2064,11 +2085,11 @@ while ((c = *(++ptr)) != 0) group. */ case '(': - if ((pcre_ctypes[ptr[3]] & ctype_digit) != 0) + if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0) { ptr += 4; length += 2; - while ((pcre_ctypes[*ptr] & ctype_digit) != 0) ptr++; + while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++; if (*ptr != ')') { *errorptr = ERR26; @@ -2237,9 +2258,10 @@ while ((c = *(++ptr)) != 0) /* Leave ptr at the final char; for read_repeat_counts this happens automatically; for the others we need an increment. */ - if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) + if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block)) { - ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr); + ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr, + &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; } else if (c == '*') { minval = 0; maxval = -1; ptr++; } @@ -2270,7 +2292,7 @@ while ((c = *(++ptr)) != 0) { if ((options & PCRE_EXTENDED) != 0) { - if ((pcre_ctypes[c] & ctype_space) != 0) continue; + if ((compile_block.ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while ((c = *(++ptr)) != 0 && c != '\n'); @@ -2284,7 +2306,8 @@ while ((c = *(++ptr)) != 0) if (c == '\\') { const uschar *saveptr = ptr; - c = check_escape(&ptr, errorptr, bracount, options, FALSE); + c = check_escape(&ptr, errorptr, bracount, options, FALSE, + &compile_block); if (*errorptr != NULL) goto PCRE_ERROR_RETURN; if (c < 0) { ptr = saveptr; break; } } @@ -2296,7 +2319,8 @@ while ((c = *(++ptr)) != 0) /* This "while" is the end of the "do" above. */ - while (runlength < 255 && (pcre_ctypes[c = *(++ptr)] & ctype_meta) == 0); + while (runlength < 255 && + (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); ptr--; length += runlength; @@ -2331,6 +2355,7 @@ if (re == NULL) re->magic_number = MAGIC_NUMBER; re->options = options; +re->tables = tables; /* Set up a starting, non-extracting bracket, then compile the expression. On error, *errorptr will be set non-NULL, so we don't need to look at the result @@ -2340,7 +2365,8 @@ ptr = (const uschar *)pattern; code = re->code; *code = OP_BRA; bracount = 0; -(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1); +(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1, + &compile_block); re->top_bracket = bracount; re->top_backref = top_backref; @@ -2637,46 +2663,6 @@ return (pcre *)re; /************************************************* -* Match a character type * -*************************************************/ - -/* Not used in all the places it might be as it's sometimes faster -to put the code inline. - -Arguments: - type the character type - c the character - dotall the dotall flag - -Returns: TRUE if character is of the type -*/ - -static BOOL -match_type(int type, int c, BOOL dotall) -{ - -#ifdef DEBUG -if (isprint(c)) printf("matching subject %c against ", c); - else printf("matching subject \\x%02x against ", c); -printf("%s\n", OP_names[type]); -#endif - -switch(type) - { - case OP_ANY: return dotall || c != '\n'; - case OP_NOT_DIGIT: return (pcre_ctypes[c] & ctype_digit) == 0; - case OP_DIGIT: return (pcre_ctypes[c] & ctype_digit) != 0; - case OP_NOT_WHITESPACE: return (pcre_ctypes[c] & ctype_space) == 0; - case OP_WHITESPACE: return (pcre_ctypes[c] & ctype_space) != 0; - case OP_NOT_WORDCHAR: return (pcre_ctypes[c] & ctype_word) == 0; - case OP_WORDCHAR: return (pcre_ctypes[c] & ctype_word) != 0; - } -return FALSE; -} - - - -/************************************************* * Match a back-reference * *************************************************/ @@ -2719,7 +2705,10 @@ if (length > md->end_subject - eptr) return FALSE; /* Separate the caselesss case for speed */ if ((ims & PCRE_CASELESS) != 0) - { while (length-- > 0) if (pcre_lcc[*p++] != pcre_lcc[*eptr++]) return FALSE; } + { + while (length-- > 0) + if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; + } else { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } @@ -3172,9 +3161,9 @@ for (;;) case OP_WORD_BOUNDARY: { BOOL prev_is_word = (eptr != md->start_subject) && - ((pcre_ctypes[eptr[-1]] & ctype_word) != 0); + ((md->ctypes[eptr[-1]] & ctype_word) != 0); BOOL cur_is_word = (eptr < md->end_subject) && - ((pcre_ctypes[*eptr] & ctype_word) != 0); + ((md->ctypes[*eptr] & ctype_word) != 0); if ((*ecode++ == OP_WORD_BOUNDARY)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) return FALSE; @@ -3191,37 +3180,43 @@ for (;;) break; case OP_NOT_DIGIT: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) != 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE; ecode++; break; case OP_DIGIT: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_digit) == 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE; ecode++; break; case OP_NOT_WHITESPACE: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) != 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_space) != 0) return FALSE; ecode++; break; case OP_WHITESPACE: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_space) == 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_space) == 0) return FALSE; ecode++; break; case OP_NOT_WORDCHAR: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) != 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_word) != 0) return FALSE; ecode++; break; case OP_WORDCHAR: - if (eptr >= md->end_subject || (pcre_ctypes[*eptr++] & ctype_word) == 0) + if (eptr >= md->end_subject || + (md->ctypes[*eptr++] & ctype_word) == 0) return FALSE; ecode++; break; @@ -3453,7 +3448,9 @@ for (;;) if (length > md->end_subject - eptr) return FALSE; if ((ims & PCRE_CASELESS) != 0) { - while (length-- > 0) if (pcre_lcc[*ecode++] != pcre_lcc[*eptr++]) return FALSE; + while (length-- > 0) + if (md->lcc[*ecode++] != md->lcc[*eptr++]) + return FALSE; } else { @@ -3510,8 +3507,9 @@ for (;;) if ((ims & PCRE_CASELESS) != 0) { - c = pcre_lcc[c]; - for (i = 1; i <= min; i++) if (c != pcre_lcc[*eptr++]) return FALSE; + c = md->lcc[c]; + for (i = 1; i <= min; i++) + if (c != md->lcc[*eptr++]) return FALSE; if (min == max) continue; if (minimize) { @@ -3519,7 +3517,8 @@ for (;;) { if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE; - if (i >= max || eptr >= md->end_subject || c != pcre_lcc[*eptr++]) + if (i >= max || eptr >= md->end_subject || + c != md->lcc[*eptr++]) return FALSE; } /* Control never gets here */ @@ -3529,7 +3528,7 @@ for (;;) const uschar *pp = eptr; for (i = min; i < max; i++) { - if (eptr >= md->end_subject || c != pcre_lcc[*eptr]) break; + if (eptr >= md->end_subject || c != md->lcc[*eptr]) break; eptr++; } while (eptr >= pp) @@ -3579,7 +3578,7 @@ for (;;) ecode++; if ((ims & PCRE_CASELESS) != 0) { - if (pcre_lcc[*ecode++] == pcre_lcc[*eptr++]) return FALSE; + if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE; } else { @@ -3639,8 +3638,9 @@ for (;;) if ((ims & PCRE_CASELESS) != 0) { - c = pcre_lcc[c]; - for (i = 1; i <= min; i++) if (c == pcre_lcc[*eptr++]) return FALSE; + c = md->lcc[c]; + for (i = 1; i <= min; i++) + if (c == md->lcc[*eptr++]) return FALSE; if (min == max) continue; if (minimize) { @@ -3648,7 +3648,8 @@ for (;;) { if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE; - if (i >= max || eptr >= md->end_subject || c == pcre_lcc[*eptr++]) + if (i >= max || eptr >= md->end_subject || + c == md->lcc[*eptr++]) return FALSE; } /* Control never gets here */ @@ -3658,7 +3659,7 @@ for (;;) const uschar *pp = eptr; for (i = min; i < max; i++) { - if (eptr >= md->end_subject || c == pcre_lcc[*eptr]) break; + if (eptr >= md->end_subject || c == md->lcc[*eptr]) break; eptr++; } while (eptr >= pp) @@ -3752,32 +3753,34 @@ for (;;) case OP_NOT_DIGIT: for (i = 1; i <= min; i++) - if ((pcre_ctypes[*eptr++] & ctype_digit) != 0) return FALSE; + if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE; break; case OP_DIGIT: for (i = 1; i <= min; i++) - if ((pcre_ctypes[*eptr++] & ctype_digit) == 0) return FALSE; + if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE; break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) - if ((pcre_ctypes[*eptr++] & ctype_space) != 0) return FALSE; + if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE; break; case OP_WHITESPACE: for (i = 1; i <= min; i++) - if ((pcre_ctypes[*eptr++] & ctype_space) == 0) return FALSE; + if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE; break; case OP_NOT_WORDCHAR: - for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) != 0) - return FALSE; + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) != 0) + return FALSE; break; case OP_WORDCHAR: - for (i = 1; i <= min; i++) if ((pcre_ctypes[*eptr++] & ctype_word) == 0) - return FALSE; + for (i = 1; i <= min; i++) + if ((md->ctypes[*eptr++] & ctype_word) == 0) + return FALSE; break; } @@ -3786,16 +3789,46 @@ for (;;) if (min == max) continue; /* If minimizing, we have to test the rest of the pattern before each - subsequent match, so inlining isn't much help; just use the function. */ + subsequent match. */ if (minimize) { for (i = min;; i++) { if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE; - if (i >= max || eptr >= md->end_subject || - !match_type(ctype, *eptr++, (ims & PCRE_DOTALL) != 0)) - return FALSE; + if (i >= max || eptr >= md->end_subject) return FALSE; + + c = *eptr++; + switch(ctype) + { + case OP_ANY: + if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE; + break; + + case OP_NOT_DIGIT: + if ((md->ctypes[c] & ctype_digit) != 0) return FALSE; + break; + + case OP_DIGIT: + if ((md->ctypes[c] & ctype_digit) == 0) return FALSE; + break; + + case OP_NOT_WHITESPACE: + if ((md->ctypes[c] & ctype_space) != 0) return FALSE; + break; + + case OP_WHITESPACE: + if ((md->ctypes[c] & ctype_space) == 0) return FALSE; + break; + + case OP_NOT_WORDCHAR: + if ((md->ctypes[c] & ctype_word) != 0) return FALSE; + break; + + case OP_WORDCHAR: + if ((md->ctypes[c] & ctype_word) == 0) return FALSE; + break; + } } /* Control never gets here */ } @@ -3828,7 +3861,7 @@ for (;;) case OP_NOT_DIGIT: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) != 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) break; eptr++; } @@ -3837,7 +3870,7 @@ for (;;) case OP_DIGIT: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_digit) == 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) break; eptr++; } @@ -3846,7 +3879,7 @@ for (;;) case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) != 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) break; eptr++; } @@ -3855,7 +3888,7 @@ for (;;) case OP_WHITESPACE: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_space) == 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) break; eptr++; } @@ -3864,7 +3897,7 @@ for (;;) case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) != 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) break; eptr++; } @@ -3873,7 +3906,7 @@ for (;;) case OP_WORDCHAR: for (i = min; i < max; i++) { - if (eptr >= md->end_subject || (pcre_ctypes[*eptr] & ctype_word) == 0) + if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) break; eptr++; } @@ -3963,6 +3996,9 @@ match_block.noteol = (options & PCRE_NOTEOL) != 0; match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */ +match_block.lcc = re->tables + lcc_offset; +match_block.ctypes = re->tables + ctypes_offset; + /* The ims options can vary during the matching as a result of the presence of (?ims) items in the pattern. They are kept in a local variable so that restoring at the exit of a group is easy. */ @@ -4008,7 +4044,7 @@ if (!anchored) if ((re->options & PCRE_FIRSTSET) != 0) { first_char = re->first_char; - if ((ims & PCRE_CASELESS) != 0) first_char = pcre_lcc[first_char]; + if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char]; } else if (!startline && extra != NULL && @@ -4033,7 +4069,8 @@ do if (first_char >= 0) { if ((ims & PCRE_CASELESS) != 0) - while (start_match < end_subject && pcre_lcc[*start_match] != first_char) + while (start_match < end_subject && + match_block.lcc[*start_match] != first_char) start_match++; else while (start_match < end_subject && *start_match != first_char) @@ -4106,6 +4143,9 @@ do DPRINTF((">>>> returning %d\n", rc)); return rc; } + +/* This "while" is the end of the "do" above */ + while (!anchored && match_block.errorcode == PCRE_ERROR_NOMATCH && start_match++ < end_subject); @@ -54,9 +54,11 @@ extern void (*pcre_free)(void *); /* Functions */ -extern pcre *pcre_compile(const char *, int, const char **, int *); +extern pcre *pcre_compile(const char *, int, const char **, int *, + const unsigned char *); extern int pcre_exec(const pcre *, const pcre_extra *, const char *, int, int, int *, int); +extern unsigned const char *pcre_maketables(void); extern int pcre_info(const pcre *, int *, int *); extern pcre_extra *pcre_study(const pcre *, int, const char **); extern const char *pcre_version(void); diff --git a/pcreposix.c b/pcreposix.c index 27c810e..63b99ef 100644 --- a/pcreposix.c +++ b/pcreposix.c @@ -191,7 +191,7 @@ int options = 0; if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS; if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE; -preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset); +preg->re_pcre = pcre_compile(pattern, options, &errorptr, &erroffset, NULL); preg->re_erroffset = erroffset; if (preg->re_pcre == NULL) return pcre_posix_error_code(errorptr); @@ -7,6 +7,7 @@ #include <string.h> #include <stdlib.h> #include <time.h> +#include <locale.h> /* Use the internal info for displaying the results of pcre_study(). */ @@ -313,6 +314,12 @@ while (argc > 1 && argv[op][0] == '-') else { printf("*** Unknown option %s\n", argv[op]); + printf("Usage: pcretest [-d] [-i] [-p] [-s] [-t] [<input> [<output>]]\n"); + printf(" -d debug: show compiled code; implies -i\n" + " -i show information about compiled pattern\n" + " -p use POSIX interface\n" + " -s output store information\n" + " -t time compilation and execution\n"); return 1; } op++; @@ -357,9 +364,11 @@ while (!done) pcre_extra *extra = NULL; regex_t preg; const char *error; - unsigned char *p, *pp; + unsigned char *p, *pp, *ppp; + unsigned const char *tables = NULL; int do_study = 0; - int do_debug = 0; + int do_debug = debug; + int do_showinfo = showinfo; int do_posix = 0; int erroroffset, len, delimiter; @@ -422,13 +431,29 @@ while (!done) case 'm': options |= PCRE_MULTILINE; break; case 's': options |= PCRE_DOTALL; break; case 'x': options |= PCRE_EXTENDED; break; + case 'A': options |= PCRE_ANCHORED; break; - case 'D': do_debug = 1; break; + case 'D': do_debug = do_showinfo = 1; break; case 'E': options |= PCRE_DOLLAR_ENDONLY; break; + case 'I': do_showinfo = 1; break; case 'P': do_posix = 1; break; case 'S': do_study = 1; break; case 'U': options |= PCRE_UNGREEDY; break; case 'X': options |= PCRE_EXTRA; break; + + case 'L': + ppp = pp; + while (*ppp != '\n' && *ppp != ' ') ppp++; + *ppp = 0; + if (setlocale(LC_CTYPE, (const char *)pp) == NULL) + { + fprintf(outfile, "** Failed to set locale \"%s\"\n", pp); + goto SKIP_DATA; + } + tables = pcre_maketables(); + pp = ppp; + break; + case '\n': case ' ': break; default: fprintf(outfile, "** Unknown option '%c'\n", pp[-1]); @@ -437,7 +462,8 @@ while (!done) } /* Handle compiling via the POSIX interface, which doesn't support the - timing, showing, or debugging options. */ + timing, showing, or debugging options, nor the ability to pass over + local character tables. */ if (posix || do_posix) { @@ -469,7 +495,7 @@ while (!done) clock_t start_time = clock(); for (i = 0; i < LOOPREPEAT; i++) { - re = pcre_compile((char *)p, options, &error, &erroroffset); + re = pcre_compile((char *)p, options, &error, &erroroffset, tables); if (re != NULL) free(re); } time_taken = clock() - start_time; @@ -477,7 +503,7 @@ while (!done) ((double)time_taken)/(4 * CLOCKS_PER_SEC)); } - re = pcre_compile((char *)p, options, &error, &erroroffset); + re = pcre_compile((char *)p, options, &error, &erroroffset, tables); /* Compilation failed; go back for another re, skipping to blank line if non-interactive. */ @@ -501,16 +527,16 @@ while (!done) } fprintf(outfile, "\n"); } - continue; + goto CONTINUE; } /* Compilation succeeded; print data if required */ - if (showinfo || do_debug) + if (do_showinfo) { int first_char, count; - if (debug || do_debug) print_internals(re, outfile); + if (do_debug) print_internals(re, outfile); count = pcre_info(re, &options, &first_char); if (count < 0) fprintf(outfile, @@ -573,7 +599,7 @@ while (!done) /* This looks at internal information. A bit kludgy to do it this way, but it is useful for testing. */ - else if (showinfo || do_debug) + else if (do_showinfo) { real_pcre_extra *xx = (real_pcre_extra *)extra; if ((xx->options & PCRE_STUDY_MAPPED) == 0) @@ -784,6 +810,11 @@ while (!done) if (posix || do_posix) regfree(&preg); if (re != NULL) free(re); if (extra != NULL) free(extra); + if (tables != NULL) + { + free((void *)tables); + setlocale(LC_CTYPE, "C"); + } } fprintf(outfile, "\n"); @@ -172,7 +172,7 @@ if (i >= argc) return usage(0); /* Compile the regular expression. */ -pattern = pcre_compile(argv[i++], options, &error, &errptr); +pattern = pcre_compile(argv[i++], options, &error, &errptr, NULL); if (pattern == NULL) { fprintf(stderr, "pgrep: error in regex at offset %d: %s\n", errptr, error); @@ -47,16 +47,17 @@ Arguments: start_bits points to the bit map c is the character caseless the caseless flag + cd the block with char table pointers Returns: nothing */ static void -set_bit(uschar *start_bits, int c, BOOL caseless) +set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd) { start_bits[c/8] |= (1 << (c&7)); -if (caseless && (pcre_ctypes[c] & ctype_letter) != 0) - start_bits[pcre_fcc[c]/8] |= (1 << (pcre_fcc[c]&7)); +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) + start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); } @@ -73,12 +74,14 @@ Arguments: code points to an expression start_bits points to a 32-byte table, initialized to 0 caseless the current state of the caseless flag + cd the block with char table pointers Returns: TRUE if table built, FALSE otherwise */ static BOOL -set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless) +set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, + compile_data *cd) { register int c; @@ -96,7 +99,8 @@ do if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) { - if (!set_start_bits(tcode, start_bits, caseless)) return FALSE; + if (!set_start_bits(tcode, start_bits, caseless, cd)) + return FALSE; } else switch(*tcode) @@ -126,7 +130,8 @@ do case OP_BRAZERO: case OP_BRAMINZERO: - if (!set_start_bits(++tcode, start_bits, caseless)) return FALSE; + if (!set_start_bits(++tcode, start_bits, caseless, cd)) + return FALSE; do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT); tcode += 3; try_next = TRUE; @@ -138,7 +143,7 @@ do case OP_MINSTAR: case OP_QUERY: case OP_MINQUERY: - set_bit(start_bits, tcode[1], caseless); + set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; try_next = TRUE; break; @@ -147,7 +152,7 @@ do case OP_UPTO: case OP_MINUPTO: - set_bit(start_bits, tcode[3], caseless); + set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; try_next = TRUE; break; @@ -162,35 +167,39 @@ do case OP_PLUS: case OP_MINPLUS: - set_bit(start_bits, tcode[1], caseless); + set_bit(start_bits, tcode[1], caseless, cd); break; /* Single character type sets the bits and stops */ case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]); break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]); break; /* One or more character type fudges the pointer and restarts, knowing @@ -221,29 +230,33 @@ do switch(tcode[1]) { case OP_NOT_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_digit]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_digit]; break; case OP_NOT_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= ~pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= ~cd->cbits[c+cbit_space]; break; case OP_WHITESPACE: - for (c = 0; c < 32; c++) start_bits[c] |= pcre_cbits[c+cbit_space]; + for (c = 0; c < 32; c++) + start_bits[c] |= cd->cbits[c+cbit_space]; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= ~(pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]); break; case OP_WORDCHAR: for (c = 0; c < 32; c++) - start_bits[c] |= (pcre_cbits[c] | pcre_cbits[c+cbit_word]); + start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]); break; } @@ -316,6 +329,7 @@ pcre_study(const pcre *external_re, int options, const char **errorptr) uschar start_bits[32]; real_pcre_extra *extra; const real_pcre *re = (const real_pcre *)external_re; +compile_data compile_block; *errorptr = NULL; @@ -338,11 +352,18 @@ present. */ if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) return NULL; +/* Set the character tables in the block which is passed around */ + +compile_block.lcc = re->tables + lcc_offset; +compile_block.fcc = re->tables + fcc_offset; +compile_block.cbits = re->tables + cbits_offset; +compile_block.ctypes = re->tables + ctypes_offset; + /* See if we can find a fixed set of initial characters for the pattern. */ memset(start_bits, 0, 32 * sizeof(uschar)); -if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0)) - return NULL; +if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0, + &compile_block)) return NULL; /* Get an "extra" block and put the information therein. */ diff --git a/testinput4 b/testinput4 new file mode 100644 index 0000000..c23b52a --- /dev/null +++ b/testinput4 @@ -0,0 +1,64 @@ +/^[\w]+/ + *** Failers + École + +/^[\w]+/Lfr + École + +/^[\w]+/ + *** Failers + École + +/^[\W]+/ + École + +/^[\W]+/Lfr + *** Failers + École + +/[\b]/ + \b + *** Failers + a + +/[\b]/Lfr + \b + *** Failers + a + +/^\w+/ + *** Failers + École + +/^\w+/Lfr + École + +/(.+)\b(.+)/ + École + +/(.+)\b(.+)/Lfr + *** Failers + École + +/École/i + École + *** Failers + école + +/École/iLfr + École + école + +/\w/IS + +/\w/ISLfr + +/^[\xc8-\xc9]/iLfr + École + école + +/^[\xc8-\xc9]/Lfr + École + *** Failers + école + @@ -1,4 +1,4 @@ -PCRE version 2.00 24-Sep-1998 +PCRE version 2.01 21-Oct-1998 /the quick brown fox/ the quick brown fox diff --git a/testoutput2 b/testoutput2 index 5625359..c2e0148 100644 --- a/testoutput2 +++ b/testoutput2 @@ -1,4 +1,4 @@ -PCRE version 2.00 24-Sep-1998 +PCRE version 2.01 21-Oct-1998 /(a)b|/ Identifying subpattern count = 1 diff --git a/testoutput3 b/testoutput3 index 3b7b158..a5d960c 100644 --- a/testoutput3 +++ b/testoutput3 @@ -1,4 +1,4 @@ -PCRE version 2.00 24-Sep-1998 +PCRE version 2.01 21-Oct-1998 /(?<!bar)foo/ foo diff --git a/testoutput4 b/testoutput4 new file mode 100644 index 0000000..9848f5a --- /dev/null +++ b/testoutput4 @@ -0,0 +1,113 @@ +PCRE version 2.01 21-Oct-1998 + +/^[\w]+/ + *** Failers +No match + École +No match + +/^[\w]+/Lfr + École + 0: École + +/^[\w]+/ + *** Failers +No match + École +No match + +/^[\W]+/ + École + 0: \xc9 + +/^[\W]+/Lfr + *** Failers + 0: *** + École +No match + +/[\b]/ + \b + 0: \x08 + *** Failers +No match + a +No match + +/[\b]/Lfr + \b + 0: \x08 + *** Failers +No match + a +No match + +/^\w+/ + *** Failers +No match + École +No match + +/^\w+/Lfr + École + 0: École + +/(.+)\b(.+)/ + École + 0: \xc9cole + 1: \xc9 + 2: cole + +/(.+)\b(.+)/Lfr + *** Failers + 0: *** Failers + 1: *** + 2: Failers + École +No match + +/École/i + École + 0: \xc9cole + *** Failers +No match + école +No match + +/École/iLfr + École + 0: École + école + 0: école + +/\w/IS +Identifying subpattern count = 0 +No options +No first char +Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z + +/\w/ISLfr +Identifying subpattern count = 0 +No options +No first char +Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P + Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z + À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å + æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ + +/^[\xc8-\xc9]/iLfr + École + 0: É + école + 0: é + +/^[\xc8-\xc9]/Lfr + École + 0: É + *** Failers +No match + école +No match + + |