summaryrefslogtreecommitdiff
path: root/pcre_study.c
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-28 17:16:11 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-28 17:16:11 +0000
commit5e8b286b566d7ec502b80892e5b709025631d58c (patch)
treec8a6c83ad13e79a9b64718b784fe0126764d98b7 /pcre_study.c
parenta29cc4dc66d82b59de7616c53517c58271e6e0e8 (diff)
downloadpcre-5e8b286b566d7ec502b80892e5b709025631d58c.tar.gz
Merging all the changes from the pcre16 branch into the trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@836 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_study.c')
-rw-r--r--pcre_study.c328
1 files changed, 236 insertions, 92 deletions
diff --git a/pcre_study.c b/pcre_study.c
index 9da92bf..cc7ba8b 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
- Copyright (c) 1997-2010 University of Cambridge
+ Copyright (c) 1997-2012 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -78,17 +78,18 @@ Returns: the minimum length
*/
static int
-find_minlength(const uschar *code, const uschar *startcode, int options,
+find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
int recurse_depth)
{
int length = -1;
-BOOL utf8 = (options & PCRE_UTF8) != 0;
+/* PCRE_UTF16 has the same value as PCRE_UTF8. */
+BOOL utf = (options & PCRE_UTF8) != 0;
BOOL had_recurse = FALSE;
register int branchlength = 0;
-register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
+register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
if (*code == OP_CBRA || *code == OP_SCBRA ||
- *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += 2;
+ *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
@@ -96,7 +97,7 @@ branch, check the length against that of the other branches. */
for (;;)
{
int d, min;
- uschar *cs, *ce;
+ pcre_uchar *cs, *ce;
register int op = *cc;
switch (op)
@@ -189,7 +190,7 @@ for (;;)
case OP_DOLLM:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
- cc += _pcre_OP_lengths[*cc];
+ cc += PRIV(OP_lengths)[*cc];
break;
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
@@ -198,7 +199,7 @@ for (;;)
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
case OP_SKIPZERO:
- cc += _pcre_OP_lengths[*cc];
+ cc += PRIV(OP_lengths)[*cc];
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
@@ -223,8 +224,8 @@ for (;;)
case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -243,15 +244,16 @@ for (;;)
case OP_NOTEXACT:
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
- cc += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+ cc += 2 + IMM2_SIZE;
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
- cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
+ cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
+ || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
break;
/* Handle single-char non-literal matchers */
@@ -286,13 +288,13 @@ for (;;)
cc++;
break;
- /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
- non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
+ /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
+ non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
appear, but leave the code, just in case.) */
case OP_ANYBYTE:
-#ifdef SUPPORT_UTF8
- if (utf8) return -1;
+#ifdef SUPPORT_UTF
+ if (utf) return -1;
#endif
branchlength++;
cc++;
@@ -308,27 +310,28 @@ for (;;)
case OP_TYPEPOSSTAR:
case OP_TYPEPOSQUERY:
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
- cc += _pcre_OP_lengths[op];
+ cc += PRIV(OP_lengths)[op];
break;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
- cc += _pcre_OP_lengths[op];
+ if (cc[1 + IMM2_SIZE] == OP_PROP
+ || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
+ cc += PRIV(OP_lengths)[op];
break;
/* Check a class for variable quantification */
-#ifdef SUPPORT_UTF8
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
- cc += GET(cc, 1) - 33;
+ cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
- cc += 33;
+ cc += PRIV(OP_lengths)[OP_CLASS];
switch (*cc)
{
@@ -347,7 +350,7 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
branchlength += GET2(cc,1);
- cc += 5;
+ cc += 1 + 2 * IMM2_SIZE;
break;
default:
@@ -372,7 +375,7 @@ for (;;)
case OP_REFI:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
- ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
+ ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
@@ -386,7 +389,7 @@ for (;;)
}
}
else d = 0;
- cc += 3;
+ cc += 1 + IMM2_SIZE;
/* Handle repeated back references */
@@ -409,7 +412,7 @@ for (;;)
case OP_CRRANGE:
case OP_CRMINRANGE:
min = GET2(cc, 1);
- cc += 5;
+ cc += 1 + 2 * IMM2_SIZE;
break;
default:
@@ -424,7 +427,7 @@ for (;;)
caught by a recursion depth count. */
case OP_RECURSE:
- cs = ce = (uschar *)startcode + GET(cc, 1);
+ cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
do ce += GET(ce, 1); while (*ce == OP_ALT);
if ((cc > cs && cc < ce) || recurse_depth > 10)
had_recurse = TRUE;
@@ -482,9 +485,9 @@ for (;;)
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
- cc += _pcre_OP_lengths[op];
-#ifdef SUPPORT_UTF8
- if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
+ cc += PRIV(OP_lengths)[op];
+#ifdef SUPPORT_UTF
+ if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@@ -494,7 +497,7 @@ for (;;)
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
- cc += _pcre_OP_lengths[op] + cc[1];
+ cc += PRIV(OP_lengths)[op] + cc[1];
break;
/* The remaining opcodes are just skipped over. */
@@ -506,7 +509,7 @@ for (;;)
case OP_SET_SOM:
case OP_SKIP:
case OP_THEN:
- cc += _pcre_OP_lengths[op];
+ cc += PRIV(OP_lengths)[op];
break;
/* This should not occur: we list all opcodes explicitly so that when
@@ -535,29 +538,30 @@ Arguments:
p points to the character
caseless the caseless flag
cd the block with char table pointers
- utf8 TRUE for UTF-8 mode
+ utf TRUE for UTF-8 / UTF-16 mode
Returns: pointer after the character
*/
-static const uschar *
-set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
- compile_data *cd, BOOL utf8)
+static const pcre_uchar *
+set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
+ compile_data *cd, BOOL utf)
{
unsigned int c = *p;
+#ifdef COMPILE_PCRE8
SET_BIT(c);
-#ifdef SUPPORT_UTF8
-if (utf8 && c > 127)
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
{
GETCHARINC(c, p);
#ifdef SUPPORT_UCP
if (caseless)
{
- uschar buff[8];
+ pcre_uchar buff[6];
c = UCD_OTHERCASE(c);
- (void)_pcre_ord2utf8(c, buff);
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
#endif
@@ -569,6 +573,36 @@ if (utf8 && c > 127)
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
+#endif
+
+#ifdef COMPILE_PCRE16
+if (c > 0xff)
+ {
+ c = 0xff;
+ caseless = FALSE;
+ }
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF
+if (utf && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ c = UCD_OTHERCASE(c);
+ if (c > 0xff)
+ c = 0xff;
+ SET_BIT(c);
+ }
+#endif
+ return p;
+ }
+#endif
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+#endif
}
@@ -594,21 +628,23 @@ Returns: nothing
*/
static void
-set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
+set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
{
- uschar buff[8];
- (void)_pcre_ord2utf8(c, buff);
+ pcre_uchar buff[6];
+ (void)PRIV(ord2utf)(c, buff);
SET_BIT(buff[0]);
}
}
+#endif
}
@@ -634,12 +670,14 @@ Returns: nothing
*/
static void
-set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
+#endif
}
@@ -659,7 +697,7 @@ function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
- utf8 TRUE if in UTF-8 mode
+ utf TRUE if in UTF-8 / UTF-16 mode
cd the block with char table pointers
Returns: SSB_FAIL => Failed to find any starting bytes
@@ -669,12 +707,16 @@ Returns: SSB_FAIL => Failed to find any starting bytes
*/
static int
-set_start_bits(const uschar *code, uschar *start_bits, BOOL utf8,
+set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
compile_data *cd)
{
register int c;
int yield = SSB_DONE;
-int table_limit = utf8? 16:32;
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+int table_limit = utf? 16:32;
+#else
+int table_limit = 32;
+#endif
#if 0
/* ========================================================================= */
@@ -696,10 +738,10 @@ volatile int dummy;
do
{
BOOL try_next = TRUE;
- const uschar *tcode = code + 1 + LINK_SIZE;
+ const pcre_uchar *tcode = code + 1 + LINK_SIZE;
if (*code == OP_CBRA || *code == OP_SCBRA ||
- *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += 2;
+ *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
while (try_next) /* Loop for items in this branch */
{
@@ -785,7 +827,9 @@ do
case OP_SOM:
case OP_THEN:
case OP_THEN_ARG:
+#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
case OP_XCLASS:
+#endif
return SSB_FAIL;
/* We can ignore word boundary tests. */
@@ -811,7 +855,7 @@ do
case OP_ONCE:
case OP_ONCE_NC:
case OP_ASSERT:
- rc = set_start_bits(tcode, start_bits, utf8, cd);
+ rc = set_start_bits(tcode, start_bits, utf, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
if (rc == SSB_DONE) try_next = FALSE; else
{
@@ -858,7 +902,7 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
- rc = set_start_bits(++tcode, start_bits, utf8, cd);
+ rc = set_start_bits(++tcode, start_bits, utf, cd);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
/* =========================================================================
See the comment at the head of this function concerning the next line,
@@ -885,7 +929,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
break;
case OP_STARI:
@@ -894,7 +938,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
- tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
break;
/* Single-char upto sets the bit and tries the next */
@@ -902,36 +946,36 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(start_bits, tcode + 3, FALSE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
- tcode = set_table_bit(start_bits, tcode + 3, TRUE, cd, utf8);
+ tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
break;
/* At least one single char sets the bit and stops */
case OP_EXACT:
- tcode += 2;
+ tcode += IMM2_SIZE;
/* Fall through */
case OP_CHAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8);
+ (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
try_next = FALSE;
break;
case OP_EXACTI:
- tcode += 2;
+ tcode += IMM2_SIZE;
/* Fall through */
case OP_CHARI:
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
- (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8);
+ (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
try_next = FALSE;
break;
@@ -944,14 +988,28 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
+ else
+#endif /* SUPPORT_UTF */
+ {
+ SET_BIT(0xA0);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
try_next = FALSE;
break;
@@ -961,12 +1019,26 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+#ifdef SUPPORT_UTF
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
+ else
+#endif /* SUPPORT_UTF */
+ {
+ SET_BIT(0x85);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
try_next = FALSE;
break;
@@ -1024,7 +1096,7 @@ do
break;
case OP_TYPEEXACT:
- tcode += 3;
+ tcode += 1 + IMM2_SIZE;
break;
/* Zero or more repeats of character types set the bits and then
@@ -1033,7 +1105,7 @@ do
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
- tcode += 2; /* Fall through */
+ tcode += IMM2_SIZE; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
@@ -1051,14 +1123,23 @@ do
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
- if (utf8)
+#ifdef COMPILE_PCRE8
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xA0);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0xA0);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0xA0);
break;
case OP_ANYNL:
@@ -1067,12 +1148,21 @@ do
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
- if (utf8)
+#ifdef COMPILE_PCRE8
+ if (utf)
{
+#ifdef COMPILE_PCRE8
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
+#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0x85);
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
}
- else SET_BIT(0x85);
+ else
+#endif /* SUPPORT_UTF */
+ SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
@@ -1119,18 +1209,23 @@ do
character with a value > 255. */
case OP_NCLASS:
-#ifdef SUPPORT_UTF8
- if (utf8)
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
{
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
}
#endif
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
/* Fall through */
case OP_CLASS:
{
+ pcre_uint8 *map;
tcode++;
+ map = (pcre_uint8 *)tcode;
/* In UTF-8 mode, the bits in a bit map correspond to character
values, not to byte values. However, the bit map we are constructing is
@@ -1138,13 +1233,13 @@ do
value is > 127. In fact, there are only two possible starting bytes for
characters in the range 128 - 255. */
-#ifdef SUPPORT_UTF8
- if (utf8)
+#if defined SUPPORT_UTF && defined COMPILE_PCRE8
+ if (utf)
{
- for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
+ for (c = 0; c < 16; c++) start_bits[c] |= map[c];
for (c = 128; c < 256; c++)
{
- if ((tcode[c/8] && (1 << (c&7))) != 0)
+ if ((map[c/8] && (1 << (c&7))) != 0)
{
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
@@ -1152,19 +1247,17 @@ do
}
}
}
-
- /* In non-UTF-8 mode, the two bit maps are completely compatible. */
-
else
#endif
{
- for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
+ /* In non-UTF-8 mode, the two bit maps are completely compatible. */
+ for (c = 0; c < 32; c++) start_bits[c] |= map[c];
}
/* Advance past the bit map, and act on what follows. For a zero
minimum repeat, continue; otherwise stop processing. */
- tcode += 32;
+ tcode += 32 / sizeof(pcre_uchar);
switch (*tcode)
{
case OP_CRSTAR:
@@ -1176,7 +1269,7 @@ do
case OP_CRRANGE:
case OP_CRMINRANGE:
- if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
+ if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
else try_next = FALSE;
break;
@@ -1219,16 +1312,21 @@ Returns: pointer to a pcre_extra block, with study_data filled in and the
NULL on error or if no optimization possible
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre *external_re, int options, const char **errorptr)
+#else
+PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
+pcre16_study(const pcre *external_re, int options, const char **errorptr)
+#endif
{
int min;
BOOL bits_set = FALSE;
-uschar start_bits[32];
+pcre_uint8 start_bits[32];
pcre_extra *extra = NULL;
pcre_study_data *study;
-const uschar *tables;
-uschar *code;
+const pcre_uint8 *tables;
+pcre_uchar *code;
compile_data compile_block;
const real_pcre *re = (const real_pcre *)external_re;
@@ -1240,13 +1338,23 @@ if (re == NULL || re->magic_number != MAGIC_NUMBER)
return NULL;
}
+if ((re->flags & PCRE_MODE) == 0)
+ {
+#ifdef COMPILE_PCRE8
+ *errorptr = "argument is compiled in 16 bit mode";
+#else
+ *errorptr = "argument is compiled in 8 bit mode";
+#endif
+ return NULL;
+ }
+
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
{
*errorptr = "unknown or incorrect option bit(s) set";
return NULL;
}
-code = (uschar *)re + re->name_table_offset +
+code = (pcre_uchar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size);
/* For an anchored pattern, or an unanchored pattern that has a first char, or
@@ -1261,9 +1369,16 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* Set the character tables in the block that is passed around */
tables = re->tables;
+
+#ifdef COMPILE_PCRE8
if (tables == NULL)
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
(void *)(&tables));
+#else
+ if (tables == NULL)
+ (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
+ (void *)(&tables));
+#endif
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
@@ -1272,7 +1387,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* See if we can find a fixed set of initial characters for the pattern. */
- memset(start_bits, 0, 32 * sizeof(uschar));
+ memset(start_bits, 0, 32 * sizeof(pcre_uint8));
rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
&compile_block);
bits_set = rc == SSB_DONE;
@@ -1307,7 +1422,7 @@ if (bits_set || min > 0
#endif
)
{
- extra = (pcre_extra *)(pcre_malloc)
+ extra = (pcre_extra *)(PUBL(malloc))
(sizeof(pcre_extra) + sizeof(pcre_study_data));
if (extra == NULL)
{
@@ -1322,11 +1437,28 @@ if (bits_set || min > 0
study->size = sizeof(pcre_study_data);
study->flags = 0;
+ /* Set the start bits always, to avoid unset memory errors if the
+ study data is written to a file, but set the flag only if any of the bits
+ are set, to save time looking when none are. */
+
if (bits_set)
{
study->flags |= PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
+ else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
+
+#ifdef PCRE_DEBUG
+ if (bits_set)
+ {
+ pcre_uint8 *ptr = (pcre_uint32 *)start_bits;
+ int i;
+
+ printf("Start bits:\n");
+ for (i = 0; i < 32; i++)
+ printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
+ }
+#endif
/* Always set the minlength value in the block, because the JIT compiler
makes use of it. However, don't set the bit unless the length is greater than
@@ -1346,10 +1478,15 @@ if (bits_set || min > 0
#ifdef SUPPORT_JIT
extra->executable_jit = NULL;
- if ((options & PCRE_STUDY_JIT_COMPILE) != 0) _pcre_jit_compile(re, extra);
+ if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
{
+#ifdef COMPILE_PCRE8
pcre_free_study(extra);
+#endif
+#ifdef COMPILE_PCRE16
+ pcre16_free_study(extra);
+#endif
extra = NULL;
}
#endif
@@ -1369,15 +1506,22 @@ Argument: a pointer to the pcre_extra block
Returns: nothing
*/
+#ifdef COMPILE_PCRE8
PCRE_EXP_DEFN void
pcre_free_study(pcre_extra *extra)
+#else
+PCRE_EXP_DEFN void
+pcre16_free_study(pcre_extra *extra)
+#endif
{
+if (extra == NULL)
+ return;
#ifdef SUPPORT_JIT
if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
extra->executable_jit != NULL)
- _pcre_jit_free(extra->executable_jit);
+ PRIV(jit_free)(extra->executable_jit);
#endif
-pcre_free(extra);
+PUBL(free)(extra);
}
/* End of pcre_study.c */