summaryrefslogtreecommitdiff
path: root/study.c
diff options
context:
space:
mode:
Diffstat (limited to 'study.c')
-rw-r--r--study.c87
1 files changed, 62 insertions, 25 deletions
diff --git a/study.c b/study.c
index f924543..4320bd2 100644
--- a/study.c
+++ b/study.c
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-2001 University of Cambridge
+ Copyright (c) 1997-2002 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -78,6 +78,7 @@ Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
caseless the current state of the caseless flag
+ utf8 TRUE if in UTF-8 mode
cd the block with char table pointers
Returns: TRUE if table built, FALSE otherwise
@@ -85,7 +86,7 @@ Returns: TRUE if table built, FALSE otherwise
static BOOL
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
- compile_data *cd)
+ BOOL utf8, compile_data *cd)
{
register int c;
@@ -99,7 +100,7 @@ volatile int dummy;
do
{
- const uschar *tcode = code + 3;
+ const uschar *tcode = code + 1 + LINK_SIZE;
BOOL try_next = TRUE;
while (try_next)
@@ -109,7 +110,7 @@ do
if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
{
- if (!set_start_bits(tcode, start_bits, caseless, cd))
+ if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
return FALSE;
try_next = FALSE;
}
@@ -119,6 +120,12 @@ do
default:
return FALSE;
+ /* Skip over callout */
+
+ case OP_CALLOUT:
+ tcode += 2;
+ break;
+
/* Skip over extended extraction bracket number */
case OP_BRANUMBER:
@@ -130,8 +137,8 @@ do
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
- do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
- tcode += 3;
+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
+ tcode += 1+LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
@@ -145,11 +152,11 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
- if (!set_start_bits(++tcode, start_bits, caseless, cd))
+ if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
return FALSE;
dummy = 1;
- do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
- tcode += 3;
+ do tcode += GET(tcode,1); while (*tcode == OP_ALT);
+ tcode += 1+LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */
@@ -160,6 +167,9 @@ do
case OP_MINQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+#endif
break;
/* Single-char upto sets the bit and tries the next */
@@ -168,6 +178,9 @@ do
case OP_MINUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
+#ifdef SUPPORT_UTF8
+ if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+#endif
break;
/* At least one single char sets the bit and stops */
@@ -281,8 +294,17 @@ do
tcode += 2;
break;
- /* Character class: set the bits and either carry on or not,
- according to the repeat count. */
+ /* Character class where all the information is in a bit map: set the
+ bits and either carry on or not, according to the repeat count. If it was
+ a negative class, and we are operating with UTF-8 characters, any byte
+ with the top-bit set is a potentially valid starter because it may start
+ a character with a value > 255. (This is sub-optimal in that the
+ character may be in the range 128-255, and those characters might be
+ unwanted, but that's as far as we go for the moment.) */
+
+ case OP_NCLASS:
+ if (utf8) memset(start_bits+16, 0xff, 16);
+ /* Fall through */
case OP_CLASS:
{
@@ -309,12 +331,12 @@ do
break;
}
}
- break; /* End of class handling */
+ break; /* End of bitmap class handling */
} /* End of switch */
} /* End of try_next loop */
- code += (code[1] << 8) + code[2]; /* Advance to next branch */
+ code += GET(code, 1); /* Advance to next branch */
}
while (*code == OP_ALT);
return TRUE;
@@ -336,7 +358,8 @@ Arguments:
errorptr points to where to place error messages;
set NULL unless error
-Returns: pointer to a pcre_extra block,
+Returns: pointer to a pcre_extra block, with study_data filled in and the
+ appropriate flag set;
NULL on error or if no optimization possible
*/
@@ -344,8 +367,11 @@ pcre_extra *
pcre_study(const pcre *external_re, int options, const char **errorptr)
{
uschar start_bits[32];
-real_pcre_extra *extra;
+pcre_extra *extra;
+pcre_study_data *study;
const real_pcre *re = (const real_pcre *)external_re;
+uschar *code = (uschar *)re + sizeof(real_pcre) +
+ (re->name_count * re->name_entry_size);
compile_data compile_block;
*errorptr = NULL;
@@ -362,9 +388,9 @@ if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
return NULL;
}
-/* For an anchored pattern, or an unchored pattern that has a first char, or a
-multiline pattern that matches only at "line starts", no further processing at
-present. */
+/* For an anchored pattern, or an unanchored pattern that has a first char, or
+a multiline pattern that matches only at "line starts", no further processing
+at present. */
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
return NULL;
@@ -379,12 +405,18 @@ compile_block.ctypes = re->tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
-if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
- &compile_block)) return NULL;
+if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
+ (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
-/* Get an "extra" block and put the information therein. */
+/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
+the latter, which is pointed to by the former, which may also get additional
+data set later by the calling program. At the moment, the size of
+pcre_study_data is fixed. We nevertheless save it in a field for returning via
+the pcre_fullinfo() function so that if it becomes variable in the future, we
+don't have to change that code. */
-extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
+extra = (pcre_extra *)(pcre_malloc)
+ (sizeof(pcre_extra) + sizeof(pcre_study_data));
if (extra == NULL)
{
@@ -392,10 +424,15 @@ if (extra == NULL)
return NULL;
}
-extra->options = PCRE_STUDY_MAPPED;
-memcpy(extra->start_bits, start_bits, sizeof(start_bits));
+study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
+extra->flags = PCRE_EXTRA_STUDY_DATA;
+extra->study_data = study;
+
+study->size = sizeof(pcre_study_data);
+study->options = PCRE_STUDY_MAPPED;
+memcpy(study->start_bits, start_bits, sizeof(start_bits));
-return (pcre_extra *)extra;
+return extra;
}
/* End of study.c */