summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2014-01-11 21:54:20 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2014-01-11 21:54:20 +0000
commitb390085e004caf7f53666fabd1d6da8474c51d73 (patch)
tree017fcf56528e33d4a186dba2557ee4b636543b6f
parent72e25ffe7010391778433c9f5ed6a5e217e1d7c4 (diff)
downloadpcre-b390085e004caf7f53666fabd1d6da8474c51d73.tar.gz
Improve pattern prefix search by a simplified Boyer-Moore algorithm in JIT.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1440 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog7
-rw-r--r--pcre_jit_compile.c133
-rw-r--r--testdata/testinput23
-rw-r--r--testdata/testoutput24
4 files changed, 137 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index a87aadf..8dd1647 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -52,7 +52,8 @@ Version 8.35-RC1 xx-xxxx-201x
bracket.
11. Empty match is not possible, when the minimum length is greater than zero,
- and there is no \K in the pattern. Remove these unnecessary checks form JIT.
+ and there is no \K in the pattern. JIT should avoid empty match checks in
+ such cases.
12. In a caseless character class with UCP support, when a character with more
than one alternative case was not the first character of a range, not all
@@ -64,6 +65,10 @@ Version 8.35-RC1 xx-xxxx-201x
enabled. This is not used in Windows, so I have put this test inside a
check for the presence of windows.h (which was already tested for).
+12. Improve pattern prefix search by a simplified Boyer-Moore algorithm in JIT.
+ The algorithm provides a way to skip certain starting offsets, and usually
+ faster than linear prefix searches.
+
Version 8.34 15-December-2013
-----------------------------
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 6d9800a..cfd03f1 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -3345,9 +3345,9 @@ while (TRUE)
chars[0] = mask;
chars[1] = mask;
+ consumed++;
if (--max_chars == 0)
return consumed;
- consumed++;
chars += 2;
}
while (--repeat > 0);
@@ -3396,7 +3396,11 @@ while (TRUE)
chr |= mask;
}
+#ifdef COMPILE_PCRE32
+ if (chars[0] == NOTACHAR && chars[1] == 0)
+#else
if (chars[0] == NOTACHAR)
+#endif
{
chars[0] = chr;
chars[1] = mask;
@@ -3410,9 +3414,9 @@ while (TRUE)
}
len--;
+ consumed++;
if (--max_chars == 0)
return consumed;
- consumed++;
chars += 2;
cc++;
}
@@ -3432,6 +3436,7 @@ while (TRUE)
}
#define MAX_N_CHARS 16
+#define MIN_RANGE_SIZE 4
static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common, BOOL firstline)
{
@@ -3440,9 +3445,15 @@ struct sljit_label *start;
struct sljit_jump *quit;
pcre_uint32 chars[MAX_N_CHARS * 2];
pcre_uint8 ones[MAX_N_CHARS];
-pcre_uint32 mask;
-int i, max;
int offsets[3];
+pcre_uint32 mask, byte;
+int i, max, from;
+int range_right = -1, range_len = -1;
+sljit_ub *update_table = NULL;
+BOOL in_range;
+
+/* This is even TRUE, if both are NULL. */
+SLJIT_ASSERT(common->read_only_data_ptr == common->read_only_data);
for (i = 0; i < MAX_N_CHARS; i++)
{
@@ -3467,6 +3478,59 @@ for (i = 0; i < max; i++)
}
}
+in_range = FALSE;
+for (i = 0; i <= max; i++)
+ {
+ if (i < max && ones[i] <= 1)
+ {
+ if (!in_range)
+ {
+ in_range = TRUE;
+ from = i;
+ }
+ }
+ else if (in_range)
+ {
+ if (range_len == -1 || (i - from) > range_len)
+ {
+ range_len = i - from;
+ range_right = i - 1;
+ }
+ in_range = FALSE;
+ }
+ }
+
+if (range_len >= MIN_RANGE_SIZE)
+ {
+ /* Since no data is consumed (see the assert in the beginning
+ of this function), this space can be reallocated. */
+ if (common->read_only_data)
+ SLJIT_FREE(common->read_only_data);
+
+ common->read_only_data_size += 256;
+ common->read_only_data = (sljit_uw *)SLJIT_MALLOC(common->read_only_data_size);
+ if (common->read_only_data == NULL)
+ return TRUE;
+
+ update_table = (sljit_ub *)common->read_only_data;
+ common->read_only_data_ptr = (sljit_uw *)(update_table + 256);
+ memset(update_table, IN_UCHARS(range_len), 256);
+
+ for (i = 0; i < range_len; i++)
+ {
+ byte = chars[(range_right - i) << 1] & 0xff;
+ if (update_table[byte] > IN_UCHARS(i))
+ update_table[byte] = IN_UCHARS(i);
+ mask = chars[((range_right - i) << 1) + 1] & 0xff;
+ if (mask != 0)
+ {
+ byte ^= mask;
+ if (update_table[byte] > IN_UCHARS(i))
+ update_table[byte] = IN_UCHARS(i);
+ }
+ }
+ }
+
offsets[0] = -1;
/* Scan forward. */
for (i = 0; i < max; i++)
@@ -3481,13 +3545,18 @@ if (offsets[0] == -1)
/* Scan backward. */
offsets[1] = -1;
for (i = max - 1; i > offsets[0]; i--)
- if (ones[i] <= 2) {
+ if (ones[i] <= 2 && i != range_right)
+ {
offsets[1] = i;
break;
- }
+ }
+
+/* This case is handled better by fast_forward_first_char. */
+if (offsets[1] == -1 && offsets[0] == 0)
+ return FALSE;
offsets[2] = -1;
-if (offsets[1] >= 0)
+if (offsets[1] >= 0 && range_right == -1)
{
/* Scan from middle. */
for (i = (offsets[0] + offsets[1]) / 2 + 1; i < offsets[1]; i++)
@@ -3528,15 +3597,41 @@ max -= 1;
if (firstline)
{
SLJIT_ASSERT(common->first_line_end != 0);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end);
OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
- OP2(SLJIT_SUB, STR_END, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end, SLJIT_IMM, IN_UCHARS(max));
+ OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
+ quit = CMP(SLJIT_C_LESS_EQUAL, STR_END, 0, TMP1, 0);
+ OP1(SLJIT_MOV, STR_END, 0, TMP1, 0);
+ JUMPHERE(quit);
}
else
OP2(SLJIT_SUB, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
+#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+if (range_len >= MIN_RANGE_SIZE)
+ OP1(SLJIT_MOV, RETURN_ADDR, 0, SLJIT_IMM, (sljit_sw)update_table);
+#endif
+
start = LABEL();
quit = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+if (range_len >= MIN_RANGE_SIZE)
+ {
+#if defined COMPILE_PCRE8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN)
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right));
+#else
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(range_right + 1) - 1);
+#endif
+
+#if !(defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM2(RETURN_ADDR, TMP1), 0);
+#else
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)update_table);
+#endif
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start);
+ }
+
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[0]));
if (offsets[1] >= 0)
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[1]));
@@ -3566,13 +3661,24 @@ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
JUMPHERE(quit);
if (firstline)
+ {
+ if (range_len >= MIN_RANGE_SIZE)
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end);
OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
+ if (range_len >= MIN_RANGE_SIZE)
+ {
+ quit = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP1, 0);
+ OP1(SLJIT_MOV, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(quit);
+ }
+ }
else
OP2(SLJIT_ADD, STR_END, 0, STR_END, 0, SLJIT_IMM, IN_UCHARS(max));
return TRUE;
}
#undef MAX_N_CHARS
+#undef MIN_RANGE_SIZE
static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline)
{
@@ -9808,7 +9914,16 @@ if ((re->options & PCRE_ANCHORED) == 0)
if ((re->options & PCRE_NO_START_OPTIMIZE) == 0)
{
if (mode == JIT_COMPILE && fast_forward_first_n_chars(common, (re->options & PCRE_FIRSTLINE) != 0))
- { /* Do nothing */ }
+ {
+ /* If read_only_data is reallocated, we might have an allocation failure. */
+ if (common->read_only_data_size > 0 && common->read_only_data == NULL)
+ {
+ sljit_free_compiler(compiler);
+ SLJIT_FREE(common->optimized_cbracket);
+ SLJIT_FREE(common->private_data_ptrs);
+ return;
+ }
+ }
else if ((re->flags & PCRE_FIRSTSET) != 0)
fast_forward_first_char(common, (pcre_uchar)re->first_char, (re->flags & PCRE_FCH_CASELESS) != 0, (re->options & PCRE_FIRSTLINE) != 0);
else if ((re->flags & PCRE_STARTLINE) != 0)
diff --git a/testdata/testinput2 b/testdata/testinput2
index 072650c..c1b1db1 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4048,4 +4048,7 @@ backtracking verbs. --/
/(?=ab\K)/+
abcd
+/abcd/f<lf>
+ xx\nxabcd
+
/-- End of testinput2 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index b82488c..e3e2f58 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -14131,4 +14131,8 @@ Start of matched string is beyond its end - displaying from end to start.
0: ab
0+ abcd
+/abcd/f<lf>
+ xx\nxabcd
+No match
+
/-- End of testinput2 --/