diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2014-01-13 20:18:33 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2014-01-13 20:18:33 +0000 |
commit | e12305c1937338ee0e2c68c8b9f566b354257ed4 (patch) | |
tree | 736067fff1115a53805f22fa7785883b3b915cba | |
parent | daa78b5ec623fc871934f84d5b1220f454f80d59 (diff) | |
download | pcre-e12305c1937338ee0e2c68c8b9f566b354257ed4.tar.gz |
JIT: Improved update table for the fast forward search algorithm.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1447 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | pcre_jit_compile.c | 211 |
1 files changed, 130 insertions, 81 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index e4b7858..99d3af8 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -3149,13 +3149,46 @@ if (newlinecheck) return mainloop; } -static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, int max_chars) +#define MAX_N_CHARS 16 +#define MAX_N_BYTES 8 + +static SLJIT_INLINE void add_prefix_byte(pcre_uint8 byte, pcre_uint8 *bytes) +{ +pcre_uint8 len = bytes[0]; +int i; + +if (len == 255) + return; + +if (len == 0) + { + bytes[0] = 1; + bytes[1] = byte; + return; + } + +for (i = len; i > 0; i--) + if (bytes[i] == byte) + return; + +if (len >= MAX_N_BYTES - 1) + { + bytes[0] = 255; + return; + } + +len++; +bytes[len] = byte; +bytes[0] = len; +} + +static int scan_prefix(compiler_common *common, pcre_uchar *cc, pcre_uint32 *chars, pcre_uint8 *bytes, int max_chars) { /* Recursive function, which scans prefix literals. */ +BOOL last, any, caseless; int len, repeat, len_save, consumed = 0; pcre_uint32 chr, mask; pcre_uchar *alternative, *cc_save, *oc; -BOOL last, any, caseless; #if defined SUPPORT_UTF && defined COMPILE_PCRE8 pcre_uchar othercase[8]; #elif defined SUPPORT_UTF && defined COMPILE_PCRE16 @@ -3239,7 +3272,7 @@ while (TRUE) alternative = cc + GET(cc, 1); while (*alternative == OP_ALT) { - max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, max_chars); + max_chars = scan_prefix(common, alternative + 1 + LINK_SIZE, chars, bytes, max_chars); if (max_chars == 0) return consumed; alternative += GET(alternative, 1); @@ -3351,11 +3384,13 @@ while (TRUE) { chars[0] = mask; chars[1] = mask; + bytes[0] = 255; consumed++; if (--max_chars == 0) return consumed; chars += 2; + bytes += MAX_N_BYTES; } while (--repeat > 0); @@ -3374,7 +3409,7 @@ while (TRUE) if (common->utf) { GETCHAR(chr, cc); - if (PRIV(ord2utf)(char_othercase(common, chr), othercase) != len) + if ((int)PRIV(ord2utf)(char_othercase(common, chr), othercase) != len) return consumed; } else @@ -3399,9 +3434,12 @@ while (TRUE) if (SLJIT_UNLIKELY(chr == NOTACHAR)) return consumed; #endif + add_prefix_byte((pcre_uint8)chr, bytes); + mask = 0; if (caseless) { + add_prefix_byte((pcre_uint8)*oc, bytes); mask = *cc ^ *oc; chr |= mask; } @@ -3428,6 +3466,7 @@ while (TRUE) if (--max_chars == 0) return consumed; chars += 2; + bytes += MAX_N_BYTES; cc++; oc++; } @@ -3446,17 +3485,17 @@ while (TRUE) } } -#define MAX_N_CHARS 16 - static SLJIT_INLINE BOOL fast_forward_first_n_chars(compiler_common *common, BOOL firstline) { DEFINE_COMPILER; struct sljit_label *start; struct sljit_jump *quit; pcre_uint32 chars[MAX_N_CHARS * 2]; +pcre_uint8 bytes[MAX_N_CHARS * MAX_N_BYTES]; pcre_uint8 ones[MAX_N_CHARS]; int offsets[3]; -pcre_uint32 mask, byte; +pcre_uint32 mask; +pcre_uint8 *byte_set, *byte_set_end; int i, max, from; int range_right = -1, range_len = 4 - 1; sljit_ub *update_table = NULL; @@ -3469,9 +3508,10 @@ for (i = 0; i < MAX_N_CHARS; i++) { chars[i << 1] = NOTACHAR; chars[(i << 1) + 1] = 0; + bytes[i * MAX_N_BYTES] = 0; } -max = scan_prefix(common, common->start, chars, MAX_N_CHARS); +max = scan_prefix(common, common->start, chars, bytes, MAX_N_CHARS); if (max <= 1) return FALSE; @@ -3491,7 +3531,13 @@ for (i = 0; i < max; i++) in_range = FALSE; for (i = 0; i <= max; i++) { - if (i < max && ones[i] <= 1) + if (in_range && (i - from) > range_len && (bytes[(i - 1) * MAX_N_BYTES] <= 4)) + { + range_len = i - from; + range_right = i - 1; + } + + if (i < max && bytes[i * MAX_N_BYTES] < 255) { if (!in_range) { @@ -3500,14 +3546,7 @@ for (i = 0; i <= max; i++) } } else if (in_range) - { - if ((i - from) > range_len) - { - range_len = i - from; - range_right = i - 1; - } in_range = FALSE; - } } if (range_right >= 0) @@ -3528,15 +3567,15 @@ if (range_right >= 0) for (i = 0; i < range_len; i++) { - byte = chars[(range_right - i) << 1] & 0xff; - if (update_table[byte] > IN_UCHARS(i)) - update_table[byte] = IN_UCHARS(i); - mask = chars[((range_right - i) << 1) + 1] & 0xff; - if (mask != 0) + byte_set = bytes + ((range_right - i) * MAX_N_BYTES); + SLJIT_ASSERT(byte_set[0] > 0 && byte_set[0] < 255); + byte_set_end = byte_set + byte_set[0]; + byte_set++; + while (byte_set <= byte_set_end) { - byte ^= mask; - if (update_table[byte] > IN_UCHARS(i)) - update_table[byte] = IN_UCHARS(i); + if (update_table[*byte_set] > IN_UCHARS(i)) + update_table[*byte_set] = IN_UCHARS(i); + byte_set++; } } } @@ -3549,58 +3588,62 @@ for (i = 0; i < max; i++) break; } -if (offsets[0] == -1) - return FALSE; - -/* Scan backward. */ -offsets[1] = -1; -for (i = max - 1; i > offsets[0]; i--) - if (ones[i] <= 2 && i != range_right) - { - offsets[1] = i; - break; - } - -/* This case is handled better by fast_forward_first_char. */ -if (offsets[1] == -1 && offsets[0] == 0) +if (offsets[0] < 0 && range_right < 0) return FALSE; -offsets[2] = -1; -if (offsets[1] >= 0 && range_right == -1) +if (offsets[0] >= 0) { - /* Scan from middle. */ - for (i = (offsets[0] + offsets[1]) / 2 + 1; i < offsets[1]; i++) - if (ones[i] <= 2) + /* Scan backward. */ + offsets[1] = -1; + for (i = max - 1; i > offsets[0]; i--) + if (ones[i] <= 2 && i != range_right) { - offsets[2] = i; + offsets[1] = i; break; } - if (offsets[2] == -1) + /* This case is handled better by fast_forward_first_char. */ + if (offsets[1] == -1 && offsets[0] == 0 && range_right < 0) + return FALSE; + + offsets[2] = -1; + /* We only search for a middle character if there is no range check. */ + if (offsets[1] >= 0 && range_right == -1) { - for (i = (offsets[0] + offsets[1]) / 2; i > offsets[0]; i--) + /* Scan from middle. */ + for (i = (offsets[0] + offsets[1]) / 2 + 1; i < offsets[1]; i++) if (ones[i] <= 2) { offsets[2] = i; break; } + + if (offsets[2] == -1) + { + for (i = (offsets[0] + offsets[1]) / 2; i > offsets[0]; i--) + if (ones[i] <= 2) + { + offsets[2] = i; + break; + } + } } - } -SLJIT_ASSERT(offsets[1] == -1 || (offsets[0] < offsets[1])); -SLJIT_ASSERT(offsets[2] == -1 || (offsets[0] < offsets[2] && offsets[1] > offsets[2])); + SLJIT_ASSERT(offsets[1] == -1 || (offsets[0] < offsets[1])); + SLJIT_ASSERT(offsets[2] == -1 || (offsets[0] < offsets[2] && offsets[1] > offsets[2])); -chars[0] = chars[offsets[0] << 1]; -chars[1] = chars[(offsets[0] << 1) + 1]; -if (offsets[2] >= 0) - { - chars[2] = chars[offsets[2] << 1]; - chars[3] = chars[(offsets[2] << 1) + 1]; - } -if (offsets[1] >= 0) - { - chars[4] = chars[offsets[1] << 1]; - chars[5] = chars[(offsets[1] << 1) + 1]; + chars[0] = chars[offsets[0] << 1]; + chars[1] = chars[(offsets[0] << 1) + 1]; + if (offsets[2] >= 0) + { + chars[2] = chars[offsets[2] << 1]; + chars[3] = chars[(offsets[2] << 1) + 1]; + } + if (offsets[1] >= 0) + { + chars[4] = chars[offsets[1] << 1]; + chars[5] = chars[(offsets[1] << 1) + 1]; + } } max -= 1; @@ -3625,6 +3668,8 @@ if (range_right >= 0) start = LABEL(); quit = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); +SLJIT_ASSERT(range_right >= 0 || offsets[0] >= 0); + if (range_right >= 0) { #if defined COMPILE_PCRE8 || (defined SLJIT_LITTLE_ENDIAN && SLJIT_LITTLE_ENDIAN) @@ -3642,31 +3687,34 @@ if (range_right >= 0) CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, 0, start); } -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[0])); -if (offsets[1] >= 0) - OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[1])); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +if (offsets[0] >= 0) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[0])); + if (offsets[1] >= 0) + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[1])); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -if (chars[1] != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[1]); -CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[0], start); -if (offsets[2] >= 0) - OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[2] - 1)); + if (chars[1] != 0) + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[1]); + CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[0], start); + if (offsets[2] >= 0) + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(offsets[2] - 1)); -if (offsets[1] >= 0) - { - if (chars[5] != 0) - OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, chars[5]); - CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, chars[4], start); - } + if (offsets[1] >= 0) + { + if (chars[5] != 0) + OP2(SLJIT_OR, TMP2, 0, TMP2, 0, SLJIT_IMM, chars[5]); + CMPTO(SLJIT_C_NOT_EQUAL, TMP2, 0, SLJIT_IMM, chars[4], start); + } -if (offsets[2] >= 0) - { - if (chars[3] != 0) - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[3]); - CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[2], start); + if (offsets[2] >= 0) + { + if (chars[3] != 0) + OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, chars[3]); + CMPTO(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, chars[2], start); + } + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } -OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); JUMPHERE(quit); @@ -3688,6 +3736,7 @@ return TRUE; } #undef MAX_N_CHARS +#undef MAX_N_BYTES static SLJIT_INLINE void fast_forward_first_char(compiler_common *common, pcre_uchar first_char, BOOL caseless, BOOL firstline) { |