diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-12-31 07:57:56 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-12-31 07:57:56 +0000 |
commit | 77fe3aba8932f6546f15a8ff06d59b98709f7a64 (patch) | |
tree | 85d15a054196e9fba66d0c119167d70781932f17 | |
parent | 6979d01ad83f11e5a2d26fd5172f8874501d7761 (diff) | |
download | pcre-77fe3aba8932f6546f15a8ff06d59b98709f7a64.tar.gz |
JIT: Improved matching of newlines.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1423 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | pcre_jit_compile.c | 81 | ||||
-rw-r--r-- | testdata/testinput4 | 5 | ||||
-rw-r--r-- | testdata/testoutput4 | 6 |
3 files changed, 67 insertions, 25 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index ac1b353..6a9e7db 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -363,8 +363,10 @@ typedef struct compiler_common { BOOL positive_assert; /* Newline control. */ int nltype; + pcre_uint32 nlmax; int newline; int bsr_nltype; + pcre_uint32 bsr_nlmax; /* Dollar endonly. */ int endonly; /* Tables. */ @@ -522,6 +524,8 @@ the start pointers when the end of the capturing group has not yet reached. */ #define GET_LOCAL_BASE(dst, dstw, offset) \ sljit_get_local_base(compiler, (dst), (dstw), (offset)) +#define READ_CHAR_ANY 0x7fffffff + static pcre_uchar* bracketend(pcre_uchar* cc) { SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND)); @@ -2626,7 +2630,7 @@ if (common->utf) static SLJIT_INLINE void read_char(compiler_common *common) { -read_char_max(common, 0x7fffffff, TRUE); +read_char_max(common, READ_CHAR_ANY, TRUE); } static void read_char8_type(compiler_common *common, BOOL full_read) @@ -2730,28 +2734,35 @@ if (common->utf) OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } -static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpiftrue) +static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpifmatch) { /* Character comes in TMP1. Checks if it is a newline. TMP2 may be destroyed. */ DEFINE_COMPILER; +struct sljit_jump *jump; if (nltype == NLTYPE_ANY) { add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); - add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO)); + add_jump(compiler, backtracks, JUMP(jumpifmatch ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO)); } else if (nltype == NLTYPE_ANYCRLF) { - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_CR); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL); - OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL); - add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO)); + if (jumpifmatch) + { + add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR)); + add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); + } + else + { + jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); + add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL)); + JUMPHERE(jump); + } } else { SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256); - add_jump(compiler, backtracks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); + add_jump(compiler, backtracks, CMP(jumpifmatch ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline)); } } @@ -2828,6 +2839,10 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); +OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400); +OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_NOT_ZERO); +/* This code runs only in 8 bit mode. No need to shift the value. */ +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); @@ -2949,7 +2964,7 @@ if (firstline) mainloop = LABEL(); /* Continual stores does not cause data dependency. */ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end, STR_PTR, 0); - read_char(common); + read_char_max(common, common->nlmax, TRUE); check_newlinechar(common, common->nltype, &newline, TRUE); CMPTO(SLJIT_C_LESS, STR_PTR, 0, STR_END, 0, mainloop); JUMPHERE(end); @@ -3517,7 +3532,7 @@ firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0); skip_char_back(common); loop = LABEL(); -read_char(common); +read_char_max(common, common->nlmax, TRUE); lastchar = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF) foundcr = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); @@ -4945,7 +4960,7 @@ switch(type) case OP_ANY: detect_partial_match(common, backtracks); - read_char(common); + read_char_max(common, common->nlmax, TRUE); if (common->nltype == NLTYPE_FIXED && common->newline > 255) { jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); @@ -5013,7 +5028,7 @@ switch(type) case OP_ANYNL: detect_partial_match(common, backtracks); - read_char(common); + read_char_max(common, common->bsr_nlmax, FALSE); jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); /* We don't need to handle soft partial matching case. */ end_list = NULL; @@ -5035,7 +5050,7 @@ switch(type) case OP_NOT_HSPACE: case OP_HSPACE: detect_partial_match(common, backtracks); - read_char(common); + read_char_max(common, 0x3000, type == OP_NOT_HSPACE); add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO)); return cc; @@ -5043,7 +5058,7 @@ switch(type) case OP_NOT_VSPACE: case OP_VSPACE: detect_partial_match(common, backtracks); - read_char(common); + read_char_max(common, 0x2029, type == OP_NOT_VSPACE); add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO)); return cc; @@ -5142,7 +5157,7 @@ switch(type) else { OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, STR_PTR, 0); - read_char(common); + read_char_max(common, common->nlmax, TRUE); add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, STR_PTR, 0, STR_END, 0)); add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO)); @@ -5190,7 +5205,7 @@ switch(type) else { skip_char_back(common); - read_char(common); + read_char_max(common, common->nlmax, TRUE); check_newlinechar(common, common->nltype, backtracks, FALSE); } JUMPHERE(jump[0]); @@ -5265,8 +5280,8 @@ switch(type) #endif return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks); } + detect_partial_match(common, backtracks); - read_char(common); #ifdef SUPPORT_UTF if (common->utf) { @@ -5275,12 +5290,15 @@ switch(type) else #endif c = *cc; + if (type == OP_CHAR || !char_has_othercase(common, cc)) { + read_char_max(common, c, FALSE); add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); return cc + length; } oc = char_othercase(common, c); + read_char_max(common, c > oc ? c : oc, FALSE); bit = c ^ oc; if (is_powerof2(bit)) { @@ -5288,11 +5306,9 @@ switch(type) add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit)); return cc + length; } - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, c); - OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL); - OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc); - OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL); - add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO)); + jump[0] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c); + add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); + JUMPHERE(jump[0]); return cc + length; case OP_NOT: @@ -9420,6 +9436,7 @@ switch(re->options & PCRE_NEWLINE_BITS) case PCRE_NEWLINE_ANYCRLF: common->newline = (CHAR_CR << 8) | CHAR_NL; common->nltype = NLTYPE_ANYCRLF; break; default: return; } +common->nlmax = READ_CHAR_ANY; if ((re->options & PCRE_BSR_ANYCRLF) != 0) common->bsr_nltype = NLTYPE_ANYCRLF; else if ((re->options & PCRE_BSR_UNICODE) != 0) @@ -9432,6 +9449,7 @@ else common->bsr_nltype = NLTYPE_ANY; #endif } +common->bsr_nlmax = READ_CHAR_ANY; common->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; common->ctypes = (sljit_sw)(tables + ctypes_offset); common->name_table = ((pcre_uchar *)re) + re->name_table_offset; @@ -9444,6 +9462,23 @@ common->utf = (re->options & PCRE_UTF8) != 0; #ifdef SUPPORT_UCP common->use_ucp = (re->options & PCRE_UCP) != 0; #endif +if (common->utf) + { + if (common->nltype == NLTYPE_ANY) + common->nlmax = 0x2029; + else if (common->nltype == NLTYPE_ANYCRLF) + common->nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL; + else + { + /* We only care about the first newline character. */ + common->nlmax = common->newline & 0xff; + } + + if (common->bsr_nltype == NLTYPE_ANY) + common->bsr_nlmax = 0x2029; + else + common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL; + } #endif /* SUPPORT_UTF */ ccend = bracketend(rootbacktrack.cc); diff --git a/testdata/testinput4 b/testdata/testinput4 index 6345f06..0110267 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -716,7 +716,10 @@ /^a+[a\x{200}]/8 aa -/.\B.\B./8 +/^.\B.\B./8 \x{10123}\x{10124}\x{10125} +/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8 + #\x{10000}#\x{100}#\x{10ffff}# + /-- End of testinput4 --/ diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 302b2cf..dcf13b0 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1263,8 +1263,12 @@ No match aa 0: aa -/.\B.\B./8 +/^.\B.\B./8 \x{10123}\x{10124}\x{10125} 0: \x{10123}\x{10124}\x{10125} +/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8 + #\x{10000}#\x{100}#\x{10ffff}# + 0: #\x{10000}#\x{100}#\x{10ffff}# + /-- End of testinput4 --/ |