diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-10-16 15:48:03 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-10-16 15:48:03 +0000 |
commit | 6f209d5f91b2eaaedefbbd9093f992e13cdf2d98 (patch) | |
tree | 8af9d0abddc74d901ded0b86514d9db2629dd152 | |
parent | 8fcc1628b61893fa9719c1f8ae8a032494b98946 (diff) | |
download | pcre-6f209d5f91b2eaaedefbbd9093f992e13cdf2d98.tar.gz |
Support OP_ANYBYTE in JIT when utf8 is disabled and optimizing utf8 character length computation
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@736 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | pcre_jit_compile.c | 79 | ||||
-rw-r--r-- | pcre_jit_test.c | 4 | ||||
-rw-r--r-- | pcre_tables.c | 18 |
3 files changed, 57 insertions, 44 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index d9b338f..87e5bc5 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -467,6 +467,12 @@ switch(*cc) case OP_SKIPZERO: return cc + 1; + case OP_ANYBYTE: +#ifdef SUPPORT_UTF8 + if (common->utf8) return NULL; +#endif + return cc + 1; + case OP_CHAR: case OP_CHARI: case OP_NOT: @@ -1336,8 +1342,7 @@ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF8 if (common->utf8) { - /* Should not found a value between 128 and 192 here. */ - jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192); + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); } @@ -1358,8 +1363,7 @@ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF8 if (common->utf8) { - /* Should not found a value between 128 and 192 here. */ - jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192); + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); JUMPHERE(jump); @@ -1383,8 +1387,7 @@ if (common->utf8) /* This can be an extra read in some situations, but hopefully it is a clever early read in most cases. */ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); - /* Should not found a value between 128 and 192 here. */ - jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 192); + jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0); add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); return; @@ -1444,7 +1447,7 @@ else static void do_utf8readchar(compiler_common *common) { /* Fast decoding an utf8 character. TMP1 contains the first byte -of the character (>= 192). Return char value in TMP1, length - 1 in TMP2. */ +of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */ DEFINE_COMPILER; struct sljit_jump *jump; @@ -1527,7 +1530,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); static void do_utf8readtype8(compiler_common *common) { /* Fast decoding an utf8 character type. TMP2 contains the first byte -of the character (>= 192) and TMP1 is destroyed. Return value in TMP1. */ +of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; struct sljit_jump *compare; @@ -1553,8 +1556,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); /* We only have types for characters less than 256. */ -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes); -OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); +OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); @@ -1598,6 +1600,9 @@ struct sljit_label *newlinelabel = NULL; struct sljit_jump *start; struct sljit_jump *end = NULL; struct sljit_jump *nl = NULL; +#ifdef SUPPORT_UTF8 +struct sljit_jump *singlebyte; +#endif jump_list *newline = NULL; BOOL newlinecheck = FALSE; BOOL readbyte = FALSE; @@ -1668,16 +1673,15 @@ if (readbyte) if (newlinecheck) CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); #ifdef SUPPORT_UTF8 if (common->utf8) { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes); + singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + JUMPHERE(singlebyte); } -else - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); -#else -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); #endif JUMPHERE(start); @@ -1730,16 +1734,14 @@ else } } +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); #ifdef SUPPORT_UTF8 if (common->utf8) { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes); + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } -else - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); -#else -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); #endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); @@ -1846,7 +1848,7 @@ leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); #ifdef SUPPORT_UTF8 if (common->utf8) - OP1(SLJIT_MOV_UB, TMP3, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes); + OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); #endif OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); @@ -1857,11 +1859,16 @@ found = JUMP(SLJIT_C_NOT_ZERO); #ifdef SUPPORT_UTF8 if (common->utf8) - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP3, 0); -else - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); -#else + OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); +#endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +#ifdef SUPPORT_UTF8 +if (common->utf8) + { + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + } #endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); @@ -2788,14 +2795,22 @@ switch(type) if (common->utf8) { OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + JUMPHERE(jump[0]); return cc; } #endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); return cc; + case OP_ANYBYTE: + check_input_end(common, fallbacks); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + return cc; + #ifdef SUPPORT_UTF8 #ifdef SUPPORT_UCP case OP_NOTPROP: @@ -3042,17 +3057,20 @@ switch(type) if (c <= 127) { OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes); if (type == OP_NOT || !char_has_othercase(common, cc)) add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c)); else { /* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */ - OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20); - add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c | 0x20)); + OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20); + add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20)); } /* Skip the variable-length character. */ - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + JUMPHERE(jump[0]); return cc + length; } else @@ -4744,6 +4762,7 @@ while (cc < ccend) case OP_WORDCHAR: case OP_ANY: case OP_ALLANY: + case OP_ANYBYTE: case OP_NOTPROP: case OP_PROP: case OP_ANYNL: diff --git a/pcre_jit_test.c b/pcre_jit_test.c index 0619921..8bdc2d1 100644 --- a/pcre_jit_test.c +++ b/pcre_jit_test.c @@ -135,6 +135,10 @@ static struct regression_test_case regression_test_cases[] = { { PCRE_CASELESS, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" }, { PCRE_CASELESS, 0, "\xfe", "\xff\xfc#\xfe\xfe" }, { PCRE_CASELESS, 0, "a1", "Aa1" }, + { MA, 0, "\\Ca", "cda" }, + { CMA, 0, "\\Ca", "CDA" }, + { MA, 0, "\\Cx", "cda" }, + { CMA, 0, "\\Cx", "CDA" }, /* Assertions. */ { MUA, 0, "\\b[^A]", "A_B#" }, diff --git a/pcre_tables.c b/pcre_tables.c index ab12b48..45c2211 100644 --- a/pcre_tables.c +++ b/pcre_tables.c @@ -88,25 +88,15 @@ const uschar _pcre_utf8_table4[] = { 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; #ifdef SUPPORT_JIT -/* Full table of the number of extra bytes. See _pcre_utf8_table4 above. */ +/* Full table of the number of extra bytes when the +character code is greater or equal than 0xc0. +See _pcre_utf8_table4 above. */ const uschar _pcre_utf8_char_sizes[] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5, + 3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4, }; #endif |