summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-10-16 15:48:03 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-10-16 15:48:03 +0000
commit6f209d5f91b2eaaedefbbd9093f992e13cdf2d98 (patch)
tree8af9d0abddc74d901ded0b86514d9db2629dd152
parent8fcc1628b61893fa9719c1f8ae8a032494b98946 (diff)
downloadpcre-6f209d5f91b2eaaedefbbd9093f992e13cdf2d98.tar.gz
Support OP_ANYBYTE in JIT when utf8 is disabled and optimizing utf8 character length computation
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@736 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--pcre_jit_compile.c79
-rw-r--r--pcre_jit_test.c4
-rw-r--r--pcre_tables.c18
3 files changed, 57 insertions, 44 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index d9b338f..87e5bc5 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -467,6 +467,12 @@ switch(*cc)
case OP_SKIPZERO:
return cc + 1;
+ case OP_ANYBYTE:
+#ifdef SUPPORT_UTF8
+ if (common->utf8) return NULL;
+#endif
+ return cc + 1;
+
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
@@ -1336,8 +1342,7 @@ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
@@ -1358,8 +1363,7 @@ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
JUMPHERE(jump);
@@ -1383,8 +1387,7 @@ if (common->utf8)
/* This can be an extra read in some situations, but hopefully
it is a clever early read in most cases. */
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
- /* Should not found a value between 128 and 192 here. */
- jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 192);
+ jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
return;
@@ -1444,7 +1447,7 @@ else
static void do_utf8readchar(compiler_common *common)
{
/* Fast decoding an utf8 character. TMP1 contains the first byte
-of the character (>= 192). Return char value in TMP1, length - 1 in TMP2. */
+of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */
DEFINE_COMPILER;
struct sljit_jump *jump;
@@ -1527,7 +1530,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
static void do_utf8readtype8(compiler_common *common)
{
/* Fast decoding an utf8 character type. TMP2 contains the first byte
-of the character (>= 192) and TMP1 is destroyed. Return value in TMP1. */
+of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
struct sljit_jump *compare;
@@ -1553,8 +1556,7 @@ sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
/* We only have types for characters less than 256. */
-OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes);
-OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
+OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -1598,6 +1600,9 @@ struct sljit_label *newlinelabel = NULL;
struct sljit_jump *start;
struct sljit_jump *end = NULL;
struct sljit_jump *nl = NULL;
+#ifdef SUPPORT_UTF8
+struct sljit_jump *singlebyte;
+#endif
jump_list *newline = NULL;
BOOL newlinecheck = FALSE;
BOOL readbyte = FALSE;
@@ -1668,16 +1673,15 @@ if (readbyte)
if (newlinecheck)
CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(singlebyte);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPHERE(start);
@@ -1730,16 +1734,14 @@ else
}
}
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#ifdef SUPPORT_UTF8
if (common->utf8)
{
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
}
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
@@ -1846,7 +1848,7 @@ leave = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
#ifdef SUPPORT_UTF8
if (common->utf8)
- OP1(SLJIT_MOV_UB, TMP3, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
#endif
OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3);
@@ -1857,11 +1859,16 @@ found = JUMP(SLJIT_C_NOT_ZERO);
#ifdef SUPPORT_UTF8
if (common->utf8)
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP3, 0);
-else
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
-#else
+ OP1(SLJIT_MOV, TMP1, 0, TMP3, 0);
+#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+#ifdef SUPPORT_UTF8
+if (common->utf8)
+ {
+ CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ }
#endif
JUMPTO(SLJIT_JUMP, start);
JUMPHERE(found);
@@ -2788,14 +2795,22 @@ switch(type)
if (common->utf8)
{
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(jump[0]);
return cc;
}
#endif
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
return cc;
+ case OP_ANYBYTE:
+ check_input_end(common, fallbacks);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ return cc;
+
#ifdef SUPPORT_UTF8
#ifdef SUPPORT_UCP
case OP_NOTPROP:
@@ -3042,17 +3057,20 @@ switch(type)
if (c <= 127)
{
OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
- OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes);
if (type == OP_NOT || !char_has_othercase(common, cc))
add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c));
else
{
/* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */
- OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x20);
- add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c | 0x20));
+ OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20);
+ add_jump(compiler, fallbacks, CMP(SLJIT_C_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20));
}
/* Skip the variable-length character. */
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1);
+ jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)_pcre_utf8_char_sizes - 0xc0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
+ JUMPHERE(jump[0]);
return cc + length;
}
else
@@ -4744,6 +4762,7 @@ while (cc < ccend)
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
+ case OP_ANYBYTE:
case OP_NOTPROP:
case OP_PROP:
case OP_ANYNL:
diff --git a/pcre_jit_test.c b/pcre_jit_test.c
index 0619921..8bdc2d1 100644
--- a/pcre_jit_test.c
+++ b/pcre_jit_test.c
@@ -135,6 +135,10 @@ static struct regression_test_case regression_test_cases[] = {
{ PCRE_CASELESS, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
{ PCRE_CASELESS, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
{ PCRE_CASELESS, 0, "a1", "Aa1" },
+ { MA, 0, "\\Ca", "cda" },
+ { CMA, 0, "\\Ca", "CDA" },
+ { MA, 0, "\\Cx", "cda" },
+ { CMA, 0, "\\Cx", "CDA" },
/* Assertions. */
{ MUA, 0, "\\b[^A]", "A_B#" },
diff --git a/pcre_tables.c b/pcre_tables.c
index ab12b48..45c2211 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -88,25 +88,15 @@ const uschar _pcre_utf8_table4[] = {
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#ifdef SUPPORT_JIT
-/* Full table of the number of extra bytes. See _pcre_utf8_table4 above. */
+/* Full table of the number of extra bytes when the
+character code is greater or equal than 0xc0.
+See _pcre_utf8_table4 above. */
const uschar _pcre_utf8_char_sizes[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,
+ 3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,
};
#endif