summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-12-31 07:57:56 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2013-12-31 07:57:56 +0000
commit77fe3aba8932f6546f15a8ff06d59b98709f7a64 (patch)
tree85d15a054196e9fba66d0c119167d70781932f17
parent6979d01ad83f11e5a2d26fd5172f8874501d7761 (diff)
downloadpcre-77fe3aba8932f6546f15a8ff06d59b98709f7a64.tar.gz
JIT: Improved matching of newlines.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1423 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--pcre_jit_compile.c81
-rw-r--r--testdata/testinput45
-rw-r--r--testdata/testoutput46
3 files changed, 67 insertions, 25 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index ac1b353..6a9e7db 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -363,8 +363,10 @@ typedef struct compiler_common {
BOOL positive_assert;
/* Newline control. */
int nltype;
+ pcre_uint32 nlmax;
int newline;
int bsr_nltype;
+ pcre_uint32 bsr_nlmax;
/* Dollar endonly. */
int endonly;
/* Tables. */
@@ -522,6 +524,8 @@ the start pointers when the end of the capturing group has not yet reached. */
#define GET_LOCAL_BASE(dst, dstw, offset) \
sljit_get_local_base(compiler, (dst), (dstw), (offset))
+#define READ_CHAR_ANY 0x7fffffff
+
static pcre_uchar* bracketend(pcre_uchar* cc)
{
SLJIT_ASSERT((*cc >= OP_ASSERT && *cc <= OP_ASSERTBACK_NOT) || (*cc >= OP_ONCE && *cc <= OP_SCOND));
@@ -2626,7 +2630,7 @@ if (common->utf)
static SLJIT_INLINE void read_char(compiler_common *common)
{
-read_char_max(common, 0x7fffffff, TRUE);
+read_char_max(common, READ_CHAR_ANY, TRUE);
}
static void read_char8_type(compiler_common *common, BOOL full_read)
@@ -2730,28 +2734,35 @@ if (common->utf)
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
}
-static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpiftrue)
+static void check_newlinechar(compiler_common *common, int nltype, jump_list **backtracks, BOOL jumpifmatch)
{
/* Character comes in TMP1. Checks if it is a newline. TMP2 may be destroyed. */
DEFINE_COMPILER;
+struct sljit_jump *jump;
if (nltype == NLTYPE_ANY)
{
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
- add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
+ add_jump(compiler, backtracks, JUMP(jumpifmatch ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
}
else if (nltype == NLTYPE_ANYCRLF)
{
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_CR);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL);
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, CHAR_NL);
- OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL);
- add_jump(compiler, backtracks, JUMP(jumpiftrue ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
+ if (jumpifmatch)
+ {
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR));
+ add_jump(compiler, backtracks, CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
+ }
+ else
+ {
+ jump = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_NL));
+ JUMPHERE(jump);
+ }
}
else
{
SLJIT_ASSERT(nltype == NLTYPE_FIXED && common->newline < 256);
- add_jump(compiler, backtracks, CMP(jumpiftrue ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
+ add_jump(compiler, backtracks, CMP(jumpifmatch ? SLJIT_C_EQUAL : SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline));
}
}
@@ -2828,6 +2839,10 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
+OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
+OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_NOT_ZERO);
+/* This code runs only in 8 bit mode. No need to shift the value. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
@@ -2949,7 +2964,7 @@ if (firstline)
mainloop = LABEL();
/* Continual stores does not cause data dependency. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), common->first_line_end, STR_PTR, 0);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
check_newlinechar(common, common->nltype, &newline, TRUE);
CMPTO(SLJIT_C_LESS, STR_PTR, 0, STR_END, 0, mainloop);
JUMPHERE(end);
@@ -3517,7 +3532,7 @@ firstchar = CMP(SLJIT_C_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
skip_char_back(common);
loop = LABEL();
-read_char(common);
+read_char_max(common, common->nlmax, TRUE);
lastchar = CMP(SLJIT_C_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -4945,7 +4960,7 @@ switch(type)
case OP_ANY:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@@ -5013,7 +5028,7 @@ switch(type)
case OP_ANYNL:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, common->bsr_nlmax, FALSE);
jump[0] = CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */
end_list = NULL;
@@ -5035,7 +5050,7 @@ switch(type)
case OP_NOT_HSPACE:
case OP_HSPACE:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, 0x3000, type == OP_NOT_HSPACE);
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
return cc;
@@ -5043,7 +5058,7 @@ switch(type)
case OP_NOT_VSPACE:
case OP_VSPACE:
detect_partial_match(common, backtracks);
- read_char(common);
+ read_char_max(common, 0x2029, type == OP_NOT_VSPACE);
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_C_NOT_ZERO : SLJIT_C_ZERO));
return cc;
@@ -5142,7 +5157,7 @@ switch(type)
else
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), LOCALS1, STR_PTR, 0);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO));
@@ -5190,7 +5205,7 @@ switch(type)
else
{
skip_char_back(common);
- read_char(common);
+ read_char_max(common, common->nlmax, TRUE);
check_newlinechar(common, common->nltype, backtracks, FALSE);
}
JUMPHERE(jump[0]);
@@ -5265,8 +5280,8 @@ switch(type)
#endif
return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks);
}
+
detect_partial_match(common, backtracks);
- read_char(common);
#ifdef SUPPORT_UTF
if (common->utf)
{
@@ -5275,12 +5290,15 @@ switch(type)
else
#endif
c = *cc;
+
if (type == OP_CHAR || !char_has_othercase(common, cc))
{
+ read_char_max(common, c, FALSE);
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
return cc + length;
}
oc = char_othercase(common, c);
+ read_char_max(common, c > oc ? c : oc, FALSE);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -5288,11 +5306,9 @@ switch(type)
add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit));
return cc + length;
}
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, c);
- OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_UNUSED, 0, SLJIT_C_EQUAL);
- OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, oc);
- OP_FLAGS(SLJIT_OR | SLJIT_SET_E, TMP2, 0, TMP2, 0, SLJIT_C_EQUAL);
- add_jump(compiler, backtracks, JUMP(SLJIT_C_ZERO));
+ jump[0] = CMP(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, c);
+ add_jump(compiler, backtracks, CMP(SLJIT_C_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc));
+ JUMPHERE(jump[0]);
return cc + length;
case OP_NOT:
@@ -9420,6 +9436,7 @@ switch(re->options & PCRE_NEWLINE_BITS)
case PCRE_NEWLINE_ANYCRLF: common->newline = (CHAR_CR << 8) | CHAR_NL; common->nltype = NLTYPE_ANYCRLF; break;
default: return;
}
+common->nlmax = READ_CHAR_ANY;
if ((re->options & PCRE_BSR_ANYCRLF) != 0)
common->bsr_nltype = NLTYPE_ANYCRLF;
else if ((re->options & PCRE_BSR_UNICODE) != 0)
@@ -9432,6 +9449,7 @@ else
common->bsr_nltype = NLTYPE_ANY;
#endif
}
+common->bsr_nlmax = READ_CHAR_ANY;
common->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
common->ctypes = (sljit_sw)(tables + ctypes_offset);
common->name_table = ((pcre_uchar *)re) + re->name_table_offset;
@@ -9444,6 +9462,23 @@ common->utf = (re->options & PCRE_UTF8) != 0;
#ifdef SUPPORT_UCP
common->use_ucp = (re->options & PCRE_UCP) != 0;
#endif
+if (common->utf)
+ {
+ if (common->nltype == NLTYPE_ANY)
+ common->nlmax = 0x2029;
+ else if (common->nltype == NLTYPE_ANYCRLF)
+ common->nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL;
+ else
+ {
+ /* We only care about the first newline character. */
+ common->nlmax = common->newline & 0xff;
+ }
+
+ if (common->bsr_nltype == NLTYPE_ANY)
+ common->bsr_nlmax = 0x2029;
+ else
+ common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL;
+ }
#endif /* SUPPORT_UTF */
ccend = bracketend(rootbacktrack.cc);
diff --git a/testdata/testinput4 b/testdata/testinput4
index 6345f06..0110267 100644
--- a/testdata/testinput4
+++ b/testdata/testinput4
@@ -716,7 +716,10 @@
/^a+[a\x{200}]/8
aa
-/.\B.\B./8
+/^.\B.\B./8
\x{10123}\x{10124}\x{10125}
+/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8
+ #\x{10000}#\x{100}#\x{10ffff}#
+
/-- End of testinput4 --/
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 302b2cf..dcf13b0 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1263,8 +1263,12 @@ No match
aa
0: aa
-/.\B.\B./8
+/^.\B.\B./8
\x{10123}\x{10124}\x{10125}
0: \x{10123}\x{10124}\x{10125}
+/^#[^\x{ffff}]#[^\x{ffff}]#[^\x{ffff}]#/8
+ #\x{10000}#\x{100}#\x{10ffff}#
+ 0: #\x{10000}#\x{100}#\x{10ffff}#
+
/-- End of testinput4 --/