summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorzherczeg <zherczeg@6239d852-aaf2-0410-a92c-79f79f948069>2018-09-12 19:06:29 +0000
committerzherczeg <zherczeg@6239d852-aaf2-0410-a92c-79f79f948069>2018-09-12 19:06:29 +0000
commit62265bf56dc6b40d998bcfcf8d5c207df16e96e1 (patch)
tree5bbfb1a5a56725e7ff78ca3690a36e1ead795721 /src
parent6dab29f548322f43a860bfe8bf1d36eb091a3de0 (diff)
downloadpcre2-62265bf56dc6b40d998bcfcf8d5c207df16e96e1.tar.gz
Start working on invalid utf subject support in JIT.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1003 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r--src/pcre2.h.generic1
-rw-r--r--src/pcre2.h.in1
-rw-r--r--src/pcre2_jit_compile.c1403
-rw-r--r--src/pcre2_jit_test.c466
4 files changed, 1636 insertions, 235 deletions
diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic
index 3d2feb7..f6ed686 100644
--- a/src/pcre2.h.generic
+++ b/src/pcre2.h.generic
@@ -164,6 +164,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
+#define PCRE2_JIT_INVALID_UTF 0x00000100u
/* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note
that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index a9396e0..5bb4bb6 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -164,6 +164,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
+#define PCRE2_JIT_INVALID_UTF 0x00000100u
/* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note
that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c
index 32e985b..89d302c 100644
--- a/src/pcre2_jit_compile.c
+++ b/src/pcre2_jit_compile.c
@@ -477,12 +477,18 @@ typedef struct compiler_common {
BOOL alt_circumflex;
#ifdef SUPPORT_UNICODE
BOOL utf;
+ BOOL invalid_utf;
BOOL use_ucp;
jump_list *getucd;
#if PCRE2_CODE_UNIT_WIDTH == 8
jump_list *utfreadchar;
- jump_list *utfreadchar16;
jump_list *utfreadtype8;
+ jump_list *utfpeakcharback;
+#endif
+#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
+ jump_list *utfreadchar_invalid;
+ jump_list *utfmoveback_invalid;
+ jump_list *utfpeakcharback_invalid;
#endif
#endif /* SUPPORT_UNICODE */
} compiler_common;
@@ -616,7 +622,173 @@ the start pointers when the end of the capturing group has not yet reached. */
#define READ_CHAR_MAX 0x7fffffff
-#define INVALID_UTF_CHAR 888
+#define INVALID_UTF_CHAR -1
+#define UNASSIGNED_UTF_CHAR 888
+
+#if defined SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+
+#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
+ { \
+ if (ptr[0] <= 0x7f) \
+ c = *ptr++; \
+ else if (ptr + 1 < end && ptr[1] >= 0x80 && ptr[1] < 0xc0) \
+ { \
+ c = ptr[1] - 0x80; \
+ \
+ if (ptr[0] >= 0xc0 && ptr[0] <= 0xdf) \
+ { \
+ c |= (ptr[0] - 0xc0) << 6; \
+ ptr += 2; \
+ } \
+ else if (ptr + 2 < end && ptr[2] >= 0x80 && ptr[2] < 0xc0) \
+ { \
+ c = c << 6 | (ptr[2] - 0x80); \
+ \
+ if (ptr[0] >= 0xe0 && ptr[0] <= 0xef) \
+ { \
+ c |= (ptr[0] - 0xe0) << 12; \
+ ptr += 3; \
+ } \
+ else if (ptr + 3 < end && ptr[3] >= 0x80 && ptr[3] < 0xc0) \
+ { \
+ c = c << 6 | (ptr[3] - 0x80); \
+ \
+ if (ptr[0] >= 0xf0 && ptr[0] <= 0xf4) \
+ { \
+ c |= (ptr[0] - 0xf0) << 18; \
+ ptr += 4; \
+ \
+ if (c >= 0x110000) \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ }
+
+#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
+ { \
+ if (ptr[-1] <= 0x7f) \
+ c = *ptr--; \
+ else if (ptr - 1 > start && ptr[-1] >= 0x80 && ptr[-1] < 0xc0) \
+ { \
+ c = ptr[-1] - 0x80; \
+ \
+ if (ptr[-2] >= 0xc0 && ptr[-2] <= 0xdf) \
+ { \
+ c |= (ptr[-2] - 0xc0) << 6; \
+ ptr -= 2; \
+ } \
+ else if (ptr - 2 > start && ptr[-2] >= 0x80 && ptr[-2] < 0xc0) \
+ { \
+ c = c << 6 | (ptr[-2] - 0x80); \
+ \
+ if (ptr[-3] >= 0xe0 && ptr[-3] <= 0xef) \
+ { \
+ c |= (ptr[-3] - 0xe0) << 12; \
+ ptr -= 3; \
+ } \
+ else if (ptr - 3 > start && ptr[-3] >= 0x80 && ptr[-3] < 0xc0) \
+ { \
+ c = c << 6 | (ptr[-3] - 0x80); \
+ \
+ if (ptr[-4] >= 0xf0 && ptr[-4] <= 0xf4) \
+ { \
+ c |= (ptr[-4] - 0xf0) << 18; \
+ ptr -= 4; \
+ \
+ if (c >= 0x110000) \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ }
+
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+
+#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
+ { \
+ if (ptr[0] < 0xd800 || ptr[0] >= 0xe000) \
+ c = *ptr++; \
+ else if (ptr[0] < 0xdc00 && ptr + 1 < end && ptr[1] >= 0xdc00 && ptr[1] < 0xe000) \
+ { \
+ c = (((ptr[0] - 0xd800) << 10) | (ptr[1] - 0xdc00)) + 0x10000; \
+ ptr += 2; \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ }
+
+#define GETCHARBACK_INVALID(c, ptr, start, invalid_action) \
+ { \
+ if (ptr[-1] < 0xd800 || ptr[-1] >= 0xe000) \
+ c = *ptr--; \
+ else if (ptr[-1] >= 0xdc00 && ptr - 1 > start && ptr[-2] >= 0xd800 && ptr[-2] < 0xdc00) \
+ { \
+ c = (((ptr[-2] - 0xd800) << 10) | (ptr[-1] - 0xdc00)) + 0x10000; \
+ ptr -= 2; \
+ } \
+ else \
+ { \
+ invalid_action; \
+ } \
+ }
+
+
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+
+#define GETCHARINC_INVALID(c, ptr, end, invalid_action) \
+ { \
+ if (ptr[0] < 0x110000) \
+ c = *ptr++; \
+ else \
+ { \
+ invalid_action; \
+ } \
+ }
+
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
static PCRE2_SPTR bracketend(PCRE2_SPTR cc)
{
@@ -3159,95 +3331,128 @@ else
JUMPHERE(jump);
}
-static void peek_char(compiler_common *common, sljit_u32 max)
+static void peek_char(compiler_common *common, sljit_u32 max, sljit_s32 dst, sljit_sw dstw)
{
/* Reads the character into TMP1, keeps STR_PTR.
-Does not check STR_END. TMP2 Destroyed. */
+Does not check STR_END. TMP2, dst, RETURN_ADDR Destroyed. */
DEFINE_COMPILER;
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
struct sljit_jump *jump;
-#endif
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
SLJIT_UNUSED_ARG(max);
+SLJIT_UNUSED_ARG(dst);
+SLJIT_UNUSED_ARG(dstw);
-OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf)
{
if (max < 128) return;
- jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
+ jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
+ OP1(SLJIT_MOV, dst, dstw, STR_PTR, 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
- add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
- OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ add_jump(compiler, common->invalid_utf ? &common->utfreadchar_invalid : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
+ OP1(SLJIT_MOV, STR_PTR, 0, dst, dstw);
JUMPHERE(jump);
}
-#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
-
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
+#elif PCRE2_CODE_UNIT_WIDTH == 16
if (common->utf)
{
if (max < 0xd800) return;
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
- jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
- /* TMP2 contains the high surrogate. */
- OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
- OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x40);
- OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
- OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
- OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+ if (common->invalid_utf)
+ {
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ }
+ else
+ {
+ /* TMP2 contains the high surrogate. */
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+ }
JUMPHERE(jump);
}
-#endif
-}
-
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
-
-static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass)
-{
-/* Tells whether the character codes below 128 are enough
-to determine a match. */
-const sljit_u8 value = nclass ? 0xff : 0;
-const sljit_u8 *end = bitset + 32;
+#elif PCRE2_CODE_UNIT_WIDTH == 32
-bitset += 16;
-do
- {
- if (*bitset++ != value)
- return FALSE;
- }
-while (bitset < end);
-return TRUE;
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
}
-static void read_char7_type(compiler_common *common, BOOL full_read)
+static void peek_char_back(compiler_common *common, sljit_u32 max, jump_list **backtracks)
{
-/* Reads the precise character type of a character into TMP1, if the character
-is less than 128. Otherwise it returns with zero. Does not check STR_END. The
-full_read argument tells whether characters above max are accepted or not. */
+/* Reads one character back without moving STR_PTR. TMP2 must
+contain the start of the subject buffer. Affects TMP1, TMP2, and RETURN_ADDR. */
DEFINE_COMPILER;
-struct sljit_jump *jump;
-SLJIT_ASSERT(common->utf);
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+struct sljit_jump *jump;
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+SLJIT_UNUSED_ARG(max);
+SLJIT_UNUSED_ARG(backtracks);
-OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
-if (full_read)
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+if (common->utf)
{
- jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
- OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ if (max < 128) return;
+
+ jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
+ if (common->invalid_utf)
+ {
+ add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ }
+ else
+ add_jump(compiler, &common->utfpeakcharback, JUMP(SLJIT_FAST_CALL));
JUMPHERE(jump);
}
-}
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+if (common->utf)
+ {
+ if (max < 0xd800) return;
-#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+ if (common->invalid_utf)
+ {
+ jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xd800);
+ add_jump(compiler, &common->utfpeakcharback_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ }
+ else
+ {
+ OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xdc00);
+ /* TMP2 contains the low surrogate. */
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+ OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000);
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+ }
+ JUMPHERE(jump);
+ }
+#elif PCRE2_CODE_UNIT_WIDTH == 32
-static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max, BOOL update_str_ptr)
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
+}
+
+static void read_char_range(compiler_common *common, sljit_u32 min, sljit_u32 max,
+ jump_list **backtracks, BOOL update_str_ptr)
{
/* Reads the precise value of a character into TMP1, if the character is
between min and max (c >= min && c <= max). Otherwise it returns with a value
@@ -3263,6 +3468,7 @@ struct sljit_jump *jump2;
SLJIT_UNUSED_ARG(update_str_ptr);
SLJIT_UNUSED_ARG(min);
SLJIT_UNUSED_ARG(max);
+SLJIT_UNUSED_ARG(backtracks);
SLJIT_ASSERT(min <= max);
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
@@ -3273,6 +3479,16 @@ if (common->utf)
{
if (max < 128 && !update_str_ptr) return;
+ if (common->invalid_utf)
+ {
+ jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ JUMPHERE(jump);
+ return;
+ }
+
jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0);
if (min >= 0x10000)
{
@@ -3319,7 +3535,9 @@ if (common->utf)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, RETURN_ADDR, 0);
}
else if (max >= 0x800)
- add_jump(compiler, (max < 0x10000) ? &common->utfreadchar16 : &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
+ {
+ add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL));
+ }
else if (max < 128)
{
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0);
@@ -3346,26 +3564,36 @@ if (common->utf)
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
if (common->utf)
{
- if (max >= 0x10000)
+ if (max < 0xd800 && !update_str_ptr) return;
+
+ if (max >= 0x10000 || common->invalid_utf)
{
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
- jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
- /* TMP2 contains the high surrogate. */
- OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
- OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x40);
- OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
- OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
- OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff);
- OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+ if (common->invalid_utf)
+ {
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ }
+ else
+ {
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
+ /* TMP2 contains the high surrogate. */
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
+ OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+ }
JUMPHERE(jump);
return;
}
- if (max < 0xd800 && !update_str_ptr) return;
-
/* Skip low surrogate if necessary. */
OP2(SLJIT_SUB, TMP2, 0, TMP1, 0, SLJIT_IMM, 0xd800);
- jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800);
if (update_str_ptr)
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
if (max >= 0xd800)
@@ -3375,12 +3603,62 @@ if (common->utf)
#endif
}
-static SLJIT_INLINE void read_char(compiler_common *common)
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
+
+static BOOL is_char7_bitset(const sljit_u8 *bitset, BOOL nclass)
+{
+/* Tells whether the character codes below 128 are enough
+to determine a match. */
+const sljit_u8 value = nclass ? 0xff : 0;
+const sljit_u8 *end = bitset + 32;
+
+bitset += 16;
+do
+ {
+ if (*bitset++ != value)
+ return FALSE;
+ }
+while (bitset < end);
+return TRUE;
+}
+
+static void read_char7_type(compiler_common *common, jump_list **backtracks, BOOL negated)
{
-read_char_range(common, 0, READ_CHAR_MAX, TRUE);
+/* Reads the precise character type of a character into TMP1, if the character
+is less than 128. Otherwise it returns with zero. Does not check STR_END. The
+full_read argument tells whether characters above max are accepted or not. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+
+SLJIT_ASSERT(common->utf);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+
+if (negated)
+ {
+ jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80);
+
+ if (common->invalid_utf)
+ {
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+ }
+ else
+ {
+ OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(utf8_table4) - 0xc0);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
+ }
+ JUMPHERE(jump);
+ }
}
-static void read_char8_type(compiler_common *common, BOOL update_str_ptr)
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+
+static void read_char8_type(compiler_common *common, jump_list **backtracks, BOOL negated)
{
/* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */
DEFINE_COMPILER;
@@ -3391,7 +3669,8 @@ struct sljit_jump *jump;
struct sljit_jump *jump2;
#endif
-SLJIT_UNUSED_ARG(update_str_ptr);
+SLJIT_UNUSED_ARG(backtracks);
+SLJIT_UNUSED_ARG(negated);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
@@ -3399,62 +3678,124 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf)
{
- /* This can be an extra read in some situations, but hopefully
- it is needed in most cases. */
+ /* The result of this read may be unused, but saves an "else" part. */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
- jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0xc0);
- if (!update_str_ptr)
+ jump = CMP(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x80);
+
+ if (!negated)
{
+ if (common->invalid_utf)
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
- OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc0);
+ if (common->invalid_utf)
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1f));
+
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
- OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
+ if (common->invalid_utf)
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0x3f));
+
OP2(SLJIT_OR, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
JUMPHERE(jump2);
}
+ else if (common->invalid_utf)
+ {
+ add_jump(compiler, &common->utfreadchar_invalid, JUMP(SLJIT_FAST_CALL));
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR));
+
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+ jump2 = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
+ OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
+ JUMPHERE(jump2);
+ }
else
add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL));
+
JUMPHERE(jump);
return;
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */
+#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32
+if (common->invalid_utf && negated)
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x110000));
+#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 32 */
+
#if PCRE2_CODE_UNIT_WIDTH != 8
/* The ctypes array contains only 256 values. */
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 255);
-#endif
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes);
#if PCRE2_CODE_UNIT_WIDTH != 8
JUMPHERE(jump);
-#endif
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16
-if (common->utf && update_str_ptr)
+if (common->utf && negated)
{
/* Skip low surrogate if necessary. */
+ if (!common->invalid_utf)
+ {
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400);
+ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ JUMPHERE(jump);
+ }
+
OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
- jump = CMP(SLJIT_GREATER, TMP2, 0, SLJIT_IMM, 0xdc00 - 0xd800 - 1);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0xe000 - 0xd800);
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400));
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
+
+ OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xdc00);
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400));
+
JUMPHERE(jump);
}
#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 16 */
}
-static void skip_char_back(compiler_common *common)
+static void move_back(compiler_common *common, jump_list **backtracks, BOOL must_be_valid)
{
-/* Goes one character back. Affects STR_PTR and TMP1. Does not check begin. */
+/* Goes one character back. TMP2 must contain the start of
+the subject buffer. Affects STR_PTR and TMP1. Does not modify
+STR_PTR for invalid character sequences. */
DEFINE_COMPILER;
+
+SLJIT_UNUSED_ARG(backtracks);
+SLJIT_UNUSED_ARG(must_be_valid);
+
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
+struct sljit_jump *jump;
+#endif
+
+#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
struct sljit_label *label;
if (common->utf)
{
+ if (!must_be_valid && common->invalid_utf)
+ {
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+ jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x80);
+ add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0));
+ JUMPHERE(jump);
+ return;
+ }
+
label = LABEL();
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
@@ -3467,6 +3808,18 @@ if (common->utf)
{
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+ if (!must_be_valid && common->invalid_utf)
+ {
+ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+ jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0xd800);
+ add_jump(compiler, &common->utfmoveback_invalid, JUMP(SLJIT_FAST_CALL));
+ if (backtracks != NULL)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, 0));
+ JUMPHERE(jump);
+ return;
+ }
+
/* Skip low surrogate if necessary. */
OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00);
OP2(SLJIT_SUB | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
@@ -3475,8 +3828,14 @@ if (common->utf)
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
return;
}
-#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16] */
-#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
+#elif PCRE2_CODE_UNIT_WIDTH == 32
+if (common->invalid_utf && !must_be_valid)
+ {
+ OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1));
+ add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x110000));
+ }
+#endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */
+#endif /* SUPPORT_UNICODE */
OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
}
@@ -3519,13 +3878,12 @@ else
static void do_utfreadchar(compiler_common *common)
{
/* Fast decoding a UTF-8 character. TMP1 contains the first byte
-of the character (>= 0xc0). Return char value in TMP1, length in TMP2. */
+of the character (>= 0xc0). Return char value in TMP1. */
DEFINE_COMPILER;
struct sljit_jump *jump;
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
@@ -3534,13 +3892,12 @@ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
jump = JUMP(SLJIT_NOT_ZERO);
/* Two byte sequence. */
+OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3000);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
JUMPHERE(jump);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
-OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
@@ -3548,55 +3905,18 @@ OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
jump = JUMP(SLJIT_NOT_ZERO);
/* Three byte sequence. */
+OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0000);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
/* Four byte sequence. */
JUMPHERE(jump);
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
-OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0000);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(4));
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-}
-
-static void do_utfreadchar16(compiler_common *common)
-{
-/* Fast decoding a UTF-8 character. TMP1 contains the first byte
-of the character (>= 0xc0). Return value in TMP1. */
-DEFINE_COMPILER;
-struct sljit_jump *jump;
-
-sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
-OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
-OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
-OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-
-/* Searching for the first zero. */
-OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
-jump = JUMP(SLJIT_NOT_ZERO);
-/* Two byte sequence. */
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
-sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
-
-JUMPHERE(jump);
-OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x400);
-OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO);
-/* This code runs only in 8 bit mode. No need to shift the value. */
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
-OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
-OP2(SLJIT_XOR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x800);
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
-/* Three byte sequence. */
-OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
@@ -3636,8 +3956,476 @@ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
+static void do_utfreadchar_invalid(compiler_common *common)
+{
+/* Slow decoding a UTF-8 character. TMP1 contains the first byte
+of the character (>= 0xc0). Return char value in TMP1. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+struct sljit_jump *buffer_end_close;
+struct sljit_label *three_byte_entry;
+struct sljit_label *exit_invalid_label;
+struct sljit_jump *exit_invalid[11];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+/* Continuation byte, or invalid two byte sequence. */
+exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc2);
+
+/* Usually more than 3 characters remained in the subject buffer. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+
+buffer_end_close = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Three-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+three_byte_entry = LABEL();
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x2d800);
+exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+exit_invalid[4] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Four-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc10000);
+exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
+
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(buffer_end_close);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+exit_invalid[7] = CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[8] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x800);
+jump = JUMP(SLJIT_NOT_ZERO);
+
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump);
+
+/* Three-byte sequence. */
+exit_invalid[9] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+exit_invalid[10] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+
+/* One will be substracted from STR_PTR later. */
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+
+/* Four byte sequences are not possible. */
+CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x30000, three_byte_entry);
+
+exit_invalid_label = LABEL();
+sljit_set_label(exit_invalid[3], exit_invalid_label);
+sljit_set_label(exit_invalid[5], exit_invalid_label);
+sljit_set_label(exit_invalid[7], exit_invalid_label);
+sljit_set_label(exit_invalid[8], exit_invalid_label);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+exit_invalid_label = LABEL();
+sljit_set_label(exit_invalid[0], exit_invalid_label);
+sljit_set_label(exit_invalid[4], exit_invalid_label);
+sljit_set_label(exit_invalid[6], exit_invalid_label);
+sljit_set_label(exit_invalid[9], exit_invalid_label);
+sljit_set_label(exit_invalid[10], exit_invalid_label);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(exit_invalid[1]);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+JUMPHERE(exit_invalid[2]);
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfmoveback_invalid(compiler_common *common)
+{
+/* Goes one character back. */
+DEFINE_COMPILER;
+sljit_s32 i;
+struct sljit_jump *jump;
+struct sljit_jump *buffer_start_close;
+struct sljit_label *exit_ok_label;
+struct sljit_label *exit_invalid_label;
+struct sljit_jump *exit_invalid[7];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0);
+
+/* Two-byte sequence. */
+buffer_start_close = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2));
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x20);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+/* Three-byte sequence. */
+JUMPHERE(jump);
+exit_invalid[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1));
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
+jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x10);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+/* Four-byte sequence. */
+JUMPHERE(jump);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x40);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xf0);
+exit_invalid[3] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x05);
+
+exit_ok_label = LABEL();
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+/* Two-byte sequence. */
+JUMPHERE(buffer_start_close);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+
+exit_invalid[4] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20, exit_ok_label);
+
+/* Three-byte sequence. */
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+exit_invalid[5] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, -0x40);
+exit_invalid[6] = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
+CMPTO(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10, exit_ok_label);
+
+/* Four-byte sequences are not possible. */
+
+exit_invalid_label = LABEL();
+sljit_set_label(exit_invalid[5], exit_invalid_label);
+sljit_set_label(exit_invalid[6], exit_invalid_label);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(exit_invalid[4]);
+/* -2 + 4 = 2 */
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+
+exit_invalid_label = LABEL();
+for (i = 0; i < 4; i++)
+ sljit_set_label(exit_invalid[i], exit_invalid_label);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(4));
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfpeakcharback(compiler_common *common)
+{
+/* Peak a character back. */
+DEFINE_COMPILER;
+struct sljit_jump *jump[2];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0);
+jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x20);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0);
+jump[1] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x10);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xe0 - 0x80);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+JUMPHERE(jump[1]);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+JUMPHERE(jump[0]);
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfpeakcharback_invalid(compiler_common *common)
+{
+/* Peak a character back. */
+DEFINE_COMPILER;
+sljit_s32 i;
+struct sljit_jump *jump[2];
+struct sljit_label *two_byte_entry;
+struct sljit_label *three_byte_entry;
+struct sljit_label *exit_invalid_label;
+struct sljit_jump *exit_invalid[8];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(3));
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xc0);
+jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
+jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x1e);
+
+two_byte_entry = LABEL();
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x02);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump[1]);
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+/* Three-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0);
+jump[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x10);
+
+three_byte_entry = LABEL();
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+exit_invalid[2] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+exit_invalid[3] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x800);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump[1]);
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0 - 0x80);
+exit_invalid[4] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+/* Four-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-4));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf0);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18);
+/* ADD is used instead of OR because of the SUB 0x10000 above. */
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+
+exit_invalid[5] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x100000);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump[0]);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
+CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry);
+
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2 - 0x80);
+exit_invalid[6] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x40);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6);
+OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0);
+
+/* Three-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-3));
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xe0);
+CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x10, three_byte_entry);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(jump[0]);
+exit_invalid[7] = CMP(SLJIT_GREATER, TMP2, 0, STR_PTR, 0);
+
+/* Two-byte sequence. */
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x80);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xc2);
+CMPTO(SLJIT_LESS, TMP2, 0, SLJIT_IMM, 0x1e, two_byte_entry);
+
+exit_invalid_label = LABEL();
+for (i = 0; i < 8; i++)
+ sljit_set_label(exit_invalid[i], exit_invalid_label);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+#if PCRE2_CODE_UNIT_WIDTH == 16
+
+static void do_utfreadchar_invalid(compiler_common *common)
+{
+/* Slow decoding a UTF-16 character. TMP1 contains the first half
+of the character (>= 0xd800). Return char value in TMP1, length in TMP2. */
+DEFINE_COMPILER;
+struct sljit_jump *exit_invalid[3];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+/* TMP2 contains the high surrogate. */
+exit_invalid[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xdc00);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0));
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xdc00);
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x10000);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400);
+
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(exit_invalid[0]);
+JUMPHERE(exit_invalid[1]);
+JUMPHERE(exit_invalid[2]);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfmoveback_invalid(compiler_common *common)
+{
+/* Goes one character back. */
+DEFINE_COMPILER;
+struct sljit_jump *exit_invalid[3];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0x400);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
+
+OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
+OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xd800);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0x400);
+
+OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 1);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(exit_invalid[0]);
+JUMPHERE(exit_invalid[1]);
+JUMPHERE(exit_invalid[2]);
+
+OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+static void do_utfpeakcharback_invalid(compiler_common *common)
+{
+/* Peak a character back. */
+DEFINE_COMPILER;
+struct sljit_jump *jump;
+struct sljit_jump *exit_invalid[3];
+
+sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
+
+jump = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000);
+OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, IN_UCHARS(1));
+exit_invalid[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xdc00);
+exit_invalid[1] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, STR_PTR, 0);
+
+OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000 - 0xdc00);
+OP2(SLJIT_SUB, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xd800);
+exit_invalid[2] = CMP(SLJIT_GREATER_EQUAL, TMP2, 0, SLJIT_IMM, 0x400);
+OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 10);
+OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
+
+JUMPHERE(jump);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+
+JUMPHERE(exit_invalid[0]);
+JUMPHERE(exit_invalid[1]);
+JUMPHERE(exit_invalid[2]);
+
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
+}
+
+#endif /* PCRE2_CODE_UNIT_WIDTH == 16 */
+
/* UCD_BLOCK_SIZE must be 128 (see the assert below). */
#define UCD_BLOCK_MASK 127
#define UCD_BLOCK_SHIFT 7
@@ -3653,7 +4441,7 @@ struct sljit_jump *jump;
#if defined SLJIT_DEBUG && SLJIT_DEBUG
/* dummy_ucd_record */
-const ucd_record *record = GET_UCD(INVALID_UTF_CHAR);
+const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
#endif
@@ -3666,7 +4454,7 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
if (!common->utf)
{
jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
- OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR);
JUMPHERE(jump);
}
#endif
@@ -3733,7 +4521,7 @@ if ((overall_options & PCRE2_FIRSTLINE) != 0)
mainloop = LABEL();
/* Continual stores does not cause data dependency. */
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr, STR_PTR, 0);
- read_char_range(common, common->nlmin, common->nlmax, TRUE);
+ read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
check_newlinechar(common, common->nltype, &newline, TRUE);
CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, mainloop);
JUMPHERE(end);
@@ -5358,12 +6146,12 @@ if (common->nltype == NLTYPE_FIXED && common->newline > 255)
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, str));
firstchar = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
-skip_char_back(common);
+move_back(common, NULL, FALSE);
loop = LABEL();
common->ff_newline_shortcut = loop;
-read_char_range(common, common->nlmin, common->nlmax, TRUE);
+read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
lastchar = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
if (common->nltype == NLTYPE_ANY || common->nltype == NLTYPE_ANYCRLF)
foundcr = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
@@ -5588,21 +6376,28 @@ static void check_wordboundary(compiler_common *common)
DEFINE_COMPILER;
struct sljit_jump *skipread;
jump_list *skipread_list = NULL;
+jump_list *invalid_utf = NULL;
#if PCRE2_CODE_UNIT_WIDTH != 8 || defined SUPPORT_UNICODE
struct sljit_jump *jump;
-#endif
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 || SUPPORT_UNICODE */
SLJIT_COMPILE_ASSERT(ctype_word == 0x10, ctype_word_must_be_16);
sljit_emit_fast_enter(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
-/* Get type of the previous char, and put it to LOCALS1. */
+/* Get type of the previous char, and put it to TMP3. */
OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
-OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
-OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, SLJIT_IMM, 0);
-skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP1, 0);
-skip_char_back(common);
-check_start_used_ptr(common);
-read_char(common);
+OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
+OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, 0);
+skipread = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0);
+
+if (common->mode == PCRE2_JIT_COMPLETE)
+ peek_char_back(common, READ_CHAR_MAX, &invalid_utf);
+else
+ {
+ move_back(common, &invalid_utf, FALSE);
+ check_start_used_ptr(common);
+ read_char_range(common, 0, READ_CHAR_MAX, &invalid_utf, TRUE);
+ }
/* Testing char type. */
#ifdef SUPPORT_UNICODE
@@ -5618,23 +6413,22 @@ if (common->use_ucp)
OP2(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ucp_No - ucp_Nd);
OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL);
JUMPHERE(jump);
- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP2, 0);
+ OP1(SLJIT_MOV, TMP3, 0, TMP2, 0);
}
else
-#endif
+#endif /* SUPPORT_UNICODE */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
#elif defined SUPPORT_UNICODE
- /* Here LOCALS1 has already been zeroed. */
+ /* Here TMP3 has already been zeroed. */
jump = NULL;
if (common->utf)
jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255);
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes);
OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 4 /* ctype_word */);
- OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 1);
- OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP1, 0);
+ OP2(SLJIT_AND, TMP3, 0, TMP1, 0, SLJIT_IMM, 1);
#if PCRE2_CODE_UNIT_WIDTH != 8
JUMPHERE(jump);
#elif defined SUPPORT_UNICODE
@@ -5646,7 +6440,7 @@ JUMPHERE(skipread);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 0);
check_str_end(common, &skipread_list);
-peek_char(common, READ_CHAR_MAX);
+peek_char(common, READ_CHAR_MAX, SLJIT_MEM1(SLJIT_SP), LOCALS1);
/* Testing char type. This is a code duplication. */
#ifdef SUPPORT_UNICODE
@@ -5664,7 +6458,7 @@ if (common->use_ucp)
JUMPHERE(jump);
}
else
-#endif
+#endif /* SUPPORT_UNICODE */
{
#if PCRE2_CODE_UNIT_WIDTH != 8
/* TMP2 may be destroyed by peek_char. */
@@ -5688,8 +6482,23 @@ else
}
set_jumps(skipread_list, LABEL());
-OP2(SLJIT_XOR | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
-sljit_emit_fast_return(compiler, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+OP2(SLJIT_XOR, TMP2, 0, TMP2, 0, TMP3, 0);
+sljit_emit_fast_return(compiler, TMP1, 0);
+
+#ifdef SUPPORT_UNICODE
+if (common->invalid_utf)
+ {
+ SLJIT_ASSERT(invalid_utf != NULL);
+
+ set_jumps(invalid_utf, LABEL());
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, -1);
+ sljit_emit_fast_return(compiler, TMP1, 0);
+ return;
+ }
+#endif /* SUPPORT_UNICODE */
+SLJIT_ASSERT(invalid_utf == NULL);
}
static BOOL optimize_class_ranges(compiler_common *common, const sljit_u8 *bits, BOOL nclass, BOOL invert, jump_list **backtracks)
@@ -6237,7 +7046,7 @@ const sljit_u32 *pp;
while (src1 < end1)
{
if (src2 >= end2)
- return (PCRE2_SPTR)1;
+ return (PCRE2_SPTR)0x1;
GETCHARINC(c1, src1);
GETCHARINC(c2, src2);
ur = GET_UCD(c2);
@@ -6254,6 +7063,33 @@ while (src1 < end1)
return src2;
}
+static PCRE2_SPTR SLJIT_FUNC do_utf_caselesscmp_invalid(PCRE2_SPTR src1, PCRE2_SPTR src2, PCRE2_SPTR end1, PCRE2_SPTR end2)
+{
+/* This function would be ineffective to do in JIT level. */
+sljit_u32 c1, c2;
+const ucd_record *ur;
+const sljit_u32 *pp;
+
+while (src1 < end1)
+ {
+ if (src2 >= end2)
+ return (PCRE2_SPTR)0x1;
+ GETCHARINC(c1, src1);
+ GETCHARINC_INVALID(c2, src2, end2, return NULL);
+ ur = GET_UCD(c2);
+ if (c1 != c2 && c1 != c2 + ur->other_case)
+ {
+ pp = PRIV(ucd_caseless_sets) + ur->caseset;
+ for (;;)
+ {
+ if (c1 < *pp) return NULL;
+ if (c1 == *pp++) break;
+ }
+ }
+ }
+return src2;
+}
+
#endif /* SUPPORT_UNICODE */
static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc,
@@ -6561,7 +7397,7 @@ SLJIT_ASSERT(compares > 0);
/* We are not necessary in utf mode even in 8 bit mode. */
cc = ccbegin;
-read_char_range(common, min, max, (cc[-1] & XCL_NOT) != 0);
+read_char_range(common, min, max, ((cc[-1] & XCL_NOT) != 0) ? backtracks : NULL, (cc[-1] & XCL_NOT) != 0);
if ((cc[-1] & XCL_HASPROP) == 0)
{
@@ -6630,7 +7466,7 @@ if (needstype || needsscript)
if (!common->utf)
{
jump = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1);
- OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, INVALID_UTF_CHAR);
+ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, UNASSIGNED_UTF_CHAR);
JUMPHERE(jump);
}
#endif
@@ -7020,6 +7856,15 @@ switch(type)
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
add_jump(compiler, &common->wordboundary, JUMP(SLJIT_FAST_CALL));
+#ifdef SUPPORT_UNICODE
+ if (common->invalid_utf)
+ {
+ OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_SIG_LESS, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0);
+ add_jump(compiler, backtracks, JUMP(SLJIT_SIG_LESS));
+ add_jump(compiler, backtracks, JUMP(type == OP_NOT_WORD_BOUNDARY ? SLJIT_NOT_ZERO : SLJIT_ZERO));
+ return cc;
+ }
+#endif /* SUPPORT_UNICODE */
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_WORD_BOUNDARY ? SLJIT_NOT_ZERO : SLJIT_ZERO));
return cc;
@@ -7079,7 +7924,7 @@ switch(type)
else
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, STR_PTR, 0);
- read_char_range(common, common->nlmin, common->nlmax, TRUE);
+ read_char_range(common, common->nlmin, common->nlmax, NULL, TRUE);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, STR_PTR, 0, STR_END, 0));
add_jump(compiler, &common->anynewline, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
@@ -7143,7 +7988,7 @@ switch(type)
}
else
{
- peek_char(common, common->nlmax);
+ peek_char(common, common->nlmax, TMP3, 0);
check_newlinechar(common, common->nltype, backtracks, FALSE);
}
JUMPHERE(jump[0]);
@@ -7158,10 +8003,10 @@ switch(type)
return cc;
case OP_CIRCM:
- OP1(SLJIT_MOV, TMP2, 0, ARGUMENTS, 0);
- OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, begin));
- jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP1, 0);
- OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL);
+ OP1(SLJIT_MOV, TMP1, 0, ARGUMENTS, 0);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
+ jump[1] = CMP(SLJIT_GREATER, STR_PTR, 0, TMP2, 0);
+ OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, options), SLJIT_IMM, PCRE2_NOTBOL);
add_jump(compiler, backtracks, JUMP(SLJIT_NOT_ZERO32));
jump[0] = JUMP(SLJIT_JUMP);
JUMPHERE(jump[1]);
@@ -7171,8 +8016,8 @@ switch(type)
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
- OP2(SLJIT_SUB, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
- add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP2, 0, TMP1, 0));
+ OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2));
+ add_jump(compiler, backtracks, CMP(SLJIT_LESS, TMP1, 0, TMP2, 0));
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-2));
OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-1));
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff));
@@ -7180,8 +8025,7 @@ switch(type)
}
else
{
- skip_char_back(common);
- read_char_range(common, common->nlmin, common->nlmax, TRUE);
+ peek_char_back(common, common->nlmax, backtracks);
check_newlinechar(common, common->nltype, backtracks, FALSE);
}
JUMPHERE(jump[0]);
@@ -7195,12 +8039,12 @@ switch(type)
#ifdef SUPPORT_UNICODE
if (common->utf)
{
- OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
- OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, length);
+ OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(TMP1), SLJIT_OFFSETOF(jit_arguments, begin));
+ OP1(SLJIT_MOV, TMP3, 0, SLJIT_IMM, length);
label = LABEL();
- add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP3, 0));
- skip_char_back(common);
- OP2(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, TMP2, 0, SLJIT_IMM, 1);
+ add_jump(compiler, backtracks, CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, TMP2, 0));
+ move_back(common, backtracks, FALSE);
+ OP2(SLJIT_SUB | SLJIT_SET_Z, TMP3, 0, TMP3, 0, SLJIT_IMM, 1);
JUMPTO(SLJIT_NOT_ZERO, label);
}
else
@@ -7225,21 +8069,28 @@ static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc)
{
PCRE2_SPTR start_subject = args->begin;
PCRE2_SPTR end_subject = args->end;
-int lgb, rgb, len, ricount;
-PCRE2_SPTR prevcc, bptr;
+int lgb, rgb, ricount;
+PCRE2_SPTR prevcc, startcc, bptr;
+BOOL first = TRUE;
uint32_t c;
prevcc = cc;
-GETCHARINC(c, cc);
-lgb = UCD_GRAPHBREAK(c);
-
-while (cc < end_subject)
+startcc = NULL;
+do
{
- len = 1;
- GETCHARLEN(c, cc, len);
+ GETCHARINC(c, cc);
rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ if (first)
+ {
+ lgb = rgb;
+ startcc = cc;
+ first = FALSE;
+ continue;
+ }
+
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
+ break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
@@ -7256,7 +8107,8 @@ while (cc < end_subject)
BACKCHAR(bptr);
GETCHAR(c, bptr);
- if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
+ if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator)
+ break;
ricount++;
}
@@ -7271,14 +8123,80 @@ while (cc < end_subject)
lgb != ucp_gbExtended_Pictographic)
lgb = rgb;
- prevcc = cc;
- cc += len;
+ prevcc = startcc;
+ startcc = cc;
}
+while (cc < end_subject);
-return cc;
+return startcc;
}
-#endif
+static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc)
+{
+PCRE2_SPTR start_subject = args->begin;
+PCRE2_SPTR end_subject = args->end;
+int lgb, rgb, ricount;
+PCRE2_SPTR prevcc, startcc, bptr;
+BOOL first = TRUE;
+uint32_t c;
+
+prevcc = cc;
+startcc = NULL;
+do
+ {
+ GETCHARINC_INVALID(c, cc, end_subject, break);
+ rgb = UCD_GRAPHBREAK(c);
+
+ if (first)
+ {
+ lgb = rgb;
+ startcc = cc;
+ first = FALSE;
+ continue;
+ }
+
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
+ break;
+
+ /* Not breaking between Regional Indicators is allowed only if there
+ are an even number of preceding RIs. */
+
+ if (lgb == ucp_gbRegionalIndicator && rgb == ucp_gbRegionalIndicator)
+ {
+ ricount = 0;
+ bptr = prevcc;
+
+ /* bptr is pointing to the left-hand character */
+ while (bptr > start_subject)
+ {
+ GETCHARBACK_INVALID(c, bptr, start_subject, break);
+
+ if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator)
+ break;
+
+ ricount++;
+ }
+
+ if ((ricount & 1) != 0)
+ break; /* Grapheme break required */
+ }
+
+ /* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
+ allows any number of them before a following Extended_Pictographic. */
+
+ if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
+ lgb != ucp_gbExtended_Pictographic)
+ lgb = rgb;
+
+ prevcc = startcc;
+ startcc = cc;
+ }
+while (cc < end_subject);
+
+return startcc;
+}
+
+#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc)
{
@@ -7289,14 +8207,23 @@ PCRE2_SPTR bptr;
uint32_t c;
GETCHARINC(c, cc);
+#if PCRE2_CODE_UNIT_WIDTH == 32
+if (c >= 0x110000)
+ return NULL;
+#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
lgb = UCD_GRAPHBREAK(c);
while (cc < end_subject)
{
c = *cc;
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c >= 0x110000)
+ break;
+#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
rgb = UCD_GRAPHBREAK(c);
- if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0)
+ break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
@@ -7311,13 +8238,18 @@ while (cc < end_subject)
{
bptr--;
c = *bptr;
+#if PCRE2_CODE_UNIT_WIDTH == 32
+ if (c >= 0x110000)
+ break;
+#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
if (UCD_GRAPHBREAK(c) != ucp_gbRegionalIndicator) break;
ricount++;
}
- if ((ricount & 1) != 0) break; /* Grapheme break required */
+ if ((ricount & 1) != 0)
+ break; /* Grapheme break required */
}
/* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
@@ -7333,7 +8265,7 @@ while (cc < end_subject)
return cc;
}
-#endif
+#endif /* SUPPORT_UNICODE */
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr)
{
@@ -7356,10 +8288,10 @@ switch(type)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE))
- read_char7_type(common, type == OP_NOT_DIGIT);
+ read_char7_type(common, backtracks, type == OP_NOT_DIGIT);
else
#endif
- read_char8_type(common, type == OP_NOT_DIGIT);
+ read_char8_type(common, backtracks, type == OP_NOT_DIGIT);
/* Flip the starting bit in the negative case. */
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_digit);
add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO));
@@ -7371,10 +8303,10 @@ switch(type)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE))
- read_char7_type(common, type == OP_NOT_WHITESPACE);
+ read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE);
else
#endif
- read_char8_type(common, type == OP_NOT_WHITESPACE);
+ read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_space);
add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO));
return cc;
@@ -7385,10 +8317,10 @@ switch(type)
detect_partial_match(common, backtracks);
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE))
- read_char7_type(common, type == OP_NOT_WORDCHAR);
+ read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR);
else
#endif
- read_char8_type(common, type == OP_NOT_WORDCHAR);
+ read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR);
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, ctype_word);
add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO));
return cc;
@@ -7396,7 +8328,7 @@ switch(type)
case OP_ANY:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, common->nlmin, common->nlmax, TRUE);
+ read_char_range(common, common->nlmin, common->nlmax, backtracks, TRUE);
if (common->nltype == NLTYPE_FIXED && common->newline > 255)
{
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff);
@@ -7421,6 +8353,12 @@ switch(type)
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (common->utf)
{
+ if (common->invalid_utf)
+ {
+ read_char_range(common, 0, READ_CHAR_MAX, backtracks, TRUE);
+ return cc;
+ }
+
OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0);
OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
@@ -7467,7 +8405,7 @@ switch(type)
case OP_ANYNL:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, FALSE);
+ read_char_range(common, common->bsr_nlmin, common->bsr_nlmax, NULL, FALSE);
jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR);
/* We don't need to handle soft partial matching case. */
end_list = NULL;
@@ -7490,7 +8428,7 @@ switch(type)
case OP_HSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, 0x9, 0x3000, type == OP_NOT_HSPACE);
+ read_char_range(common, 0x9, 0x3000, NULL, type == OP_NOT_HSPACE);
add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -7500,7 +8438,7 @@ switch(type)
case OP_VSPACE:
if (check_str_ptr)
detect_partial_match(common, backtracks);
- read_char_range(common, 0xa, 0x2029, type == OP_NOT_VSPACE);
+ read_char_range(common, 0xa, 0x2029, NULL, type == OP_NOT_VSPACE);
add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL));
sljit_set_current_flags(compiler, SLJIT_SET_Z);
add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO));
@@ -7516,9 +8454,12 @@ switch(type)
#if PCRE2_CODE_UNIT_WIDTH != 32
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM,
- common->utf ? SLJIT_FUNC_OFFSET(do_extuni_utf) : SLJIT_FUNC_OFFSET(do_extuni_no_utf));
+ common->utf ? (common->invalid_utf ? SLJIT_FUNC_OFFSET(do_extuni_utf_invalid) : SLJIT_FUNC_OFFSET(do_extuni_utf)) : SLJIT_FUNC_OFFSET(do_extuni_no_utf));
+ if (common->invalid_utf)
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
#else
sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_extuni_no_utf));
+ add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0));
#endif
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
@@ -7566,12 +8507,12 @@ switch(type)
if (type == OP_CHAR || !char_has_othercase(common, cc))
{
- read_char_range(common, c, c, FALSE);
+ read_char_range(common, c, c, NULL, FALSE);
add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c));
return cc + length;
}
oc = char_othercase(common, c);
- read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, FALSE);
+ read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, FALSE);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -7626,13 +8567,13 @@ switch(type)
if (type == OP_NOT || !char_has_othercase(common, cc))
{
- read_char_range(common, c, c, TRUE);
+ read_char_range(common, c, c, NULL, TRUE);
add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c));
}
else
{
oc = char_othercase(common, c);
- read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, TRUE);
+ read_char_range(common, c < oc ? c : oc, c > oc ? c : oc, NULL, TRUE);
bit = c ^ oc;
if (is_powerof2(bit))
{
@@ -7654,9 +8595,9 @@ switch(type)
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255;
- read_char_range(common, 0, bit, type == OP_NCLASS);
+ read_char_range(common, 0, bit, NULL, type == OP_NCLASS);
#else
- read_char_range(common, 0, 255, type == OP_NCLASS);
+ read_char_range(common, 0, 255, NULL, type == OP_NCLASS);
#endif
if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks))
@@ -7869,7 +8810,8 @@ if (common->utf && *cc == OP_REFI)
/* No free saved registers so save data on stack. */
OP1(SLJIT_MOV, SLJIT_R3, 0, STR_END, 0);
- sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), SLJIT_IMM, SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
+ sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
+ SLJIT_IMM, (common->invalid_utf) ? SLJIT_FUNC_OFFSET(do_utf_caselesscmp_invalid) : SLJIT_FUNC_OFFSET(do_utf_caselesscmp));
OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
if (common->mode == PCRE2_JIT_COMPLETE)
@@ -10787,14 +11729,14 @@ switch(opcode)
if (CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit != 0)
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.othercasebit);
CMPTO(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, CURRENT_AS(char_iterator_backtrack)->u.charpos.chr, CURRENT_AS(char_iterator_backtrack)->matchingpath);
- skip_char_back(common);
+ move_back(common, NULL, TRUE);
CMPTO(SLJIT_GREATER, STR_PTR, 0, TMP2, 0, label);
}
else
{
OP1(SLJIT_MOV, STR_PTR, 0, base, offset0);
jump = CMP(SLJIT_LESS_EQUAL, STR_PTR, 0, base, offset1);
- skip_char_back(common);
+ move_back(common, NULL, TRUE);
OP1(SLJIT_MOV, base, offset0, STR_PTR, 0);
JUMPTO(SLJIT_JUMP, CURRENT_AS(char_iterator_backtrack)->matchingpath);
}
@@ -12016,6 +12958,9 @@ sljit_emit_fast_return(compiler, TMP2, 0);
#undef COMPILE_BACKTRACKINGPATH
#undef CURRENT_AS
+#define PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS \
+ (PCRE2_JIT_INVALID_UTF)
+
static int jit_compile(pcre2_code *code, sljit_u32 mode)
{
pcre2_real_code *re = (pcre2_real_code *)code;
@@ -12052,6 +12997,11 @@ common->re = re;
common->name_table = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code));
rootbacktrack.cc = common->name_table + re->name_count * re->name_entry_size;
+#ifdef SUPPORT_UNICODE
+common->invalid_utf = (mode & PCRE2_JIT_INVALID_UTF) != 0;
+#endif /* SUPPORT_UNICODE */
+mode &= ~PUBLIC_JIT_COMPILE_CONFIGURATION_OPTIONS;
+
common->start = rootbacktrack.cc;
common->read_only_data_head = NULL;
common->fcc = tables + fcc_offset;
@@ -12117,6 +13067,8 @@ if (common->utf)
common->bsr_nlmax = (CHAR_CR > CHAR_NL) ? CHAR_CR : CHAR_NL;
common->bsr_nlmin = (CHAR_CR < CHAR_NL) ? CHAR_CR : CHAR_NL;
}
+else
+ common->invalid_utf = FALSE;
#endif /* SUPPORT_UNICODE */
ccend = bracketend(common->start);
@@ -12557,17 +13509,34 @@ if (common->utfreadchar != NULL)
set_jumps(common->utfreadchar, LABEL());
do_utfreadchar(common);
}
-if (common->utfreadchar16 != NULL)
- {
- set_jumps(common->utfreadchar16, LABEL());
- do_utfreadchar16(common);
- }
if (common->utfreadtype8 != NULL)
{
set_jumps(common->utfreadtype8, LABEL());
do_utfreadtype8(common);
}
+if (common->utfpeakcharback != NULL)
+ {
+ set_jumps(common->utfpeakcharback, LABEL());
+ do_utfpeakcharback(common);
+ }
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
+#if PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16
+if (common->utfreadchar_invalid != NULL)
+ {
+ set_jumps(common->utfreadchar_invalid, LABEL());
+ do_utfreadchar_invalid(common);
+ }
+if (common->utfmoveback_invalid)
+ {
+ set_jumps(common->utfmoveback_invalid, LABEL());
+ do_utfmoveback_invalid(common);
+ }
+if (common->utfpeakcharback_invalid)
+ {
+ set_jumps(common->utfpeakcharback_invalid, LABEL());
+ do_utfpeakcharback_invalid(common);
+ }
+#endif /* PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16 */
if (common->getucd != NULL)
{
set_jumps(common->getucd, LABEL());
@@ -12644,7 +13613,7 @@ Returns: 0: success or (*NOJIT) was used
*/
#define PUBLIC_JIT_COMPILE_OPTIONS \
- (PCRE2_JIT_COMPLETE|PCRE2_JIT_PARTIAL_SOFT|PCRE2_JIT_PARTIAL_HARD)
+ (PCRE2_JIT_COMPLETE|PCRE2_JIT_PARTIAL_SOFT|PCRE2_JIT_PARTIAL_HARD|PCRE2_JIT_INVALID_UTF)
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_jit_compile(pcre2_code *code, uint32_t options)
@@ -12659,6 +13628,7 @@ return PCRE2_ERROR_JIT_BADOPTION;
pcre2_real_code *re = (pcre2_real_code *)code;
executable_functions *functions;
+uint32_t excluded_options;
int result;
if (code == NULL)
@@ -12673,21 +13643,24 @@ functions = (executable_functions *)re->executable_jit;
if ((options & PCRE2_JIT_COMPLETE) != 0 && (functions == NULL
|| functions->executable_funcs[0] == NULL)) {
- result = jit_compile(code, PCRE2_JIT_COMPLETE);
+ excluded_options = (PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_PARTIAL_HARD);
+ result = jit_compile(code, options & ~excluded_options);
if (result != 0)
return result;
}
if ((options & PCRE2_JIT_PARTIAL_SOFT) != 0 && (functions == NULL
|| functions->executable_funcs[1] == NULL)) {
- result = jit_compile(code, PCRE2_JIT_PARTIAL_SOFT);
+ excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_HARD);
+ result = jit_compile(code, options & ~excluded_options);
if (result != 0)
return result;
}
if ((options & PCRE2_JIT_PARTIAL_HARD) != 0 && (functions == NULL
|| functions->executable_funcs[2] == NULL)) {
- result = jit_compile(code, PCRE2_JIT_PARTIAL_HARD);
+ excluded_options = (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT);
+ result = jit_compile(code, options & ~excluded_options);
if (result != 0)
return result;
}
diff --git a/src/pcre2_jit_test.c b/src/pcre2_jit_test.c
index a28e9a0..78fdcca 100644
--- a/src/pcre2_jit_test.c
+++ b/src/pcre2_jit_test.c
@@ -93,6 +93,8 @@ POSSIBILITY OF SUCH DAMAGE.
*/
static int regression_tests(void);
+static int invalid_utf8_regression_tests(void);
+static int invalid_utf16_regression_tests(void);
int main(void)
{
@@ -108,7 +110,9 @@ int main(void)
printf("JIT must be enabled to run pcre_jit_test\n");
return 1;
}
- return regression_tests();
+ return regression_tests()
+ || invalid_utf8_regression_tests()
+ || invalid_utf16_regression_tests();
}
/* --------------------------------------------------------------------------------------- */
@@ -184,7 +188,7 @@ static struct regression_test_case regression_test_cases[] = {
{ CM, A, 0, 0, "\\Ca", "CDA" },
{ M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
{ CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
-#endif
+#endif /* !NEVER_BACKSLASH_C */
{ CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
{ CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
{ CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
@@ -262,6 +266,9 @@ static struct regression_test_case regression_test_cases[] = {
{ MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
{ U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
{ U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
+ { CM, A, 0, 0, "ab|cd", "CD" },
+ { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
+ { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
/* Greedy and non-greedy ? operators. */
{ MU, A, 0, 0, "(?:a)?a", "laab" },
@@ -1163,7 +1170,7 @@ static int regression_tests(void)
#elif defined SUPPORT_PCRE2_32
PCRE2_UCHAR32 cpu_info[128];
#endif
-#if defined SUPPORT_UTF && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
+#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
int return_value;
#endif
@@ -1331,9 +1338,8 @@ static int regression_tests(void)
ovector8_2[i] = -2;
}
if (re8) {
- (void)pcre2_set_match_limit_8(mcontext8, 10000000);
return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
- current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
+ current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, NULL);
if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
@@ -1376,9 +1382,8 @@ static int regression_tests(void)
else
length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
- (void)pcre2_set_match_limit_16(mcontext16, 10000000);
return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
- current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
+ current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, NULL);
if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
@@ -1421,9 +1426,8 @@ static int regression_tests(void)
else
length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
- (void)pcre2_set_match_limit_32(mcontext32, 10000000);
return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
- current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
+ current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, NULL);
if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
@@ -1451,7 +1455,7 @@ static int regression_tests(void)
is_successful = 1;
if (!(current->start_offset & F_DIFF)) {
-#if defined SUPPORT_UTF && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
+#if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
if (!(current->start_offset & F_FORCECONV)) {
/* All results must be the same. */
@@ -1500,8 +1504,8 @@ static int regression_tests(void)
is_successful = 0;
} else
#endif
- if (return_value >= 0 || return_value == PCRE_ERROR_PARTIAL) {
- if (return_value == PCRE_ERROR_PARTIAL) {
+ if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
+ if (return_value == PCRE2_ERROR_PARTIAL) {
return_value = 2;
} else {
return_value *= 2;
@@ -1516,20 +1520,20 @@ static int regression_tests(void)
return_value32[0] = return_value;
#endif
/* Transform back the results. */
- if (current->flags & PCRE_UTF8) {
+ if (current->compile_options & PCRE2_UTF) {
#ifdef SUPPORT_PCRE2_16
for (i = 0; i < return_value; ++i) {
- if (ovector16_1[i] >= 0)
+ if (ovector16_1[i] != PCRE2_UNSET)
ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
- if (ovector16_2[i] >= 0)
+ if (ovector16_2[i] != PCRE2_UNSET)
ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
}
#endif
#ifdef SUPPORT_PCRE2_32
for (i = 0; i < return_value; ++i) {
- if (ovector32_1[i] >= 0)
+ if (ovector32_1[i] != PCRE2_UNSET)
ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
- if (ovector32_2[i] >= 0)
+ if (ovector32_2[i] != PCRE2_UNSET)
ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
}
#endif
@@ -1539,7 +1543,7 @@ static int regression_tests(void)
#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
- i, ovector8_1[i], ovector8_2[i], ovector16_1[i], ovector16_2[i],
+ i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
total, current->pattern, current->input);
is_successful = 0;
}
@@ -1547,7 +1551,7 @@ static int regression_tests(void)
#if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
- i, ovector8_1[i], ovector8_2[i], ovector32_1[i], ovector32_2[i],
+ i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
total, current->pattern, current->input);
is_successful = 0;
}
@@ -1555,7 +1559,7 @@ static int regression_tests(void)
#if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
- i, ovector16_1[i], ovector16_2[i], ovector32_1[i], ovector32_2[i],
+ i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
total, current->pattern, current->input);
is_successful = 0;
}
@@ -1751,4 +1755,426 @@ static int regression_tests(void)
}
}
+#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
+
+#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
+#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
+#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
+
+struct invalid_utf8_regression_test_case {
+ int compile_options;
+ int jit_compile_options;
+ int start_offset;
+ int skip_left;
+ int skip_right;
+ int expected_result;
+ const char *pattern[2];
+ const char *input;
+};
+
+static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xf0\x90\x80\x80" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
+ { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xef\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xe0\xa0\x80" },
+ { UDA, CI, 0, 0, 2, -1, { ".", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CI, 0, 0, 1, -1, { ".", NULL }, "\xef\xbf\xbf" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\x7f#" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xef\xbf\xc0" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x9f\xbf" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xed\x9f\xbf#" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xa0\x80#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xee\x80\x80#" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xed\xbf\xbf#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf##" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xdf\xbf" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80##" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80#" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\xc2\x80" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80##" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0##" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xe0\x80" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xdf\xc0" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf##" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xc1\xbf" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80###" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\x80" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8###" },
+ { UDA, CI, 0, 0, 0, -1, { ".", NULL }, "\xf8" },
+ { UDA, CI, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
+
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
+ { UDA, CPI, 4, 1, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xef\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "#\xe0\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
+ { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
+ { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xdf\xbf#" },
+ { UDA, CPI, 4, 0, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
+ { UDA, CPI, 4, 2, 0, 1, { "\\B", NULL }, "##\xc2\x80#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
+ { UDA, CPI, 4, 0, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+ { UDA, CPI, 4, 2, 0, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
+
+ { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xef\xbf\xbf#" },
+ { UDA, CPI, 3, 0, 0, 1, { "\\B", NULL }, "\xe0\xa0\x80#" },
+ { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
+ { UDA, CPI, 3, 1, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
+ { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
+ { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
+ { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
+ { UDA, CPI, 3, 0, 0, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
+
+ { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xdf\xbf#" },
+ { UDA, CPI, 2, 0, 0, 1, { "\\B", NULL }, "\xc2\x80#" },
+ { UDA, CPI, 2, 1, 0, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
+ { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
+ { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xe0\x80#" },
+ { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xdf\xff#" },
+ { UDA, CPI, 2, 0, 0, -1, { "\\B", "\\b" }, "\xff\xbf#" },
+
+ { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x7f#" },
+ { UDA, CPI, 1, 0, 0, 1, { "\\B", NULL }, "\x01#" },
+ { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
+ { UDA, CPI, 1, 0, 0, -1, { "\\B", "\\b" }, "\x80#" },
+
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "a\xff" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
+
+ { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "A" },
+ { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xff" },
+ { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xc3\xa1" },
+ { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xc3\xa1" },
+ { UDA, CPI, 0, 0, 0, -1, { "\\X", NULL }, "\xc3\x7f" },
+ { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xe1\xbd\xb8" },
+ { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
+ { UDA, CPI, 0, 0, 0, 1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+ { UDA, CPI, 0, 0, 1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
+
+ { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+};
+
+#undef UDA
+#undef CI
+#undef CPI
+
+static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current,
+ int pattern_index, int i, pcre2_match_data_8 *mdata)
+{
+ pcre2_code_8 *code;
+ int result, errorcode;
+ PCRE2_SIZE length, erroroffset;
+
+ if (current->pattern[i] == NULL)
+ return 1;
+
+ code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
+ current->compile_options, &errorcode, &erroroffset, NULL);
+
+ if (!code) {
+ printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
+ return 0;
+ }
+
+ if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
+ printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
+ pcre2_code_free_8(code);
+ return 0;
+ }
+
+ length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
+
+ if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
+ result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
+ length, current->start_offset - current->skip_left, 0, mdata, NULL);
+
+ if (result != current->expected_result) {
+ printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ pcre2_code_free_8(code);
+ return 0;
+ }
+ }
+
+ if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
+ result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
+ length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
+
+ if (result != current->expected_result) {
+ printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ pcre2_code_free_8(code);
+ return 0;
+ }
+ }
+
+ pcre2_code_free_8(code);
+ return 1;
+}
+
+static int invalid_utf8_regression_tests(void)
+{
+ struct invalid_utf8_regression_test_case *current;
+ pcre2_match_data_8 *mdata;
+ int total = 0, successful = 0;
+ int result;
+
+ printf("\nRunning invalid-utf8 JIT regression tests\n");
+
+ mdata = pcre2_match_data_create_8(4, NULL);
+
+ for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
+ /* printf("\nPattern: %s :\n", current->pattern); */
+ total++;
+
+ result = 1;
+ if (!run_invalid_utf8_test(current, total - 1, 0, mdata))
+ result = 0;
+ if (!run_invalid_utf8_test(current, total - 1, 1, mdata))
+ result = 0;
+
+ if (result) {
+ successful++;
+ }
+
+ printf(".");
+ if ((total % 60) == 0)
+ printf("\n");
+ }
+
+ if ((total % 60) != 0)
+ printf("\n");
+
+ pcre2_match_data_free_8(mdata);
+
+ if (total == successful) {
+ printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
+ return 0;
+ } else {
+ printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
+ return 1;
+ }
+}
+
+#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
+
+static int invalid_utf8_regression_tests(void)
+{
+ return 0;
+}
+
+#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
+
+#if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
+
+#define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
+#define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
+#define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
+
+struct invalid_utf16_regression_test_case {
+ int compile_options;
+ int jit_compile_options;
+ int start_offset;
+ int skip_left;
+ int skip_right;
+ int expected_result;
+ const PCRE2_UCHAR16 *pattern[2];
+ const PCRE2_UCHAR16 *input;
+};
+
+static PCRE2_UCHAR16 allany[] = { '.', 0 };
+static PCRE2_UCHAR16 non_word_boundary[] = { '\\', 'B', 0 };
+static PCRE2_UCHAR16 word_boundary[] = { '\\', 'b', 0 };
+static PCRE2_UCHAR16 backreference[] = { '(', '.', ')', '\\', '1', 0 };
+static PCRE2_UCHAR16 grapheme[] = { '\\', 'X', 0 };
+static PCRE2_UCHAR16 test1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
+static PCRE2_UCHAR16 test2[] = { 0xd800, 0xdc00, '#', 0 };
+static PCRE2_UCHAR16 test3[] = { 0xdbff, 0xdfff, '#', 0 };
+static PCRE2_UCHAR16 test4[] = { 0xd800, 0xdbff, '#', 0 };
+static PCRE2_UCHAR16 test5[] = { '#', 0xd800, '#', 0 };
+static PCRE2_UCHAR16 test6[] = { 'a', 'A', 0xdc28, 0 };
+static PCRE2_UCHAR16 test7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
+
+static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
+ { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test1 },
+ { UDA, CI, 1, 0, 0, 1, { allany, NULL }, test1 },
+ { UDA, CI, 2, 0, 0, 1, { allany, NULL }, test1 },
+ { UDA, CI, 3, 0, 0, 1, { allany, NULL }, test1 },
+ { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test2 },
+ { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test2 },
+ { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test2 },
+ { UDA, CI, 0, 0, 0, 1, { allany, NULL }, test3 },
+ { UDA, CI, 0, 0, 2, -1, { allany, NULL }, test3 },
+ { UDA, CI, 1, 0, 0, -1, { allany, NULL }, test3 },
+
+ { UDA, CPI, 1, 0, 0, 1, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 3, 0, 0, 1, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 4, 0, 0, 1, { non_word_boundary, NULL }, test1 },
+ { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test2 },
+ { UDA, CPI, 2, 0, 0, 1, { non_word_boundary, NULL }, test3 },
+ { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test2 },
+ { UDA, CPI, 2, 1, 0, -1, { non_word_boundary, word_boundary }, test3 },
+ { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test4 },
+ { UDA, CPI, 2, 0, 0, -1, { non_word_boundary, word_boundary }, test5 },
+
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test6 },
+ { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, { backreference, NULL }, test6 },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 2, { backreference, NULL }, test7 },
+ { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, { backreference, NULL }, test7 },
+
+ { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test6 },
+ { UDA, CPI, 1, 0, 0, 1, { grapheme, NULL }, test6 },
+ { UDA, CPI, 2, 0, 0, -1, { grapheme, NULL }, test6 },
+ { UDA, CPI, 0, 0, 0, 1, { grapheme, NULL }, test7 },
+ { UDA, CPI, 2, 0, 0, 1, { grapheme, NULL }, test7 },
+ { UDA, CPI, 1, 0, 0, -1, { grapheme, NULL }, test7 },
+
+ { 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
+};
+
+#undef UDA
+#undef CI
+#undef CPI
+
+static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current,
+ int pattern_index, int i, pcre2_match_data_16 *mdata)
+{
+ pcre2_code_16 *code;
+ int result, errorcode;
+ PCRE2_SIZE length, erroroffset;
+ const PCRE2_UCHAR16 *input;
+
+ if (current->pattern[i] == NULL)
+ return 1;
+
+ code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
+ current->compile_options, &errorcode, &erroroffset, NULL);
+
+ if (!code) {
+ printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
+ return 0;
+ }
+
+ if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
+ printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
+ pcre2_code_free_16(code);
+ return 0;
+ }
+
+ input = current->input;
+ length = 0;
+
+ while (*input++ != 0)
+ length++;
+
+ length -= current->skip_left + current->skip_right;
+
+ if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
+ result = pcre2_jit_match_16(code, (current->input + current->skip_left),
+ length, current->start_offset - current->skip_left, 0, mdata, NULL);
+
+ if (result != current->expected_result) {
+ printf("Pattern[%d:0] match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ pcre2_code_free_16(code);
+ return 0;
+ }
+ }
+
+ if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
+ result = pcre2_jit_match_16(code, (current->input + current->skip_left),
+ length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
+
+ if (result != current->expected_result) {
+ printf("Pattern[%d:0] partial match result %d differs from expected %d.\n", pattern_index, result, current->expected_result);
+ pcre2_code_free_16(code);
+ return 0;
+ }
+ }
+
+ pcre2_code_free_16(code);
+ return 1;
+}
+
+static int invalid_utf16_regression_tests(void)
+{
+ struct invalid_utf16_regression_test_case *current;
+ pcre2_match_data_16 *mdata;
+ int total = 0, successful = 0;
+ int result;
+
+ printf("\nRunning invalid-utf16 JIT regression tests\n");
+
+ mdata = pcre2_match_data_create_16(4, NULL);
+
+ for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
+ /* printf("\nPattern: %s :\n", current->pattern); */
+ total++;
+
+ result = 1;
+ if (!run_invalid_utf16_test(current, total - 1, 0, mdata))
+ result = 0;
+ if (!run_invalid_utf16_test(current, total - 1, 1, mdata))
+ result = 0;
+
+ if (result) {
+ successful++;
+ }
+
+ printf(".");
+ if ((total % 60) == 0)
+ printf("\n");
+ }
+
+ if ((total % 60) != 0)
+ printf("\n");
+
+ pcre2_match_data_free_16(mdata);
+
+ if (total == successful) {
+ printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
+ return 0;
+ } else {
+ printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
+ return 1;
+ }
+}
+
+#else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
+
+static int invalid_utf16_regression_tests(void)
+{
+ return 0;
+}
+
+#endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
+
/* End of pcre2_jit_test.c */