diff options
Diffstat (limited to 'pcre_jit_compile.c')
-rw-r--r-- | pcre_jit_compile.c | 373 |
1 files changed, 251 insertions, 122 deletions
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index 16611f1..03c7b2c 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -302,9 +302,11 @@ typedef struct compiler_common { #ifdef SUPPORT_UCP BOOL useucp; #endif - jump_list *utf8readchar; - jump_list *utf8readtype8; + jump_list *utfreadchar; +#ifdef COMPILE_PCRE8 + jump_list *utfreadtype8; #endif +#endif /* SUPPORT_UTF8 */ #ifdef SUPPORT_UCP jump_list *getucd; #endif @@ -543,8 +545,8 @@ switch(*cc) case OP_NOTPOSPLUSI: case OP_NOTPOSQUERYI: cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif return cc; @@ -565,8 +567,8 @@ switch(*cc) case OP_NOTEXACTI: case OP_NOTPOSUPTOI: cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif return cc; @@ -1285,7 +1287,7 @@ return MAX_255(c) ? common->fcc[c] != c : FALSE; static SLJIT_INLINE unsigned int char_othercase(compiler_common *common, unsigned int c) { /* Returns with the othercase. */ -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf && c > 127) { #ifdef SUPPORT_UCP @@ -1302,11 +1304,11 @@ static unsigned int char_get_othercase_bit(compiler_common *common, pcre_uchar* { /* Detects if the character and its othercase has only 1 bit difference. */ unsigned int c, oc, bit; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF8 && defined COMPILE_PCRE8 int n; #endif -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { GETCHAR(c, cc); @@ -1324,11 +1326,11 @@ if (common->utf) else { c = *cc; - oc = common->fcc[c]; + oc = TABLE_GET(c, common->fcc, c); } #else c = *cc; -oc = common->fcc[c]; +oc = TABLE_GET(c, common->fcc, c); #endif SLJIT_ASSERT(c != oc); @@ -1342,10 +1344,12 @@ if (c <= 127 && bit == 0x20) if (!ispowerof2(bit)) return 0; +#ifdef COMPILE_PCRE8 + #ifdef SUPPORT_UTF8 if (common->utf && c > 127) { - n = PRIV(utf8_table4)[*cc & 0x3f]; + n = GET_EXTRALEN(*cc); while ((bit & 0x3f) == 0) { n--; @@ -1353,8 +1357,25 @@ if (common->utf && c > 127) } return (n << 8) | bit; } -#endif +#endif /* SUPPORT_UTF8 */ return (0 << 8) | bit; + +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 +#ifdef SUPPORT_UTF16 +if (common->utf && c > 65535) + { + if (bit >= (1 << 10)) + bit >>= 10; + else + return (bit <= 255) ? ((2 << 8) | bit) : ((3 << 8) | (bit >> 8)); + } +#endif /* SUPPORT_UTF16 */ +return (bit <= 255) ? ((0 << 8) | bit) : ((1 << 8) | (bit >> 8)); +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ } static SLJIT_INLINE void check_input_end(compiler_common *common, jump_list **fallbacks) @@ -1368,16 +1389,22 @@ static void read_char(compiler_common *common) /* Reads the character into TMP1, updates STR_PTR. Does not check STR_END. TMP2 Destroyed. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF struct sljit_jump *jump; #endif OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { +#ifdef COMPILE_PCRE8 jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); +#else +#ifdef COMPILE_PCRE16 + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); +#endif +#endif /* COMPILE_PCRE8 */ + add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); } #endif @@ -1389,16 +1416,22 @@ static void peek_char(compiler_common *common) /* Reads the character into TMP1, keeps STR_PTR. Does not check STR_END. TMP2 Destroyed. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF struct sljit_jump *jump; #endif OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { +#ifdef COMPILE_PCRE8 jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readchar, JUMP(SLJIT_FAST_CALL)); +#else +#ifdef COMPILE_PCRE16 + jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); +#endif +#endif /* COMPILE_PCRE8 */ + add_jump(compiler, &common->utfreadchar, JUMP(SLJIT_FAST_CALL)); OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0); JUMPHERE(jump); } @@ -1409,46 +1442,83 @@ static void read_char8_type(compiler_common *common) { /* Reads the character type into TMP1, updates STR_PTR. Does not check STR_END. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF || defined COMPILE_PCRE16 struct sljit_jump *jump; #endif -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { - OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#ifdef COMPILE_PCRE8 /* This can be an extra read in some situations, but hopefully - it is a clever early read in most cases. */ + it is needed in most cases. */ OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); jump = CMP(SLJIT_C_LESS, TMP2, 0, SLJIT_IMM, 0xc0); - add_jump(compiler, &common->utf8readtype8, JUMP(SLJIT_FAST_CALL)); + add_jump(compiler, &common->utfreadtype8, JUMP(SLJIT_FAST_CALL)); JUMPHERE(jump); +#else +#ifdef COMPILE_PCRE16 + OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); + jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); + OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); + JUMPHERE(jump); + /* Skip low surrogate if necessary. */ + OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0); +#endif +#endif /* COMPILE_PCRE8 */ return; } #endif -OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), common->ctypes); +#ifdef COMPILE_PCRE16 +/* The ctypes array contains only 255 values. */ +OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); +jump = CMP(SLJIT_C_GREATER, TMP2, 0, SLJIT_IMM, 0xff); +#endif +OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP2), common->ctypes); +#ifdef COMPILE_PCRE16 +JUMPHERE(jump); +#endif } static void skip_char_back(compiler_common *common) { /* Goes one character back. Only affects STR_PTR. Does not check begin. */ DEFINE_COMPILER; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 struct sljit_label *label; if (common->utf) { label = LABEL(); - OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xc0); CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, 0x80, label); return; } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -IN_UCHARS(1)); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); + /* Skip low surrogate if necessary. */ + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xdc00); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + return; + } +#endif OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); } @@ -1477,10 +1547,12 @@ else } } -#ifdef SUPPORT_UTF8 -static void do_utf8readchar(compiler_common *common) +#ifdef SUPPORT_UTF + +#ifdef COMPILE_PCRE8 +static void do_utfreadchar(compiler_common *common) { -/* Fast decoding an utf8 character. TMP1 contains the first byte +/* Fast decoding a UTF-8 character. TMP1 contains the first byte of the character (>= 0xc0). Return char value in TMP1, length - 1 in TMP2. */ DEFINE_COMPILER; struct sljit_jump *jump; @@ -1489,82 +1561,57 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); /* Searching for the first zero. */ OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 2 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +/* Two byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 1); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x10); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 3 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +/* Three byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0f); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 12); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 2); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(2)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 2); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(2)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); JUMPHERE(jump); -OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x08); -jump = JUMP(SLJIT_C_NOT_ZERO); -/* 4 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +/* Four byte sequence. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(1)); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x07); OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 18); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(2)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 3); +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(3)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(3)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 3); -sljit_emit_fast_return(compiler, RETURN_ADDR, 0); -JUMPHERE(jump); - -/* 5 byte sequence */ -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); -OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x03); -OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 24); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 18); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 2); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 12); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 3); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV_UB, TMP2, 0, SLJIT_MEM1(STR_PTR), 4); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 4); -OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3f); -OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); -OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, 4); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(3)); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } -static void do_utf8readtype8(compiler_common *common) +static void do_utfreadtype8(compiler_common *common) { -/* Fast decoding an utf8 character type. TMP2 contains the first byte -of the character (>= 0xc0) and TMP1 is destroyed. Return value in TMP1. */ +/* Fast decoding a UTF-8 character type. TMP2 contains the first byte +of the character (>= 0xc0). Return value in TMP1. */ DEFINE_COMPILER; struct sljit_jump *jump; struct sljit_jump *compare; @@ -1573,9 +1620,9 @@ sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); OP2(SLJIT_AND | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP2, 0, SLJIT_IMM, 0x20); jump = JUMP(SLJIT_C_NOT_ZERO); -/* 2 byte sequence */ -OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); -OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); +/* Two byte sequence. */ +OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(0)); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x1f); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 6); OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3f); @@ -1596,7 +1643,38 @@ OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0); } -#endif +#else /* COMPILE_PCRE8 */ + +#ifdef COMPILE_PCRE16 +static void do_utfreadchar(compiler_common *common) +{ +/* Fast decoding a UTF-16 character. TMP1 contains the first 16 bit char +of the character (>= 0xd800). Return char value in TMP1, length - 1 in TMP2. */ +DEFINE_COMPILER; +struct sljit_jump *jump; + +sljit_emit_fast_enter(compiler, RETURN_ADDR, 0, 1, 5, 5, common->localsize); +jump = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xdc00); +/* Do nothing, only return. */ +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); + +JUMPHERE(jump); +/* Combine two 16 bit characters. */ +OP1(MOV_UCHAR, TMP2, 0, SLJIT_MEM1(STR_PTR), 1); +OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x3ff); +OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 10); +OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0x3ff); +OP2(SLJIT_OR, TMP1, 0, TMP1, 0, TMP2, 0); +OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, IN_UCHARS(1)); +OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x10000); +sljit_emit_fast_return(compiler, RETURN_ADDR, 0); +} +#endif /* COMPILE_PCRE16 */ + +#endif /* COMPILE_PCRE8 */ + +#endif /* SUPPORT_UTF */ #ifdef SUPPORT_UCP @@ -1634,8 +1712,8 @@ struct sljit_label *newlinelabel = NULL; struct sljit_jump *start; struct sljit_jump *end = NULL; struct sljit_jump *nl = NULL; -#ifdef SUPPORT_UTF8 -struct sljit_jump *singlebyte; +#ifdef SUPPORT_UTF +struct sljit_jump *singlechar; #endif jump_list *newline = NULL; BOOL newlinecheck = FALSE; @@ -1708,13 +1786,25 @@ if (newlinecheck) CMPTO(SLJIT_C_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff, newlinelabel); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { - singlebyte = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); + singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); - JUMPHERE(singlebyte); + JUMPHERE(singlechar); + } +#endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + singlechar = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + JUMPHERE(singlechar); } #endif JUMPHERE(start); @@ -1770,7 +1860,7 @@ else } OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); @@ -1778,6 +1868,17 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + } +#endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); JUMPHERE(leave); @@ -1900,7 +2001,7 @@ if (common->utf) OP1(SLJIT_MOV, TMP1, 0, TMP3, 0); #endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (common->utf) { CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0, start); @@ -1908,6 +2009,17 @@ if (common->utf) OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); } #endif +#if defined SUPPORT_UTF && defined COMPILE_PCRE16 +if (common->utf) + { + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800, start); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); + } +#endif JUMPTO(SLJIT_JUMP, start); JUMPHERE(found); JUMPHERE(leave); @@ -2335,10 +2447,10 @@ if (context->sourcereg == -1) context->sourcereg = TMP2; } -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF utflength = 1; -if (common->utf && *cc >= 0xc0) - utflength += PRIV(utf8_table4)[*cc & 0x3f]; +if (common->utf && HAS_EXTRALEN(*cc)) + utflength += GET_EXTRALEN(*cc); do { @@ -2523,8 +2635,8 @@ while (*cc != XCL_END) if (*cc == XCL_SINGLE) { cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2533,12 +2645,12 @@ while (*cc != XCL_END) else if (*cc == XCL_RANGE) { cc += 2; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif cc++; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif #ifdef SUPPORT_UCP needschar = TRUE; @@ -2875,24 +2987,35 @@ switch(type) case OP_ALLANY: check_input_end(common, fallbacks); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { - OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); +#ifdef COMPILE_PCRE8 jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +#else /* COMPILE_PCRE8 */ +#ifdef COMPILE_PCRE16 + jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xd800); + OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, 0xfc00); + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xd800); + COND_VALUE(SLJIT_MOV, TMP1, 0, SLJIT_C_EQUAL); + OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); +#endif /* COMPILE_PCRE16 */ +#endif /* COMPILE_PCRE8 */ JUMPHERE(jump[0]); return cc; } #endif - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); return cc; case OP_ANYBYTE: check_input_end(common, fallbacks); - OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); + OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); return cc; #ifdef SUPPORT_UTF8 @@ -3095,8 +3218,8 @@ switch(type) case OP_CHAR: case OP_CHARI: length = 1; -#ifdef SUPPORT_UTF8 - if (common->utf && *cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); #endif if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0) { @@ -3129,11 +3252,11 @@ switch(type) case OP_NOT: case OP_NOTI: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { length = 1; - if (*cc >= 0xc0) length += PRIV(utf8_table4)[*cc & 0x3f]; + if (HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); check_input_end(common, fallbacks); GETCHAR(c, cc); @@ -3152,7 +3275,9 @@ switch(type) /* Skip the variable-length character. */ OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 1); jump[0] = CMP(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, 0xc0); +#ifdef COMPILE_PCRE8 OP1(SLJIT_MOV_UB, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_w)PRIV(utf8_table4) - 0xc0); +#endif OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); JUMPHERE(jump[0]); return cc + length; @@ -3268,21 +3393,21 @@ do if (*cc == OP_CHAR) { size = 1; -#ifdef SUPPORT_UTF8 - if (common->utf && cc[1] >= 0xc0) - size += PRIV(utf8_table4)[cc[1] & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(cc[1])) + size += GET_EXTRALEN(cc[1]); #endif } else if (*cc == OP_CHARI) { size = 1; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (common->utf) { if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) size = 0; - else if (cc[1] >= 0xc0) - size += PRIV(utf8_table4)[cc[1] & 0x3f]; + else if (HAS_EXTRALEN(cc[1])) + size += GET_EXTRALEN(cc[1]); } else #endif @@ -4786,8 +4911,8 @@ if (*type == 0) if (end != NULL) { *end = cc + 1; -#ifdef SUPPORT_UTF8 - if (common->utf && *cc >= 0xc0) *end += PRIV(utf8_table4)[*cc & 0x3f]; +#ifdef SUPPORT_UTF + if (common->utf && HAS_EXTRALEN(*cc)) *end += GET_EXTRALEN(*cc); #endif } return cc; @@ -6259,9 +6384,11 @@ common->utf = (re->options & PCRE_UTF8) != 0; #ifdef SUPPORT_UCP common->useucp = (re->options & PCRE_UCP) != 0; #endif -common->utf8readchar = NULL; -common->utf8readtype8 = NULL; +common->utfreadchar = NULL; +#ifdef COMPILE_PCRE8 +common->utfreadtype8 = NULL; #endif +#endif /* SUPPORT_UTF8 */ #ifdef SUPPORT_UCP common->getucd = NULL; #endif @@ -6487,18 +6614,20 @@ if (common->caselesscmp != NULL) set_jumps(common->caselesscmp, LABEL()); do_caselesscmp(common); } -#ifdef SUPPORT_UTF8 -if (common->utf8readchar != NULL) +#ifdef SUPPORT_UTF +if (common->utfreadchar != NULL) { - set_jumps(common->utf8readchar, LABEL()); - do_utf8readchar(common); + set_jumps(common->utfreadchar, LABEL()); + do_utfreadchar(common); } -if (common->utf8readtype8 != NULL) +#ifdef COMPILE_PCRE8 +if (common->utfreadtype8 != NULL) { - set_jumps(common->utf8readtype8, LABEL()); - do_utf8readtype8(common); + set_jumps(common->utfreadtype8, LABEL()); + do_utfreadtype8(common); } #endif +#endif /* COMPILE_PCRE8 */ #ifdef SUPPORT_UCP if (common->getucd != NULL) { |