From 4cf8a94be5fb7b63eae7ffbe47b66afef880b929 Mon Sep 17 00:00:00 2001 From: zherczeg Date: Wed, 6 Nov 2019 14:00:21 +0000 Subject: JIT ARM64 fixes by Sebastian Pop. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1181 6239d852-aaf2-0410-a92c-79f79f948069 --- src/pcre2_jit_neon_inc.h | 295 +++++++++++++++++++++++++++ src/pcre2_jit_simd_inc.h | 510 +++++++++++++++++++++++------------------------ 2 files changed, 539 insertions(+), 266 deletions(-) create mode 100644 src/pcre2_jit_neon_inc.h diff --git a/src/pcre2_jit_neon_inc.h b/src/pcre2_jit_neon_inc.h new file mode 100644 index 0000000..7d2c8a7 --- /dev/null +++ b/src/pcre2_jit_neon_inc.h @@ -0,0 +1,295 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + This module by Zoltan Herczeg and Sebastian Pop + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2019 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +# if defined(FFCS) +# if defined(FF_UTF) +# define FF_FUN ffcs_utf +# else +# define FF_FUN ffcs +# endif + +# elif defined(FFCS_2) +# if defined(FF_UTF) +# define FF_FUN ffcs_2_utf +# else +# define FF_FUN ffcs_2 +# endif + +# elif defined(FFCS_MASK) +# if defined(FF_UTF) +# define FF_FUN ffcs_mask_utf +# else +# define FF_FUN ffcs_mask +# endif + +# elif defined(FFCPS_0) +# if defined (FF_UTF) +# define FF_FUN ffcps_0_utf +# else +# define FF_FUN ffcps_0 +# endif + +# elif defined (FFCPS_1) +# if defined (FF_UTF) +# define FF_FUN ffcps_1_utf +# else +# define FF_FUN ffcps_1 +# endif + +# elif defined (FFCPS_DEFAULT) +# if defined (FF_UTF) +# define FF_FUN ffcps_default_utf +# else +# define FF_FUN ffcps_default +# endif +# endif + +static sljit_u8* SLJIT_FUNC FF_FUN(sljit_u8 *str_end, sljit_u8 *str_ptr, sljit_uw offs1, sljit_uw offs2, sljit_uw chars) +#undef FF_FUN +{ +quad_word qw; +int_char ic; +ic.x = chars; + +#if defined(FFCS) +sljit_u8 c1 = ic.c.c1; +vect_t vc1 = VDUPQ(c1); + +#elif defined(FFCS_2) +sljit_u8 c1 = ic.c.c1; +vect_t vc1 = VDUPQ(c1); +sljit_u8 c2 = ic.c.c2; +vect_t vc2 = VDUPQ(c2); + +#elif defined(FFCS_MASK) +sljit_u8 c1 = ic.c.c1; +vect_t vc1 = VDUPQ(c1); +sljit_u8 mask = ic.c.c2; +vect_t vmask = VDUPQ(mask); +#endif + +#if defined(FFCPS) +compare_type compare1_type = compare_match1; +compare_type compare2_type = compare_match1; +vect_t cmp1a, cmp1b, cmp2a, cmp2b; +const sljit_u32 diff = IN_UCHARS(offs1 - offs2); +PCRE2_UCHAR char1a = ic.c.c1; +PCRE2_UCHAR char1b = ic.c.c2; +PCRE2_UCHAR char2a = ic.c.c3; +PCRE2_UCHAR char2b = ic.c.c4; + +# ifdef FFCPS_CHAR1A2A +cmp1a = VDUPQ(char1a); +cmp2a = VDUPQ(char2a); +# else +if (char1a == char1b) + cmp1a = VDUPQ(char1a); +else + { + sljit_u32 bit1 = char1a ^ char1b; + if (is_powerof2(bit1)) + { + compare1_type = compare_match1i; + cmp1a = VDUPQ(char1a | bit1); + cmp1b = VDUPQ(bit1); + } + else + { + compare1_type = compare_match2; + cmp1a = VDUPQ(char1a); + cmp1b = VDUPQ(char1b); + } + } + +if (char2a == char2b) + cmp2a = VDUPQ(char2a); +else + { + sljit_u32 bit2 = char2a ^ char2b; + if (is_powerof2(bit2)) + { + compare2_type = compare_match1i; + cmp2a = VDUPQ(char2a | bit2); + cmp2b = VDUPQ(bit2); + } + else + { + compare2_type = compare_match2; + cmp2a = VDUPQ(char2a); + cmp2b = VDUPQ(char2b); + } + } +# endif + +str_ptr += offs1; +#endif + +restart:; +#if defined(FFCPS) +sljit_u8 *p1 = str_ptr - diff; +#endif +sljit_s32 align_offset = ((uint64_t)str_ptr & 0xf); +str_ptr = (sljit_u8 *) ((uint64_t)str_ptr & ~0xf); +vect_t data = VLD1Q(str_ptr); + +#if defined(FFCS) +vect_t eq = VCEQQ(data, vc1); + +#elif defined(FFCS_2) +vect_t eq1 = VCEQQ(data, vc1); +vect_t eq2 = VCEQQ(data, vc2); +vect_t eq = VORRQ(eq1, eq2); + +#elif defined(FFCS_MASK) +vect_t eq = VORRQ(data, vmask); +eq = VCEQQ(eq, vc1); + +#elif defined(FFCPS) +# if defined(FFCPS_DIFF1) +vect_t prev_data = data; +# endif +vect_t data2 = VLD1Q(str_ptr - diff); + +data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); +data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); +vect_t eq = VANDQ(data, data2); +#endif + +VST1Q(qw.mem, eq); +/* Ignore matches before the first STR_PTR. */ +if (align_offset < 8) + { + qw.dw[0] >>= align_offset * 8; + if (qw.dw[0]) + { + str_ptr += align_offset + __builtin_ctzll(qw.dw[0]) / 8; + goto match; + } + if (qw.dw[1]) + { + str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; + goto match; + } + } +else + { + qw.dw[1] >>= (align_offset - 8) * 8; + if (qw.dw[1]) + { + str_ptr += align_offset + __builtin_ctzll(qw.dw[1]) / 8; + goto match; + } + } +str_ptr += 16; + +while (str_ptr < str_end) + { + vect_t orig_data = VLD1Q(str_ptr); + data = orig_data; + +#if defined(FFCS) + eq = VCEQQ(data, vc1); + +#elif defined(FFCS_2) + eq1 = VCEQQ(data, vc1); + eq2 = VCEQQ(data, vc2); + eq = VORRQ(eq1, eq2); + +#elif defined(FFCS_MASK) + eq = VORRQ(data, vmask); + eq = VCEQQ(eq, vc1); +#endif + +#if defined(FFCPS) +# if defined (FFCPS_DIFF1) + data2 = VEXTQ(prev_data, data, 15); +# else + data2 = VLD1Q(str_ptr - diff); +# endif + +# ifdef FFCPS_CHAR1A2A + data = VCEQQ(data, cmp1a); + data2 = VCEQQ(data2, cmp2a); +# else + data = fast_forward_char_pair_compare(compare1_type, data, cmp1a, cmp1b); + data2 = fast_forward_char_pair_compare(compare2_type, data2, cmp2a, cmp2b); +# endif + + eq = VANDQ(data, data2); +#endif + + VST1Q(qw.mem, eq); + if (qw.dw[0]) + str_ptr += __builtin_ctzll(qw.dw[0]) / 8; + else if (qw.dw[1]) + str_ptr += 8 + __builtin_ctzll(qw.dw[1]) / 8; + else { + str_ptr += 16; +#if defined (FFCPS_DIFF1) + prev_data = orig_data; +#endif + continue; + } + +match:; + if (str_ptr >= str_end) + /* Failed match. */ + return NULL; + +#if defined(FF_UTF) + if (utf_continue(str_ptr + IN_UCHARS(-offs1))) + { + /* Not a match. */ + str_ptr += IN_UCHARS(1); + goto restart; + } +#endif + + /* Match. */ +#if defined (FFCPS) + str_ptr -= IN_UCHARS(offs1); +#endif + return str_ptr; + } + +/* Failed match. */ +return NULL; +} diff --git a/src/pcre2_jit_simd_inc.h b/src/pcre2_jit_simd_inc.h index 3dab0aa..cd91578 100644 --- a/src/pcre2_jit_simd_inc.h +++ b/src/pcre2_jit_simd_inc.h @@ -636,101 +636,12 @@ if (common->match_end_ptr != 0) #include -typedef union { - uint8_t mem[16]; - uint64_t dw[2]; -} quad_word; - typedef union { unsigned int x; struct { unsigned char c1, c2, c3, c4; } c; } int_char; -static SLJIT_INLINE void emit_memchr(struct sljit_compiler *compiler, PCRE2_UCHAR char1) -{ -SLJIT_ASSERT(STR_PTR == SLJIT_R1); -/* We need to be careful in the order we store argument passing registers, as STR_PTR is same as SLJIT_R1. */ -OP1(SLJIT_MOV, SLJIT_R0, 0, STR_PTR, 0); -OP2(SLJIT_SUB, SLJIT_R2, 0, STR_END, 0, STR_PTR, 0); -OP1(SLJIT_MOV_U8, SLJIT_R1, 0, SLJIT_IMM, char1); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(memchr)); -} - -static sljit_u8* SLJIT_FUNC sljit_memchr_mask(sljit_u8 *str, sljit_uw n, sljit_u8 c1mask, sljit_u8 mask) -{ -if (n >= 16) - { - quad_word qw; - uint8x16_t vmask = vdupq_n_u8(mask); - uint8x16_t vc1mask = vdupq_n_u8(c1mask); - for (; n >= 16; n -= 16, str += 16) - { - uint8x16_t x = vld1q_u8(str); - uint8x16_t xmask = vorrq_u8(x, vmask); - uint8x16_t eq = vceqq_u8(xmask, vc1mask); - vst1q_u8(qw.mem, eq); - if (qw.dw[0]) - return str + __builtin_ctzll(qw.dw[0]) / 8; - if (qw.dw[1]) - return str + 8 + __builtin_ctzll(qw.dw[1]) / 8; - } - } -for (; n > 0; --n, ++str) - if (c1mask == (*str | mask)) - return str; -return NULL; -} - -static SLJIT_INLINE void emit_memchr_mask(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR mask, sljit_s32 offset) -{ -OP1(SLJIT_MOV_U8, SLJIT_R2, 0, SLJIT_IMM, char1 | mask); -OP1(SLJIT_MOV_U8, SLJIT_R3, 0, SLJIT_IMM, mask); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_memchr_mask)); -} - -/* Like memchr except that we are looking for either one of the two chars c1 or c2. */ -static sljit_u8* SLJIT_FUNC sljit_memchr_2(sljit_u8 *str, sljit_uw n, sljit_u8 c1, sljit_u8 c2) -{ -if (n >= 16) - { - quad_word qw; - uint8x16_t vc1 = vdupq_n_u8(c1); - uint8x16_t vc2 = vdupq_n_u8(c2); - for (; n >= 16; n -= 16, str += 16) - { - uint8x16_t x = vld1q_u8(str); - uint8x16_t eq1 = vceqq_u8(x, vc1); - uint8x16_t eq2 = vceqq_u8(x, vc2); - uint8x16_t eq = vorrq_u8(eq1, eq2); - vst1q_u8(qw.mem, eq); - if (qw.dw[0]) - return str + __builtin_ctzll(qw.dw[0]) / 8; - if (qw.dw[1]) - return str + 8 + __builtin_ctzll(qw.dw[1]) / 8; - } - } -for (; n > 0; --n, ++str) - { - sljit_u8 x = *str; - if (x == c1 || x == c2) - return str; - } -return NULL; -} - -static SLJIT_INLINE void emit_memchr_2(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR char2) -{ -OP1(SLJIT_MOV_U8, SLJIT_R2, 0, SLJIT_IMM, char1); -OP1(SLJIT_MOV_U8, SLJIT_R3, 0, SLJIT_IMM, char2); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(sljit_memchr_2)); -} - - #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - static SLJIT_INLINE int utf_continue(sljit_u8 *s) { #if PCRE2_CODE_UNIT_WIDTH == 8 @@ -741,221 +652,141 @@ return (*s & 0xfc00) == 0xdc00; #error "Unknown code width" #endif } +#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ -static sljit_u8* SLJIT_FUNC exec_memchr_mask_utf(sljit_u8 *str, sljit_uw n, sljit_uw c, sljit_uw offset) -{ -sljit_u8 c1mask, mask; -int_char ic; -ic.x = c; -c1mask = ic.c.c1; -mask = ic.c.c2; -if (n >= 16) - { - quad_word qw; - uint8x16_t vmask = vdupq_n_u8(mask); - uint8x16_t vc1mask = vdupq_n_u8(c1mask); - for (; n >= 16; n -= 16, str += 16) - { - sljit_u8 *s; - uint8x16_t x = vld1q_u8(str); - uint8x16_t xmask = vorrq_u8(x, vmask); - uint8x16_t eq = vceqq_u8(xmask, vc1mask); - vst1q_u8(qw.mem, eq); - if (qw.dw[0] == 0 && qw.dw[1] == 0) - continue; - if (qw.dw[0]) - s = str + __builtin_ctzll(qw.dw[0]) / 8; - else - s = str + 8 + __builtin_ctzll(qw.dw[1]) / 8; - if (utf_continue(s - offset)) - { - /* Increment by 1 over the matching byte (i.e., -15 + 16). */ - str = s - 15; - continue; - } - return s; - } - } -for (; n > 0; --n, ++str) - { - if (c1mask != (*str | mask)) - continue; - if (utf_continue(str - offset)) - continue; - return str; - } -return NULL; -} - -static SLJIT_INLINE void emit_memchr_mask_utf(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR mask, sljit_s32 offset) -{ -int_char ic; -ic.c.c1 = char1 | mask; -ic.c.c2 = mask; -OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, ic.x); -OP1(SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, offset); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(exec_memchr_mask_utf)); -} - - -/* Like sljit_memchr_2 and handle utf. */ -static sljit_u8* SLJIT_FUNC exec_memchr_2_utf(sljit_u8 *str, sljit_uw n, sljit_uw c, sljit_uw offset) -{ -sljit_u8 c1, c2; -int_char ic; -ic.x = c; -c1 = ic.c.c1; -c2 = ic.c.c2; -if (n >= 16) - { - quad_word qw; - uint8x16_t vc1 = vdupq_n_u8(c1); - uint8x16_t vc2 = vdupq_n_u8(c2); - for (; n >= 16; n -= 16, str += 16) - { - sljit_u8 *s; - uint8x16_t x = vld1q_u8(str); - uint8x16_t eq1 = vceqq_u8(x, vc1); - uint8x16_t eq2 = vceqq_u8(x, vc2); - uint8x16_t eq = vorrq_u8(eq1, eq2); - vst1q_u8(qw.mem, eq); - if (qw.dw[0]) - s = str + __builtin_ctzll(qw.dw[0]) / 8; - else if (qw.dw[1]) - s = str + 8 + __builtin_ctzll(qw.dw[1]) / 8; - else - continue; - if (utf_continue(s - offset)) - { - /* Increment by 1 over the matching byte (i.e., -15 + 16). */ - str = s - 15; - continue; - } - return s; - } - } -for (; n > 0; --n, ++str) - { - sljit_u8 x = *str; - if (x != c1 && x != c2) - continue; - if (utf_continue(str - offset)) - continue; - return str; - } -return NULL; -} - -static SLJIT_INLINE void emit_memchr_2_utf(struct sljit_compiler *compiler, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) -{ -int_char ic; -ic.c.c1 = char1; -ic.c.c2 = char2; - -OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, ic.x); -OP1(SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, offset); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(exec_memchr_2_utf)); -} - -/* Like memchr and handle utf. */ -static sljit_u8* SLJIT_FUNC exec_memchr_utf(sljit_u8 *str, sljit_uw n, sljit_u8 c, sljit_uw offset) -{ -if (n >= 16) - { - quad_word qw; - uint8x16_t vc = vdupq_n_u8(c); - for (; n >= 16; n -= 16, str += 16) - { - sljit_u8 *s; - uint8x16_t x = vld1q_u8(str); - uint8x16_t eq = vceqq_u8(x, vc); - vst1q_u8(qw.mem, eq); - if (qw.dw[0]) - s = str + __builtin_ctzll(qw.dw[0]) / 8; - else if (qw.dw[1]) - s = str + 8 + __builtin_ctzll(qw.dw[1]) / 8; - else - continue; - if (utf_continue(s - offset)) - { - /* Increment by 1 over the matching byte (i.e., -15 + 16). */ - str = s - 15; - continue; - } - return s; - } - } -for (; n > 0; --n, ++str) - { - if (*str != c) - continue; - if (utf_continue(str - offset)) - continue; - return str; - } -return NULL; -} - -static SLJIT_INLINE void emit_memchr_utf(struct sljit_compiler *compiler, PCRE2_UCHAR char1, sljit_s32 offset) -{ -OP1(SLJIT_MOV, SLJIT_R0, 0, STR_PTR, 0); -OP2(SLJIT_SUB, SLJIT_R1, 0, STR_END, 0, STR_PTR, 0); -OP1(SLJIT_MOV_U8, SLJIT_R2, 0, SLJIT_IMM, char1); -OP1(SLJIT_MOV, SLJIT_R3, 0, SLJIT_IMM, offset); -sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), - SLJIT_IMM, SLJIT_FUNC_OFFSET(exec_memchr_utf)); -} +#if PCRE2_CODE_UNIT_WIDTH == 8 +# define vect_t uint8x16_t +# define VLD1Q vld1q_u8 +# define VCEQQ vceqq_u8 +# define VORRQ vorrq_u8 +# define VST1Q vst1q_u8 +# define VDUPQ vdupq_n_u8 +# define VEXTQ vextq_u8 +# define VANDQ vandq_u8 +typedef union { + uint8_t mem[16]; + uint64_t dw[2]; +} quad_word; +#elif PCRE2_CODE_UNIT_WIDTH == 16 +# define vect_t uint16x8_t +# define VLD1Q vld1q_u16 +# define VCEQQ vceqq_u16 +# define VORRQ vorrq_u16 +# define VST1Q vst1q_u16 +# define VDUPQ vdupq_n_u16 +# define VEXTQ vextq_u16 +# define VANDQ vandq_u16 +typedef union { + uint16_t mem[8]; + uint64_t dw[2]; +} quad_word; +#else +# define vect_t uint32x4_t +# define VLD1Q vld1q_u32 +# define VCEQQ vceqq_u32 +# define VORRQ vorrq_u32 +# define VST1Q vst1q_u32 +# define VDUPQ vdupq_n_u32 +# define VEXTQ vextq_u32 +# define VANDQ vandq_u32 +typedef union { + uint32_t mem[4]; + uint64_t dw[2]; +} quad_word; +#endif -#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */ +#define FFCS +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FFCS +#undef FF_UTF + +#define FFCS_2 +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FF_UTF +#undef FFCS_2 + +#define FFCS_MASK +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FF_UTF +#undef FFCS_MASK #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset) { DEFINE_COMPILER; +int_char ic; struct sljit_jump *partial_quit; /* Save temporary registers. */ OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0); +/* Prepare function arguments */ +OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0); +OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0); +OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset); + if (char1 == char2) { + ic.c.c1 = char1; + ic.c.c2 = char2; + OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf && offset > 0) - emit_memchr_utf(compiler, char1, offset); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf)); else - emit_memchr(compiler, char1); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs)); #else - emit_memchr(compiler, char1); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs)); #endif } else { PCRE2_UCHAR mask = char1 ^ char2; - OP1(SLJIT_MOV, SLJIT_R0, 0, STR_PTR, 0); - OP2(SLJIT_SUB, SLJIT_R1, 0, STR_END, 0, STR_PTR, 0); if (is_powerof2(mask)) { + ic.c.c1 = char1 | mask; + ic.c.c2 = mask; + OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf && offset > 0) - emit_memchr_mask_utf(compiler, char1, mask, offset); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf)); else - emit_memchr_mask(compiler, char1, mask, offset); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask)); #else - emit_memchr_mask(compiler, char1, mask, offset); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask)); #endif } else { + ic.c.c1 = char1; + ic.c.c2 = char2; + OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x); + #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 if (common->utf && offset > 0) - emit_memchr_2_utf(compiler, char1, char2, offset); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf)); else - emit_memchr_2(compiler, char1, char2); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2)); #else - emit_memchr_2(compiler, char1, char2); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2)); #endif } } @@ -974,4 +805,151 @@ OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); if (common->mode != PCRE2_JIT_COMPLETE) JUMPHERE(partial_quit); } + +typedef enum { + compare_match1, + compare_match1i, + compare_match2, +} compare_type; + +static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2) +{ +if (ctype == compare_match2) + { + vect_t tmp = dst; + dst = VCEQQ(dst, cmp1); + tmp = VCEQQ(tmp, cmp2); + dst = VORRQ(dst, tmp); + return dst; + } + +if (ctype == compare_match1i) + dst = VORRQ(dst, cmp2); +dst = VCEQQ(dst, cmp1); +return dst; +} + +static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void) +{ +#if PCRE2_CODE_UNIT_WIDTH == 8 +return 15; +#elif PCRE2_CODE_UNIT_WIDTH == 16 +return 7; +#elif PCRE2_CODE_UNIT_WIDTH == 32 +return 3; +#else +#error "Unsupported unit width" +#endif +} + +#define FFCPS +#define FFCPS_DIFF1 +#define FFCPS_CHAR1A2A + +#define FFCPS_0 +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FF_UTF +#undef FFCPS_0 + +#undef FFCPS_CHAR1A2A + +#define FFCPS_1 +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FF_UTF +#undef FFCPS_1 + +#undef FFCPS_DIFF1 + +#define FFCPS_DEFAULT +#include "pcre2_jit_neon_inc.h" +#define FF_UTF +#include "pcre2_jit_neon_inc.h" +#undef FF_UTF +#undef FFCPS + +#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1 + +static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1, + PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b) +{ +DEFINE_COMPILER; +sljit_u32 diff = IN_UCHARS(offs1 - offs2); +struct sljit_jump *partial_quit; +int_char ic; +SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2); +SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset())); +SLJIT_ASSERT(compiler->scratches == 5); + +/* Save temporary register STR_PTR. */ +OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0); + +/* Prepare arguments for the function call. */ +if (common->match_end_ptr == 0) + OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0); +else + { + OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr); + OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1)); + + OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0); + CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0); + } + +OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0); +OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1); +OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2); +ic.c.c1 = char1a; +ic.c.c2 = char1b; +ic.c.c3 = char2a; +ic.c.c4 = char2b; +OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x); + +if (diff == 1) { + if (char1a == char1b && char2a == char2b) { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (common->utf) + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf)); + else +#endif + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0)); + } else { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (common->utf) + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf)); + else +#endif + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1)); + } +} else { +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 + if (common->utf) + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf)); + else +#endif + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW), + SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default)); +} + +/* Restore STR_PTR register. */ +OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0); + +/* Check return value. */ +partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0); +add_jump(compiler, &common->failed_match, partial_quit); + +/* Fast forward STR_PTR to the result of memchr. */ +OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); + +JUMPHERE(partial_quit); +} + #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */ -- cgit v1.2.1