diff options
Diffstat (limited to 'chromium/v8/src/regexp')
33 files changed, 383 insertions, 237 deletions
diff --git a/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.cc b/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.cc index 10dad83c28c..aaee9b196c6 100644 --- a/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.cc +++ b/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.cc @@ -224,7 +224,7 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ ldr(r0, register_location(start_reg)); // Index of start of capture __ ldr(r1, register_location(start_reg + 1)); // Index of end of capture @@ -335,7 +335,10 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.h b/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.h index 549636a6744..910e5c46079 100644 --- a/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.h +++ b/chromium/v8/src/regexp/arm/regexp-macro-assembler-arm.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, diff --git a/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc b/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc index 055f5639f5b..b56a8ac709c 100644 --- a/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc +++ b/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc @@ -294,7 +294,7 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; Register capture_start_offset = w10; @@ -425,7 +425,10 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h b/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h index 2b5feb1dbdc..aeb49aa9fff 100644 --- a/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h +++ b/chromium/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h @@ -42,7 +42,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, diff --git a/chromium/v8/src/regexp/gen-regexp-special-case.cc b/chromium/v8/src/regexp/gen-regexp-special-case.cc index 9606c5d70d9..9ed338fc1d8 100644 --- a/chromium/v8/src/regexp/gen-regexp-special-case.cc +++ b/chromium/v8/src/regexp/gen-regexp-special-case.cc @@ -55,8 +55,9 @@ void PrintSpecial(std::ofstream& out) { CHECK(U_SUCCESS(status)); // Iterate through all chars in BMP except surrogates. - for (UChar32 i = 0; i < kNonBmpStart; i++) { - if (i >= kSurrogateStart && i <= kSurrogateEnd) { + for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) { + if (i >= static_cast<UChar32>(kSurrogateStart) && + i <= static_cast<UChar32>(kSurrogateEnd)) { continue; // Ignore surrogate range } current.set(i, i); diff --git a/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc b/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc index 501a0aff604..f439ae7de07 100644 --- a/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc +++ b/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc @@ -206,7 +206,7 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ mov(edx, register_location(start_reg)); // Index of start of capture __ mov(ebx, register_location(start_reg + 1)); // Index of end of capture @@ -336,7 +336,10 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference compare = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(compare, argument_count); } // Pop original values before reacting on result value. diff --git a/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h b/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h index 2339ca57e15..a30bff29a15 100644 --- a/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h +++ b/chromium/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerIA32 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.cc b/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.cc index 5f8eb4c6d33..a6289254457 100644 --- a/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.cc +++ b/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.cc @@ -226,7 +226,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ lw(a0, register_location(start_reg)); // Index of start of capture. __ lw(a1, register_location(start_reg + 1)); // Index of end of capture. @@ -340,7 +340,10 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(masm_->isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.h b/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.h index cafa7851803..e2aea1b0910 100644 --- a/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.h +++ b/chromium/v8/src/regexp/mips/regexp-macro-assembler-mips.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc b/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc index c443c8da467..e79038b00b7 100644 --- a/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc +++ b/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc @@ -262,7 +262,7 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ Ld(a0, register_location(start_reg)); // Index of start of capture. __ Ld(a1, register_location(start_reg + 1)); // Index of end of capture. @@ -376,7 +376,10 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(masm_->isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h b/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h index 161a01e2fca..aebfec10604 100644 --- a/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h +++ b/chromium/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc b/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc index 5a6eb315103..9db26777d31 100644 --- a/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc +++ b/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc @@ -242,7 +242,7 @@ void RegExpMacroAssemblerPPC::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ LoadP(r3, register_location(start_reg), r0); // Index of start of capture __ LoadP(r4, register_location(start_reg + 1), r0); // Index of end @@ -356,7 +356,10 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h b/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h index 598691d9883..f6b959837fc 100644 --- a/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h +++ b/chromium/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h @@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerPPC virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, diff --git a/chromium/v8/src/regexp/regexp-ast.h b/chromium/v8/src/regexp/regexp-ast.h index a9106d3d304..643e1fc983f 100644 --- a/chromium/v8/src/regexp/regexp-ast.h +++ b/chromium/v8/src/regexp/regexp-ast.h @@ -76,9 +76,8 @@ class Interval { int to_; }; - -// Represents code units in the range from from_ to to_, both ends are -// inclusive. +// Represents code points (with values up to 0x10FFFF) in the range from from_ +// to to_, both ends are inclusive. class CharacterRange { public: CharacterRange() : from_(0), to_(0) {} diff --git a/chromium/v8/src/regexp/regexp-bytecode-generator.cc b/chromium/v8/src/regexp/regexp-bytecode-generator.cc index e82b67b530a..8abd15384e7 100644 --- a/chromium/v8/src/regexp/regexp-bytecode-generator.cc +++ b/chromium/v8/src/regexp/regexp-bytecode-generator.cc @@ -182,7 +182,7 @@ void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset, int eats_at_least) { DCHECK_GE(eats_at_least, characters); if (eats_at_least > characters && check_bounds) { - DCHECK(is_uint24(cp_offset + eats_at_least)); + DCHECK(is_int24(cp_offset + eats_at_least)); Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least); EmitOrLink(on_failure); check_bounds = false; // Load below doesn't need to check. @@ -329,11 +329,13 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg, } void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_not_equal) { + int start_reg, bool read_backward, bool unicode, Label* on_not_equal) { DCHECK_LE(0, start_reg); DCHECK_GE(kMaxRegister, start_reg); - Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE, + Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) + : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE + : BC_CHECK_NOT_BACK_REF_NO_CASE), start_reg); EmitOrLink(on_not_equal); } diff --git a/chromium/v8/src/regexp/regexp-bytecode-generator.h b/chromium/v8/src/regexp/regexp-bytecode-generator.h index fdb9b468619..9c4b6057c23 100644 --- a/chromium/v8/src/regexp/regexp-bytecode-generator.h +++ b/chromium/v8/src/regexp/regexp-bytecode-generator.h @@ -69,6 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + bool unicode, Label* on_no_match) override; void IfRegisterLT(int register_index, int comparand, Label* if_lt) override; void IfRegisterGE(int register_index, int comparand, Label* if_ge) override; diff --git a/chromium/v8/src/regexp/regexp-bytecode-peephole.cc b/chromium/v8/src/regexp/regexp-bytecode-peephole.cc index f0957f0779a..dcbafac334f 100644 --- a/chromium/v8/src/regexp/regexp-bytecode-peephole.cc +++ b/chromium/v8/src/regexp/regexp-bytecode-peephole.cc @@ -187,7 +187,8 @@ class RegExpBytecodePeephole { BytecodeSequenceNode& CreateSequence(int bytecode); // Checks for optimization candidates at pc and emits optimized bytecode to // the internal buffer. Returns the length of replaced bytecodes in bytes. - int TryOptimizeSequence(const byte* bytecode, int start_pc); + int TryOptimizeSequence(const byte* bytecode, int bytecode_length, + int start_pc); // Emits optimized bytecode to the internal buffer. start_pc points to the // start of the sequence in bytecode and last_node is the last // BytecodeSequenceNode of the matching sequence found. @@ -626,7 +627,7 @@ bool RegExpBytecodePeephole::OptimizeBytecode(const byte* bytecode, bool did_optimize = false; while (old_pc < length) { - int replaced_len = TryOptimizeSequence(bytecode, old_pc); + int replaced_len = TryOptimizeSequence(bytecode, length, old_pc); if (replaced_len > 0) { old_pc += replaced_len; did_optimize = true; @@ -659,6 +660,7 @@ BytecodeSequenceNode& RegExpBytecodePeephole::CreateSequence(int bytecode) { } int RegExpBytecodePeephole::TryOptimizeSequence(const byte* bytecode, + int bytecode_length, int start_pc) { BytecodeSequenceNode* seq_node = sequences_; BytecodeSequenceNode* valid_seq_end = nullptr; @@ -667,13 +669,12 @@ int RegExpBytecodePeephole::TryOptimizeSequence(const byte* bytecode, // Check for the longest valid sequence matching any of the pre-defined // sequences in the Trie data structure. - while ((seq_node = seq_node->Find(bytecode[current_pc]))) { - if (!seq_node->CheckArguments(bytecode, start_pc)) { - break; - } - if (seq_node->IsSequence()) { - valid_seq_end = seq_node; - } + while (current_pc < bytecode_length) { + seq_node = seq_node->Find(bytecode[current_pc]); + if (seq_node == nullptr) break; + if (!seq_node->CheckArguments(bytecode, start_pc)) break; + + if (seq_node->IsSequence()) valid_seq_end = seq_node; current_pc += RegExpBytecodeLength(bytecode[current_pc]); } diff --git a/chromium/v8/src/regexp/regexp-bytecodes.h b/chromium/v8/src/regexp/regexp-bytecodes.h index 1664a476d29..e3248d7b837 100644 --- a/chromium/v8/src/regexp/regexp-bytecodes.h +++ b/chromium/v8/src/regexp/regexp-bytecodes.h @@ -5,6 +5,7 @@ #ifndef V8_REGEXP_REGEXP_BYTECODES_H_ #define V8_REGEXP_REGEXP_BYTECODES_H_ +#include "src/base/bounds.h" #include "src/base/macros.h" #include "src/common/globals.h" @@ -27,6 +28,7 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); // TODO(pthier): Argument offsets of bytecodes should be easily accessible by // name or at least by position. +// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32). #define BYTECODE_ITERATOR(V) \ V(BREAK, 0, 4) /* bc8 */ \ V(PUSH_CP, 1, 4) /* bc8 pad24 */ \ @@ -101,12 +103,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \ + V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ @@ -229,16 +231,18 @@ static constexpr int kRegExpBytecodeLengths[] = { }; inline constexpr int RegExpBytecodeLength(int bytecode) { + CONSTEXPR_DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1)); return kRegExpBytecodeLengths[bytecode]; } -static const char* const kRegExpBytecodeNames[] = { +static constexpr const char* const kRegExpBytecodeNames[] = { #define DECLARE_BYTECODE_NAME(name, ...) #name, BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME) #undef DECLARE_BYTECODE_NAME }; -inline const char* RegExpBytecodeName(int bytecode) { +inline constexpr const char* RegExpBytecodeName(int bytecode) { + CONSTEXPR_DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1)); return kRegExpBytecodeNames[bytecode]; } diff --git a/chromium/v8/src/regexp/regexp-compiler-tonode.cc b/chromium/v8/src/regexp/regexp-compiler-tonode.cc index 9496de83e10..5fd53390797 100644 --- a/chromium/v8/src/regexp/regexp-compiler-tonode.cc +++ b/chromium/v8/src/regexp/regexp-compiler-tonode.cc @@ -56,11 +56,11 @@ static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges, return false; } for (int i = 0; i < length; i += 2) { - if (special_class[i] != (range.to() + 1)) { + if (static_cast<uc32>(special_class[i]) != (range.to() + 1)) { return false; } range = ranges->at((i >> 1) + 1); - if (special_class[i + 1] != range.from()) { + if (static_cast<uc32>(special_class[i + 1]) != range.from()) { return false; } } @@ -79,8 +79,8 @@ static bool CompareRanges(ZoneList<CharacterRange>* ranges, } for (int i = 0; i < length; i += 2) { CharacterRange range = ranges->at(i >> 1); - if (range.from() != special_class[i] || - range.to() != special_class[i + 1] - 1) { + if (range.from() != static_cast<uc32>(special_class[i]) || + range.to() != static_cast<uc32>(special_class[i + 1] - 1)) { return false; } } @@ -1154,7 +1154,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, CharacterRange range = ranges->at(i); uc32 from = range.from(); if (from > String::kMaxUtf16CodeUnit) continue; - uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit); + uc32 to = Min(range.to(), String::kMaxUtf16CodeUnitU); // Nothing to be done for surrogates. if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { @@ -1197,7 +1197,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, CharacterRange range = ranges->at(i); uc32 bottom = range.from(); if (bottom > String::kMaxUtf16CodeUnit) continue; - uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); + uc32 top = Min(range.to(), String::kMaxUtf16CodeUnitU); // Nothing to be done for surrogates. if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { @@ -1232,7 +1232,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, // block we do this for all the blocks covered by the range (handling // characters that is not in a block as a "singleton block"). unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - int pos = bottom; + uc32 pos = bottom; while (pos <= top) { int length = isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); @@ -1265,7 +1265,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { DCHECK_NOT_NULL(ranges); int n = ranges->length(); if (n <= 1) return true; - int max = ranges->at(0).to(); + uc32 max = ranges->at(0).to(); for (int i = 1; i < n; i++) { CharacterRange next_range = ranges->at(i); if (next_range.from() <= max + 1) return false; @@ -1366,7 +1366,7 @@ void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) { // Check whether ranges are already canonical (increasing, non-overlapping, // non-adjacent). int n = character_ranges->length(); - int max = character_ranges->at(0).to(); + uc32 max = character_ranges->at(0).to(); int i = 1; while (i < n) { CharacterRange current = character_ranges->at(i); diff --git a/chromium/v8/src/regexp/regexp-compiler.cc b/chromium/v8/src/regexp/regexp-compiler.cc index a04180fd346..58d598ca768 100644 --- a/chromium/v8/src/regexp/regexp-compiler.cc +++ b/chromium/v8/src/regexp/regexp-compiler.cc @@ -174,6 +174,24 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces) // trace is not recorded in the node and so it cannot currently be reused in // the event that code generation is requested for an identical trace. +namespace { + +constexpr uc32 MaxCodeUnit(const bool one_byte) { + STATIC_ASSERT(String::kMaxOneByteCharCodeU <= + std::numeric_limits<uint16_t>::max()); + STATIC_ASSERT(String::kMaxUtf16CodeUnitU <= + std::numeric_limits<uint16_t>::max()); + return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; +} + +constexpr uint32_t CharMask(const bool one_byte) { + STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); + STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); + return MaxCodeUnit(one_byte); +} + +} // namespace + void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { @@ -386,9 +404,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, int pushes = 0; for (int reg = 0; reg <= max_register; reg++) { - if (!affected_registers.Get(reg)) { - continue; - } + if (!affected_registers.Get(reg)) continue; // The chronologically first deferred action in the trace // is used to infer the action needed to restore a register @@ -710,6 +726,20 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, } } +namespace { + +#ifdef DEBUG +bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { + STATIC_ASSERT(sizeof(unibrow::uchar) == 4); + for (int i = 0; i < length; i++) { + if (chars[i] > String::kMaxUtf16CodeUnit) return false; + } + return true; +} +#endif // DEBUG + +} // namespace + // Returns the number of characters in the equivalence class, omitting those // that cannot occur in the source string because it is Latin1. static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, @@ -719,6 +749,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, #ifdef V8_INTL_SUPPORT if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; + DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1)); return 1; } bool in_special_add_set = @@ -744,9 +775,10 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { continue; } - letters[items++] = (unibrow::uchar)(cu); + letters[items++] = static_cast<unibrow::uchar>(cu); } } + DCHECK(ContainsOnlyUtf16CodeUnits(letters, items)); return items; #else int length = @@ -768,6 +800,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, length = new_length; } + DCHECK(ContainsOnlyUtf16CodeUnits(letters, length)); return length; #endif // V8_INTL_SUPPORT } @@ -820,12 +853,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, bool one_byte, uc16 c1, uc16 c2, Label* on_failure) { - uc16 char_mask; - if (one_byte) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(one_byte); uc16 exor = c1 ^ c2; // Check whether exor has only one bit set. if (((exor - 1) & exor) == 0) { @@ -1126,7 +1154,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges, return; } - if ((min_char >> kBits) != (first >> kBits)) { + if ((min_char >> kBits) != static_cast<uc32>(first >> kBits)) { masm->CheckCharacterLT(first, odd_label); GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char, fall_through, odd_label, even_label); @@ -1185,21 +1213,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ZoneList<CharacterRange>* ranges = cc->ranges(zone); CharacterRange::Canonicalize(ranges); - int max_char; - if (one_byte) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } - + const uc32 max_char = MaxCodeUnit(one_byte); int range_count = ranges->length(); int last_valid_range = range_count - 1; while (last_valid_range >= 0) { CharacterRange& range = ranges->at(last_valid_range); - if (range.from() <= max_char) { - break; - } + if (range.from() <= max_char) break; last_valid_range--; } @@ -1240,6 +1260,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, // entry at zero which goes to the failure label, but if there // was already one there we fall through for success on that entry. // Subsequent entries have alternating meaning (success/failure). + // TODO(jgruber,v8:10568): Change `range_boundaries` to a ZoneList<uc32>. ZoneList<int>* range_boundaries = new (zone) ZoneList<int>(last_valid_range, zone); @@ -1256,7 +1277,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, range_boundaries->Add(range.to() + 1, zone); } int end_index = range_boundaries->length() - 1; - if (range_boundaries->at(end_index) > max_char) { + if (static_cast<uc32>(range_boundaries->at(end_index)) > max_char) { end_index--; } @@ -1370,12 +1391,7 @@ static inline uint32_t SmearBitsRight(uint32_t v) { bool QuickCheckDetails::Rationalize(bool asc) { bool found_useful_op = false; - uint32_t char_mask; - if (asc) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(asc); mask_ = 0; value_ = 0; int char_shift = 0; @@ -1495,12 +1511,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, if (details->characters() == 1) { // If number of characters preloaded is 1 then we used a byte or 16 bit // load so the value is already masked down. - uint32_t char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(compiler->one_byte()); if ((mask & char_mask) == char_mask) need_mask = false; mask &= char_mask; } else { @@ -1551,12 +1562,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, Isolate* isolate = compiler->macro_assembler()->isolate(); DCHECK(characters_filled_in < details->characters()); int characters = details->characters(); - int char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(compiler->one_byte()); for (int k = 0; k < elements()->length(); k++) { TextElement elm = elements()->at(k); if (elm.text_type() == TextElement::ATOM) { @@ -1645,26 +1651,22 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, } } CharacterRange range = ranges->at(first_range); - uc16 from = range.from(); - uc16 to = range.to(); - if (to > char_mask) { - to = char_mask; - } - uint32_t differing_bits = (from ^ to); + const uc32 first_from = range.from(); + const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to(); + const uint32_t differing_bits = (first_from ^ first_to); // A mask and compare is only perfect if the differing bits form a // number like 00011111 with one single block of trailing 1s. if ((differing_bits & (differing_bits + 1)) == 0 && - from + differing_bits == to) { + first_from + differing_bits == first_to) { pos->determines_perfectly = true; } uint32_t common_bits = ~SmearBitsRight(differing_bits); - uint32_t bits = (from & common_bits); + uint32_t bits = (first_from & common_bits); for (int i = first_range + 1; i < ranges->length(); i++) { CharacterRange range = ranges->at(i); - uc16 from = range.from(); - uc16 to = range.to(); + const uc32 from = range.from(); if (from > char_mask) continue; - if (to > char_mask) to = char_mask; + const uc32 to = (range.to() > char_mask) ? char_mask : range.to(); // Here we are combining more ranges into the mask and compare // value. With each new range the mask becomes more sparse and // so the chances of a false positive rise. A character class @@ -1684,9 +1686,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, } characters_filled_in++; DCHECK(characters_filled_in <= details->characters()); - if (characters_filled_in == details->characters()) { - return; - } + if (characters_filled_in == details->characters()) return; } } DCHECK(characters_filled_in != details->characters()); @@ -1748,7 +1748,7 @@ void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { pos->mask &= other_pos->mask; pos->value &= pos->mask; other_pos->value &= pos->mask; - uc16 differing_bits = (pos->value ^ other_pos->value); + uint32_t differing_bits = (pos->value ^ other_pos->value); pos->mask &= ~differing_bits; pos->value &= pos->mask; } @@ -1858,16 +1858,20 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (range_count != 0 && ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + if (IgnoreCase(cc->flags()) && + RangesContainLatin1Equivalents(ranges)) { continue; + } return set_replacement(nullptr); } } else { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + if (IgnoreCase(cc->flags()) && + RangesContainLatin1Equivalents(ranges)) { continue; + } return set_replacement(nullptr); } } @@ -2504,12 +2508,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( return ranges->length() == 0 ? on_success() : nullptr; } if (ranges->length() != 1) return nullptr; - uint32_t max_char; - if (compiler->one_byte()) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } + const uc32 max_char = MaxCodeUnit(compiler->one_byte()); return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; } @@ -2719,12 +2718,9 @@ void BoyerMoorePositionInfo::SetAll() { BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone) - : length_(length), compiler_(compiler) { - if (compiler->one_byte()) { - max_char_ = String::kMaxOneByteCharCode; - } else { - max_char_ = String::kMaxUtf16CodeUnit; - } + : length_(length), + compiler_(compiler), + max_char_(MaxCodeUnit(compiler->one_byte())) { bitmaps_ = new (zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); for (int i = 0; i < length; i++) { bitmaps_->Add(new (zone) BoyerMoorePositionInfo(), zone); @@ -3421,8 +3417,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { DCHECK_EQ(start_reg_ + 1, end_reg_); if (IgnoreCase(flags_)) { + bool unicode = IsUnicode(flags_); assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), - trace->backtrack()); + unicode, trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3787,7 +3784,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, } else { for (int k = 0; k < ranges->length(); k++) { CharacterRange& range = ranges->at(k); - if (range.from() > max_char) continue; + if (static_cast<int>(range.from()) > max_char) continue; int to = Min(max_char, static_cast<int>(range.to())); bm->SetInterval(offset, Interval(range.from(), to)); } diff --git a/chromium/v8/src/regexp/regexp-compiler.h b/chromium/v8/src/regexp/regexp-compiler.h index a35ffcd01a2..4e7652883c4 100644 --- a/chromium/v8/src/regexp/regexp-compiler.h +++ b/chromium/v8/src/regexp/regexp-compiler.h @@ -96,8 +96,8 @@ class QuickCheckDetails { void set_cannot_match() { cannot_match_ = true; } struct Position { Position() : mask(0), value(0), determines_perfectly(false) {} - uc16 mask; - uc16 value; + uc32 mask; + uc32 value; bool determines_perfectly; }; int characters() { return characters_; } diff --git a/chromium/v8/src/regexp/regexp-dotprinter.cc b/chromium/v8/src/regexp/regexp-dotprinter.cc index b6640626f2c..7cf1e82c4d0 100644 --- a/chromium/v8/src/regexp/regexp-dotprinter.cc +++ b/chromium/v8/src/regexp/regexp-dotprinter.cc @@ -143,7 +143,7 @@ void DotPrinterImpl::VisitText(TextNode* that) { if (node->is_negated()) os_ << "^"; for (int j = 0; j < node->ranges(zone)->length(); j++) { CharacterRange range = node->ranges(zone)->at(j); - os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); + os_ << AsUC32(range.from()) << "-" << AsUC32(range.to()); } os_ << "]"; break; diff --git a/chromium/v8/src/regexp/regexp-interpreter.cc b/chromium/v8/src/regexp/regexp-interpreter.cc index 0c6d8d5b4be..49215a25446 100644 --- a/chromium/v8/src/regexp/regexp-interpreter.cc +++ b/chromium/v8/src/regexp/regexp-interpreter.cc @@ -35,18 +35,23 @@ namespace internal { namespace { bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uc16> subject) { + Vector<const uc16> subject, bool unicode) { Address offset_a = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from))); Address offset_b = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current))); size_t length = len * kUC16Size; - return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b, - length, isolate) == 1; + + bool result = unicode + ? RegExpMacroAssembler::CaseInsensitiveCompareUnicode( + offset_a, offset_b, length, isolate) + : RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode( + offset_a, offset_b, length, isolate); + return result == 1; } bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uint8_t> subject) { + Vector<const uint8_t> subject, bool unicode) { // For Latin1 characters the unicode flag makes no difference. for (int i = 0; i < len; i++) { unsigned int old_char = subject[from++]; @@ -100,6 +105,18 @@ int32_t Load16AlignedSigned(const byte* pc) { return *reinterpret_cast<const int16_t*>(pc); } +// Helpers to access the packed argument. Takes the 32 bits containing the +// current bytecode, where the 8 LSB contain the bytecode and the rest contains +// a packed 24-bit argument. +// TODO(jgruber): Specify signed-ness in bytecode signature declarations, and +// police restrictions during bytecode generation. +int32_t LoadPacked24Signed(int32_t bytecode_and_packed_arg) { + return bytecode_and_packed_arg >> BYTECODE_SHIFT; +} +uint32_t LoadPacked24Unsigned(int32_t bytecode_and_packed_arg) { + return static_cast<uint32_t>(bytecode_and_packed_arg) >> BYTECODE_SHIFT; +} + // A simple abstraction over the backtracking stack used by the interpreter. // // Despite the name 'backtracking' stack, it's actually used as a generic stack @@ -296,6 +313,12 @@ bool CheckBitInTable(const uint32_t current_char, const byte* const table) { return (b & (1 << bit)) != 0; } +// Returns true iff 0 <= index < length. +bool IndexIsInBounds(int index, int length) { + DCHECK_GE(length, 0); + return static_cast<uintptr_t>(index) < static_cast<uintptr_t>(length); +} + // If computed gotos are supported by the compiler, we can get addresses to // labels directly in C/C++. Every bytecode handler has its own label and we // store the addresses in a dispatch table indexed by bytecode. To execute the @@ -337,6 +360,14 @@ bool CheckBitInTable(const uint32_t current_char, const byte* const table) { next_pc = code_base + offset; \ DECODE() +// Current position mutations. +#define SET_CURRENT_POSITION(value) \ + do { \ + current = (value); \ + DCHECK(base::IsInRange(current, 0, subject.length())); \ + } while (false) +#define ADVANCE_CURRENT_POSITION(by) SET_CURRENT_POSITION(current + (by)) + #ifdef DEBUG #define BYTECODE(name) \ BC_LABEL(name) \ @@ -447,44 +478,44 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(PUSH_REGISTER) { ADVANCE(PUSH_REGISTER); - if (!backtrack_stack.push(registers[insn >> BYTECODE_SHIFT])) { + if (!backtrack_stack.push(registers[LoadPacked24Unsigned(insn)])) { return MaybeThrowStackOverflow(isolate, call_origin); } DISPATCH(); } BYTECODE(SET_REGISTER) { ADVANCE(SET_REGISTER); - registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4); + registers[LoadPacked24Unsigned(insn)] = Load32Aligned(pc + 4); DISPATCH(); } BYTECODE(ADVANCE_REGISTER) { ADVANCE(ADVANCE_REGISTER); - registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4); + registers[LoadPacked24Unsigned(insn)] += Load32Aligned(pc + 4); DISPATCH(); } BYTECODE(SET_REGISTER_TO_CP) { ADVANCE(SET_REGISTER_TO_CP); - registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4); + registers[LoadPacked24Unsigned(insn)] = current + Load32Aligned(pc + 4); DISPATCH(); } BYTECODE(SET_CP_TO_REGISTER) { ADVANCE(SET_CP_TO_REGISTER); - current = registers[insn >> BYTECODE_SHIFT]; + SET_CURRENT_POSITION(registers[LoadPacked24Unsigned(insn)]); DISPATCH(); } BYTECODE(SET_REGISTER_TO_SP) { ADVANCE(SET_REGISTER_TO_SP); - registers[insn >> BYTECODE_SHIFT] = backtrack_stack.sp(); + registers[LoadPacked24Unsigned(insn)] = backtrack_stack.sp(); DISPATCH(); } BYTECODE(SET_SP_TO_REGISTER) { ADVANCE(SET_SP_TO_REGISTER); - backtrack_stack.set_sp(registers[insn >> BYTECODE_SHIFT]); + backtrack_stack.set_sp(registers[LoadPacked24Unsigned(insn)]); DISPATCH(); } BYTECODE(POP_CP) { ADVANCE(POP_CP); - current = backtrack_stack.pop(); + SET_CURRENT_POSITION(backtrack_stack.pop()); DISPATCH(); } BYTECODE(POP_BT) { @@ -504,7 +535,7 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(POP_REGISTER) { ADVANCE(POP_REGISTER); - registers[insn >> BYTECODE_SHIFT] = backtrack_stack.pop(); + registers[LoadPacked24Unsigned(insn)] = backtrack_stack.pop(); DISPATCH(); } BYTECODE(FAIL) { @@ -520,7 +551,7 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(ADVANCE_CP) { ADVANCE(ADVANCE_CP); - current += insn >> BYTECODE_SHIFT; + ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); DISPATCH(); } BYTECODE(GOTO) { @@ -529,7 +560,7 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(ADVANCE_CP_AND_GOTO) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - current += insn >> BYTECODE_SHIFT; + ADVANCE_CURRENT_POSITION(LoadPacked24Signed(insn)); DISPATCH(); } BYTECODE(CHECK_GREEDY) { @@ -542,7 +573,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(LOAD_CURRENT_CHAR) { - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); if (pos >= subject.length() || pos < 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -553,12 +584,12 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) { ADVANCE(LOAD_CURRENT_CHAR_UNCHECKED); - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); current_char = subject[pos]; DISPATCH(); } BYTECODE(LOAD_2_CURRENT_CHARS) { - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); if (pos + 2 > subject.length() || pos < 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -570,14 +601,14 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) { ADVANCE(LOAD_2_CURRENT_CHARS_UNCHECKED); - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); Char next = subject[pos + 1]; current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char)))); DISPATCH(); } BYTECODE(LOAD_4_CURRENT_CHARS) { DCHECK_EQ(1, sizeof(Char)); - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); if (pos + 4 > subject.length() || pos < 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -593,7 +624,7 @@ IrregexpInterpreter::Result RawMatch( BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) { ADVANCE(LOAD_4_CURRENT_CHARS_UNCHECKED); DCHECK_EQ(1, sizeof(Char)); - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); Char next1 = subject[pos + 1]; Char next2 = subject[pos + 2]; Char next3 = subject[pos + 3]; @@ -611,7 +642,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); + uint32_t c = LoadPacked24Unsigned(insn); if (c == current_char) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -629,7 +660,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); + uint32_t c = LoadPacked24Unsigned(insn); if (c != current_char) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -647,7 +678,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(AND_CHECK_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); + uint32_t c = LoadPacked24Unsigned(insn); if (c == (current_char & Load32Aligned(pc + 4))) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { @@ -665,7 +696,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(AND_CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); + uint32_t c = LoadPacked24Unsigned(insn); if (c != (current_char & Load32Aligned(pc + 4))) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { @@ -674,7 +705,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(MINUS_AND_CHECK_NOT_CHAR) { - uint32_t c = (insn >> BYTECODE_SHIFT); + uint32_t c = LoadPacked24Unsigned(insn); uint32_t minus = Load16Aligned(pc + 4); uint32_t mask = Load16Aligned(pc + 6); if (c != ((current_char - minus) & mask)) { @@ -713,7 +744,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_LT) { - uint32_t limit = (insn >> BYTECODE_SHIFT); + uint32_t limit = LoadPacked24Unsigned(insn); if (current_char < limit) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -722,7 +753,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_GT) { - uint32_t limit = (insn >> BYTECODE_SHIFT); + uint32_t limit = LoadPacked24Unsigned(insn); if (current_char > limit) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -731,7 +762,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_REGISTER_LT) { - if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) { + if (registers[LoadPacked24Unsigned(insn)] < Load32Aligned(pc + 4)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { ADVANCE(CHECK_REGISTER_LT); @@ -739,7 +770,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_REGISTER_GE) { - if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) { + if (registers[LoadPacked24Unsigned(insn)] >= Load32Aligned(pc + 4)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); } else { ADVANCE(CHECK_REGISTER_GE); @@ -747,7 +778,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_REGISTER_EQ_POS) { - if (registers[insn >> BYTECODE_SHIFT] == current) { + if (registers[LoadPacked24Unsigned(insn)] == current) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { ADVANCE(CHECK_REGISTER_EQ_POS); @@ -755,7 +786,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_NOT_REGS_EQUAL) { - if (registers[insn >> BYTECODE_SHIFT] == + if (registers[LoadPacked24Unsigned(insn)] == registers[Load32Aligned(pc + 4)]) { ADVANCE(CHECK_NOT_REGS_EQUAL); } else { @@ -764,69 +795,94 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; if (from >= 0 && len > 0) { if (current + len > subject.length() || CompareChars(&subject[from], &subject[current], len) != 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } - current += len; + ADVANCE_CURRENT_POSITION(len); } ADVANCE(CHECK_NOT_BACK_REF); DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; if (from >= 0 && len > 0) { if (current - len < 0 || CompareChars(&subject[from], &subject[current - len], len) != 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } - current -= len; + SET_CURRENT_POSITION(current - len); } ADVANCE(CHECK_NOT_BACK_REF_BACKWARD); DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { - UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current + len > subject.length() || + !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + ADVANCE_CURRENT_POSITION(len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE); + DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; if (from >= 0 && len > 0) { if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject)) { + !BackRefMatchesNoCase(isolate, from, current, len, subject, + false)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } - current += len; + ADVANCE_CURRENT_POSITION(len); } ADVANCE(CHECK_NOT_BACK_REF_NO_CASE); DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { - UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; + if (from >= 0 && len > 0) { + if (current - len < 0 || + !BackRefMatchesNoCase(isolate, from, current - len, len, subject, + true)) { + SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); + DISPATCH(); + } + SET_CURRENT_POSITION(current - len); + } + ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD); + DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; + int from = registers[LoadPacked24Unsigned(insn)]; + int len = registers[LoadPacked24Unsigned(insn) + 1] - from; if (from >= 0 && len > 0) { if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) { + !BackRefMatchesNoCase(isolate, from, current - len, len, subject, + false)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } - current -= len; + SET_CURRENT_POSITION(current - len); } ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD); DISPATCH(); } BYTECODE(CHECK_AT_START) { - if (current + (insn >> BYTECODE_SHIFT) == 0) { + if (current + LoadPacked24Signed(insn) == 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { ADVANCE(CHECK_AT_START); @@ -834,7 +890,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(CHECK_NOT_AT_START) { - if (current + (insn >> BYTECODE_SHIFT) == 0) { + if (current + LoadPacked24Signed(insn) == 0) { ADVANCE(CHECK_NOT_AT_START); } else { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); @@ -843,15 +899,15 @@ IrregexpInterpreter::Result RawMatch( } BYTECODE(SET_CURRENT_POSITION_FROM_END) { ADVANCE(SET_CURRENT_POSITION_FROM_END); - int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT; + int by = LoadPacked24Unsigned(insn); if (subject.length() - current > by) { - current = subject.length() - by; + SET_CURRENT_POSITION(subject.length() - by); current_char = subject[current - 1]; } DISPATCH(); } BYTECODE(CHECK_CURRENT_POSITION) { - int pos = current + (insn >> BYTECODE_SHIFT); + int pos = current + LoadPacked24Signed(insn); if (pos > subject.length() || pos < 0) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); } else { @@ -860,23 +916,22 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(SKIP_UNTIL_CHAR) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); uint32_t c = Load16Aligned(pc + 6); - while (static_cast<uintptr_t>(current + load_offset) < - static_cast<uintptr_t>(subject.length())) { + while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; if (c == current_char) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 8)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); DISPATCH(); } BYTECODE(SKIP_UNTIL_CHAR_AND) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); uint16_t c = Load16Aligned(pc + 6); uint32_t mask = Load32Aligned(pc + 8); @@ -888,13 +943,13 @@ IrregexpInterpreter::Result RawMatch( SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 20)); DISPATCH(); } BYTECODE(SKIP_UNTIL_CHAR_POS_CHECKED) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); uint16_t c = Load16Aligned(pc + 6); int32_t maximum_offset = Load32Aligned(pc + 8); @@ -905,34 +960,32 @@ IrregexpInterpreter::Result RawMatch( SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); DISPATCH(); } BYTECODE(SKIP_UNTIL_BIT_IN_TABLE) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); const byte* table = pc + 8; - while (static_cast<uintptr_t>(current + load_offset) < - static_cast<uintptr_t>(subject.length())) { + while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; if (CheckBitInTable(current_char, table)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); DISPATCH(); } BYTECODE(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load16AlignedSigned(pc + 4); uint16_t limit = Load16Aligned(pc + 6); const byte* table = pc + 8; - while (static_cast<uintptr_t>(current + load_offset) < - static_cast<uintptr_t>(subject.length())) { + while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; if (current_char > limit) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); @@ -942,18 +995,17 @@ IrregexpInterpreter::Result RawMatch( SET_PC_FROM_OFFSET(Load32Aligned(pc + 24)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 28)); DISPATCH(); } BYTECODE(SKIP_UNTIL_CHAR_OR_CHAR) { - int load_offset = (insn >> BYTECODE_SHIFT); + int32_t load_offset = LoadPacked24Signed(insn); int32_t advance = Load32Aligned(pc + 4); uint16_t c = Load16Aligned(pc + 8); uint16_t c2 = Load16Aligned(pc + 10); - while (static_cast<uintptr_t>(current + load_offset) < - static_cast<uintptr_t>(subject.length())) { + while (IndexIsInBounds(current + load_offset, subject.length())) { current_char = subject[current + load_offset]; // The two if-statements below are split up intentionally, as combining // them seems to result in register allocation behaving quite @@ -966,7 +1018,7 @@ IrregexpInterpreter::Result RawMatch( SET_PC_FROM_OFFSET(Load32Aligned(pc + 12)); DISPATCH(); } - current += advance; + ADVANCE_CURRENT_POSITION(advance); } SET_PC_FROM_OFFSET(Load32Aligned(pc + 16)); DISPATCH(); @@ -986,6 +1038,8 @@ IrregexpInterpreter::Result RawMatch( } #undef BYTECODE +#undef ADVANCE_CURRENT_POSITION +#undef SET_CURRENT_POSITION #undef DISPATCH #undef DECODE #undef SET_PC_FROM_OFFSET diff --git a/chromium/v8/src/regexp/regexp-macro-assembler-tracer.cc b/chromium/v8/src/regexp/regexp-macro-assembler-tracer.cc index 0a122017437..d1feec4c33d 100644 --- a/chromium/v8/src/regexp/regexp-macro-assembler-tracer.cc +++ b/chromium/v8/src/regexp/regexp-macro-assembler-tracer.cc @@ -352,11 +352,11 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, } void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { - PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n", + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n", start_reg, read_backward ? "backward" : "forward", - LabelToInt(on_no_match)); - assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, + unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match)); + assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode, on_no_match); } diff --git a/chromium/v8/src/regexp/regexp-macro-assembler-tracer.h b/chromium/v8/src/regexp/regexp-macro-assembler-tracer.h index b6ad63071f4..2a44146e738 100644 --- a/chromium/v8/src/regexp/regexp-macro-assembler-tracer.h +++ b/chromium/v8/src/regexp/regexp-macro-assembler-tracer.h @@ -33,6 +33,7 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + bool unicode, Label* on_no_match) override; void CheckNotCharacter(unsigned c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, diff --git a/chromium/v8/src/regexp/regexp-macro-assembler.cc b/chromium/v8/src/regexp/regexp-macro-assembler.cc index 6cc9cae6e1d..cf4346309eb 100644 --- a/chromium/v8/src/regexp/regexp-macro-assembler.cc +++ b/chromium/v8/src/regexp/regexp-macro-assembler.cc @@ -9,6 +9,7 @@ #include "src/execution/pointer-authentication.h" #include "src/execution/simulator.h" #include "src/regexp/regexp-stack.h" +#include "src/regexp/special-case.h" #include "src/strings/unicode-inl.h" #ifdef V8_INTL_SUPPORT @@ -27,17 +28,46 @@ RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) RegExpMacroAssembler::~RegExpMacroAssembler() = default; -int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, - Address byte_offset2, - size_t byte_length, - Isolate* isolate) { +int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate) { +#ifdef V8_INTL_SUPPORT + // This function is not allowed to cause a garbage collection. + // A GC might move the calling generated code and invalidate the + // return address on the stack. + DisallowHeapAllocation no_gc; + DCHECK_EQ(0, byte_length % 2); + size_t length = byte_length / 2; + uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1); + uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2); + + for (size_t i = 0; i < length; i++) { + UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]); + UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]); + if (c1 != c2) { + return 0; + } + } + return 1; +#else + return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length, + isolate); +#endif +} + +int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate) { // This function is not allowed to cause a garbage collection. // A GC might move the calling generated code and invalidate the // return address on the stack. + DisallowHeapAllocation no_gc; DCHECK_EQ(0, byte_length % 2); #ifdef V8_INTL_SUPPORT - int32_t length = (int32_t)(byte_length >> 1); + int32_t length = static_cast<int32_t>(byte_length >> 1); icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1), length); return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2), @@ -68,7 +98,6 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, #endif // V8_INTL_SUPPORT } - void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, Label* on_failure) { Label ok; diff --git a/chromium/v8/src/regexp/regexp-macro-assembler.h b/chromium/v8/src/regexp/regexp-macro-assembler.h index 289c2a979e6..52465610cb6 100644 --- a/chromium/v8/src/regexp/regexp-macro-assembler.h +++ b/chromium/v8/src/regexp/regexp-macro-assembler.h @@ -88,7 +88,7 @@ class RegExpMacroAssembler { virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) = 0; virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -165,11 +165,16 @@ class RegExpMacroAssembler { virtual void ClearRegisters(int reg_from, int reg_to) = 0; virtual void WriteStackPointerToRegister(int reg) = 0; - // Compares two-byte strings case insensitively. + // Compare two-byte strings case insensitively. // Called from generated RegExp code. - static int CaseInsensitiveCompareUC16(Address byte_offset1, - Address byte_offset2, - size_t byte_length, Isolate* isolate); + static int CaseInsensitiveCompareNonUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate); + static int CaseInsensitiveCompareUnicode(Address byte_offset1, + Address byte_offset2, + size_t byte_length, + Isolate* isolate); // Check that we are not in the middle of a surrogate pair. void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); diff --git a/chromium/v8/src/regexp/regexp-parser.cc b/chromium/v8/src/regexp/regexp-parser.cc index 3c1115414fb..7b87044ca65 100644 --- a/chromium/v8/src/regexp/regexp-parser.cc +++ b/chromium/v8/src/regexp/regexp-parser.cc @@ -1301,7 +1301,7 @@ bool LookupSpecialPropertyValueName(const char* name, return true; } -// Explicitly whitelist supported binary properties. The spec forbids supporting +// Explicitly allowlist supported binary properties. The spec forbids supporting // properties outside of this set to ensure interoperability. bool IsSupportedBinaryProperty(UProperty property) { switch (property) { @@ -1550,7 +1550,7 @@ bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { } while (d >= 0) { x = x * 16 + d; - if (x > max_value) { + if (x > static_cast<uc32>(max_value)) { return false; } Advance(); @@ -1789,34 +1789,54 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { #undef CHECK_FAILED - -bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, - FlatStringReader* input, JSRegExp::Flags flags, - RegExpCompileData* result) { +bool RegExpParser::Parse(RegExpCompileData* result, + const DisallowHeapAllocation&) { DCHECK(result != nullptr); - RegExpParser parser(input, flags, isolate, zone); - RegExpTree* tree = parser.ParsePattern(); - if (parser.failed()) { + RegExpTree* tree = ParsePattern(); + if (failed()) { DCHECK(tree == nullptr); - DCHECK(parser.error_ != RegExpError::kNone); - result->error = parser.error_; - result->error_pos = parser.error_pos_; + DCHECK(error_ != RegExpError::kNone); + result->error = error_; + result->error_pos = error_pos_; } else { DCHECK(tree != nullptr); - DCHECK(parser.error_ == RegExpError::kNone); + DCHECK(error_ == RegExpError::kNone); if (FLAG_trace_regexp_parser) { StdoutStream os; - tree->Print(os, zone); + tree->Print(os, zone()); os << "\n"; } result->tree = tree; - int capture_count = parser.captures_started(); - result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; - result->contains_anchor = parser.contains_anchor(); - result->capture_name_map = parser.CreateCaptureNameMap(); + int capture_count = captures_started(); + result->simple = tree->IsAtom() && simple() && capture_count == 0; + result->contains_anchor = contains_anchor(); result->capture_count = capture_count; } - return !parser.failed(); + return !failed(); +} + +bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, + FlatStringReader* input, JSRegExp::Flags flags, + RegExpCompileData* result) { + RegExpParser parser(input, flags, isolate, zone); + bool success; + { + DisallowHeapAllocation no_gc; + success = parser.Parse(result, no_gc); + } + if (success) { + result->capture_name_map = parser.CreateCaptureNameMap(); + } + return success; +} + +bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone, + FlatStringReader* input, + JSRegExp::Flags flags, + RegExpCompileData* result, + const DisallowHeapAllocation& no_gc) { + RegExpParser parser(input, flags, isolate, zone); + return parser.Parse(result, no_gc); } RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) diff --git a/chromium/v8/src/regexp/regexp-parser.h b/chromium/v8/src/regexp/regexp-parser.h index aff1746bc53..bfb08208980 100644 --- a/chromium/v8/src/regexp/regexp-parser.h +++ b/chromium/v8/src/regexp/regexp-parser.h @@ -159,6 +159,13 @@ class V8_EXPORT_PRIVATE RegExpParser { static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, JSRegExp::Flags flags, RegExpCompileData* result); + static bool VerifyRegExpSyntax(Isolate* isolate, Zone* zone, + FlatStringReader* input, JSRegExp::Flags flags, + RegExpCompileData* result, + const DisallowHeapAllocation& no_gc); + + private: + bool Parse(RegExpCompileData* result, const DisallowHeapAllocation&); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); diff --git a/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.cc b/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.cc index 2109b45314a..9ac4f755227 100644 --- a/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.cc +++ b/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.cc @@ -230,7 +230,7 @@ void RegExpMacroAssemblerS390::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; __ LoadP(r2, register_location(start_reg)); // Index of start of // capture @@ -346,7 +346,10 @@ void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase( { AllowExternalCallThatCantCauseGC scope(masm_); ExternalReference function = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(function, argument_count); } diff --git a/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.h b/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.h index 9ced67fe274..e4f88f51b9a 100644 --- a/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.h +++ b/chromium/v8/src/regexp/s390/regexp-macro-assembler-s390.h @@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerS390 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, + bool read_backward, bool unicode, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, diff --git a/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.cc b/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.cc index cf8eb6604c9..ef3e48428f0 100644 --- a/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.cc +++ b/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.cc @@ -215,7 +215,7 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, Label* on_no_match) { + int start_reg, bool read_backward, bool unicode, Label* on_no_match) { Label fallthrough; ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture @@ -354,7 +354,10 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( // linter. AllowExternalCallThatCantCauseGC scope(&masm_); ExternalReference compare = - ExternalReference::re_case_insensitive_compare_uc16(isolate()); + unicode ? ExternalReference::re_case_insensitive_compare_unicode( + isolate()) + : ExternalReference::re_case_insensitive_compare_non_unicode( + isolate()); __ CallCFunction(compare, num_arguments); } diff --git a/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.h b/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.h index 551e9bc6ec7..ea4d45edba8 100644 --- a/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.h +++ b/chromium/v8/src/regexp/x64/regexp-macro-assembler-x64.h @@ -37,6 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerX64 void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, + bool unicode, Label* on_no_match) override; void CheckNotCharacter(uint32_t c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask, |