diff options
author | Michaël Zasso <targos@protonmail.com> | 2020-05-05 09:19:02 +0200 |
---|---|---|
committer | Michaël Zasso <targos@protonmail.com> | 2020-05-12 16:12:13 +0200 |
commit | 1d6adf7432defeb39b751a19c68335e8afb0d8ee (patch) | |
tree | 7ab67931110b8d9db770d774c7a6d0d14c976c15 /deps/v8/src/regexp | |
parent | aee36a04475a20c13663d1037aa6f175ff368bc7 (diff) | |
download | node-new-1d6adf7432defeb39b751a19c68335e8afb0d8ee.tar.gz |
deps: update V8 to 8.3.110.9
PR-URL: https://github.com/nodejs/node/pull/32831
Reviewed-By: Anna Henningsen <anna@addaleax.net>
Reviewed-By: Michaël Zasso <targos@protonmail.com>
Reviewed-By: Jiawen Geng <technicalcute@gmail.com>
Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Diffstat (limited to 'deps/v8/src/regexp')
39 files changed, 583 insertions, 584 deletions
diff --git a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc index 8f9da563a9..03dac337e0 100644 --- a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc +++ b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc @@ -110,6 +110,8 @@ RegExpMacroAssemblerARM::RegExpMacroAssemblerARM(Isolate* isolate, Zone* zone, success_label_(), backtrack_label_(), exit_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); __ jmp(&entry_label_); // We'll write the entry code later. __ bind(&start_label_); // And then continue from here. @@ -221,9 +223,8 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) { BranchOrBacktrack(eq, on_equal); } - void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ ldr(r0, register_location(start_reg)); // Index of start of capture __ ldr(r1, register_location(start_reg + 1)); // Index of end of capture @@ -315,7 +316,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( // r0: Address byte_offset1 - Address captured substring's start. // r1: Address byte_offset2 - Address of current character position. // r2: size_t byte_length - length of capture in bytes(!) - // r3: Isolate* isolate or 0 if unicode flag. + // r3: Isolate* isolate. // Address of start of capture. __ add(r0, r0, Operand(end_of_input_address())); @@ -329,14 +330,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( __ sub(r1, r1, r4); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ mov(r3, Operand(0)); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ mov(r3, Operand(ExternalReference::isolate_address(isolate()))); - } + __ mov(r3, Operand(ExternalReference::isolate_address(isolate()))); { AllowExternalCallThatCantCauseGC scope(masm_); @@ -360,7 +354,6 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerARM::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { diff --git a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h index 6320913f4c..22628fb760 100644 --- a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h +++ b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, diff --git a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc index 56658819b1..43a6bdf912 100644 --- a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc +++ b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc @@ -120,10 +120,14 @@ RegExpMacroAssemblerARM64::RegExpMacroAssemblerARM64(Isolate* isolate, success_label_(), backtrack_label_(), exit_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); // We can cache at most 16 W registers in x0-x7. STATIC_ASSERT(kNumCachedRegisters <= 16); STATIC_ASSERT((kNumCachedRegisters % 2) == 0); + __ CallTarget(); + __ B(&entry_label_); // We'll write the entry code later. __ Bind(&start_label_); // And then continue from here. } @@ -212,6 +216,9 @@ void RegExpMacroAssemblerARM64::Bind(Label* label) { __ Bind(label); } +void RegExpMacroAssemblerARM64::BindJumpTarget(Label* label) { + __ BindJumpTarget(label); +} void RegExpMacroAssemblerARM64::CheckCharacter(uint32_t c, Label* on_equal) { CompareAndBranchOrBacktrack(current_character(), c, eq, on_equal); @@ -286,9 +293,8 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) { BranchOrBacktrack(eq, on_equal); } - void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; Register capture_start_offset = w10; @@ -402,7 +408,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( // x0: Address byte_offset1 - Address captured substring's start. // x1: Address byte_offset2 - Address of current character position. // w2: size_t byte_length - length of capture in bytes(!) - // x3: Isolate* isolate or 0 if unicode flag + // x3: Isolate* isolate. // Address of start of capture. __ Add(x0, input_end(), Operand(capture_start_offset, SXTW)); @@ -414,14 +420,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase( __ Sub(x1, x1, Operand(capture_length, SXTW)); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ Mov(x3, Operand(0)); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ Mov(x3, ExternalReference::isolate_address(isolate())); - } + __ Mov(x3, ExternalReference::isolate_address(isolate())); { AllowExternalCallThatCantCauseGC scope(masm_); @@ -737,10 +736,11 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) { CPURegList argument_registers(x0, x5, x6, x7); CPURegList registers_to_retain = kCalleeSaved; - DCHECK_EQ(11, kCalleeSaved.Count()); + registers_to_retain.Combine(fp); registers_to_retain.Combine(lr); - __ PushCPURegList(registers_to_retain); + DCHECK(registers_to_retain.IncludesAliasOf(lr)); + __ PushCPURegList<TurboAssembler::kSignLR>(registers_to_retain); __ PushCPURegList(argument_registers); // Set frame pointer in place. @@ -1035,7 +1035,7 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) { __ Mov(sp, fp); // Restore registers. - __ PopCPURegList(registers_to_retain); + __ PopCPURegList<TurboAssembler::kAuthLR>(registers_to_retain); __ Ret(); @@ -1585,14 +1585,14 @@ void RegExpMacroAssemblerARM64::CallIf(Label* to, Condition condition) { void RegExpMacroAssemblerARM64::RestoreLinkRegister() { - __ Pop(lr, xzr); + __ Pop<TurboAssembler::kAuthLR>(padreg, lr); __ Add(lr, lr, Operand(masm_->CodeObject())); } void RegExpMacroAssemblerARM64::SaveLinkRegister() { __ Sub(lr, lr, Operand(masm_->CodeObject())); - __ Push(xzr, lr); + __ Push<TurboAssembler::kSignLR>(lr, padreg); } diff --git a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h index cee9e2c97e..91b5e90bf5 100644 --- a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h +++ b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h @@ -42,7 +42,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, @@ -65,6 +65,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64 virtual void CheckPosition(int cp_offset, Label* on_outside_input); virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); + virtual void BindJumpTarget(Label* label = nullptr); virtual void Fail(); virtual Handle<HeapObject> GetCode(Handle<String> source); virtual void GoTo(Label* label); diff --git a/deps/v8/src/regexp/gen-regexp-special-case.cc b/deps/v8/src/regexp/gen-regexp-special-case.cc index 8aace6ab88..9606c5d70d 100644 --- a/deps/v8/src/regexp/gen-regexp-special-case.cc +++ b/deps/v8/src/regexp/gen-regexp-special-case.cc @@ -1,4 +1,4 @@ -// Copyright 2019 the V8 project authors. All rights reserved. +// Copyright 2020 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -7,19 +7,19 @@ #include <iostream> #include <sstream> -#include "src/base/logging.h" -#include "unicode/uchar.h" -#include "unicode/uniset.h" +#include "src/regexp/special-case.h" namespace v8 { namespace internal { -// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() -// functions into "src/regexp/special-case.cc". -// See more details in http://shorturl.at/adfO5 -void PrintSet(std::ofstream& out, const char* func_name, +static const uc32 kSurrogateStart = 0xd800; +static const uc32 kSurrogateEnd = 0xdfff; +static const uc32 kNonBmpStart = 0x10000; + +// The following code generates "src/regexp/special-case.cc". +void PrintSet(std::ofstream& out, const char* name, const icu::UnicodeSet& set) { - out << "icu::UnicodeSet " << func_name << "() {\n" + out << "icu::UnicodeSet Build" << name << "() {\n" << " icu::UnicodeSet set;\n"; for (int32_t i = 0; i < set.getRangeCount(); i++) { if (set.getRangeStart(i) == set.getRangeEnd(i)) { @@ -31,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name, } out << " set.freeze();\n" << " return set;\n" - << "}\n"; + << "}\n\n"; + + out << "struct " << name << "Data {\n" + << " " << name << "Data() : set(Build" << name << "()) {}\n" + << " const icu::UnicodeSet set;\n" + << "};\n\n"; + + out << "//static\n" + << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" + << " static base::LazyInstance<" << name << "Data>::type set =\n" + << " LAZY_INSTANCE_INITIALIZER;\n" + << " return set.Pointer()->set;\n" + << "}\n\n"; } void PrintSpecial(std::ofstream& out) { icu::UnicodeSet current; - icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. icu::UnicodeSet special_add; icu::UnicodeSet ignore; UErrorCode status = U_ZERO_ERROR; icu::UnicodeSet upper("[\\p{Lu}]", status); CHECK(U_SUCCESS(status)); - // Iterate through all chars in BMP except ASCII and Surrogate. - for (UChar32 i = 0x80; i < 0x010000; i++) { - // Ignore those characters which is already processed. - if (!processed.contains(i)) { - current.set(i, i); - current.closeOver(USET_CASE_INSENSITIVE); - // Remember we already processed current. - processed.addAll(current); - - // All uppercase characters in current. - icu::UnicodeSet keep_upper(current); - keep_upper.retainAll(upper); - - // Check if we have more than one uppercase character in current. - // If there are more than one uppercase character, then it is a special - // set which need to be added into either "Special Add" set or "Ignore" - // set. - int32_t number_of_upper = 0; - for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { - number_of_upper += - keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; + // Iterate through all chars in BMP except surrogates. + for (UChar32 i = 0; i < kNonBmpStart; i++) { + if (i >= kSurrogateStart && i <= kSurrogateEnd) { + continue; // Ignore surrogate range + } + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Check to see if all characters in the case-folding equivalence + // class as defined by UnicodeSet::closeOver all map to the same + // canonical value. + UChar32 canonical = RegExpCaseFolding::Canonicalize(i); + bool class_has_matching_canonical_char = false; + bool class_has_non_matching_canonical_char = false; + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); + c++) { + if (c == i) { + continue; + } + UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); + if (canonical == other_canonical) { + class_has_matching_canonical_char = true; + } else { + class_has_non_matching_canonical_char = true; + } + } + } + // If any other character in i's equivalence class has a + // different canonical value, then i needs special handling. If + // no other character shares a canonical value with i, we can + // ignore i when adding alternatives for case-independent + // comparison. If at least one other character shares a + // canonical value, then i needs special handling. + if (class_has_non_matching_canonical_char) { + if (class_has_matching_canonical_char) { + special_add.add(i); + } else { + ignore.add(i); } - if (number_of_upper > 1) { - // Add all non uppercase characters (could be Ll or Mn) to special add - // set. - current.removeAll(upper); - special_add.addAll(current); - - // Add the uppercase characters of non uppercase character to - // special add set. - CHECK_GT(current.getRangeCount(), 0); - UChar32 main_upper = u_toupper(current.getRangeStart(0)); - special_add.add(main_upper); - - // Add all uppercase except the main upper to ignore set. - keep_upper.remove(main_upper); - ignore.addAll(keep_upper); + } + } + + // Verify that no Unicode equivalence class contains two non-trivial + // JS equivalence classes. Every character in SpecialAddSet has the + // same canonical value as every other non-IgnoreSet character in + // its Unicode equivalence class. Therefore, if we call closeOver on + // a set containing no IgnoreSet characters, the only characters + // that must be removed from the result are in IgnoreSet. This fact + // is used in CharacterRange::AddCaseEquivalents. + for (int32_t i = 0; i < special_add.getRangeCount(); i++) { + for (UChar32 c = special_add.getRangeStart(i); + c <= special_add.getRangeEnd(i); c++) { + UChar32 canonical = RegExpCaseFolding::Canonicalize(c); + current.set(c, c); + current.closeOver(USET_CASE_INSENSITIVE); + current.removeAll(ignore); + for (int32_t j = 0; j < current.getRangeCount(); j++) { + for (UChar32 c2 = current.getRangeStart(j); + c2 <= current.getRangeEnd(j); c2++) { + CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); + } } } } - // Remove any ASCII - special_add.remove(0x0000, 0x007f); - PrintSet(out, "BuildIgnoreSet", ignore); - PrintSet(out, "BuildSpecialAddSet", special_add); + PrintSet(out, "IgnoreSet", ignore); + PrintSet(out, "SpecialAddSet", special_add); } void WriteHeader(const char* header_filename) { std::ofstream out(header_filename); out << std::hex << std::setfill('0') << std::setw(4); - - out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" - << "// The following functions are used to build icu::UnicodeSet\n" - << "// for specical cases different between Unicode and ECMA262.\n" + out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" + << "// Use of this source code is governed by a BSD-style license that\n" + << "// can be found in the LICENSE file.\n\n" + << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" + << "// The following functions are used to build UnicodeSets\n" + << "// for special cases where the case-folding algorithm used by\n" + << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" + << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" + << "// Semantics: Canonicalize) step 3.\n\n" << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/base/lazy-instance.h\"\n\n" << "#include \"src/regexp/special-case.h\"\n\n" << "#include \"unicode/uniset.h\"\n" << "namespace v8 {\n" diff --git a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc index f9015287f9..7f6bd5e296 100644 --- a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc +++ b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc @@ -205,9 +205,8 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) { __ bind(&fallthrough); } - void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ mov(edx, register_location(start_reg)); // Index of start of capture __ mov(ebx, register_location(start_reg + 1)); // Index of end of capture @@ -314,18 +313,11 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( // Address byte_offset1 - Address captured substring's start. // Address byte_offset2 - Address of current character position. // size_t byte_length - length of capture in bytes(!) -// Isolate* isolate or 0 if unicode flag. + // Isolate* isolate. // Set isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ mov(Operand(esp, 3 * kSystemPointerSize), Immediate(0)); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ mov(Operand(esp, 3 * kSystemPointerSize), - Immediate(ExternalReference::isolate_address(isolate()))); - } + __ mov(Operand(esp, 3 * kSystemPointerSize), + Immediate(ExternalReference::isolate_address(isolate()))); // Set byte_length. __ mov(Operand(esp, 2 * kSystemPointerSize), ebx); // Set byte_offset2. @@ -366,7 +358,6 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerIA32::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { diff --git a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h index b2c6fab7b3..f68dd0b1b7 100644 --- a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h +++ b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerIA32 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc index 1e7839c219..e3f2ea6292 100644 --- a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc +++ b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc @@ -106,6 +106,8 @@ RegExpMacroAssemblerMIPS::RegExpMacroAssemblerMIPS(Isolate* isolate, Zone* zone, backtrack_label_(), exit_label_(), internal_failure_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); __ jmp(&entry_label_); // We'll write the entry code later. // If the code gets too big or corrupted, an internal exception will be @@ -223,9 +225,8 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { BranchOrBacktrack(on_equal, eq, current_input_offset(), Operand(a0)); } - void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ lw(a0, register_location(start_reg)); // Index of start of capture. __ lw(a1, register_location(start_reg + 1)); // Index of end of capture. @@ -320,7 +321,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( // a0: Address byte_offset1 - Address captured substring's start. // a1: Address byte_offset2 - Address of current character position. // a2: size_t byte_length - length of capture in bytes(!). - // a3: Isolate* isolate or 0 if unicode flag. + // a3: Isolate* isolate. // Address of start of capture. __ Addu(a0, a0, Operand(end_of_input_address())); @@ -334,14 +335,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( __ Subu(a1, a1, Operand(s3)); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ mov(a3, zero_reg); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); - } + __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); { AllowExternalCallThatCantCauseGC scope(masm_); @@ -368,7 +362,6 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerMIPS::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { diff --git a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h index 9281b0174d..5733bbe046 100644 --- a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h +++ b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc index 3dd1548685..fc3cad8b0e 100644 --- a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc +++ b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc @@ -142,6 +142,8 @@ RegExpMacroAssemblerMIPS::RegExpMacroAssemblerMIPS(Isolate* isolate, Zone* zone, backtrack_label_(), exit_label_(), internal_failure_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); __ jmp(&entry_label_); // We'll write the entry code later. // If the code gets too big or corrupted, an internal exception will be @@ -259,9 +261,8 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) { BranchOrBacktrack(on_equal, eq, current_input_offset(), Operand(a0)); } - void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ Ld(a0, register_location(start_reg)); // Index of start of capture. __ Ld(a1, register_location(start_reg + 1)); // Index of end of capture. @@ -356,7 +357,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( // a0: Address byte_offset1 - Address captured substring's start. // a1: Address byte_offset2 - Address of current character position. // a2: size_t byte_length - length of capture in bytes(!). - // a3: Isolate* isolate or 0 if unicode flag. + // a3: Isolate* isolate. // Address of start of capture. __ Daddu(a0, a0, Operand(end_of_input_address())); @@ -370,14 +371,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( __ Dsubu(a1, a1, Operand(s3)); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ mov(a3, zero_reg); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); - } + __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate()))); { AllowExternalCallThatCantCauseGC scope(masm_); @@ -404,7 +398,6 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerMIPS::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { diff --git a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h index bc7f83e6e9..b267297c24 100644 --- a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h +++ b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h @@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc index 50bf71e6d5..376103324a 100644 --- a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc +++ b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#if V8_TARGET_ARCH_PPC +#if V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" @@ -111,6 +111,8 @@ RegExpMacroAssemblerPPC::RegExpMacroAssemblerPPC(Isolate* isolate, Zone* zone, backtrack_label_(), exit_label_(), internal_failure_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); @@ -123,7 +125,6 @@ RegExpMacroAssemblerPPC::RegExpMacroAssemblerPPC(Isolate* isolate, Zone* zone, __ bind(&start_label_); // And then continue from here. } - RegExpMacroAssemblerPPC::~RegExpMacroAssemblerPPC() { delete masm_; // Unuse labels in case we throw away the assembler without calling GetCode. @@ -241,7 +242,7 @@ void RegExpMacroAssemblerPPC::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ LoadP(r3, register_location(start_reg), r0); // Index of start of capture __ LoadP(r4, register_location(start_reg + 1), r0); // Index of end @@ -336,7 +337,7 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( // r3: Address byte_offset1 - Address captured substring's start. // r4: Address byte_offset2 - Address of current character position. // r5: size_t byte_length - length of capture in bytes(!) - // r6: Isolate* isolate or 0 if unicode flag. + // r6: Isolate* isolate. // Address of start of capture. __ add(r3, r3, end_of_input_address()); @@ -350,14 +351,7 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( __ sub(r4, r4, r25); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ li(r6, Operand::Zero()); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ mov(r6, Operand(ExternalReference::isolate_address(isolate()))); - } + __ mov(r6, Operand(ExternalReference::isolate_address(isolate()))); { AllowExternalCallThatCantCauseGC scope(masm_); @@ -381,7 +375,6 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerPPC::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { @@ -1371,4 +1364,4 @@ void RegExpMacroAssemblerPPC::LoadCurrentCharacterUnchecked(int cp_offset, } // namespace internal } // namespace v8 -#endif // V8_TARGET_ARCH_PPC +#endif // V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 diff --git a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h index c726a5f0d7..3e64f139a8 100644 --- a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h +++ b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h @@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerPPC virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, diff --git a/deps/v8/src/regexp/regexp-ast.h b/deps/v8/src/regexp/regexp-ast.h index 3de29512ea..a9106d3d30 100644 --- a/deps/v8/src/regexp/regexp-ast.h +++ b/deps/v8/src/regexp/regexp-ast.h @@ -463,7 +463,11 @@ class RegExpQuantifier final : public RegExpTree { class RegExpCapture final : public RegExpTree { public: explicit RegExpCapture(int index) - : body_(nullptr), index_(index), name_(nullptr) {} + : body_(nullptr), + index_(index), + min_match_(0), + max_match_(0), + name_(nullptr) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; static RegExpNode* ToNode(RegExpTree* body, int index, @@ -473,10 +477,14 @@ class RegExpCapture final : public RegExpTree { bool IsAnchoredAtEnd() override; Interval CaptureRegisters() override; bool IsCapture() override; - int min_match() override { return body_->min_match(); } - int max_match() override { return body_->max_match(); } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } RegExpTree* body() { return body_; } - void set_body(RegExpTree* body) { body_ = body; } + void set_body(RegExpTree* body) { + body_ = body; + min_match_ = body->min_match(); + max_match_ = body->max_match(); + } int index() const { return index_; } const ZoneVector<uc16>* name() const { return name_; } void set_name(const ZoneVector<uc16>* name) { name_ = name; } @@ -486,12 +494,17 @@ class RegExpCapture final : public RegExpTree { private: RegExpTree* body_; int index_; + int min_match_; + int max_match_; const ZoneVector<uc16>* name_; }; class RegExpGroup final : public RegExpTree { public: - explicit RegExpGroup(RegExpTree* body) : body_(body) {} + explicit RegExpGroup(RegExpTree* body) + : body_(body), + min_match_(body->min_match()), + max_match_(body->max_match()) {} void* Accept(RegExpVisitor* visitor, void* data) override; RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override { @@ -501,13 +514,15 @@ class RegExpGroup final : public RegExpTree { bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } bool IsGroup() override; - int min_match() override { return body_->min_match(); } - int max_match() override { return body_->max_match(); } + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } Interval CaptureRegisters() override { return body_->CaptureRegisters(); } RegExpTree* body() { return body_; } private: RegExpTree* body_; + int min_match_; + int max_match_; }; class RegExpLookaround final : public RegExpTree { diff --git a/deps/v8/src/regexp/regexp-bytecode-generator.cc b/deps/v8/src/regexp/regexp-bytecode-generator.cc index 0dcc288d3c..e82b67b530 100644 --- a/deps/v8/src/regexp/regexp-bytecode-generator.cc +++ b/deps/v8/src/regexp/regexp-bytecode-generator.cc @@ -329,13 +329,11 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg, } void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_not_equal) { + int start_reg, bool read_backward, Label* on_not_equal) { DCHECK_LE(0, start_reg); DCHECK_GE(kMaxRegister, start_reg); - Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) - : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE - : BC_CHECK_NOT_BACK_REF_NO_CASE), + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg); EmitOrLink(on_not_equal); } diff --git a/deps/v8/src/regexp/regexp-bytecode-generator.h b/deps/v8/src/regexp/regexp-bytecode-generator.h index dfcc2ca5f8..85073cc99d 100644 --- a/deps/v8/src/regexp/regexp-bytecode-generator.h +++ b/deps/v8/src/regexp/regexp-bytecode-generator.h @@ -69,7 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt); virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge); diff --git a/deps/v8/src/regexp/regexp-bytecode-peephole.cc b/deps/v8/src/regexp/regexp-bytecode-peephole.cc index 8f1f1d95a9..f0957f0779 100644 --- a/deps/v8/src/regexp/regexp-bytecode-peephole.cc +++ b/deps/v8/src/regexp/regexp-bytecode-peephole.cc @@ -436,7 +436,6 @@ BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping( size_t index) const { DCHECK(IsSequence()); DCHECK(argument_mapping_ != nullptr); - DCHECK_GE(index, 0); DCHECK_LT(index, argument_mapping_->size()); return argument_mapping_->at(index); diff --git a/deps/v8/src/regexp/regexp-bytecodes.h b/deps/v8/src/regexp/regexp-bytecodes.h index e25945d0a0..1664a476d2 100644 --- a/deps/v8/src/regexp/regexp-bytecodes.h +++ b/deps/v8/src/regexp/regexp-bytecodes.h @@ -101,12 +101,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \ + V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \ V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ - V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \ + V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \ V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ diff --git a/deps/v8/src/regexp/regexp-compiler-tonode.cc b/deps/v8/src/regexp/regexp-compiler-tonode.cc index 2d86d3ea9e..40ecee0f91 100644 --- a/deps/v8/src/regexp/regexp-compiler-tonode.cc +++ b/deps/v8/src/regexp/regexp-compiler-tonode.cc @@ -1140,39 +1140,6 @@ Vector<const int> CharacterRange::GetWordBounds() { return Vector<const int>(kWordRanges, kWordRangeCount - 1); } -#ifdef V8_INTL_SUPPORT -struct IgnoreSet { - IgnoreSet() : set(BuildIgnoreSet()) {} - const icu::UnicodeSet set; -}; - -struct SpecialAddSet { - SpecialAddSet() : set(BuildSpecialAddSet()) {} - const icu::UnicodeSet set; -}; - -icu::UnicodeSet BuildAsciiAToZSet() { - icu::UnicodeSet set('a', 'z'); - set.add('A', 'Z'); - set.freeze(); - return set; -} - -struct AsciiAToZSet { - AsciiAToZSet() : set(BuildAsciiAToZSet()) {} - const icu::UnicodeSet set; -}; - -static base::LazyInstance<IgnoreSet>::type ignore_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<SpecialAddSet>::type special_add_set = - LAZY_INSTANCE_INITIALIZER; - -static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set = - LAZY_INSTANCE_INITIALIZER; -#endif // V8_INTL_SUPPORT - // static void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, @@ -1195,75 +1162,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, others.add(from, to); } - // Set of characters already added to ranges that do not need to be added - // again. + // Compute the set of additional characters that should be added, + // using UnicodeSet::closeOver. ECMA 262 defines slightly different + // case-folding rules than Unicode, so some characters that are + // added by closeOver do not match anything other than themselves in + // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the + // same case-insensitive character as 's' or 'S' according to + // Unicode, but does not match any other character in JS. To handle + // this case, we add such characters to the IgnoreSet and filter + // them out. We filter twice: once before calling closeOver (to + // prevent 'ſ' from adding 's'), and once after calling closeOver + // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for + // more information. icu::UnicodeSet already_added(others); - - // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z]. - icu::UnicodeSet in_ascii_a_to_z(others); - in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set); - - // Remove all chars in [a-zA-Z] from others. - others.removeAll(in_ascii_a_to_z); - - // Set of characters in ranges that are overlapping with special add set. - icu::UnicodeSet in_special_add(others); - in_special_add.retainAll(special_add_set.Pointer()->set); - - others.removeAll(in_special_add); - - // Ignore all chars in ignore set. - others.removeAll(ignore_set.Pointer()->set); - - // For most of the chars in ranges that is still in others, find the case - // equivlant set by calling closeOver(USET_CASE_INSENSITIVE). + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.closeOver(USET_CASE_INSENSITIVE); - - // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others, - // but ECMA262 "i" mode won't consider that, remove them from others. - // Ex: U+017F add 'S' and 's' to others. - others.removeAll(ascii_a_to_z_set.Pointer()->set); - - // Special handling for in_ascii_a_to_z. - for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) { - UChar32 start = in_ascii_a_to_z.getRangeStart(i); - UChar32 end = in_ascii_a_to_z.getRangeEnd(i); - // Check if it is uppercase A-Z by checking bit 6. - if (start & 0x0020) { - // Add the lowercases - others.add(start & 0x005F, end & 0x005F); - } else { - // Add the uppercases - others.add(start | 0x0020, end | 0x0020); - } - } - - // Special handling for chars in "Special Add" set. - for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) { - UChar32 end = in_special_add.getRangeEnd(i); - for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) { - // Add the uppercase of this character if itself is not an uppercase - // character. - // Note: The if condiction cannot be u_islower(ch) because ch could be - // neither uppercase nor lowercase but Mn. - if (!u_isupper(ch)) { - others.add(u_toupper(ch)); - } - icu::UnicodeSet candidates(ch, ch); - candidates.closeOver(USET_CASE_INSENSITIVE); - for (int32_t j = 0; j < candidates.getRangeCount(); j++) { - UChar32 end2 = candidates.getRangeEnd(j); - for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) { - // Add character that is not uppercase to others. - if (!u_isupper(ch2)) { - others.add(ch2); - } - } - } - } - } - - // Remove all characters which already in the ranges. + others.removeAll(RegExpCaseFolding::IgnoreSet()); others.removeAll(already_added); // Add others to the ranges diff --git a/deps/v8/src/regexp/regexp-compiler.cc b/deps/v8/src/regexp/regexp-compiler.cc index d141f3c490..a6c7cdbe2f 100644 --- a/deps/v8/src/regexp/regexp-compiler.cc +++ b/deps/v8/src/regexp/regexp-compiler.cc @@ -8,7 +8,9 @@ #include "src/execution/isolate.h" #include "src/objects/objects-inl.h" #include "src/regexp/regexp-macro-assembler-arch.h" -#include "src/regexp/regexp-macro-assembler-tracer.h" +#ifdef V8_INTL_SUPPORT +#include "src/regexp/special-case.h" +#endif // V8_INTL_SUPPORT #include "src/strings/unicode-inl.h" #include "src/zone/zone-list-inl.h" @@ -242,20 +244,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, RegExpCompiler::CompilationResult RegExpCompiler::Assemble( Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start, int capture_count, Handle<String> pattern) { -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) - macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler); - else -#endif - macro_assembler_ = macro_assembler; + macro_assembler_ = macro_assembler; - std::vector<RegExpNode*> work_list; + ZoneVector<RegExpNode*> work_list(zone()); work_list_ = &work_list; Label fail; macro_assembler_->PushBacktrack(&fail); Trace new_trace; start->Emit(this, &new_trace); - macro_assembler_->Bind(&fail); + macro_assembler_->BindJumpTarget(&fail); macro_assembler_->Fail(); while (!work_list.empty()) { RegExpNode* node = work_list.back(); @@ -269,14 +266,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble( } Handle<HeapObject> code = macro_assembler_->GetCode(pattern); - isolate->IncreaseTotalRegexpCodeGenerated(code->Size()); + isolate->IncreaseTotalRegexpCodeGenerated(code); work_list_ = nullptr; -#ifdef DEBUG - if (FLAG_trace_regexp_assembler) { - delete macro_assembler_; - } -#endif return {*code, next_register_}; } @@ -562,7 +554,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) { } // On backtrack we need to restore state. - assembler->Bind(&undo); + assembler->BindJumpTarget(&undo); RestoreAffectedRegisters(assembler, max_register, registers_to_pop, registers_to_clear); if (backtrack() == nullptr) { @@ -725,32 +717,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, unibrow::uchar* letters, int letter_length) { #ifdef V8_INTL_SUPPORT - // Special case for U+017F which has upper case in ASCII range. - if (character == 0x017f) { + if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; return 1; } + bool in_special_add_set = + RegExpCaseFolding::SpecialAddSet().contains(character); + icu::UnicodeSet set; set.add(character); set = set.closeOver(USET_CASE_INSENSITIVE); + + UChar32 canon = 0; + if (in_special_add_set) { + canon = RegExpCaseFolding::Canonicalize(character); + } + int32_t range_count = set.getRangeCount(); int items = 0; for (int32_t i = 0; i < range_count; i++) { UChar32 start = set.getRangeStart(i); UChar32 end = set.getRangeEnd(i); CHECK(end - start + items <= letter_length); - // Only add to the output if character is not in ASCII range - // or the case equivalent character is in ASCII range. - // #sec-runtime-semantics-canonicalize-ch - // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128, - // return ch. - if (!((start >= 128) && (character < 128))) { - // No range have start and end span across code point 128. - DCHECK((start >= 128) == (end >= 128)); - for (UChar32 cu = start; cu <= end; cu++) { - if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; - letters[items++] = (unibrow::uchar)(cu); + for (UChar32 cu = start; cu <= end; cu++) { + if (one_byte_subject && cu > String::kMaxOneByteCharCode) break; + if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) { + continue; } + letters[items++] = (unibrow::uchar)(cu); } } return items; @@ -857,10 +851,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, return false; } -using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded); - // Only emits letters (things that have case). Only used for case independent // matches. static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, @@ -1848,13 +1838,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (elm.text_type() == TextElement::ATOM) { Vector<const uc16> quarks = elm.atom()->data(); for (int j = 0; j < quarks.length(); j++) { - uint16_t c = quarks[j]; + uc16 c = quarks[j]; if (elm.atom()->ignore_case()) { c = unibrow::Latin1::TryConvertToLatin1(c); } if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); // Replace quark in case we converted to Latin-1. - uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin()); + uc16* writable_quarks = const_cast<uc16*>(quarks.begin()); writable_quarks[j] = c; } } else { @@ -2309,7 +2299,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; - EmitCharacterFunction* emit_function = nullptr; uc16 quark = quarks[j]; if (elm.atom()->ignore_case()) { // Everywhere else we assume that a non-Latin-1 character cannot match @@ -2317,6 +2306,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, // invalid by using the Latin1 equivalent instead. quark = unibrow::Latin1::TryConvertToLatin1(quark); } + bool needs_bounds_check = + *checked_up_to < cp_offset + j || read_backward(); + bool bounds_checked = false; switch (pass) { case NON_LATIN1_MATCH: DCHECK(one_byte); @@ -2326,24 +2318,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, } break; case NON_LETTER_CHARACTER_MATCH: - emit_function = &EmitAtomNonLetter; + bounds_checked = + EmitAtomNonLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; case SIMPLE_CHARACTER_MATCH: - emit_function = &EmitSimpleCharacter; + bounds_checked = EmitSimpleCharacter(isolate, compiler, quark, + backtrack, cp_offset + j, + needs_bounds_check, preloaded); break; case CASE_CHARACTER_MATCH: - emit_function = &EmitAtomLetter; + bounds_checked = + EmitAtomLetter(isolate, compiler, quark, backtrack, + cp_offset + j, needs_bounds_check, preloaded); break; default: break; } - if (emit_function != nullptr) { - bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); - bool bound_checked = - emit_function(isolate, compiler, quark, backtrack, cp_offset + j, - bounds_check, preloaded); - if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); - } + if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } } else { DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); @@ -3429,8 +3421,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { DCHECK_EQ(start_reg_ + 1, end_reg_); if (IgnoreCase(flags_)) { - assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack()); + assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), + trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3602,12 +3594,17 @@ template <typename... Propagators> class Analysis : public NodeVisitor { public: Analysis(Isolate* isolate, bool is_one_byte) - : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {} + : isolate_(isolate), + is_one_byte_(is_one_byte), + error_(RegExpError::kNone) {} void EnsureAnalyzed(RegExpNode* that) { StackLimitCheck check(isolate()); if (check.HasOverflowed()) { - fail("Stack overflow"); + if (FLAG_correctness_fuzzer_suppressions) { + FATAL("Analysis: Aborting on stack overflow"); + } + fail(RegExpError::kAnalysisStackOverflow); return; } if (that->info()->been_analyzed || that->info()->being_analyzed) return; @@ -3617,12 +3614,12 @@ class Analysis : public NodeVisitor { that->info()->been_analyzed = true; } - bool has_failed() { return error_message_ != nullptr; } - const char* error_message() { - DCHECK(error_message_ != nullptr); - return error_message_; + bool has_failed() { return error_ != RegExpError::kNone; } + RegExpError error() { + DCHECK(error_ != RegExpError::kNone); + return error_; } - void fail(const char* error_message) { error_message_ = error_message; } + void fail(RegExpError error) { error_ = error; } Isolate* isolate() const { return isolate_; } @@ -3707,19 +3704,19 @@ class Analysis : public NodeVisitor { private: Isolate* isolate_; bool is_one_byte_; - const char* error_message_; + RegExpError error_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); }; -const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node) { Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate, is_one_byte); DCHECK_EQ(node->info()->been_analyzed, false); analysis.EnsureAnalyzed(node); - DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr); - return analysis.has_failed() ? analysis.error_message() : nullptr; + DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); + return analysis.has_failed() ? analysis.error() : RegExpError::kNone; } void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, diff --git a/deps/v8/src/regexp/regexp-compiler.h b/deps/v8/src/regexp/regexp-compiler.h index 2de221f35d..d083d5d9dd 100644 --- a/deps/v8/src/regexp/regexp-compiler.h +++ b/deps/v8/src/regexp/regexp-compiler.h @@ -423,10 +423,7 @@ struct PreloadState { // Analysis performs assertion propagation and computes eats_at_least_ values. // See the comments on AssertionPropagator and EatsAtLeastPropagator for more // details. -// -// This method returns nullptr on success or a null-terminated failure message -// on failure. -const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); class FrequencyCollator { public: @@ -503,18 +500,17 @@ class RegExpCompiler { } struct CompilationResult final { - explicit CompilationResult(const char* error_message) - : error_message(error_message) {} + explicit CompilationResult(RegExpError err) : error(err) {} CompilationResult(Object code, int registers) : code(code), num_registers(registers) {} static CompilationResult RegExpTooBig() { - return CompilationResult("RegExp too big"); + return CompilationResult(RegExpError::kTooLarge); } - bool Succeeded() const { return error_message == nullptr; } + bool Succeeded() const { return error == RegExpError::kNone; } - const char* const error_message = nullptr; + const RegExpError error = RegExpError::kNone; Object code; int num_registers = 0; }; @@ -576,7 +572,7 @@ class RegExpCompiler { int next_register_; int unicode_lookaround_stack_register_; int unicode_lookaround_position_register_; - std::vector<RegExpNode*>* work_list_; + ZoneVector<RegExpNode*>* work_list_; int recursion_depth_; RegExpMacroAssembler* macro_assembler_; bool one_byte_; diff --git a/deps/v8/src/regexp/regexp-error.cc b/deps/v8/src/regexp/regexp-error.cc new file mode 100644 index 0000000000..d7763c64f8 --- /dev/null +++ b/deps/v8/src/regexp/regexp-error.cc @@ -0,0 +1,22 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "src/regexp/regexp-error.h" + +namespace v8 { +namespace internal { + +const char* const kRegExpErrorStrings[] = { +#define TEMPLATE(NAME, STRING) STRING, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE +}; + +const char* RegExpErrorString(RegExpError error) { + DCHECK_LT(error, RegExpError::NumErrors); + return kRegExpErrorStrings[static_cast<int>(error)]; +} + +} // namespace internal +} // namespace v8 diff --git a/deps/v8/src/regexp/regexp-error.h b/deps/v8/src/regexp/regexp-error.h new file mode 100644 index 0000000000..6145b404ab --- /dev/null +++ b/deps/v8/src/regexp/regexp-error.h @@ -0,0 +1,58 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_ERROR_H_ +#define V8_REGEXP_REGEXP_ERROR_H_ + +#include "src/base/logging.h" +#include "src/base/macros.h" + +namespace v8 { +namespace internal { + +#define REGEXP_ERROR_MESSAGES(T) \ + T(None, "") \ + T(StackOverflow, "Maximum call stack size exceeded") \ + T(AnalysisStackOverflow, "Stack overflow") \ + T(TooLarge, "Regular expression too large") \ + T(UnterminatedGroup, "Unterminated group") \ + T(UnmatchedParen, "Unmatched ')'") \ + T(EscapeAtEndOfPattern, "\\ at end of pattern") \ + T(InvalidPropertyName, "Invalid property name") \ + T(InvalidEscape, "Invalid escape") \ + T(InvalidDecimalEscape, "Invalid decimal escape") \ + T(InvalidUnicodeEscape, "Invalid Unicode escape") \ + T(NothingToRepeat, "Nothing to repeat") \ + T(LoneQuantifierBrackets, "Lone quantifier brackets") \ + T(RangeOutOfOrder, "numbers out of order in {} quantifier") \ + T(IncompleteQuantifier, "Incomplete quantifier") \ + T(InvalidQuantifier, "Invalid quantifier") \ + T(InvalidGroup, "Invalid group") \ + T(MultipleFlagDashes, "Multiple dashes in flag group") \ + T(RepeatedFlag, "Repeated flag in flag group") \ + T(InvalidFlagGroup, "Invalid flag group") \ + T(TooManyCaptures, "Too many captures") \ + T(InvalidCaptureGroupName, "Invalid capture group name") \ + T(DuplicateCaptureGroupName, "Duplicate capture group name") \ + T(InvalidNamedReference, "Invalid named reference") \ + T(InvalidNamedCaptureReference, "Invalid named capture referenced") \ + T(InvalidClassEscape, "Invalid class escape") \ + T(InvalidClassPropertyName, "Invalid property name in character class") \ + T(InvalidCharacterClass, "Invalid character class") \ + T(UnterminatedCharacterClass, "Unterminated character class") \ + T(OutOfOrderCharacterClass, "Range out of order in character class") + +enum class RegExpError : uint32_t { +#define TEMPLATE(NAME, STRING) k##NAME, + REGEXP_ERROR_MESSAGES(TEMPLATE) +#undef TEMPLATE + NumErrors +}; + +V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error); + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_ERROR_H_ diff --git a/deps/v8/src/regexp/regexp-interpreter.cc b/deps/v8/src/regexp/regexp-interpreter.cc index a74df90c1d..d3efa65bf1 100644 --- a/deps/v8/src/regexp/regexp-interpreter.cc +++ b/deps/v8/src/regexp/regexp-interpreter.cc @@ -35,18 +35,18 @@ namespace internal { namespace { bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uc16> subject, bool unicode) { + Vector<const uc16> subject) { Address offset_a = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from))); Address offset_b = reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current))); size_t length = len * kUC16Size; - return RegExpMacroAssembler::CaseInsensitiveCompareUC16( - offset_a, offset_b, length, unicode ? nullptr : isolate) == 1; + return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b, + length, isolate) == 1; } bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector<const uint8_t> subject, bool unicode) { + Vector<const uint8_t> subject) { // For Latin1 characters the unicode flag makes no difference. for (int i = 0; i < len; i++) { unsigned int old_char = subject[from++]; @@ -747,26 +747,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current += len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE); - DISPATCH(); + UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from >= 0 && len > 0) { if (current + len > subject.length() || - !BackRefMatchesNoCase(isolate, from, current, len, subject, - false)) { + !BackRefMatchesNoCase(isolate, from, current, len, subject)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } @@ -776,27 +764,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array, DISPATCH(); } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) { - int from = registers[insn >> BYTECODE_SHIFT]; - int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; - if (from >= 0 && len > 0) { - if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject, - true)) { - SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); - DISPATCH(); - } - current -= len; - } - ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD); - DISPATCH(); + UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode. } BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from >= 0 && len > 0) { if (current - len < 0 || - !BackRefMatchesNoCase(isolate, from, current - len, len, subject, - false)) { + !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) { SET_PC_FROM_OFFSET(Load32Aligned(pc + 4)); DISPATCH(); } @@ -1029,6 +1004,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( } } +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + // This method is called through an external reference from RegExpExecInternal // builtin. IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( @@ -1076,6 +1053,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( return result; } +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime( Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string, int* registers, int registers_length, int start_position) { diff --git a/deps/v8/src/regexp/regexp-macro-assembler-arch.h b/deps/v8/src/regexp/regexp-macro-assembler-arch.h index 2dc6739e42..8ec12a0ae6 100644 --- a/deps/v8/src/regexp/regexp-macro-assembler-arch.h +++ b/deps/v8/src/regexp/regexp-macro-assembler-arch.h @@ -15,7 +15,7 @@ #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" #elif V8_TARGET_ARCH_ARM #include "src/regexp/arm/regexp-macro-assembler-arm.h" -#elif V8_TARGET_ARCH_PPC +#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h" #elif V8_TARGET_ARCH_MIPS #include "src/regexp/mips/regexp-macro-assembler-mips.h" diff --git a/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc b/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc index 5dca04a18c..0a12201743 100644 --- a/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc +++ b/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc @@ -351,17 +351,15 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg, assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match); } - void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { - PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n", + int start_reg, bool read_backward, Label* on_no_match) { + PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n", start_reg, read_backward ? "backward" : "forward", - unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match)); - assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode, + LabelToInt(on_no_match)); + assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, on_no_match); } - void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset, Label* on_outside_input) { PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset, diff --git a/deps/v8/src/regexp/regexp-macro-assembler-tracer.h b/deps/v8/src/regexp/regexp-macro-assembler-tracer.h index 2a44146e73..b6ad63071f 100644 --- a/deps/v8/src/regexp/regexp-macro-assembler-tracer.h +++ b/deps/v8/src/regexp/regexp-macro-assembler-tracer.h @@ -33,7 +33,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, - bool unicode, Label* on_no_match) override; void CheckNotCharacter(unsigned c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, diff --git a/deps/v8/src/regexp/regexp-macro-assembler.cc b/deps/v8/src/regexp/regexp-macro-assembler.cc index 30a9955dc3..3ac1bb7f57 100644 --- a/deps/v8/src/regexp/regexp-macro-assembler.cc +++ b/deps/v8/src/regexp/regexp-macro-assembler.cc @@ -6,6 +6,7 @@ #include "src/codegen/assembler.h" #include "src/execution/isolate-inl.h" +#include "src/execution/pointer-authentication.h" #include "src/execution/simulator.h" #include "src/regexp/regexp-stack.h" #include "src/strings/unicode-inl.h" @@ -114,34 +115,7 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() { return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); } -const byte* NativeRegExpMacroAssembler::StringCharacterPosition( - String subject, int start_index, const DisallowHeapAllocation& no_gc) { - if (subject.IsConsString()) { - subject = ConsString::cast(subject).first(); - } else if (subject.IsSlicedString()) { - start_index += SlicedString::cast(subject).offset(); - subject = SlicedString::cast(subject).parent(); - } - if (subject.IsThinString()) { - subject = ThinString::cast(subject).actual(); - } - DCHECK_LE(0, start_index); - DCHECK_LE(start_index, subject.length()); - if (subject.IsSeqOneByteString()) { - return reinterpret_cast<const byte*>( - SeqOneByteString::cast(subject).GetChars(no_gc) + start_index); - } else if (subject.IsSeqTwoByteString()) { - return reinterpret_cast<const byte*>( - SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index); - } else if (subject.IsExternalOneByteString()) { - return reinterpret_cast<const byte*>( - ExternalOneByteString::cast(subject).GetChars() + start_index); - } else { - DCHECK(subject.IsExternalTwoByteString()); - return reinterpret_cast<const byte*>( - ExternalTwoByteString::cast(subject).GetChars() + start_index); - } -} +#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER // This method may only be called after an interrupt. int NativeRegExpMacroAssembler::CheckStackGuardState( @@ -149,9 +123,10 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( Address* return_address, Code re_code, Address* subject, const byte** input_start, const byte** input_end) { DisallowHeapAllocation no_gc; + Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0); + DCHECK_LE(re_code.raw_instruction_start(), old_pc); + DCHECK_LE(old_pc, re_code.raw_instruction_end()); - DCHECK(re_code.raw_instruction_start() <= *return_address); - DCHECK(*return_address <= re_code.raw_instruction_end()); StackLimitCheck check(isolate); bool js_has_overflowed = check.JsHasOverflowed(); @@ -193,9 +168,11 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( } if (*code_handle != re_code) { // Return address no longer valid - intptr_t delta = code_handle->address() - re_code.address(); // Overwrite the return address on the stack. - *return_address += delta; + intptr_t delta = code_handle->address() - re_code.address(); + Address new_pc = old_pc + delta; + // TODO(v8:10026): avoid replacing a signed pointer. + PointerAuthentication::ReplacePC(return_address, new_pc, 0); } // If we continue, we need to update the subject string addresses. @@ -210,8 +187,7 @@ int NativeRegExpMacroAssembler::CheckStackGuardState( } else { *subject = subject_handle->ptr(); intptr_t byte_length = *input_end - *input_start; - *input_start = - StringCharacterPosition(*subject_handle, start_index, no_gc); + *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc); *input_end = *input_start + byte_length; } } @@ -259,7 +235,7 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp, DisallowHeapAllocation no_gc; const byte* input_start = - StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc); + subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc); int byte_length = char_length << char_size_shift; const byte* input_end = input_start + byte_length; return Execute(*subject, start_offset, input_start, input_end, offsets_vector, @@ -305,6 +281,8 @@ int NativeRegExpMacroAssembler::Execute( return result; } +#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER + // clang-format off const byte NativeRegExpMacroAssembler::word_character_map[] = { 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, diff --git a/deps/v8/src/regexp/regexp-macro-assembler.h b/deps/v8/src/regexp/regexp-macro-assembler.h index bda7e5cce1..e83446cdc9 100644 --- a/deps/v8/src/regexp/regexp-macro-assembler.h +++ b/deps/v8/src/regexp/regexp-macro-assembler.h @@ -87,7 +87,7 @@ class RegExpMacroAssembler { virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) = 0; virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -122,6 +122,11 @@ class RegExpMacroAssembler { // not have custom support. // May clobber the current loaded character. virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); + + // Control-flow integrity: + // Define a jump target and bind a label. + virtual void BindJumpTarget(Label* label) { Bind(label); } + virtual void Fail() = 0; virtual Handle<HeapObject> GetCode(Handle<String> source) = 0; virtual void GoTo(Label* label) = 0; @@ -246,9 +251,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { static Address GrowStack(Address stack_pointer, Address* stack_top, Isolate* isolate); - static const byte* StringCharacterPosition( - String subject, int start_index, const DisallowHeapAllocation& no_gc); - static int CheckStackGuardState(Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, Address* return_address, Code re_code, diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc index 951f815374..3c1115414f 100644 --- a/deps/v8/src/regexp/regexp-parser.cc +++ b/deps/v8/src/regexp/regexp-parser.cc @@ -24,11 +24,10 @@ namespace v8 { namespace internal { -RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, - JSRegExp::Flags flags, Isolate* isolate, Zone* zone) +RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, + Isolate* isolate, Zone* zone) : isolate_(isolate), zone_(zone), - error_(error), captures_(nullptr), named_captures_(nullptr), named_back_references_(nullptr), @@ -81,13 +80,12 @@ void RegExpParser::Advance() { if (FLAG_correctness_fuzzer_suppressions) { FATAL("Aborting on stack overflow"); } - ReportError(CStrVector( - MessageFormatter::TemplateString(MessageTemplate::kStackOverflow))); + ReportError(RegExpError::kStackOverflow); } else if (zone()->excess_allocation()) { if (FLAG_correctness_fuzzer_suppressions) { FATAL("Aborting on excess zone allocation"); } - ReportError(CStrVector("Regular expression too large")); + ReportError(RegExpError::kTooLarge); } else { current_ = ReadNext<true>(); } @@ -139,15 +137,12 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { return false; } - -RegExpTree* RegExpParser::ReportError(Vector<const char> message) { +RegExpTree* RegExpParser::ReportError(RegExpError error) { if (failed_) return nullptr; // Do not overwrite any existing error. failed_ = true; - *error_ = isolate() - ->factory() - ->NewStringFromOneByte(Vector<const uint8_t>::cast(message)) - .ToHandleChecked(); - // Zip to the end to make sure the no more input is read. + error_ = error; + error_pos_ = position(); + // Zip to the end to make sure no more input is read. current_ = kEndMarker; next_pos_ = in()->length(); return nullptr; @@ -194,14 +189,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { case kEndMarker: if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. - return ReportError(CStrVector("Unterminated group")); + return ReportError(RegExpError::kUnterminatedGroup); } DCHECK_EQ(INITIAL, state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { if (!state->IsSubexpression()) { - return ReportError(CStrVector("Unmatched ')'")); + return ReportError(RegExpError::kUnmatchedParen); } DCHECK_NE(INITIAL, state->group_type()); @@ -252,7 +247,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '*': case '+': case '?': - return ReportError(CStrVector("Nothing to repeat")); + return ReportError(RegExpError::kNothingToRepeat); case '^': { Advance(); if (builder->multiline()) { @@ -307,7 +302,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '\\': switch (Next()) { case kEndMarker: - return ReportError(CStrVector("\\ at end of pattern")); + return ReportError(RegExpError::kEscapeAtEndOfPattern); case 'b': Advance(2); builder->AddAssertion(new (zone()) RegExpAssertion( @@ -347,7 +342,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { if (unicode()) { ZoneList<CharacterRange>* ranges = new (zone()) ZoneList<CharacterRange>(2, zone()); - std::vector<char> name_1, name_2; + ZoneVector<char> name_1(zone()); + ZoneVector<char> name_2(zone()); if (ParsePropertyClassName(&name_1, &name_2)) { if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { RegExpCharacterClass* cc = new (zone()) @@ -363,7 +359,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { } } } - return ReportError(CStrVector("Invalid property name")); + return ReportError(RegExpError::kInvalidPropertyName); } else { builder->AddCharacter(p); } @@ -399,7 +395,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { // With /u, no identity escapes except for syntax characters // are allowed. Otherwise, all identity escapes are allowed. if (unicode()) { - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } uc32 first_digit = Next(); if (first_digit == '8' || first_digit == '9') { @@ -413,7 +409,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(); if (unicode() && Next() >= '0' && Next() <= '9') { // With /u, decimal escape with leading 0 are not parsed as octal. - return ReportError(CStrVector("Invalid decimal escape")); + return ReportError(RegExpError::kInvalidDecimalEscape); } uc32 octal = ParseOctalLiteral(); builder->AddCharacter(octal); @@ -454,7 +450,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { // ES#prod-annexB-ExtendedPatternCharacter if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid unicode escape")); + return ReportError(RegExpError::kInvalidUnicodeEscape); } builder->AddCharacter('\\'); } else { @@ -472,7 +468,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter('x'); } else { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } break; } @@ -485,7 +481,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter('u'); } else { // With /u, invalid escapes are not treated as identity escapes. - return ReportError(CStrVector("Invalid Unicode escape")); + return ReportError(RegExpError::kInvalidUnicodeEscape); } break; } @@ -509,7 +505,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddCharacter(current()); Advance(); } else { - return ReportError(CStrVector("Invalid escape")); + return ReportError(RegExpError::kInvalidEscape); } break; } @@ -517,13 +513,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': { int dummy; bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); - if (parsed) return ReportError(CStrVector("Nothing to repeat")); + if (parsed) return ReportError(RegExpError::kNothingToRepeat); V8_FALLTHROUGH; } case '}': case ']': if (unicode()) { - return ReportError(CStrVector("Lone quantifier brackets")); + return ReportError(RegExpError::kLoneQuantifierBrackets); } V8_FALLTHROUGH; default: @@ -558,13 +554,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': if (ParseIntervalQuantifier(&min, &max)) { if (max < min) { - return ReportError( - CStrVector("numbers out of order in {} quantifier")); + return ReportError(RegExpError::kRangeOutOfOrder); } break; } else if (unicode()) { // With /u, incomplete quantifiers are not allowed. - return ReportError(CStrVector("Incomplete quantifier")); + return ReportError(RegExpError::kIncompleteQuantifier); } continue; default: @@ -580,7 +575,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(); } if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { - return ReportError(CStrVector("Invalid quantifier")); + return ReportError(RegExpError::kInvalidQuantifier); } } } @@ -615,7 +610,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( case 's': case 'm': { if (!FLAG_regexp_mode_modifiers) { - ReportError(CStrVector("Invalid group")); + ReportError(RegExpError::kInvalidGroup); return nullptr; } Advance(); @@ -624,7 +619,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( switch (current()) { case '-': if (!flags_sense) { - ReportError(CStrVector("Multiple dashes in flag group")); + ReportError(RegExpError::kMultipleFlagDashes); return nullptr; } flags_sense = false; @@ -638,7 +633,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( if (current() == 'm') bit = JSRegExp::kMultiline; if (current() == 's') bit = JSRegExp::kDotAll; if (((switch_on | switch_off) & bit) != 0) { - ReportError(CStrVector("Repeated flag in flag group")); + ReportError(RegExpError::kRepeatedFlag); return nullptr; } if (flags_sense) { @@ -666,7 +661,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( subexpr_type = GROUPING; // Will break us out of the outer loop. continue; default: - ReportError(CStrVector("Invalid flag group")); + ReportError(RegExpError::kInvalidFlagGroup); return nullptr; } } @@ -690,13 +685,13 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( Advance(); break; default: - ReportError(CStrVector("Invalid group")); + ReportError(RegExpError::kInvalidGroup); return nullptr; } } if (subexpr_type == CAPTURE) { if (captures_started_ >= JSRegExp::kMaxCaptures) { - ReportError(CStrVector("Too many captures")); + ReportError(RegExpError::kTooManyCaptures); return nullptr; } captures_started_++; @@ -845,20 +840,20 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { if (c == '\\' && current() == 'u') { Advance(); if (!ParseUnicodeEscape(&c)) { - ReportError(CStrVector("Invalid Unicode escape sequence")); + ReportError(RegExpError::kInvalidUnicodeEscape); return nullptr; } } // The backslash char is misclassified as both ID_Start and ID_Continue. if (c == '\\') { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } if (at_start) { if (!IsIdentifierStart(c)) { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } push_code_unit(name, c); @@ -869,7 +864,7 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { } else if (IsIdentifierPart(c)) { push_code_unit(name, c); } else { - ReportError(CStrVector("Invalid capture group name")); + ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } } @@ -896,7 +891,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, const auto& named_capture_it = named_captures_->find(capture); if (named_capture_it != named_captures_->end()) { - ReportError(CStrVector("Duplicate capture group name")); + ReportError(RegExpError::kDuplicateCaptureGroupName); return false; } } @@ -910,7 +905,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, RegExpParserState* state) { // The parser is assumed to be on the '<' in \k<name>. if (current() != '<') { - ReportError(CStrVector("Invalid named reference")); + ReportError(RegExpError::kInvalidNamedReference); return false; } @@ -943,7 +938,7 @@ void RegExpParser::PatchNamedBackReferences() { if (named_back_references_ == nullptr) return; if (named_captures_ == nullptr) { - ReportError(CStrVector("Invalid named capture referenced")); + ReportError(RegExpError::kInvalidNamedCaptureReference); return; } @@ -964,7 +959,7 @@ void RegExpParser::PatchNamedBackReferences() { if (capture_it != named_captures_->end()) { index = (*capture_it)->index(); } else { - ReportError(CStrVector("Invalid named capture referenced")); + ReportError(RegExpError::kInvalidNamedCaptureReference); return; } @@ -1385,8 +1380,8 @@ bool IsUnicodePropertyValueCharacter(char c) { } // anonymous namespace -bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2) { +bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { DCHECK(name_1->empty()); DCHECK(name_2->empty()); // Parse the property class as follows: @@ -1425,8 +1420,8 @@ bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2) { + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2) { if (name_2.empty()) { // First attempt to interpret as general category property value name. const char* name = name_1.data(); @@ -1463,7 +1458,7 @@ bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, } } -RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) { +RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) { if (!FLAG_harmony_regexp_sequence) return nullptr; const char* name = name_1.data(); const uc32* sequence_list = nullptr; @@ -1529,19 +1524,19 @@ RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) { #else // V8_INTL_SUPPORT -bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2) { +bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2) { return false; } bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2) { + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2) { return false; } -RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) { +RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) { return nullptr; } @@ -1605,7 +1600,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { } if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid class escape")); + ReportError(RegExpError::kInvalidClassEscape); return 0; } if ((controlLetter >= '0' && controlLetter <= '9') || @@ -1638,7 +1633,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { // ES#prod-annexB-LegacyOctalEscapeSequence if (unicode()) { // With /u, decimal escape is not interpreted as octal character code. - ReportError(CStrVector("Invalid class escape")); + ReportError(RegExpError::kInvalidClassEscape); return 0; } return ParseOctalLiteral(); @@ -1648,7 +1643,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { if (ParseHexEscape(2, &value)) return value; if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid escape")); + ReportError(RegExpError::kInvalidEscape); return 0; } // If \x is not followed by a two-digit hexadecimal, treat it @@ -1661,7 +1656,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() { if (ParseUnicodeEscape(&value)) return value; if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(CStrVector("Invalid unicode escape")); + ReportError(RegExpError::kInvalidUnicodeEscape); return 0; } // If \u is not followed by a two-digit hexadecimal, treat it @@ -1676,11 +1671,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() { Advance(); return result; } - ReportError(CStrVector("Invalid escape")); + ReportError(RegExpError::kInvalidEscape); return 0; } } - return 0; + UNREACHABLE(); } void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, @@ -1703,17 +1698,18 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, return; } case kEndMarker: - ReportError(CStrVector("\\ at end of pattern")); + ReportError(RegExpError::kEscapeAtEndOfPattern); return; case 'p': case 'P': if (unicode()) { bool negate = Next() == 'P'; Advance(2); - std::vector<char> name_1, name_2; + ZoneVector<char> name_1(zone); + ZoneVector<char> name_2(zone); if (!ParsePropertyClassName(&name_1, &name_2) || !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError(CStrVector("Invalid property name in character class")); + ReportError(RegExpError::kInvalidClassPropertyName); } *is_class_escape = true; return; @@ -1732,10 +1728,6 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, } RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { - static const char* kUnterminated = "Unterminated character class"; - static const char* kRangeInvalid = "Invalid character class"; - static const char* kRangeOutOfOrder = "Range out of order in character class"; - DCHECK_EQ(current(), '['); Advance(); bool is_negated = false; @@ -1768,7 +1760,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { // Either end is an escaped character class. Treat the '-' verbatim. if (unicode()) { // ES2015 21.2.2.15.1 step 1. - return ReportError(CStrVector(kRangeInvalid)); + return ReportError(RegExpError::kInvalidCharacterClass); } if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); ranges->Add(CharacterRange::Singleton('-'), zone()); @@ -1777,7 +1769,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } // ES2015 21.2.2.15.1 step 6. if (char_1 > char_2) { - return ReportError(CStrVector(kRangeOutOfOrder)); + return ReportError(RegExpError::kOutOfOrderCharacterClass); } ranges->Add(CharacterRange::Range(char_1, char_2), zone()); } else { @@ -1785,7 +1777,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } } if (!has_more()) { - return ReportError(CStrVector(kUnterminated)); + return ReportError(RegExpError::kUnterminatedCharacterClass); } Advance(); RegExpCharacterClass::CharacterClassFlags character_class_flags; @@ -1802,14 +1794,16 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, JSRegExp::Flags flags, RegExpCompileData* result) { DCHECK(result != nullptr); - RegExpParser parser(input, &result->error, flags, isolate, zone); + RegExpParser parser(input, flags, isolate, zone); RegExpTree* tree = parser.ParsePattern(); if (parser.failed()) { DCHECK(tree == nullptr); - DCHECK(!result->error.is_null()); + DCHECK(parser.error_ != RegExpError::kNone); + result->error = parser.error_; + result->error_pos = parser.error_pos_; } else { DCHECK(tree != nullptr); - DCHECK(result->error.is_null()); + DCHECK(parser.error_ == RegExpError::kNone); if (FLAG_trace_regexp_parser) { StdoutStream os; tree->Print(os, zone); diff --git a/deps/v8/src/regexp/regexp-parser.h b/deps/v8/src/regexp/regexp-parser.h index cc1948b101..aff1746bc5 100644 --- a/deps/v8/src/regexp/regexp-parser.h +++ b/deps/v8/src/regexp/regexp-parser.h @@ -8,6 +8,7 @@ #include "src/objects/js-regexp.h" #include "src/objects/objects.h" #include "src/regexp/regexp-ast.h" +#include "src/regexp/regexp-error.h" #include "src/zone/zone.h" namespace v8 { @@ -153,8 +154,8 @@ class RegExpBuilder : public ZoneObject { class V8_EXPORT_PRIVATE RegExpParser { public: - RegExpParser(FlatStringReader* in, Handle<String>* error, - JSRegExp::Flags flags, Isolate* isolate, Zone* zone); + RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate, + Zone* zone); static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, JSRegExp::Flags flags, RegExpCompileData* result); @@ -177,13 +178,13 @@ class V8_EXPORT_PRIVATE RegExpParser { bool ParseUnicodeEscape(uc32* value); bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); - bool ParsePropertyClassName(std::vector<char>* name_1, - std::vector<char>* name_2); + bool ParsePropertyClassName(ZoneVector<char>* name_1, + ZoneVector<char>* name_2); bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, - const std::vector<char>& name_1, - const std::vector<char>& name_2); + const ZoneVector<char>& name_1, + const ZoneVector<char>& name_2); - RegExpTree* GetPropertySequence(const std::vector<char>& name_1); + RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1); RegExpTree* ParseCharacterClass(const RegExpBuilder* state); uc32 ParseOctalLiteral(); @@ -202,7 +203,7 @@ class V8_EXPORT_PRIVATE RegExpParser { char ParseClassEscape(); - RegExpTree* ReportError(Vector<const char> message); + RegExpTree* ReportError(RegExpError error); void Advance(); void Advance(int dist); void Reset(int pos); @@ -335,7 +336,8 @@ class V8_EXPORT_PRIVATE RegExpParser { Isolate* isolate_; Zone* zone_; - Handle<String>* error_; + RegExpError error_ = RegExpError::kNone; + int error_pos_ = 0; ZoneList<RegExpCapture*>* captures_; ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; ZoneList<RegExpBackReference*>* named_back_references_; diff --git a/deps/v8/src/regexp/regexp-stack.h b/deps/v8/src/regexp/regexp-stack.h index cd199adfb2..9394398fcc 100644 --- a/deps/v8/src/regexp/regexp-stack.h +++ b/deps/v8/src/regexp/regexp-stack.h @@ -38,6 +38,9 @@ class RegExpStackScope { class RegExpStack { public: + RegExpStack(); + ~RegExpStack(); + // Number of allocated locations on the stack below the limit. // No sequence of pushes must be longer that this without doing a stack-limit // check. @@ -77,9 +80,6 @@ class RegExpStack { static constexpr size_t kMaximumStackSize = 64 * MB; private: - RegExpStack(); - ~RegExpStack(); - // Artificial limit used when the thread-local state has been destroyed. static const Address kMemoryTop = static_cast<Address>(static_cast<uintptr_t>(-1)); diff --git a/deps/v8/src/regexp/regexp.cc b/deps/v8/src/regexp/regexp.cc index 3632deaeb8..4319990a39 100644 --- a/deps/v8/src/regexp/regexp.cc +++ b/deps/v8/src/regexp/regexp.cc @@ -14,6 +14,7 @@ #include "src/regexp/regexp-dotprinter.h" #include "src/regexp/regexp-interpreter.h" #include "src/regexp/regexp-macro-assembler-arch.h" +#include "src/regexp/regexp-macro-assembler-tracer.h" #include "src/regexp/regexp-parser.h" #include "src/strings/string-search.h" #include "src/utils/ostreams.h" @@ -91,9 +92,15 @@ class RegExpImpl final : public AllStatic { }; V8_WARN_UNUSED_RESULT -static inline MaybeHandle<Object> ThrowRegExpException( - Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern, - Handle<String> error_text) { +static inline MaybeHandle<Object> ThrowRegExpException(Isolate* isolate, + Handle<JSRegExp> re, + Handle<String> pattern, + RegExpError error) { + Vector<const char> error_data = CStrVector(RegExpErrorString(error)); + Handle<String> error_text = + isolate->factory() + ->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data)) + .ToHandleChecked(); THROW_NEW_ERROR( isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text), @@ -101,7 +108,7 @@ static inline MaybeHandle<Object> ThrowRegExpException( } inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re, - Handle<String> error_text) { + RegExpError error_text) { USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate), error_text)); } @@ -407,7 +414,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re, Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject, is_one_byte, re->BacktrackLimit()); if (!compilation_succeeded) { - DCHECK(!compile_data.error.is_null()); + DCHECK(compile_data.error != RegExpError::kNone); ThrowRegExpException(isolate, re, compile_data.error); return false; } @@ -740,8 +747,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, Handle<String> sample_subject, bool is_one_byte, uint32_t backtrack_limit) { if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { - data->error = - isolate->factory()->NewStringFromAsciiChecked("RegExp too big"); + data->error = RegExpError::kTooLarge; return false; } @@ -809,8 +815,8 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - if (const char* error_message = AnalyzeRegExp(isolate, is_one_byte, node)) { - data->error = isolate->factory()->NewStringFromAsciiChecked(error_message); + data->error = AnalyzeRegExp(isolate, is_one_byte, node); + if (data->error != RegExpError::kNone) { return false; } @@ -839,7 +845,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, #elif V8_TARGET_ARCH_S390 macro_assembler.reset(new RegExpMacroAssemblerS390( isolate, zone, mode, (data->capture_count + 1) * 2)); -#elif V8_TARGET_ARCH_PPC +#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64 macro_assembler.reset(new RegExpMacroAssemblerPPC( isolate, zone, mode, (data->capture_count + 1) * 2)); #elif V8_TARGET_ARCH_MIPS @@ -878,8 +884,18 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, macro_assembler->set_global_mode(mode); } + RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get(); +#ifdef DEBUG + std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler; + if (FLAG_trace_regexp_assembler) { + tracer_macro_assembler.reset( + new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr)); + macro_assembler_ptr = tracer_macro_assembler.get(); + } +#endif + RegExpCompiler::CompilationResult result = compiler.Assemble( - isolate, macro_assembler.get(), node, data->capture_count, pattern); + isolate, macro_assembler_ptr, node, data->capture_count, pattern); // Code / bytecode printing. { @@ -902,13 +918,12 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data, } } - if (result.error_message != nullptr) { + if (result.error != RegExpError::kNone) { if (FLAG_correctness_fuzzer_suppressions && - strncmp(result.error_message, "Stack overflow", 15) == 0) { + result.error == RegExpError::kStackOverflow) { FATAL("Aborting on stack overflow"); } - data->error = - isolate->factory()->NewStringFromAsciiChecked(result.error_message); + data->error = result.error; } data->code = result.code; diff --git a/deps/v8/src/regexp/regexp.h b/deps/v8/src/regexp/regexp.h index 9f3581d18e..27ccbb47ba 100644 --- a/deps/v8/src/regexp/regexp.h +++ b/deps/v8/src/regexp/regexp.h @@ -6,6 +6,7 @@ #define V8_REGEXP_REGEXP_H_ #include "src/objects/js-regexp.h" +#include "src/regexp/regexp-error.h" namespace v8 { namespace internal { @@ -42,7 +43,11 @@ struct RegExpCompileData { // The error message. Only used if an error occurred during parsing or // compilation. - Handle<String> error; + RegExpError error = RegExpError::kNone; + + // The position at which the error was detected. Only used if an + // error occurred. + int error_pos = 0; // The number of capture groups, without the global capture \0. int capture_count = 0; diff --git a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc index bcef02369f..be4b85df4f 100644 --- a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc +++ b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc @@ -113,6 +113,8 @@ RegExpMacroAssemblerS390::RegExpMacroAssemblerS390(Isolate* isolate, Zone* zone, backtrack_label_(), exit_label_(), internal_failure_label_() { + masm_->set_root_array_available(false); + DCHECK_EQ(0, registers_to_save % 2); __ b(&entry_label_); // We'll write the entry code later. @@ -228,7 +230,7 @@ void RegExpMacroAssemblerS390::CheckGreedyLoop(Label* on_equal) { } void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; __ LoadP(r2, register_location(start_reg)); // Index of start of // capture @@ -325,7 +327,7 @@ void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase( // r2: Address byte_offset1 - Address captured substring's start. // r3: Address byte_offset2 - Address of current character position. // r4: size_t byte_length - length of capture in bytes(!) - // r5: Isolate* isolate or 0 if unicode flag. + // r5: Isolate* isolate. // Address of start of capture. __ AddP(r2, end_of_input_address()); @@ -339,14 +341,7 @@ void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase( __ SubP(r3, r3, r6); } // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ LoadImmP(r5, Operand::Zero()); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ mov(r5, Operand(ExternalReference::isolate_address(isolate()))); - } + __ mov(r5, Operand(ExternalReference::isolate_address(isolate()))); { AllowExternalCallThatCantCauseGC scope(masm_); diff --git a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h index 4f79296d78..eced564d7f 100644 --- a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h +++ b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h @@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerS390 virtual void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, + bool read_backward, Label* on_no_match); virtual void CheckNotCharacter(unsigned c, Label* on_not_equal); virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, diff --git a/deps/v8/src/regexp/special-case.h b/deps/v8/src/regexp/special-case.h index 1ccec5d31a..753c9231ed 100644 --- a/deps/v8/src/regexp/special-case.h +++ b/deps/v8/src/regexp/special-case.h @@ -6,70 +6,109 @@ #define V8_REGEXP_SPECIAL_CASE_H_ #ifdef V8_INTL_SUPPORT -#include "unicode/uversion.h" -namespace U_ICU_NAMESPACE { -class UnicodeSet; -} // namespace U_ICU_NAMESPACE +#include "src/base/logging.h" +#include "src/common/globals.h" + +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" namespace v8 { namespace internal { -// Functions to build special sets of Unicode characters that need special -// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE). +// Sets of Unicode characters that need special handling under "i" mode + +// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 +// defines slightly different case-folding rules than Unicode. An +// input character should match a pattern character if the result of +// the Canonicalize algorithm is the same for both characters. // -// For the characters in the "ignore set", the process should not treat other -// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case -// equivlant under the ECMA262 RegExp "i" mode because these characters are -// uppercase themselves that no other characters in the set uppercase to. +// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as +// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character +// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See +// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for +// the precise definition. // -// For the characters in the "special add set", the proecess should add only -// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is -// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode -// and also that ONE uppercase character that other non uppercase character -// uppercase into to the set. Other uppercase characters in the result of -// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 -// RegExp "i" mode consider two characters as "case equivlant" if both -// characters uppercase to the same character. +// While compiling such regular expressions, we need to compute the +// set of characters that should match a given input character. (See +// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) +// For almost all characters, this can be efficiently computed using +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent +// the remaining special cases. // -// For example, consider the following case equivalent set defined by Unicode -// standard. Notice there are more than one uppercase characters in this set: -// U+212B Å Angstrom Sign - an uppercase character. -// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character. -// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which -// uppercase to U+00C5. -// In this case equivlant set is a special set and need special handling while -// considering "case equivlant" under the ECMA262 RegExp "i" mode which is -// different than Unicode Standard: -// * U+212B should be included into the "ignore" set because there are no other -// characters, under the ECMA262 "i" mode, are considered as "case equivlant" -// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5 -// uppercase to U+212B. -// * U+00C5 and U+00E5 will both be included into the "special add" set. While -// calculate the "equivlant set" under ECMA262 "i" mode, the process will -// add U+00E5, because it is not an uppercase character in the set. The -// process will also add U+00C5, because it is the uppercase character which -// other non uppercase character, U+00C5, uppercase into. +// For a character c, the rules are as follows: // -// For characters not included in "ignore set" and "special add set", the -// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is -// much faster. +// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling +// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet +// containing c will produce the set of characters that should +// match /c/i (or /[c]/i), and only those characters. // -// Under Unicode 12.0, there are only 7 characters in the "special add set" and -// 4 characters in "ignore set" so even the special add process is slower, it is -// limited to a small set of cases only. +// 2. If c is in IgnoreSet, then the only character it should match is +// itself. However, closeOver will add additional incorrect +// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' +// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is +// "SS". Step 3.e therefore requires that 'ß' canonicalizes to +// itself, and should not match 'ẞ'. In these cases, we can skip +// the closeOver entirely, because it will never add an equivalent +// character. // -// The implementation of these two function will be generated by calling ICU -// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by -// the code in src/regexp/gen-regexp-special-case.cc. +// 3. If c is in SpecialAddSet, then it should match at least one +// character other than itself. However, closeOver will add at +// least one additional incorrect match. For example, consider the +// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase +// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN +// SIGN should not match either of the other two characters. As a +// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in +// IgnoreSet). To find the correct matches for characters in +// SpecialAddSet, we closeOver the original character, but filter +// out the results that do not have the same canonical value. // -// These two function will be used with LazyInstance<> template to generate -// global sharable set to reduce memory usage and speed up performance. +// The contents of these sets are calculated at build time by +// src/regexp/gen-regexp-special-case.cc, which generates +// gen/src/regexp/special-case.cc. This is done by iterating over the +// result of closeOver for each BMP character, and finding sets for +// which at least one character has a different canonical value than +// another character. Characters that match no other characters in +// their equivalence class are added to IgnoreSet. Characters that +// match at least one other character are added to SpecialAddSet. + +class RegExpCaseFolding final : public AllStatic { + public: + static const icu::UnicodeSet& IgnoreSet(); + static const icu::UnicodeSet& SpecialAddSet(); + + // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: + // Canonicalize) step 3, which is used to determine whether + // characters match when ignoreCase is true and unicode is false. + static UChar32 Canonicalize(UChar32 ch) { + // a. Assert: ch is a UTF-16 code unit. + CHECK_LE(ch, 0xffff); + + // b. Let s be the String value consisting of the single code unit ch. + icu::UnicodeString s(ch); + + // c. Let u be the same result produced as if by performing the algorithm + // for String.prototype.toUpperCase using s as the this value. + // d. Assert: Type(u) is String. + icu::UnicodeString& u = s.toUpper(); + + // e. If u does not consist of a single code unit, return ch. + if (u.length() != 1) { + return ch; + } + + // f. Let cu be u's single code unit element. + UChar32 cu = u.char32At(0); -// Function to build and return the Ignore set. -icu::UnicodeSet BuildIgnoreSet(); + // g. If the value of ch >= 128 and the value of cu < 128, return ch. + if (ch >= 128 && cu < 128) { + return ch; + } -// Function to build and return the Special Add set. -icu::UnicodeSet BuildSpecialAddSet(); + // h. Return cu. + return cu; + } +}; } // namespace internal } // namespace v8 diff --git a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc index 5620c6b9ce..5edbf5e579 100644 --- a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc +++ b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc @@ -214,9 +214,8 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) { __ bind(&fallthrough); } - void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( - int start_reg, bool read_backward, bool unicode, Label* on_no_match) { + int start_reg, bool read_backward, Label* on_no_match) { Label fallthrough; ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture @@ -321,7 +320,7 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( // Address byte_offset1 - Address captured substring's start. // Address byte_offset2 - Address of current character position. // size_t byte_length - length of capture in bytes(!) -// Isolate* isolate or 0 if unicode flag. + // Isolate* isolate. #ifdef V8_TARGET_OS_WIN DCHECK(rcx == arg_reg_1); DCHECK(rdx == arg_reg_2); @@ -349,14 +348,7 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( // Set byte_length. __ movq(arg_reg_3, rbx); // Isolate. -#ifdef V8_INTL_SUPPORT - if (unicode) { - __ movq(arg_reg_4, Immediate(0)); - } else // NOLINT -#endif // V8_INTL_SUPPORT - { - __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate())); - } + __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate())); { // NOLINT: Can't find a way to open this scope without confusing the // linter. @@ -388,7 +380,6 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase( __ bind(&fallthrough); } - void RegExpMacroAssemblerX64::CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) { diff --git a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h index 0bf1c2e150..64614e228a 100644 --- a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h +++ b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h @@ -37,7 +37,6 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerX64 void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward, - bool unicode, Label* on_no_match) override; void CheckNotCharacter(uint32_t c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask, |