summaryrefslogtreecommitdiff
path: root/deps/v8/src/regexp
diff options
context:
space:
mode:
authorMichaël Zasso <targos@protonmail.com>2020-05-05 09:19:02 +0200
committerMichaël Zasso <targos@protonmail.com>2020-05-12 16:12:13 +0200
commit1d6adf7432defeb39b751a19c68335e8afb0d8ee (patch)
tree7ab67931110b8d9db770d774c7a6d0d14c976c15 /deps/v8/src/regexp
parentaee36a04475a20c13663d1037aa6f175ff368bc7 (diff)
downloadnode-new-1d6adf7432defeb39b751a19c68335e8afb0d8ee.tar.gz
deps: update V8 to 8.3.110.9
PR-URL: https://github.com/nodejs/node/pull/32831 Reviewed-By: Anna Henningsen <anna@addaleax.net> Reviewed-By: Michaël Zasso <targos@protonmail.com> Reviewed-By: Jiawen Geng <technicalcute@gmail.com> Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
Diffstat (limited to 'deps/v8/src/regexp')
-rw-r--r--deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc17
-rw-r--r--deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h2
-rw-r--r--deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc32
-rw-r--r--deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h3
-rw-r--r--deps/v8/src/regexp/gen-regexp-special-case.cc150
-rw-r--r--deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc17
-rw-r--r--deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h2
-rw-r--r--deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc17
-rw-r--r--deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h2
-rw-r--r--deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc17
-rw-r--r--deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h2
-rw-r--r--deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc21
-rw-r--r--deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h2
-rw-r--r--deps/v8/src/regexp/regexp-ast.h29
-rw-r--r--deps/v8/src/regexp/regexp-bytecode-generator.cc8
-rw-r--r--deps/v8/src/regexp/regexp-bytecode-generator.h2
-rw-r--r--deps/v8/src/regexp/regexp-bytecode-peephole.cc1
-rw-r--r--deps/v8/src/regexp/regexp-bytecodes.h8
-rw-r--r--deps/v8/src/regexp/regexp-compiler-tonode.cc114
-rw-r--r--deps/v8/src/regexp/regexp-compiler.cc115
-rw-r--r--deps/v8/src/regexp/regexp-compiler.h16
-rw-r--r--deps/v8/src/regexp/regexp-error.cc22
-rw-r--r--deps/v8/src/regexp/regexp-error.h58
-rw-r--r--deps/v8/src/regexp/regexp-interpreter.cc45
-rw-r--r--deps/v8/src/regexp/regexp-macro-assembler-arch.h2
-rw-r--r--deps/v8/src/regexp/regexp-macro-assembler-tracer.cc10
-rw-r--r--deps/v8/src/regexp/regexp-macro-assembler-tracer.h1
-rw-r--r--deps/v8/src/regexp/regexp-macro-assembler.cc48
-rw-r--r--deps/v8/src/regexp/regexp-macro-assembler.h10
-rw-r--r--deps/v8/src/regexp/regexp-parser.cc142
-rw-r--r--deps/v8/src/regexp/regexp-parser.h20
-rw-r--r--deps/v8/src/regexp/regexp-stack.h6
-rw-r--r--deps/v8/src/regexp/regexp.cc45
-rw-r--r--deps/v8/src/regexp/regexp.h7
-rw-r--r--deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc15
-rw-r--r--deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h2
-rw-r--r--deps/v8/src/regexp/special-case.h141
-rw-r--r--deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc15
-rw-r--r--deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h1
39 files changed, 583 insertions, 584 deletions
diff --git a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc
index 8f9da563a9..03dac337e0 100644
--- a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc
+++ b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.cc
@@ -110,6 +110,8 @@ RegExpMacroAssemblerARM::RegExpMacroAssemblerARM(Isolate* isolate, Zone* zone,
success_label_(),
backtrack_label_(),
exit_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
__ jmp(&entry_label_); // We'll write the entry code later.
__ bind(&start_label_); // And then continue from here.
@@ -221,9 +223,8 @@ void RegExpMacroAssemblerARM::CheckGreedyLoop(Label* on_equal) {
BranchOrBacktrack(eq, on_equal);
}
-
void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ ldr(r0, register_location(start_reg)); // Index of start of capture
__ ldr(r1, register_location(start_reg + 1)); // Index of end of capture
@@ -315,7 +316,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
// r0: Address byte_offset1 - Address captured substring's start.
// r1: Address byte_offset2 - Address of current character position.
// r2: size_t byte_length - length of capture in bytes(!)
- // r3: Isolate* isolate or 0 if unicode flag.
+ // r3: Isolate* isolate.
// Address of start of capture.
__ add(r0, r0, Operand(end_of_input_address()));
@@ -329,14 +330,7 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
__ sub(r1, r1, r4);
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ mov(r3, Operand(0));
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
- }
+ __ mov(r3, Operand(ExternalReference::isolate_address(isolate())));
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -360,7 +354,6 @@ void RegExpMacroAssemblerARM::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerARM::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
diff --git a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h
index 6320913f4c..22628fb760 100644
--- a/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h
+++ b/deps/v8/src/regexp/arm/regexp-macro-assembler-arm.h
@@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
diff --git a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc
index 56658819b1..43a6bdf912 100644
--- a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc
+++ b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.cc
@@ -120,10 +120,14 @@ RegExpMacroAssemblerARM64::RegExpMacroAssemblerARM64(Isolate* isolate,
success_label_(),
backtrack_label_(),
exit_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
// We can cache at most 16 W registers in x0-x7.
STATIC_ASSERT(kNumCachedRegisters <= 16);
STATIC_ASSERT((kNumCachedRegisters % 2) == 0);
+ __ CallTarget();
+
__ B(&entry_label_); // We'll write the entry code later.
__ Bind(&start_label_); // And then continue from here.
}
@@ -212,6 +216,9 @@ void RegExpMacroAssemblerARM64::Bind(Label* label) {
__ Bind(label);
}
+void RegExpMacroAssemblerARM64::BindJumpTarget(Label* label) {
+ __ BindJumpTarget(label);
+}
void RegExpMacroAssemblerARM64::CheckCharacter(uint32_t c, Label* on_equal) {
CompareAndBranchOrBacktrack(current_character(), c, eq, on_equal);
@@ -286,9 +293,8 @@ void RegExpMacroAssemblerARM64::CheckGreedyLoop(Label* on_equal) {
BranchOrBacktrack(eq, on_equal);
}
-
void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
Register capture_start_offset = w10;
@@ -402,7 +408,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
// x0: Address byte_offset1 - Address captured substring's start.
// x1: Address byte_offset2 - Address of current character position.
// w2: size_t byte_length - length of capture in bytes(!)
- // x3: Isolate* isolate or 0 if unicode flag
+ // x3: Isolate* isolate.
// Address of start of capture.
__ Add(x0, input_end(), Operand(capture_start_offset, SXTW));
@@ -414,14 +420,7 @@ void RegExpMacroAssemblerARM64::CheckNotBackReferenceIgnoreCase(
__ Sub(x1, x1, Operand(capture_length, SXTW));
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ Mov(x3, Operand(0));
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ Mov(x3, ExternalReference::isolate_address(isolate()));
- }
+ __ Mov(x3, ExternalReference::isolate_address(isolate()));
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -737,10 +736,11 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) {
CPURegList argument_registers(x0, x5, x6, x7);
CPURegList registers_to_retain = kCalleeSaved;
- DCHECK_EQ(11, kCalleeSaved.Count());
+ registers_to_retain.Combine(fp);
registers_to_retain.Combine(lr);
- __ PushCPURegList(registers_to_retain);
+ DCHECK(registers_to_retain.IncludesAliasOf(lr));
+ __ PushCPURegList<TurboAssembler::kSignLR>(registers_to_retain);
__ PushCPURegList(argument_registers);
// Set frame pointer in place.
@@ -1035,7 +1035,7 @@ Handle<HeapObject> RegExpMacroAssemblerARM64::GetCode(Handle<String> source) {
__ Mov(sp, fp);
// Restore registers.
- __ PopCPURegList(registers_to_retain);
+ __ PopCPURegList<TurboAssembler::kAuthLR>(registers_to_retain);
__ Ret();
@@ -1585,14 +1585,14 @@ void RegExpMacroAssemblerARM64::CallIf(Label* to, Condition condition) {
void RegExpMacroAssemblerARM64::RestoreLinkRegister() {
- __ Pop(lr, xzr);
+ __ Pop<TurboAssembler::kAuthLR>(padreg, lr);
__ Add(lr, lr, Operand(masm_->CodeObject()));
}
void RegExpMacroAssemblerARM64::SaveLinkRegister() {
__ Sub(lr, lr, Operand(masm_->CodeObject()));
- __ Push(xzr, lr);
+ __ Push<TurboAssembler::kSignLR>(lr, padreg);
}
diff --git a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h
index cee9e2c97e..91b5e90bf5 100644
--- a/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h
+++ b/deps/v8/src/regexp/arm64/regexp-macro-assembler-arm64.h
@@ -42,7 +42,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
@@ -65,6 +65,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerARM64
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual bool CheckSpecialCharacterClass(uc16 type,
Label* on_no_match);
+ virtual void BindJumpTarget(Label* label = nullptr);
virtual void Fail();
virtual Handle<HeapObject> GetCode(Handle<String> source);
virtual void GoTo(Label* label);
diff --git a/deps/v8/src/regexp/gen-regexp-special-case.cc b/deps/v8/src/regexp/gen-regexp-special-case.cc
index 8aace6ab88..9606c5d70d 100644
--- a/deps/v8/src/regexp/gen-regexp-special-case.cc
+++ b/deps/v8/src/regexp/gen-regexp-special-case.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
+// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -7,19 +7,19 @@
#include <iostream>
#include <sstream>
-#include "src/base/logging.h"
-#include "unicode/uchar.h"
-#include "unicode/uniset.h"
+#include "src/regexp/special-case.h"
namespace v8 {
namespace internal {
-// The following code generates BuildSpecialAddSet() and BuildIgnoreSet()
-// functions into "src/regexp/special-case.cc".
-// See more details in http://shorturl.at/adfO5
-void PrintSet(std::ofstream& out, const char* func_name,
+static const uc32 kSurrogateStart = 0xd800;
+static const uc32 kSurrogateEnd = 0xdfff;
+static const uc32 kNonBmpStart = 0x10000;
+
+// The following code generates "src/regexp/special-case.cc".
+void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
- out << "icu::UnicodeSet " << func_name << "() {\n"
+ out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
@@ -31,73 +31,113 @@ void PrintSet(std::ofstream& out, const char* func_name,
}
out << " set.freeze();\n"
<< " return set;\n"
- << "}\n";
+ << "}\n\n";
+
+ out << "struct " << name << "Data {\n"
+ << " " << name << "Data() : set(Build" << name << "()) {}\n"
+ << " const icu::UnicodeSet set;\n"
+ << "};\n\n";
+
+ out << "//static\n"
+ << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
+ << " static base::LazyInstance<" << name << "Data>::type set =\n"
+ << " LAZY_INSTANCE_INITIALIZER;\n"
+ << " return set.Pointer()->set;\n"
+ << "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
- icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range.
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
- // Iterate through all chars in BMP except ASCII and Surrogate.
- for (UChar32 i = 0x80; i < 0x010000; i++) {
- // Ignore those characters which is already processed.
- if (!processed.contains(i)) {
- current.set(i, i);
- current.closeOver(USET_CASE_INSENSITIVE);
- // Remember we already processed current.
- processed.addAll(current);
-
- // All uppercase characters in current.
- icu::UnicodeSet keep_upper(current);
- keep_upper.retainAll(upper);
-
- // Check if we have more than one uppercase character in current.
- // If there are more than one uppercase character, then it is a special
- // set which need to be added into either "Special Add" set or "Ignore"
- // set.
- int32_t number_of_upper = 0;
- for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) {
- number_of_upper +=
- keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1;
+ // Iterate through all chars in BMP except surrogates.
+ for (UChar32 i = 0; i < kNonBmpStart; i++) {
+ if (i >= kSurrogateStart && i <= kSurrogateEnd) {
+ continue; // Ignore surrogate range
+ }
+ current.set(i, i);
+ current.closeOver(USET_CASE_INSENSITIVE);
+
+ // Check to see if all characters in the case-folding equivalence
+ // class as defined by UnicodeSet::closeOver all map to the same
+ // canonical value.
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
+ bool class_has_matching_canonical_char = false;
+ bool class_has_non_matching_canonical_char = false;
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
+ c++) {
+ if (c == i) {
+ continue;
+ }
+ UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
+ if (canonical == other_canonical) {
+ class_has_matching_canonical_char = true;
+ } else {
+ class_has_non_matching_canonical_char = true;
+ }
+ }
+ }
+ // If any other character in i's equivalence class has a
+ // different canonical value, then i needs special handling. If
+ // no other character shares a canonical value with i, we can
+ // ignore i when adding alternatives for case-independent
+ // comparison. If at least one other character shares a
+ // canonical value, then i needs special handling.
+ if (class_has_non_matching_canonical_char) {
+ if (class_has_matching_canonical_char) {
+ special_add.add(i);
+ } else {
+ ignore.add(i);
}
- if (number_of_upper > 1) {
- // Add all non uppercase characters (could be Ll or Mn) to special add
- // set.
- current.removeAll(upper);
- special_add.addAll(current);
-
- // Add the uppercase characters of non uppercase character to
- // special add set.
- CHECK_GT(current.getRangeCount(), 0);
- UChar32 main_upper = u_toupper(current.getRangeStart(0));
- special_add.add(main_upper);
-
- // Add all uppercase except the main upper to ignore set.
- keep_upper.remove(main_upper);
- ignore.addAll(keep_upper);
+ }
+ }
+
+ // Verify that no Unicode equivalence class contains two non-trivial
+ // JS equivalence classes. Every character in SpecialAddSet has the
+ // same canonical value as every other non-IgnoreSet character in
+ // its Unicode equivalence class. Therefore, if we call closeOver on
+ // a set containing no IgnoreSet characters, the only characters
+ // that must be removed from the result are in IgnoreSet. This fact
+ // is used in CharacterRange::AddCaseEquivalents.
+ for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
+ for (UChar32 c = special_add.getRangeStart(i);
+ c <= special_add.getRangeEnd(i); c++) {
+ UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
+ current.set(c, c);
+ current.closeOver(USET_CASE_INSENSITIVE);
+ current.removeAll(ignore);
+ for (int32_t j = 0; j < current.getRangeCount(); j++) {
+ for (UChar32 c2 = current.getRangeStart(j);
+ c2 <= current.getRangeEnd(j); c2++) {
+ CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
+ }
}
}
}
- // Remove any ASCII
- special_add.remove(0x0000, 0x007f);
- PrintSet(out, "BuildIgnoreSet", ignore);
- PrintSet(out, "BuildSpecialAddSet", special_add);
+ PrintSet(out, "IgnoreSet", ignore);
+ PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
-
- out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n"
- << "// The following functions are used to build icu::UnicodeSet\n"
- << "// for specical cases different between Unicode and ECMA262.\n"
+ out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
+ << "// Use of this source code is governed by a BSD-style license that\n"
+ << "// can be found in the LICENSE file.\n\n"
+ << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
+ << "// The following functions are used to build UnicodeSets\n"
+ << "// for special cases where the case-folding algorithm used by\n"
+ << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
+ << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
+ << "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
+ << "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
diff --git a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc
index f9015287f9..7f6bd5e296 100644
--- a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc
+++ b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.cc
@@ -205,9 +205,8 @@ void RegExpMacroAssemblerIA32::CheckGreedyLoop(Label* on_equal) {
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ mov(edx, register_location(start_reg)); // Index of start of capture
__ mov(ebx, register_location(start_reg + 1)); // Index of end of capture
@@ -314,18 +313,11 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
-// Isolate* isolate or 0 if unicode flag.
+ // Isolate* isolate.
// Set isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ mov(Operand(esp, 3 * kSystemPointerSize), Immediate(0));
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ mov(Operand(esp, 3 * kSystemPointerSize),
- Immediate(ExternalReference::isolate_address(isolate())));
- }
+ __ mov(Operand(esp, 3 * kSystemPointerSize),
+ Immediate(ExternalReference::isolate_address(isolate())));
// Set byte_length.
__ mov(Operand(esp, 2 * kSystemPointerSize), ebx);
// Set byte_offset2.
@@ -366,7 +358,6 @@ void RegExpMacroAssemblerIA32::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerIA32::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
diff --git a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h
index b2c6fab7b3..f68dd0b1b7 100644
--- a/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h
+++ b/deps/v8/src/regexp/ia32/regexp-macro-assembler-ia32.h
@@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerIA32
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc
index 1e7839c219..e3f2ea6292 100644
--- a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc
+++ b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.cc
@@ -106,6 +106,8 @@ RegExpMacroAssemblerMIPS::RegExpMacroAssemblerMIPS(Isolate* isolate, Zone* zone,
backtrack_label_(),
exit_label_(),
internal_failure_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
__ jmp(&entry_label_); // We'll write the entry code later.
// If the code gets too big or corrupted, an internal exception will be
@@ -223,9 +225,8 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
BranchOrBacktrack(on_equal, eq, current_input_offset(), Operand(a0));
}
-
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ lw(a0, register_location(start_reg)); // Index of start of capture.
__ lw(a1, register_location(start_reg + 1)); // Index of end of capture.
@@ -320,7 +321,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
- // a3: Isolate* isolate or 0 if unicode flag.
+ // a3: Isolate* isolate.
// Address of start of capture.
__ Addu(a0, a0, Operand(end_of_input_address()));
@@ -334,14 +335,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Subu(a1, a1, Operand(s3));
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ mov(a3, zero_reg);
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
- }
+ __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -368,7 +362,6 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerMIPS::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
diff --git a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h
index 9281b0174d..5733bbe046 100644
--- a/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h
+++ b/deps/v8/src/regexp/mips/regexp-macro-assembler-mips.h
@@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc
index 3dd1548685..fc3cad8b0e 100644
--- a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc
+++ b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.cc
@@ -142,6 +142,8 @@ RegExpMacroAssemblerMIPS::RegExpMacroAssemblerMIPS(Isolate* isolate, Zone* zone,
backtrack_label_(),
exit_label_(),
internal_failure_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
__ jmp(&entry_label_); // We'll write the entry code later.
// If the code gets too big or corrupted, an internal exception will be
@@ -259,9 +261,8 @@ void RegExpMacroAssemblerMIPS::CheckGreedyLoop(Label* on_equal) {
BranchOrBacktrack(on_equal, eq, current_input_offset(), Operand(a0));
}
-
void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ Ld(a0, register_location(start_reg)); // Index of start of capture.
__ Ld(a1, register_location(start_reg + 1)); // Index of end of capture.
@@ -356,7 +357,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
// a0: Address byte_offset1 - Address captured substring's start.
// a1: Address byte_offset2 - Address of current character position.
// a2: size_t byte_length - length of capture in bytes(!).
- // a3: Isolate* isolate or 0 if unicode flag.
+ // a3: Isolate* isolate.
// Address of start of capture.
__ Daddu(a0, a0, Operand(end_of_input_address()));
@@ -370,14 +371,7 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ Dsubu(a1, a1, Operand(s3));
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ mov(a3, zero_reg);
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
- }
+ __ li(a3, Operand(ExternalReference::isolate_address(masm_->isolate())));
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -404,7 +398,6 @@ void RegExpMacroAssemblerMIPS::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerMIPS::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
diff --git a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h
index bc7f83e6e9..b267297c24 100644
--- a/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h
+++ b/deps/v8/src/regexp/mips64/regexp-macro-assembler-mips64.h
@@ -37,7 +37,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerMIPS
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,
diff --git a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc
index 50bf71e6d5..376103324a 100644
--- a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc
+++ b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.cc
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
-#if V8_TARGET_ARCH_PPC
+#if V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
@@ -111,6 +111,8 @@ RegExpMacroAssemblerPPC::RegExpMacroAssemblerPPC(Isolate* isolate, Zone* zone,
backtrack_label_(),
exit_label_(),
internal_failure_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
@@ -123,7 +125,6 @@ RegExpMacroAssemblerPPC::RegExpMacroAssemblerPPC(Isolate* isolate, Zone* zone,
__ bind(&start_label_); // And then continue from here.
}
-
RegExpMacroAssemblerPPC::~RegExpMacroAssemblerPPC() {
delete masm_;
// Unuse labels in case we throw away the assembler without calling GetCode.
@@ -241,7 +242,7 @@ void RegExpMacroAssemblerPPC::CheckGreedyLoop(Label* on_equal) {
}
void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ LoadP(r3, register_location(start_reg), r0); // Index of start of capture
__ LoadP(r4, register_location(start_reg + 1), r0); // Index of end
@@ -336,7 +337,7 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
// r3: Address byte_offset1 - Address captured substring's start.
// r4: Address byte_offset2 - Address of current character position.
// r5: size_t byte_length - length of capture in bytes(!)
- // r6: Isolate* isolate or 0 if unicode flag.
+ // r6: Isolate* isolate.
// Address of start of capture.
__ add(r3, r3, end_of_input_address());
@@ -350,14 +351,7 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
__ sub(r4, r4, r25);
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ li(r6, Operand::Zero());
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
- }
+ __ mov(r6, Operand(ExternalReference::isolate_address(isolate())));
{
AllowExternalCallThatCantCauseGC scope(masm_);
@@ -381,7 +375,6 @@ void RegExpMacroAssemblerPPC::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerPPC::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
@@ -1371,4 +1364,4 @@ void RegExpMacroAssemblerPPC::LoadCurrentCharacterUnchecked(int cp_offset,
} // namespace internal
} // namespace v8
-#endif // V8_TARGET_ARCH_PPC
+#endif // V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
diff --git a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h
index c726a5f0d7..3e64f139a8 100644
--- a/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h
+++ b/deps/v8/src/regexp/ppc/regexp-macro-assembler-ppc.h
@@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerPPC
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
diff --git a/deps/v8/src/regexp/regexp-ast.h b/deps/v8/src/regexp/regexp-ast.h
index 3de29512ea..a9106d3d30 100644
--- a/deps/v8/src/regexp/regexp-ast.h
+++ b/deps/v8/src/regexp/regexp-ast.h
@@ -463,7 +463,11 @@ class RegExpQuantifier final : public RegExpTree {
class RegExpCapture final : public RegExpTree {
public:
explicit RegExpCapture(int index)
- : body_(nullptr), index_(index), name_(nullptr) {}
+ : body_(nullptr),
+ index_(index),
+ min_match_(0),
+ max_match_(0),
+ name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
static RegExpNode* ToNode(RegExpTree* body, int index,
@@ -473,10 +477,14 @@ class RegExpCapture final : public RegExpTree {
bool IsAnchoredAtEnd() override;
Interval CaptureRegisters() override;
bool IsCapture() override;
- int min_match() override { return body_->min_match(); }
- int max_match() override { return body_->max_match(); }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
RegExpTree* body() { return body_; }
- void set_body(RegExpTree* body) { body_ = body; }
+ void set_body(RegExpTree* body) {
+ body_ = body;
+ min_match_ = body->min_match();
+ max_match_ = body->max_match();
+ }
int index() const { return index_; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
@@ -486,12 +494,17 @@ class RegExpCapture final : public RegExpTree {
private:
RegExpTree* body_;
int index_;
+ int min_match_;
+ int max_match_;
const ZoneVector<uc16>* name_;
};
class RegExpGroup final : public RegExpTree {
public:
- explicit RegExpGroup(RegExpTree* body) : body_(body) {}
+ explicit RegExpGroup(RegExpTree* body)
+ : body_(body),
+ min_match_(body->min_match()),
+ max_match_(body->max_match()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) override {
@@ -501,13 +514,15 @@ class RegExpGroup final : public RegExpTree {
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
bool IsGroup() override;
- int min_match() override { return body_->min_match(); }
- int max_match() override { return body_->max_match(); }
+ int min_match() override { return min_match_; }
+ int max_match() override { return max_match_; }
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
+ int min_match_;
+ int max_match_;
};
class RegExpLookaround final : public RegExpTree {
diff --git a/deps/v8/src/regexp/regexp-bytecode-generator.cc b/deps/v8/src/regexp/regexp-bytecode-generator.cc
index 0dcc288d3c..e82b67b530 100644
--- a/deps/v8/src/regexp/regexp-bytecode-generator.cc
+++ b/deps/v8/src/regexp/regexp-bytecode-generator.cc
@@ -329,13 +329,11 @@ void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
}
void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_not_equal) {
+ int start_reg, bool read_backward, Label* on_not_equal) {
DCHECK_LE(0, start_reg);
DCHECK_GE(kMaxRegister, start_reg);
- Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD
- : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD)
- : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE
- : BC_CHECK_NOT_BACK_REF_NO_CASE),
+ Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
+ : BC_CHECK_NOT_BACK_REF_NO_CASE,
start_reg);
EmitOrLink(on_not_equal);
}
diff --git a/deps/v8/src/regexp/regexp-bytecode-generator.h b/deps/v8/src/regexp/regexp-bytecode-generator.h
index dfcc2ca5f8..85073cc99d 100644
--- a/deps/v8/src/regexp/regexp-bytecode-generator.h
+++ b/deps/v8/src/regexp/regexp-bytecode-generator.h
@@ -69,7 +69,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
diff --git a/deps/v8/src/regexp/regexp-bytecode-peephole.cc b/deps/v8/src/regexp/regexp-bytecode-peephole.cc
index 8f1f1d95a9..f0957f0779 100644
--- a/deps/v8/src/regexp/regexp-bytecode-peephole.cc
+++ b/deps/v8/src/regexp/regexp-bytecode-peephole.cc
@@ -436,7 +436,6 @@ BytecodeArgumentMapping BytecodeSequenceNode::ArgumentMapping(
size_t index) const {
DCHECK(IsSequence());
DCHECK(argument_mapping_ != nullptr);
- DCHECK_GE(index, 0);
DCHECK_LT(index, argument_mapping_->size());
return argument_mapping_->at(index);
diff --git a/deps/v8/src/regexp/regexp-bytecodes.h b/deps/v8/src/regexp/regexp-bytecodes.h
index e25945d0a0..1664a476d2 100644
--- a/deps/v8/src/regexp/regexp-bytecodes.h
+++ b/deps/v8/src/regexp/regexp-bytecodes.h
@@ -101,12 +101,12 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
- V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
+ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \
V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
- V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
+ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \
V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
diff --git a/deps/v8/src/regexp/regexp-compiler-tonode.cc b/deps/v8/src/regexp/regexp-compiler-tonode.cc
index 2d86d3ea9e..40ecee0f91 100644
--- a/deps/v8/src/regexp/regexp-compiler-tonode.cc
+++ b/deps/v8/src/regexp/regexp-compiler-tonode.cc
@@ -1140,39 +1140,6 @@ Vector<const int> CharacterRange::GetWordBounds() {
return Vector<const int>(kWordRanges, kWordRangeCount - 1);
}
-#ifdef V8_INTL_SUPPORT
-struct IgnoreSet {
- IgnoreSet() : set(BuildIgnoreSet()) {}
- const icu::UnicodeSet set;
-};
-
-struct SpecialAddSet {
- SpecialAddSet() : set(BuildSpecialAddSet()) {}
- const icu::UnicodeSet set;
-};
-
-icu::UnicodeSet BuildAsciiAToZSet() {
- icu::UnicodeSet set('a', 'z');
- set.add('A', 'Z');
- set.freeze();
- return set;
-}
-
-struct AsciiAToZSet {
- AsciiAToZSet() : set(BuildAsciiAToZSet()) {}
- const icu::UnicodeSet set;
-};
-
-static base::LazyInstance<IgnoreSet>::type ignore_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<SpecialAddSet>::type special_add_set =
- LAZY_INSTANCE_INITIALIZER;
-
-static base::LazyInstance<AsciiAToZSet>::type ascii_a_to_z_set =
- LAZY_INSTANCE_INITIALIZER;
-#endif // V8_INTL_SUPPORT
-
// static
void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
ZoneList<CharacterRange>* ranges,
@@ -1195,75 +1162,22 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
others.add(from, to);
}
- // Set of characters already added to ranges that do not need to be added
- // again.
+ // Compute the set of additional characters that should be added,
+ // using UnicodeSet::closeOver. ECMA 262 defines slightly different
+ // case-folding rules than Unicode, so some characters that are
+ // added by closeOver do not match anything other than themselves in
+ // JS. For example, 'ſ' (U+017F LATIN SMALL LETTER LONG S) is the
+ // same case-insensitive character as 's' or 'S' according to
+ // Unicode, but does not match any other character in JS. To handle
+ // this case, we add such characters to the IgnoreSet and filter
+ // them out. We filter twice: once before calling closeOver (to
+ // prevent 'ſ' from adding 's'), and once after calling closeOver
+ // (to prevent 's' from adding 'ſ'). See regexp/special-case.h for
+ // more information.
icu::UnicodeSet already_added(others);
-
- // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z].
- icu::UnicodeSet in_ascii_a_to_z(others);
- in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set);
-
- // Remove all chars in [a-zA-Z] from others.
- others.removeAll(in_ascii_a_to_z);
-
- // Set of characters in ranges that are overlapping with special add set.
- icu::UnicodeSet in_special_add(others);
- in_special_add.retainAll(special_add_set.Pointer()->set);
-
- others.removeAll(in_special_add);
-
- // Ignore all chars in ignore set.
- others.removeAll(ignore_set.Pointer()->set);
-
- // For most of the chars in ranges that is still in others, find the case
- // equivlant set by calling closeOver(USET_CASE_INSENSITIVE).
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.closeOver(USET_CASE_INSENSITIVE);
-
- // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others,
- // but ECMA262 "i" mode won't consider that, remove them from others.
- // Ex: U+017F add 'S' and 's' to others.
- others.removeAll(ascii_a_to_z_set.Pointer()->set);
-
- // Special handling for in_ascii_a_to_z.
- for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) {
- UChar32 start = in_ascii_a_to_z.getRangeStart(i);
- UChar32 end = in_ascii_a_to_z.getRangeEnd(i);
- // Check if it is uppercase A-Z by checking bit 6.
- if (start & 0x0020) {
- // Add the lowercases
- others.add(start & 0x005F, end & 0x005F);
- } else {
- // Add the uppercases
- others.add(start | 0x0020, end | 0x0020);
- }
- }
-
- // Special handling for chars in "Special Add" set.
- for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) {
- UChar32 end = in_special_add.getRangeEnd(i);
- for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) {
- // Add the uppercase of this character if itself is not an uppercase
- // character.
- // Note: The if condiction cannot be u_islower(ch) because ch could be
- // neither uppercase nor lowercase but Mn.
- if (!u_isupper(ch)) {
- others.add(u_toupper(ch));
- }
- icu::UnicodeSet candidates(ch, ch);
- candidates.closeOver(USET_CASE_INSENSITIVE);
- for (int32_t j = 0; j < candidates.getRangeCount(); j++) {
- UChar32 end2 = candidates.getRangeEnd(j);
- for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) {
- // Add character that is not uppercase to others.
- if (!u_isupper(ch2)) {
- others.add(ch2);
- }
- }
- }
- }
- }
-
- // Remove all characters which already in the ranges.
+ others.removeAll(RegExpCaseFolding::IgnoreSet());
others.removeAll(already_added);
// Add others to the ranges
diff --git a/deps/v8/src/regexp/regexp-compiler.cc b/deps/v8/src/regexp/regexp-compiler.cc
index d141f3c490..a6c7cdbe2f 100644
--- a/deps/v8/src/regexp/regexp-compiler.cc
+++ b/deps/v8/src/regexp/regexp-compiler.cc
@@ -8,7 +8,9 @@
#include "src/execution/isolate.h"
#include "src/objects/objects-inl.h"
#include "src/regexp/regexp-macro-assembler-arch.h"
-#include "src/regexp/regexp-macro-assembler-tracer.h"
+#ifdef V8_INTL_SUPPORT
+#include "src/regexp/special-case.h"
+#endif // V8_INTL_SUPPORT
#include "src/strings/unicode-inl.h"
#include "src/zone/zone-list-inl.h"
@@ -242,20 +244,15 @@ RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
int capture_count, Handle<String> pattern) {
-#ifdef DEBUG
- if (FLAG_trace_regexp_assembler)
- macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
- else
-#endif
- macro_assembler_ = macro_assembler;
+ macro_assembler_ = macro_assembler;
- std::vector<RegExpNode*> work_list;
+ ZoneVector<RegExpNode*> work_list(zone());
work_list_ = &work_list;
Label fail;
macro_assembler_->PushBacktrack(&fail);
Trace new_trace;
start->Emit(this, &new_trace);
- macro_assembler_->Bind(&fail);
+ macro_assembler_->BindJumpTarget(&fail);
macro_assembler_->Fail();
while (!work_list.empty()) {
RegExpNode* node = work_list.back();
@@ -269,14 +266,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
}
Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
- isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
+ isolate->IncreaseTotalRegexpCodeGenerated(code);
work_list_ = nullptr;
-#ifdef DEBUG
- if (FLAG_trace_regexp_assembler) {
- delete macro_assembler_;
- }
-#endif
return {*code, next_register_};
}
@@ -562,7 +554,7 @@ void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
}
// On backtrack we need to restore state.
- assembler->Bind(&undo);
+ assembler->BindJumpTarget(&undo);
RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
registers_to_clear);
if (backtrack() == nullptr) {
@@ -725,32 +717,34 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
unibrow::uchar* letters,
int letter_length) {
#ifdef V8_INTL_SUPPORT
- // Special case for U+017F which has upper case in ASCII range.
- if (character == 0x017f) {
+ if (RegExpCaseFolding::IgnoreSet().contains(character)) {
letters[0] = character;
return 1;
}
+ bool in_special_add_set =
+ RegExpCaseFolding::SpecialAddSet().contains(character);
+
icu::UnicodeSet set;
set.add(character);
set = set.closeOver(USET_CASE_INSENSITIVE);
+
+ UChar32 canon = 0;
+ if (in_special_add_set) {
+ canon = RegExpCaseFolding::Canonicalize(character);
+ }
+
int32_t range_count = set.getRangeCount();
int items = 0;
for (int32_t i = 0; i < range_count; i++) {
UChar32 start = set.getRangeStart(i);
UChar32 end = set.getRangeEnd(i);
CHECK(end - start + items <= letter_length);
- // Only add to the output if character is not in ASCII range
- // or the case equivalent character is in ASCII range.
- // #sec-runtime-semantics-canonicalize-ch
- // 3.g If the numeric value of ch ≥ 128 and the numeric value of cu < 128,
- // return ch.
- if (!((start >= 128) && (character < 128))) {
- // No range have start and end span across code point 128.
- DCHECK((start >= 128) == (end >= 128));
- for (UChar32 cu = start; cu <= end; cu++) {
- if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
- letters[items++] = (unibrow::uchar)(cu);
+ for (UChar32 cu = start; cu <= end; cu++) {
+ if (one_byte_subject && cu > String::kMaxOneByteCharCode) break;
+ if (in_special_add_set && RegExpCaseFolding::Canonicalize(cu) != canon) {
+ continue;
}
+ letters[items++] = (unibrow::uchar)(cu);
}
}
return items;
@@ -857,10 +851,6 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
return false;
}
-using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler,
- uc16 c, Label* on_failure, int cp_offset,
- bool check, bool preloaded);
-
// Only emits letters (things that have case). Only used for case independent
// matches.
static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
@@ -1848,13 +1838,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
for (int j = 0; j < quarks.length(); j++) {
- uint16_t c = quarks[j];
+ uc16 c = quarks[j];
if (elm.atom()->ignore_case()) {
c = unibrow::Latin1::TryConvertToLatin1(c);
}
if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
// Replace quark in case we converted to Latin-1.
- uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.begin());
+ uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
writable_quarks[j] = c;
}
} else {
@@ -2309,7 +2299,6 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
if (first_element_checked && i == 0 && j == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
- EmitCharacterFunction* emit_function = nullptr;
uc16 quark = quarks[j];
if (elm.atom()->ignore_case()) {
// Everywhere else we assume that a non-Latin-1 character cannot match
@@ -2317,6 +2306,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
// invalid by using the Latin1 equivalent instead.
quark = unibrow::Latin1::TryConvertToLatin1(quark);
}
+ bool needs_bounds_check =
+ *checked_up_to < cp_offset + j || read_backward();
+ bool bounds_checked = false;
switch (pass) {
case NON_LATIN1_MATCH:
DCHECK(one_byte);
@@ -2326,24 +2318,24 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
}
break;
case NON_LETTER_CHARACTER_MATCH:
- emit_function = &EmitAtomNonLetter;
+ bounds_checked =
+ EmitAtomNonLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
break;
case SIMPLE_CHARACTER_MATCH:
- emit_function = &EmitSimpleCharacter;
+ bounds_checked = EmitSimpleCharacter(isolate, compiler, quark,
+ backtrack, cp_offset + j,
+ needs_bounds_check, preloaded);
break;
case CASE_CHARACTER_MATCH:
- emit_function = &EmitAtomLetter;
+ bounds_checked =
+ EmitAtomLetter(isolate, compiler, quark, backtrack,
+ cp_offset + j, needs_bounds_check, preloaded);
break;
default:
break;
}
- if (emit_function != nullptr) {
- bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
- bool bound_checked =
- emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
- bounds_check, preloaded);
- if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
- }
+ if (bounds_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
}
} else {
DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
@@ -3429,8 +3421,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (IgnoreCase(flags_)) {
- assembler->CheckNotBackReferenceIgnoreCase(
- start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
+ assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
+ trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
@@ -3602,12 +3594,17 @@ template <typename... Propagators>
class Analysis : public NodeVisitor {
public:
Analysis(Isolate* isolate, bool is_one_byte)
- : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
+ : isolate_(isolate),
+ is_one_byte_(is_one_byte),
+ error_(RegExpError::kNone) {}
void EnsureAnalyzed(RegExpNode* that) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
- fail("Stack overflow");
+ if (FLAG_correctness_fuzzer_suppressions) {
+ FATAL("Analysis: Aborting on stack overflow");
+ }
+ fail(RegExpError::kAnalysisStackOverflow);
return;
}
if (that->info()->been_analyzed || that->info()->being_analyzed) return;
@@ -3617,12 +3614,12 @@ class Analysis : public NodeVisitor {
that->info()->been_analyzed = true;
}
- bool has_failed() { return error_message_ != nullptr; }
- const char* error_message() {
- DCHECK(error_message_ != nullptr);
- return error_message_;
+ bool has_failed() { return error_ != RegExpError::kNone; }
+ RegExpError error() {
+ DCHECK(error_ != RegExpError::kNone);
+ return error_;
}
- void fail(const char* error_message) { error_message_ = error_message; }
+ void fail(RegExpError error) { error_ = error; }
Isolate* isolate() const { return isolate_; }
@@ -3707,19 +3704,19 @@ class Analysis : public NodeVisitor {
private:
Isolate* isolate_;
bool is_one_byte_;
- const char* error_message_;
+ RegExpError error_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
};
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
RegExpNode* node) {
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
is_one_byte);
DCHECK_EQ(node->info()->been_analyzed, false);
analysis.EnsureAnalyzed(node);
- DCHECK_IMPLIES(analysis.has_failed(), analysis.error_message() != nullptr);
- return analysis.has_failed() ? analysis.error_message() : nullptr;
+ DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
+ return analysis.has_failed() ? analysis.error() : RegExpError::kNone;
}
void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
diff --git a/deps/v8/src/regexp/regexp-compiler.h b/deps/v8/src/regexp/regexp-compiler.h
index 2de221f35d..d083d5d9dd 100644
--- a/deps/v8/src/regexp/regexp-compiler.h
+++ b/deps/v8/src/regexp/regexp-compiler.h
@@ -423,10 +423,7 @@ struct PreloadState {
// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
-//
-// This method returns nullptr on success or a null-terminated failure message
-// on failure.
-const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
+RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
class FrequencyCollator {
public:
@@ -503,18 +500,17 @@ class RegExpCompiler {
}
struct CompilationResult final {
- explicit CompilationResult(const char* error_message)
- : error_message(error_message) {}
+ explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResult(Object code, int registers)
: code(code), num_registers(registers) {}
static CompilationResult RegExpTooBig() {
- return CompilationResult("RegExp too big");
+ return CompilationResult(RegExpError::kTooLarge);
}
- bool Succeeded() const { return error_message == nullptr; }
+ bool Succeeded() const { return error == RegExpError::kNone; }
- const char* const error_message = nullptr;
+ const RegExpError error = RegExpError::kNone;
Object code;
int num_registers = 0;
};
@@ -576,7 +572,7 @@ class RegExpCompiler {
int next_register_;
int unicode_lookaround_stack_register_;
int unicode_lookaround_position_register_;
- std::vector<RegExpNode*>* work_list_;
+ ZoneVector<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
bool one_byte_;
diff --git a/deps/v8/src/regexp/regexp-error.cc b/deps/v8/src/regexp/regexp-error.cc
new file mode 100644
index 0000000000..d7763c64f8
--- /dev/null
+++ b/deps/v8/src/regexp/regexp-error.cc
@@ -0,0 +1,22 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/regexp/regexp-error.h"
+
+namespace v8 {
+namespace internal {
+
+const char* const kRegExpErrorStrings[] = {
+#define TEMPLATE(NAME, STRING) STRING,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+};
+
+const char* RegExpErrorString(RegExpError error) {
+ DCHECK_LT(error, RegExpError::NumErrors);
+ return kRegExpErrorStrings[static_cast<int>(error)];
+}
+
+} // namespace internal
+} // namespace v8
diff --git a/deps/v8/src/regexp/regexp-error.h b/deps/v8/src/regexp/regexp-error.h
new file mode 100644
index 0000000000..6145b404ab
--- /dev/null
+++ b/deps/v8/src/regexp/regexp-error.h
@@ -0,0 +1,58 @@
+// Copyright 2020 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_ERROR_H_
+#define V8_REGEXP_REGEXP_ERROR_H_
+
+#include "src/base/logging.h"
+#include "src/base/macros.h"
+
+namespace v8 {
+namespace internal {
+
+#define REGEXP_ERROR_MESSAGES(T) \
+ T(None, "") \
+ T(StackOverflow, "Maximum call stack size exceeded") \
+ T(AnalysisStackOverflow, "Stack overflow") \
+ T(TooLarge, "Regular expression too large") \
+ T(UnterminatedGroup, "Unterminated group") \
+ T(UnmatchedParen, "Unmatched ')'") \
+ T(EscapeAtEndOfPattern, "\\ at end of pattern") \
+ T(InvalidPropertyName, "Invalid property name") \
+ T(InvalidEscape, "Invalid escape") \
+ T(InvalidDecimalEscape, "Invalid decimal escape") \
+ T(InvalidUnicodeEscape, "Invalid Unicode escape") \
+ T(NothingToRepeat, "Nothing to repeat") \
+ T(LoneQuantifierBrackets, "Lone quantifier brackets") \
+ T(RangeOutOfOrder, "numbers out of order in {} quantifier") \
+ T(IncompleteQuantifier, "Incomplete quantifier") \
+ T(InvalidQuantifier, "Invalid quantifier") \
+ T(InvalidGroup, "Invalid group") \
+ T(MultipleFlagDashes, "Multiple dashes in flag group") \
+ T(RepeatedFlag, "Repeated flag in flag group") \
+ T(InvalidFlagGroup, "Invalid flag group") \
+ T(TooManyCaptures, "Too many captures") \
+ T(InvalidCaptureGroupName, "Invalid capture group name") \
+ T(DuplicateCaptureGroupName, "Duplicate capture group name") \
+ T(InvalidNamedReference, "Invalid named reference") \
+ T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
+ T(InvalidClassEscape, "Invalid class escape") \
+ T(InvalidClassPropertyName, "Invalid property name in character class") \
+ T(InvalidCharacterClass, "Invalid character class") \
+ T(UnterminatedCharacterClass, "Unterminated character class") \
+ T(OutOfOrderCharacterClass, "Range out of order in character class")
+
+enum class RegExpError : uint32_t {
+#define TEMPLATE(NAME, STRING) k##NAME,
+ REGEXP_ERROR_MESSAGES(TEMPLATE)
+#undef TEMPLATE
+ NumErrors
+};
+
+V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_ERROR_H_
diff --git a/deps/v8/src/regexp/regexp-interpreter.cc b/deps/v8/src/regexp/regexp-interpreter.cc
index a74df90c1d..d3efa65bf1 100644
--- a/deps/v8/src/regexp/regexp-interpreter.cc
+++ b/deps/v8/src/regexp/regexp-interpreter.cc
@@ -35,18 +35,18 @@ namespace internal {
namespace {
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
- Vector<const uc16> subject, bool unicode) {
+ Vector<const uc16> subject) {
Address offset_a =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
Address offset_b =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
size_t length = len * kUC16Size;
- return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
- offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
+ return RegExpMacroAssembler::CaseInsensitiveCompareUC16(offset_a, offset_b,
+ length, isolate) == 1;
}
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
- Vector<const uint8_t> subject, bool unicode) {
+ Vector<const uint8_t> subject) {
// For Latin1 characters the unicode flag makes no difference.
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
@@ -747,26 +747,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
DISPATCH();
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) {
- int from = registers[insn >> BYTECODE_SHIFT];
- int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
- if (from >= 0 && len > 0) {
- if (current + len > subject.length() ||
- !BackRefMatchesNoCase(isolate, from, current, len, subject, true)) {
- SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
- DISPATCH();
- }
- current += len;
- }
- ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE);
- DISPATCH();
+ UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode.
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current + len > subject.length() ||
- !BackRefMatchesNoCase(isolate, from, current, len, subject,
- false)) {
+ !BackRefMatchesNoCase(isolate, from, current, len, subject)) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
DISPATCH();
}
@@ -776,27 +764,14 @@ IrregexpInterpreter::Result RawMatch(Isolate* isolate, ByteArray code_array,
DISPATCH();
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) {
- int from = registers[insn >> BYTECODE_SHIFT];
- int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
- if (from >= 0 && len > 0) {
- if (current - len < 0 ||
- !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
- true)) {
- SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
- DISPATCH();
- }
- current -= len;
- }
- ADVANCE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD);
- DISPATCH();
+ UNREACHABLE(); // TODO(jgruber): Remove this unused bytecode.
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from >= 0 && len > 0) {
if (current - len < 0 ||
- !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
- false)) {
+ !BackRefMatchesNoCase(isolate, from, current - len, len, subject)) {
SET_PC_FROM_OFFSET(Load32Aligned(pc + 4));
DISPATCH();
}
@@ -1029,6 +1004,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
}
}
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
// This method is called through an external reference from RegExpExecInternal
// builtin.
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
@@ -1076,6 +1053,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
return result;
}
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromRuntime(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject_string,
int* registers, int registers_length, int start_position) {
diff --git a/deps/v8/src/regexp/regexp-macro-assembler-arch.h b/deps/v8/src/regexp/regexp-macro-assembler-arch.h
index 2dc6739e42..8ec12a0ae6 100644
--- a/deps/v8/src/regexp/regexp-macro-assembler-arch.h
+++ b/deps/v8/src/regexp/regexp-macro-assembler-arch.h
@@ -15,7 +15,7 @@
#include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
#elif V8_TARGET_ARCH_ARM
#include "src/regexp/arm/regexp-macro-assembler-arm.h"
-#elif V8_TARGET_ARCH_PPC
+#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
#include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
#elif V8_TARGET_ARCH_MIPS
#include "src/regexp/mips/regexp-macro-assembler-mips.h"
diff --git a/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc b/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc
index 5dca04a18c..0a12201743 100644
--- a/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc
+++ b/deps/v8/src/regexp/regexp-macro-assembler-tracer.cc
@@ -351,17 +351,15 @@ void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
}
-
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
- PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s %s, label[%08x]);\n",
+ int start_reg, bool read_backward, Label* on_no_match) {
+ PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward",
- unicode ? "unicode" : "non-unicode", LabelToInt(on_no_match));
- assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward, unicode,
+ LabelToInt(on_no_match));
+ assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
on_no_match);
}
-
void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
Label* on_outside_input) {
PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
diff --git a/deps/v8/src/regexp/regexp-macro-assembler-tracer.h b/deps/v8/src/regexp/regexp-macro-assembler-tracer.h
index 2a44146e73..b6ad63071f 100644
--- a/deps/v8/src/regexp/regexp-macro-assembler-tracer.h
+++ b/deps/v8/src/regexp/regexp-macro-assembler-tracer.h
@@ -33,7 +33,6 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
- bool unicode,
Label* on_no_match) override;
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
diff --git a/deps/v8/src/regexp/regexp-macro-assembler.cc b/deps/v8/src/regexp/regexp-macro-assembler.cc
index 30a9955dc3..3ac1bb7f57 100644
--- a/deps/v8/src/regexp/regexp-macro-assembler.cc
+++ b/deps/v8/src/regexp/regexp-macro-assembler.cc
@@ -6,6 +6,7 @@
#include "src/codegen/assembler.h"
#include "src/execution/isolate-inl.h"
+#include "src/execution/pointer-authentication.h"
#include "src/execution/simulator.h"
#include "src/regexp/regexp-stack.h"
#include "src/strings/unicode-inl.h"
@@ -114,34 +115,7 @@ bool NativeRegExpMacroAssembler::CanReadUnaligned() {
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
}
-const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
- String subject, int start_index, const DisallowHeapAllocation& no_gc) {
- if (subject.IsConsString()) {
- subject = ConsString::cast(subject).first();
- } else if (subject.IsSlicedString()) {
- start_index += SlicedString::cast(subject).offset();
- subject = SlicedString::cast(subject).parent();
- }
- if (subject.IsThinString()) {
- subject = ThinString::cast(subject).actual();
- }
- DCHECK_LE(0, start_index);
- DCHECK_LE(start_index, subject.length());
- if (subject.IsSeqOneByteString()) {
- return reinterpret_cast<const byte*>(
- SeqOneByteString::cast(subject).GetChars(no_gc) + start_index);
- } else if (subject.IsSeqTwoByteString()) {
- return reinterpret_cast<const byte*>(
- SeqTwoByteString::cast(subject).GetChars(no_gc) + start_index);
- } else if (subject.IsExternalOneByteString()) {
- return reinterpret_cast<const byte*>(
- ExternalOneByteString::cast(subject).GetChars() + start_index);
- } else {
- DCHECK(subject.IsExternalTwoByteString());
- return reinterpret_cast<const byte*>(
- ExternalTwoByteString::cast(subject).GetChars() + start_index);
- }
-}
+#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
// This method may only be called after an interrupt.
int NativeRegExpMacroAssembler::CheckStackGuardState(
@@ -149,9 +123,10 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
Address* return_address, Code re_code, Address* subject,
const byte** input_start, const byte** input_end) {
DisallowHeapAllocation no_gc;
+ Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
+ DCHECK_LE(re_code.raw_instruction_start(), old_pc);
+ DCHECK_LE(old_pc, re_code.raw_instruction_end());
- DCHECK(re_code.raw_instruction_start() <= *return_address);
- DCHECK(*return_address <= re_code.raw_instruction_end());
StackLimitCheck check(isolate);
bool js_has_overflowed = check.JsHasOverflowed();
@@ -193,9 +168,11 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
}
if (*code_handle != re_code) { // Return address no longer valid
- intptr_t delta = code_handle->address() - re_code.address();
// Overwrite the return address on the stack.
- *return_address += delta;
+ intptr_t delta = code_handle->address() - re_code.address();
+ Address new_pc = old_pc + delta;
+ // TODO(v8:10026): avoid replacing a signed pointer.
+ PointerAuthentication::ReplacePC(return_address, new_pc, 0);
}
// If we continue, we need to update the subject string addresses.
@@ -210,8 +187,7 @@ int NativeRegExpMacroAssembler::CheckStackGuardState(
} else {
*subject = subject_handle->ptr();
intptr_t byte_length = *input_end - *input_start;
- *input_start =
- StringCharacterPosition(*subject_handle, start_index, no_gc);
+ *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
*input_end = *input_start + byte_length;
}
}
@@ -259,7 +235,7 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
DisallowHeapAllocation no_gc;
const byte* input_start =
- StringCharacterPosition(subject_ptr, start_offset + slice_offset, no_gc);
+ subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
int byte_length = char_length << char_size_shift;
const byte* input_end = input_start + byte_length;
return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
@@ -305,6 +281,8 @@ int NativeRegExpMacroAssembler::Execute(
return result;
}
+#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
+
// clang-format off
const byte NativeRegExpMacroAssembler::word_character_map[] = {
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
diff --git a/deps/v8/src/regexp/regexp-macro-assembler.h b/deps/v8/src/regexp/regexp-macro-assembler.h
index bda7e5cce1..e83446cdc9 100644
--- a/deps/v8/src/regexp/regexp-macro-assembler.h
+++ b/deps/v8/src/regexp/regexp-macro-assembler.h
@@ -87,7 +87,7 @@ class RegExpMacroAssembler {
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
@@ -122,6 +122,11 @@ class RegExpMacroAssembler {
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
+
+ // Control-flow integrity:
+ // Define a jump target and bind a label.
+ virtual void BindJumpTarget(Label* label) { Bind(label); }
+
virtual void Fail() = 0;
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
virtual void GoTo(Label* label) = 0;
@@ -246,9 +251,6 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
static Address GrowStack(Address stack_pointer, Address* stack_top,
Isolate* isolate);
- static const byte* StringCharacterPosition(
- String subject, int start_index, const DisallowHeapAllocation& no_gc);
-
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address, Code re_code,
diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc
index 951f815374..3c1115414f 100644
--- a/deps/v8/src/regexp/regexp-parser.cc
+++ b/deps/v8/src/regexp/regexp-parser.cc
@@ -24,11 +24,10 @@
namespace v8 {
namespace internal {
-RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
+RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags,
+ Isolate* isolate, Zone* zone)
: isolate_(isolate),
zone_(zone),
- error_(error),
captures_(nullptr),
named_captures_(nullptr),
named_back_references_(nullptr),
@@ -81,13 +80,12 @@ void RegExpParser::Advance() {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on stack overflow");
}
- ReportError(CStrVector(
- MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
+ ReportError(RegExpError::kStackOverflow);
} else if (zone()->excess_allocation()) {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on excess zone allocation");
}
- ReportError(CStrVector("Regular expression too large"));
+ ReportError(RegExpError::kTooLarge);
} else {
current_ = ReadNext<true>();
}
@@ -139,15 +137,12 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
return false;
}
-
-RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
+RegExpTree* RegExpParser::ReportError(RegExpError error) {
if (failed_) return nullptr; // Do not overwrite any existing error.
failed_ = true;
- *error_ = isolate()
- ->factory()
- ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
- .ToHandleChecked();
- // Zip to the end to make sure the no more input is read.
+ error_ = error;
+ error_pos_ = position();
+ // Zip to the end to make sure no more input is read.
current_ = kEndMarker;
next_pos_ = in()->length();
return nullptr;
@@ -194,14 +189,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
- return ReportError(CStrVector("Unterminated group"));
+ return ReportError(RegExpError::kUnterminatedGroup);
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
- return ReportError(CStrVector("Unmatched ')'"));
+ return ReportError(RegExpError::kUnmatchedParen);
}
DCHECK_NE(INITIAL, state->group_type());
@@ -252,7 +247,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '*':
case '+':
case '?':
- return ReportError(CStrVector("Nothing to repeat"));
+ return ReportError(RegExpError::kNothingToRepeat);
case '^': {
Advance();
if (builder->multiline()) {
@@ -307,7 +302,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '\\':
switch (Next()) {
case kEndMarker:
- return ReportError(CStrVector("\\ at end of pattern"));
+ return ReportError(RegExpError::kEscapeAtEndOfPattern);
case 'b':
Advance(2);
builder->AddAssertion(new (zone()) RegExpAssertion(
@@ -347,7 +342,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (unicode()) {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
- std::vector<char> name_1, name_2;
+ ZoneVector<char> name_1(zone());
+ ZoneVector<char> name_2(zone());
if (ParsePropertyClassName(&name_1, &name_2)) {
if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
RegExpCharacterClass* cc = new (zone())
@@ -363,7 +359,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
}
}
- return ReportError(CStrVector("Invalid property name"));
+ return ReportError(RegExpError::kInvalidPropertyName);
} else {
builder->AddCharacter(p);
}
@@ -399,7 +395,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// With /u, no identity escapes except for syntax characters
// are allowed. Otherwise, all identity escapes are allowed.
if (unicode()) {
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
@@ -413,7 +409,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance();
if (unicode() && Next() >= '0' && Next() <= '9') {
// With /u, decimal escape with leading 0 are not parsed as octal.
- return ReportError(CStrVector("Invalid decimal escape"));
+ return ReportError(RegExpError::kInvalidDecimalEscape);
}
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
@@ -454,7 +450,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// ES#prod-annexB-ExtendedPatternCharacter
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid unicode escape"));
+ return ReportError(RegExpError::kInvalidUnicodeEscape);
}
builder->AddCharacter('\\');
} else {
@@ -472,7 +468,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter('x');
} else {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
break;
}
@@ -485,7 +481,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter('u');
} else {
// With /u, invalid escapes are not treated as identity escapes.
- return ReportError(CStrVector("Invalid Unicode escape"));
+ return ReportError(RegExpError::kInvalidUnicodeEscape);
}
break;
}
@@ -509,7 +505,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
builder->AddCharacter(current());
Advance();
} else {
- return ReportError(CStrVector("Invalid escape"));
+ return ReportError(RegExpError::kInvalidEscape);
}
break;
}
@@ -517,13 +513,13 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': {
int dummy;
bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
- if (parsed) return ReportError(CStrVector("Nothing to repeat"));
+ if (parsed) return ReportError(RegExpError::kNothingToRepeat);
V8_FALLTHROUGH;
}
case '}':
case ']':
if (unicode()) {
- return ReportError(CStrVector("Lone quantifier brackets"));
+ return ReportError(RegExpError::kLoneQuantifierBrackets);
}
V8_FALLTHROUGH;
default:
@@ -558,13 +554,12 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
- return ReportError(
- CStrVector("numbers out of order in {} quantifier"));
+ return ReportError(RegExpError::kRangeOutOfOrder);
}
break;
} else if (unicode()) {
// With /u, incomplete quantifiers are not allowed.
- return ReportError(CStrVector("Incomplete quantifier"));
+ return ReportError(RegExpError::kIncompleteQuantifier);
}
continue;
default:
@@ -580,7 +575,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance();
}
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
- return ReportError(CStrVector("Invalid quantifier"));
+ return ReportError(RegExpError::kInvalidQuantifier);
}
}
}
@@ -615,7 +610,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
case 's':
case 'm': {
if (!FLAG_regexp_mode_modifiers) {
- ReportError(CStrVector("Invalid group"));
+ ReportError(RegExpError::kInvalidGroup);
return nullptr;
}
Advance();
@@ -624,7 +619,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
switch (current()) {
case '-':
if (!flags_sense) {
- ReportError(CStrVector("Multiple dashes in flag group"));
+ ReportError(RegExpError::kMultipleFlagDashes);
return nullptr;
}
flags_sense = false;
@@ -638,7 +633,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
if (current() == 'm') bit = JSRegExp::kMultiline;
if (current() == 's') bit = JSRegExp::kDotAll;
if (((switch_on | switch_off) & bit) != 0) {
- ReportError(CStrVector("Repeated flag in flag group"));
+ ReportError(RegExpError::kRepeatedFlag);
return nullptr;
}
if (flags_sense) {
@@ -666,7 +661,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
subexpr_type = GROUPING; // Will break us out of the outer loop.
continue;
default:
- ReportError(CStrVector("Invalid flag group"));
+ ReportError(RegExpError::kInvalidFlagGroup);
return nullptr;
}
}
@@ -690,13 +685,13 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
Advance();
break;
default:
- ReportError(CStrVector("Invalid group"));
+ ReportError(RegExpError::kInvalidGroup);
return nullptr;
}
}
if (subexpr_type == CAPTURE) {
if (captures_started_ >= JSRegExp::kMaxCaptures) {
- ReportError(CStrVector("Too many captures"));
+ ReportError(RegExpError::kTooManyCaptures);
return nullptr;
}
captures_started_++;
@@ -845,20 +840,20 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
if (c == '\\' && current() == 'u') {
Advance();
if (!ParseUnicodeEscape(&c)) {
- ReportError(CStrVector("Invalid Unicode escape sequence"));
+ ReportError(RegExpError::kInvalidUnicodeEscape);
return nullptr;
}
}
// The backslash char is misclassified as both ID_Start and ID_Continue.
if (c == '\\') {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
if (at_start) {
if (!IsIdentifierStart(c)) {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
push_code_unit(name, c);
@@ -869,7 +864,7 @@ const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
} else if (IsIdentifierPart(c)) {
push_code_unit(name, c);
} else {
- ReportError(CStrVector("Invalid capture group name"));
+ ReportError(RegExpError::kInvalidCaptureGroupName);
return nullptr;
}
}
@@ -896,7 +891,7 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
const auto& named_capture_it = named_captures_->find(capture);
if (named_capture_it != named_captures_->end()) {
- ReportError(CStrVector("Duplicate capture group name"));
+ ReportError(RegExpError::kDuplicateCaptureGroupName);
return false;
}
}
@@ -910,7 +905,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state) {
// The parser is assumed to be on the '<' in \k<name>.
if (current() != '<') {
- ReportError(CStrVector("Invalid named reference"));
+ ReportError(RegExpError::kInvalidNamedReference);
return false;
}
@@ -943,7 +938,7 @@ void RegExpParser::PatchNamedBackReferences() {
if (named_back_references_ == nullptr) return;
if (named_captures_ == nullptr) {
- ReportError(CStrVector("Invalid named capture referenced"));
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
@@ -964,7 +959,7 @@ void RegExpParser::PatchNamedBackReferences() {
if (capture_it != named_captures_->end()) {
index = (*capture_it)->index();
} else {
- ReportError(CStrVector("Invalid named capture referenced"));
+ ReportError(RegExpError::kInvalidNamedCaptureReference);
return;
}
@@ -1385,8 +1380,8 @@ bool IsUnicodePropertyValueCharacter(char c) {
} // anonymous namespace
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
DCHECK(name_1->empty());
DCHECK(name_2->empty());
// Parse the property class as follows:
@@ -1425,8 +1420,8 @@ bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2) {
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2) {
if (name_2.empty()) {
// First attempt to interpret as general category property value name.
const char* name = name_1.data();
@@ -1463,7 +1458,7 @@ bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
}
}
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) {
if (!FLAG_harmony_regexp_sequence) return nullptr;
const char* name = name_1.data();
const uc32* sequence_list = nullptr;
@@ -1529,19 +1524,19 @@ RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
#else // V8_INTL_SUPPORT
-bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2) {
+bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2) {
return false;
}
bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2) {
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2) {
return false;
}
-RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
+RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) {
return nullptr;
}
@@ -1605,7 +1600,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
}
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid class escape"));
+ ReportError(RegExpError::kInvalidClassEscape);
return 0;
}
if ((controlLetter >= '0' && controlLetter <= '9') ||
@@ -1638,7 +1633,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// ES#prod-annexB-LegacyOctalEscapeSequence
if (unicode()) {
// With /u, decimal escape is not interpreted as octal character code.
- ReportError(CStrVector("Invalid class escape"));
+ ReportError(RegExpError::kInvalidClassEscape);
return 0;
}
return ParseOctalLiteral();
@@ -1648,7 +1643,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseHexEscape(2, &value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid escape"));
+ ReportError(RegExpError::kInvalidEscape);
return 0;
}
// If \x is not followed by a two-digit hexadecimal, treat it
@@ -1661,7 +1656,7 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
if (ParseUnicodeEscape(&value)) return value;
if (unicode()) {
// With /u, invalid escapes are not treated as identity escapes.
- ReportError(CStrVector("Invalid unicode escape"));
+ ReportError(RegExpError::kInvalidUnicodeEscape);
return 0;
}
// If \u is not followed by a two-digit hexadecimal, treat it
@@ -1676,11 +1671,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
Advance();
return result;
}
- ReportError(CStrVector("Invalid escape"));
+ ReportError(RegExpError::kInvalidEscape);
return 0;
}
}
- return 0;
+ UNREACHABLE();
}
void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
@@ -1703,17 +1698,18 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
return;
}
case kEndMarker:
- ReportError(CStrVector("\\ at end of pattern"));
+ ReportError(RegExpError::kEscapeAtEndOfPattern);
return;
case 'p':
case 'P':
if (unicode()) {
bool negate = Next() == 'P';
Advance(2);
- std::vector<char> name_1, name_2;
+ ZoneVector<char> name_1(zone);
+ ZoneVector<char> name_2(zone);
if (!ParsePropertyClassName(&name_1, &name_2) ||
!AddPropertyClassRange(ranges, negate, name_1, name_2)) {
- ReportError(CStrVector("Invalid property name in character class"));
+ ReportError(RegExpError::kInvalidClassPropertyName);
}
*is_class_escape = true;
return;
@@ -1732,10 +1728,6 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
}
RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
- static const char* kUnterminated = "Unterminated character class";
- static const char* kRangeInvalid = "Invalid character class";
- static const char* kRangeOutOfOrder = "Range out of order in character class";
-
DCHECK_EQ(current(), '[');
Advance();
bool is_negated = false;
@@ -1768,7 +1760,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
// Either end is an escaped character class. Treat the '-' verbatim.
if (unicode()) {
// ES2015 21.2.2.15.1 step 1.
- return ReportError(CStrVector(kRangeInvalid));
+ return ReportError(RegExpError::kInvalidCharacterClass);
}
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
@@ -1777,7 +1769,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
}
// ES2015 21.2.2.15.1 step 6.
if (char_1 > char_2) {
- return ReportError(CStrVector(kRangeOutOfOrder));
+ return ReportError(RegExpError::kOutOfOrderCharacterClass);
}
ranges->Add(CharacterRange::Range(char_1, char_2), zone());
} else {
@@ -1785,7 +1777,7 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
}
}
if (!has_more()) {
- return ReportError(CStrVector(kUnterminated));
+ return ReportError(RegExpError::kUnterminatedCharacterClass);
}
Advance();
RegExpCharacterClass::CharacterClassFlags character_class_flags;
@@ -1802,14 +1794,16 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
FlatStringReader* input, JSRegExp::Flags flags,
RegExpCompileData* result) {
DCHECK(result != nullptr);
- RegExpParser parser(input, &result->error, flags, isolate, zone);
+ RegExpParser parser(input, flags, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == nullptr);
- DCHECK(!result->error.is_null());
+ DCHECK(parser.error_ != RegExpError::kNone);
+ result->error = parser.error_;
+ result->error_pos = parser.error_pos_;
} else {
DCHECK(tree != nullptr);
- DCHECK(result->error.is_null());
+ DCHECK(parser.error_ == RegExpError::kNone);
if (FLAG_trace_regexp_parser) {
StdoutStream os;
tree->Print(os, zone);
diff --git a/deps/v8/src/regexp/regexp-parser.h b/deps/v8/src/regexp/regexp-parser.h
index cc1948b101..aff1746bc5 100644
--- a/deps/v8/src/regexp/regexp-parser.h
+++ b/deps/v8/src/regexp/regexp-parser.h
@@ -8,6 +8,7 @@
#include "src/objects/js-regexp.h"
#include "src/objects/objects.h"
#include "src/regexp/regexp-ast.h"
+#include "src/regexp/regexp-error.h"
#include "src/zone/zone.h"
namespace v8 {
@@ -153,8 +154,8 @@ class RegExpBuilder : public ZoneObject {
class V8_EXPORT_PRIVATE RegExpParser {
public:
- RegExpParser(FlatStringReader* in, Handle<String>* error,
- JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
+ RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
+ Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
@@ -177,13 +178,13 @@ class V8_EXPORT_PRIVATE RegExpParser {
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
- bool ParsePropertyClassName(std::vector<char>* name_1,
- std::vector<char>* name_2);
+ bool ParsePropertyClassName(ZoneVector<char>* name_1,
+ ZoneVector<char>* name_2);
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
- const std::vector<char>& name_1,
- const std::vector<char>& name_2);
+ const ZoneVector<char>& name_1,
+ const ZoneVector<char>& name_2);
- RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
+ RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
uc32 ParseOctalLiteral();
@@ -202,7 +203,7 @@ class V8_EXPORT_PRIVATE RegExpParser {
char ParseClassEscape();
- RegExpTree* ReportError(Vector<const char> message);
+ RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void Reset(int pos);
@@ -335,7 +336,8 @@ class V8_EXPORT_PRIVATE RegExpParser {
Isolate* isolate_;
Zone* zone_;
- Handle<String>* error_;
+ RegExpError error_ = RegExpError::kNone;
+ int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
diff --git a/deps/v8/src/regexp/regexp-stack.h b/deps/v8/src/regexp/regexp-stack.h
index cd199adfb2..9394398fcc 100644
--- a/deps/v8/src/regexp/regexp-stack.h
+++ b/deps/v8/src/regexp/regexp-stack.h
@@ -38,6 +38,9 @@ class RegExpStackScope {
class RegExpStack {
public:
+ RegExpStack();
+ ~RegExpStack();
+
// Number of allocated locations on the stack below the limit.
// No sequence of pushes must be longer that this without doing a stack-limit
// check.
@@ -77,9 +80,6 @@ class RegExpStack {
static constexpr size_t kMaximumStackSize = 64 * MB;
private:
- RegExpStack();
- ~RegExpStack();
-
// Artificial limit used when the thread-local state has been destroyed.
static const Address kMemoryTop =
static_cast<Address>(static_cast<uintptr_t>(-1));
diff --git a/deps/v8/src/regexp/regexp.cc b/deps/v8/src/regexp/regexp.cc
index 3632deaeb8..4319990a39 100644
--- a/deps/v8/src/regexp/regexp.cc
+++ b/deps/v8/src/regexp/regexp.cc
@@ -14,6 +14,7 @@
#include "src/regexp/regexp-dotprinter.h"
#include "src/regexp/regexp-interpreter.h"
#include "src/regexp/regexp-macro-assembler-arch.h"
+#include "src/regexp/regexp-macro-assembler-tracer.h"
#include "src/regexp/regexp-parser.h"
#include "src/strings/string-search.h"
#include "src/utils/ostreams.h"
@@ -91,9 +92,15 @@ class RegExpImpl final : public AllStatic {
};
V8_WARN_UNUSED_RESULT
-static inline MaybeHandle<Object> ThrowRegExpException(
- Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
- Handle<String> error_text) {
+static inline MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
+ Handle<JSRegExp> re,
+ Handle<String> pattern,
+ RegExpError error) {
+ Vector<const char> error_data = CStrVector(RegExpErrorString(error));
+ Handle<String> error_text =
+ isolate->factory()
+ ->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
+ .ToHandleChecked();
THROW_NEW_ERROR(
isolate,
NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
@@ -101,7 +108,7 @@ static inline MaybeHandle<Object> ThrowRegExpException(
}
inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
- Handle<String> error_text) {
+ RegExpError error_text) {
USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
error_text));
}
@@ -407,7 +414,7 @@ bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
is_one_byte, re->BacktrackLimit());
if (!compilation_succeeded) {
- DCHECK(!compile_data.error.is_null());
+ DCHECK(compile_data.error != RegExpError::kNone);
ThrowRegExpException(isolate, re, compile_data.error);
return false;
}
@@ -740,8 +747,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
Handle<String> sample_subject, bool is_one_byte,
uint32_t backtrack_limit) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
- data->error =
- isolate->factory()->NewStringFromAsciiChecked("RegExp too big");
+ data->error = RegExpError::kTooLarge;
return false;
}
@@ -809,8 +815,8 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
data->node = node;
- if (const char* error_message = AnalyzeRegExp(isolate, is_one_byte, node)) {
- data->error = isolate->factory()->NewStringFromAsciiChecked(error_message);
+ data->error = AnalyzeRegExp(isolate, is_one_byte, node);
+ if (data->error != RegExpError::kNone) {
return false;
}
@@ -839,7 +845,7 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
#elif V8_TARGET_ARCH_S390
macro_assembler.reset(new RegExpMacroAssemblerS390(
isolate, zone, mode, (data->capture_count + 1) * 2));
-#elif V8_TARGET_ARCH_PPC
+#elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
macro_assembler.reset(new RegExpMacroAssemblerPPC(
isolate, zone, mode, (data->capture_count + 1) * 2));
#elif V8_TARGET_ARCH_MIPS
@@ -878,8 +884,18 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
macro_assembler->set_global_mode(mode);
}
+ RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
+#ifdef DEBUG
+ std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
+ if (FLAG_trace_regexp_assembler) {
+ tracer_macro_assembler.reset(
+ new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
+ macro_assembler_ptr = tracer_macro_assembler.get();
+ }
+#endif
+
RegExpCompiler::CompilationResult result = compiler.Assemble(
- isolate, macro_assembler.get(), node, data->capture_count, pattern);
+ isolate, macro_assembler_ptr, node, data->capture_count, pattern);
// Code / bytecode printing.
{
@@ -902,13 +918,12 @@ bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
}
}
- if (result.error_message != nullptr) {
+ if (result.error != RegExpError::kNone) {
if (FLAG_correctness_fuzzer_suppressions &&
- strncmp(result.error_message, "Stack overflow", 15) == 0) {
+ result.error == RegExpError::kStackOverflow) {
FATAL("Aborting on stack overflow");
}
- data->error =
- isolate->factory()->NewStringFromAsciiChecked(result.error_message);
+ data->error = result.error;
}
data->code = result.code;
diff --git a/deps/v8/src/regexp/regexp.h b/deps/v8/src/regexp/regexp.h
index 9f3581d18e..27ccbb47ba 100644
--- a/deps/v8/src/regexp/regexp.h
+++ b/deps/v8/src/regexp/regexp.h
@@ -6,6 +6,7 @@
#define V8_REGEXP_REGEXP_H_
#include "src/objects/js-regexp.h"
+#include "src/regexp/regexp-error.h"
namespace v8 {
namespace internal {
@@ -42,7 +43,11 @@ struct RegExpCompileData {
// The error message. Only used if an error occurred during parsing or
// compilation.
- Handle<String> error;
+ RegExpError error = RegExpError::kNone;
+
+ // The position at which the error was detected. Only used if an
+ // error occurred.
+ int error_pos = 0;
// The number of capture groups, without the global capture \0.
int capture_count = 0;
diff --git a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc
index bcef02369f..be4b85df4f 100644
--- a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc
+++ b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.cc
@@ -113,6 +113,8 @@ RegExpMacroAssemblerS390::RegExpMacroAssemblerS390(Isolate* isolate, Zone* zone,
backtrack_label_(),
exit_label_(),
internal_failure_label_() {
+ masm_->set_root_array_available(false);
+
DCHECK_EQ(0, registers_to_save % 2);
__ b(&entry_label_); // We'll write the entry code later.
@@ -228,7 +230,7 @@ void RegExpMacroAssemblerS390::CheckGreedyLoop(Label* on_equal) {
}
void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
__ LoadP(r2, register_location(start_reg)); // Index of start of
// capture
@@ -325,7 +327,7 @@ void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase(
// r2: Address byte_offset1 - Address captured substring's start.
// r3: Address byte_offset2 - Address of current character position.
// r4: size_t byte_length - length of capture in bytes(!)
- // r5: Isolate* isolate or 0 if unicode flag.
+ // r5: Isolate* isolate.
// Address of start of capture.
__ AddP(r2, end_of_input_address());
@@ -339,14 +341,7 @@ void RegExpMacroAssemblerS390::CheckNotBackReferenceIgnoreCase(
__ SubP(r3, r3, r6);
}
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ LoadImmP(r5, Operand::Zero());
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ mov(r5, Operand(ExternalReference::isolate_address(isolate())));
- }
+ __ mov(r5, Operand(ExternalReference::isolate_address(isolate())));
{
AllowExternalCallThatCantCauseGC scope(masm_);
diff --git a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h
index 4f79296d78..eced564d7f 100644
--- a/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h
+++ b/deps/v8/src/regexp/s390/regexp-macro-assembler-s390.h
@@ -36,7 +36,7 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerS390
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
- bool read_backward, bool unicode,
+ bool read_backward,
Label* on_no_match);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
diff --git a/deps/v8/src/regexp/special-case.h b/deps/v8/src/regexp/special-case.h
index 1ccec5d31a..753c9231ed 100644
--- a/deps/v8/src/regexp/special-case.h
+++ b/deps/v8/src/regexp/special-case.h
@@ -6,70 +6,109 @@
#define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT
-#include "unicode/uversion.h"
-namespace U_ICU_NAMESPACE {
-class UnicodeSet;
-} // namespace U_ICU_NAMESPACE
+#include "src/base/logging.h"
+#include "src/common/globals.h"
+
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
namespace v8 {
namespace internal {
-// Functions to build special sets of Unicode characters that need special
-// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE).
+// Sets of Unicode characters that need special handling under "i" mode
+
+// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
+// defines slightly different case-folding rules than Unicode. An
+// input character should match a pattern character if the result of
+// the Canonicalize algorithm is the same for both characters.
//
-// For the characters in the "ignore set", the process should not treat other
-// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case
-// equivlant under the ECMA262 RegExp "i" mode because these characters are
-// uppercase themselves that no other characters in the set uppercase to.
+// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
+// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
+// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
+// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
+// the precise definition.
//
-// For the characters in the "special add set", the proecess should add only
-// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is
-// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode
-// and also that ONE uppercase character that other non uppercase character
-// uppercase into to the set. Other uppercase characters in the result of
-// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262
-// RegExp "i" mode consider two characters as "case equivlant" if both
-// characters uppercase to the same character.
+// While compiling such regular expressions, we need to compute the
+// set of characters that should match a given input character. (See
+// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
+// For almost all characters, this can be efficiently computed using
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
+// the remaining special cases.
//
-// For example, consider the following case equivalent set defined by Unicode
-// standard. Notice there are more than one uppercase characters in this set:
-// U+212B Å Angstrom Sign - an uppercase character.
-// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character.
-// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which
-// uppercase to U+00C5.
-// In this case equivlant set is a special set and need special handling while
-// considering "case equivlant" under the ECMA262 RegExp "i" mode which is
-// different than Unicode Standard:
-// * U+212B should be included into the "ignore" set because there are no other
-// characters, under the ECMA262 "i" mode, are considered as "case equivlant"
-// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5
-// uppercase to U+212B.
-// * U+00C5 and U+00E5 will both be included into the "special add" set. While
-// calculate the "equivlant set" under ECMA262 "i" mode, the process will
-// add U+00E5, because it is not an uppercase character in the set. The
-// process will also add U+00C5, because it is the uppercase character which
-// other non uppercase character, U+00C5, uppercase into.
+// For a character c, the rules are as follows:
//
-// For characters not included in "ignore set" and "special add set", the
-// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is
-// much faster.
+// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
+// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
+// containing c will produce the set of characters that should
+// match /c/i (or /[c]/i), and only those characters.
//
-// Under Unicode 12.0, there are only 7 characters in the "special add set" and
-// 4 characters in "ignore set" so even the special add process is slower, it is
-// limited to a small set of cases only.
+// 2. If c is in IgnoreSet, then the only character it should match is
+// itself. However, closeOver will add additional incorrect
+// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
+// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
+// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
+// itself, and should not match 'ẞ'. In these cases, we can skip
+// the closeOver entirely, because it will never add an equivalent
+// character.
//
-// The implementation of these two function will be generated by calling ICU
-// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by
-// the code in src/regexp/gen-regexp-special-case.cc.
+// 3. If c is in SpecialAddSet, then it should match at least one
+// character other than itself. However, closeOver will add at
+// least one additional incorrect match. For example, consider the
+// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
+// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
+// SIGN should not match either of the other two characters. As a
+// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
+// IgnoreSet). To find the correct matches for characters in
+// SpecialAddSet, we closeOver the original character, but filter
+// out the results that do not have the same canonical value.
//
-// These two function will be used with LazyInstance<> template to generate
-// global sharable set to reduce memory usage and speed up performance.
+// The contents of these sets are calculated at build time by
+// src/regexp/gen-regexp-special-case.cc, which generates
+// gen/src/regexp/special-case.cc. This is done by iterating over the
+// result of closeOver for each BMP character, and finding sets for
+// which at least one character has a different canonical value than
+// another character. Characters that match no other characters in
+// their equivalence class are added to IgnoreSet. Characters that
+// match at least one other character are added to SpecialAddSet.
+
+class RegExpCaseFolding final : public AllStatic {
+ public:
+ static const icu::UnicodeSet& IgnoreSet();
+ static const icu::UnicodeSet& SpecialAddSet();
+
+ // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
+ // Canonicalize) step 3, which is used to determine whether
+ // characters match when ignoreCase is true and unicode is false.
+ static UChar32 Canonicalize(UChar32 ch) {
+ // a. Assert: ch is a UTF-16 code unit.
+ CHECK_LE(ch, 0xffff);
+
+ // b. Let s be the String value consisting of the single code unit ch.
+ icu::UnicodeString s(ch);
+
+ // c. Let u be the same result produced as if by performing the algorithm
+ // for String.prototype.toUpperCase using s as the this value.
+ // d. Assert: Type(u) is String.
+ icu::UnicodeString& u = s.toUpper();
+
+ // e. If u does not consist of a single code unit, return ch.
+ if (u.length() != 1) {
+ return ch;
+ }
+
+ // f. Let cu be u's single code unit element.
+ UChar32 cu = u.char32At(0);
-// Function to build and return the Ignore set.
-icu::UnicodeSet BuildIgnoreSet();
+ // g. If the value of ch >= 128 and the value of cu < 128, return ch.
+ if (ch >= 128 && cu < 128) {
+ return ch;
+ }
-// Function to build and return the Special Add set.
-icu::UnicodeSet BuildSpecialAddSet();
+ // h. Return cu.
+ return cu;
+ }
+};
} // namespace internal
} // namespace v8
diff --git a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc
index 5620c6b9ce..5edbf5e579 100644
--- a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc
+++ b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.cc
@@ -214,9 +214,8 @@ void RegExpMacroAssemblerX64::CheckGreedyLoop(Label* on_equal) {
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
- int start_reg, bool read_backward, bool unicode, Label* on_no_match) {
+ int start_reg, bool read_backward, Label* on_no_match) {
Label fallthrough;
ReadPositionFromRegister(rdx, start_reg); // Offset of start of capture
ReadPositionFromRegister(rbx, start_reg + 1); // Offset of end of capture
@@ -321,7 +320,7 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
-// Isolate* isolate or 0 if unicode flag.
+ // Isolate* isolate.
#ifdef V8_TARGET_OS_WIN
DCHECK(rcx == arg_reg_1);
DCHECK(rdx == arg_reg_2);
@@ -349,14 +348,7 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
// Set byte_length.
__ movq(arg_reg_3, rbx);
// Isolate.
-#ifdef V8_INTL_SUPPORT
- if (unicode) {
- __ movq(arg_reg_4, Immediate(0));
- } else // NOLINT
-#endif // V8_INTL_SUPPORT
- {
- __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
- }
+ __ LoadAddress(arg_reg_4, ExternalReference::isolate_address(isolate()));
{ // NOLINT: Can't find a way to open this scope without confusing the
// linter.
@@ -388,7 +380,6 @@ void RegExpMacroAssemblerX64::CheckNotBackReferenceIgnoreCase(
__ bind(&fallthrough);
}
-
void RegExpMacroAssemblerX64::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
diff --git a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h
index 0bf1c2e150..64614e228a 100644
--- a/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h
+++ b/deps/v8/src/regexp/x64/regexp-macro-assembler-x64.h
@@ -37,7 +37,6 @@ class V8_EXPORT_PRIVATE RegExpMacroAssemblerX64
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
- bool unicode,
Label* on_no_match) override;
void CheckNotCharacter(uint32_t c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask,