summaryrefslogtreecommitdiff
path: root/deps/v8/src/regexp/regexp-parser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/regexp/regexp-parser.cc')
-rw-r--r--deps/v8/src/regexp/regexp-parser.cc523
1 files changed, 398 insertions, 125 deletions
diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc
index fa8900342c..2fe6fde82a 100644
--- a/deps/v8/src/regexp/regexp-parser.cc
+++ b/deps/v8/src/regexp/regexp-parser.cc
@@ -8,27 +8,32 @@
#include "src/factory.h"
#include "src/isolate.h"
#include "src/objects-inl.h"
+#include "src/ostreams.h"
#include "src/regexp/jsregexp.h"
#include "src/utils.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#endif // V8_I18N_SUPPORT
+
namespace v8 {
namespace internal {
RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
- bool multiline, bool unicode, Isolate* isolate,
- Zone* zone)
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
: isolate_(isolate),
zone_(zone),
error_(error),
captures_(NULL),
in_(in),
current_(kEndMarker),
+ ignore_case_(flags & JSRegExp::kIgnoreCase),
+ multiline_(flags & JSRegExp::kMultiline),
+ unicode_(flags & JSRegExp::kUnicode),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
- multiline_(multiline),
- unicode_(unicode),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false),
@@ -36,10 +41,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Advance();
}
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
+ int position = next_pos_;
+ uc32 c0 = in()->Get(position);
+ position++;
+ // Read the whole surrogate pair in case of unicode flag, if possible.
+ if (unicode() && position < in()->length() &&
+ unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+ uc16 c1 = in()->Get(position);
+ if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+ c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+ position++;
+ }
+ }
+ if (update_position) next_pos_ = position;
+ return c0;
+}
+
uc32 RegExpParser::Next() {
if (has_next()) {
- return in()->Get(next_pos_);
+ return ReadNext<false>();
} else {
return kEndMarker;
}
@@ -47,25 +70,14 @@ uc32 RegExpParser::Next() {
void RegExpParser::Advance() {
- if (next_pos_ < in()->length()) {
+ if (has_next()) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
ReportError(CStrVector(Isolate::kStackOverflowMessage));
} else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large"));
} else {
- current_ = in()->Get(next_pos_);
- next_pos_++;
- // Read the whole surrogate pair in case of unicode flag, if possible.
- if (unicode_ && next_pos_ < in()->length() &&
- unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
- uc16 trail = in()->Get(next_pos_);
- if (unibrow::Utf16::IsTrailSurrogate(trail)) {
- current_ = unibrow::Utf16::CombineSurrogatePair(
- static_cast<uc16>(current_), trail);
- next_pos_++;
- }
- }
+ current_ = ReadNext<true>();
}
} else {
current_ = kEndMarker;
@@ -92,11 +104,28 @@ void RegExpParser::Advance(int dist) {
bool RegExpParser::simple() { return simple_; }
-
-bool RegExpParser::IsSyntaxCharacter(uc32 c) {
- return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
- c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
- c == '{' || c == '}' || c == '|';
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+ switch (c) {
+ case '^':
+ case '$':
+ case '\\':
+ case '.':
+ case '*':
+ case '+':
+ case '?':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ case '/':
+ return true;
+ default:
+ break;
+ }
+ return false;
}
@@ -142,7 +171,7 @@ RegExpTree* RegExpParser::ParsePattern() {
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
- zone());
+ ignore_case(), unicode(), zone());
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@@ -151,14 +180,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case kEndMarker:
if (state->IsSubexpression()) {
// Inside a parenthesized group when hitting end of input.
- ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+ return ReportError(CStrVector("Unterminated group"));
}
DCHECK_EQ(INITIAL, state->group_type());
// Parsing completed successfully.
return builder->ToRegExp();
case ')': {
if (!state->IsSubexpression()) {
- ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
+ return ReportError(CStrVector("Unmatched ')'"));
}
DCHECK_NE(INITIAL, state->group_type());
@@ -206,7 +235,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
return ReportError(CStrVector("Nothing to repeat"));
case '^': {
Advance();
- if (multiline_) {
+ if (multiline()) {
builder->AddAssertion(
new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
} else {
@@ -219,8 +248,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '$': {
Advance();
RegExpAssertion::AssertionType assertion_type =
- multiline_ ? RegExpAssertion::END_OF_LINE
- : RegExpAssertion::END_OF_INPUT;
+ multiline() ? RegExpAssertion::END_OF_LINE
+ : RegExpAssertion::END_OF_INPUT;
builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
continue;
}
@@ -230,8 +259,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape('.', ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
break;
}
case '(': {
@@ -265,25 +295,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
// Fall through.
default:
- ReportError(CStrVector("Invalid group") CHECK_FAILED);
- break;
+ return ReportError(CStrVector("Invalid group"));
}
Advance(2);
} else {
if (captures_started_ >= kMaxCaptures) {
- ReportError(CStrVector("Too many captures") CHECK_FAILED);
+ return ReportError(CStrVector("Too many captures"));
}
captures_started_++;
}
// Store current state and begin new disjunction parsing.
state = new (zone()) RegExpParserState(
- state, subexpr_type, lookaround_type, captures_started_, zone());
+ state, subexpr_type, lookaround_type, captures_started_,
+ ignore_case(), unicode(), zone());
builder = state->builder();
continue;
}
case '[': {
- RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
- builder->AddAtom(atom);
+ RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
+ builder->AddCharacterClass(cc->AsCharacterClass());
break;
}
// Atom ::
@@ -318,8 +348,26 @@ RegExpTree* RegExpParser::ParseDisjunction() {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
CharacterRange::AddClassEscape(c, ranges, zone());
- RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
- builder->AddAtom(atom);
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, false);
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ case 'p':
+ case 'P': {
+ uc32 p = Next();
+ Advance(2);
+ if (unicode()) {
+ ZoneList<CharacterRange>* ranges = ParsePropertyClass();
+ if (ranges == nullptr) {
+ return ReportError(CStrVector("Invalid property name"));
+ }
+ RegExpCharacterClass* cc =
+ new (zone()) RegExpCharacterClass(ranges, p == 'P');
+ builder->AddCharacterClass(cc);
+ } else {
+ builder->AddCharacter(p);
+ }
break;
}
case '1':
@@ -332,7 +380,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '8':
case '9': {
int index = 0;
- if (ParseBackReferenceIndex(&index)) {
+ bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+ if (is_backref) {
if (state->IsInsideCaptureGroup(index)) {
// The back reference is inside the capture group it refers to.
// Nothing can possibly have been captured yet, so we use empty
@@ -347,24 +396,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
}
break;
}
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (unicode()) {
+ return ReportError(CStrVector("Invalid escape"));
+ }
uc32 first_digit = Next();
if (first_digit == '8' || first_digit == '9') {
- // If the 'u' flag is present, only syntax characters can be
- // escaped,
- // no other identity escapes are allowed. If the 'u' flag is not
- // present, all identity escapes are allowed.
- if (!unicode_) {
- builder->AddCharacter(first_digit);
- Advance(2);
- } else {
- return ReportError(CStrVector("Invalid escape"));
- }
+ builder->AddCharacter(first_digit);
+ Advance(2);
break;
}
}
// FALLTHROUGH
case '0': {
Advance();
+ if (unicode() && Next() >= '0' && Next() <= '9') {
+ // With /u, decimal escape with leading 0 are not parsed as octal.
+ return ReportError(CStrVector("Invalid decimal escape"));
+ }
uc32 octal = ParseOctalLiteral();
builder->AddCharacter(octal);
break;
@@ -402,6 +452,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
// This is outside the specification. We match JSC in
// reading the backslash as a literal character instead
// of as starting an escape.
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ return ReportError(CStrVector("Invalid unicode escape"));
+ }
builder->AddCharacter('\\');
} else {
Advance(2);
@@ -414,11 +468,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
uc32 value;
if (ParseHexEscape(2, &value)) {
builder->AddCharacter(value);
- } else if (!unicode_) {
+ } else if (!unicode()) {
builder->AddCharacter('x');
} else {
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
+ // With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid escape"));
}
break;
@@ -427,24 +480,20 @@ RegExpTree* RegExpParser::ParseDisjunction() {
Advance(2);
uc32 value;
if (ParseUnicodeEscape(&value)) {
- builder->AddUnicodeCharacter(value);
- } else if (!unicode_) {
+ builder->AddEscapedUnicodeCharacter(value);
+ } else if (!unicode()) {
builder->AddCharacter('u');
} else {
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
+ // With /u, invalid escapes are not treated as identity escapes.
return ReportError(CStrVector("Invalid unicode escape"));
}
break;
}
default:
Advance();
- // If the 'u' flag is present, only syntax characters can be
- // escaped, no
- // other identity escapes are allowed. If the 'u' flag is not
- // present,
- // all identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(current())) {
+ // With /u, no identity escapes except for syntax characters
+ // are allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
builder->AddCharacter(current());
Advance();
} else {
@@ -456,10 +505,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{': {
int dummy;
if (ParseIntervalQuantifier(&dummy, &dummy)) {
- ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
+ return ReportError(CStrVector("Nothing to repeat"));
}
// fallthrough
}
+ case '}':
+ case ']':
+ if (unicode()) {
+ return ReportError(CStrVector("Lone quantifier brackets"));
+ }
+ // fallthrough
default:
builder->AddUnicodeCharacter(current());
Advance();
@@ -492,13 +547,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '{':
if (ParseIntervalQuantifier(&min, &max)) {
if (max < min) {
- ReportError(CStrVector("numbers out of order in {} quantifier.")
- CHECK_FAILED);
+ return ReportError(
+ CStrVector("numbers out of order in {} quantifier"));
}
break;
- } else {
- continue;
+ } else if (unicode()) {
+ // With /u, incomplete quantifiers are not allowed.
+ return ReportError(CStrVector("Incomplete quantifier"));
}
+ continue;
default:
continue;
}
@@ -511,7 +568,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
- builder->AddQuantifierToAtom(min, max, quantifier_type);
+ if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+ return ReportError(CStrVector("Invalid quantifier"));
+ }
}
}
@@ -740,12 +799,12 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
return true;
}
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
bool RegExpParser::ParseUnicodeEscape(uc32* value) {
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
// allowed). In the latter case, the number of hex digits between { } is
// arbitrary. \ and u have already been read.
- if (current() == '{' && unicode_) {
+ if (current() == '{' && unicode()) {
int start = position();
Advance();
if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -758,9 +817,75 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
return false;
}
// \u but no {, or \u{...} escapes not allowed.
- return ParseHexEscape(4, value);
+ bool result = ParseHexEscape(4, value);
+ if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+ current() == '\\') {
+ // Attempt to read trail surrogate.
+ int start = position();
+ if (Next() == 'u') {
+ Advance(2);
+ uc32 trail;
+ if (ParseHexEscape(4, &trail) &&
+ unibrow::Utf16::IsTrailSurrogate(trail)) {
+ *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+ static_cast<uc16>(trail));
+ return true;
+ }
+ }
+ Reset(start);
+ }
+ return result;
}
+ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
+#ifdef V8_I18N_SUPPORT
+ char property_name[3];
+ memset(property_name, 0, sizeof(property_name));
+ if (current() == '{') {
+ Advance();
+ if (current() < 'A' || current() > 'Z') return nullptr;
+ property_name[0] = static_cast<char>(current());
+ Advance();
+ if (current() >= 'a' && current() <= 'z') {
+ property_name[1] = static_cast<char>(current());
+ Advance();
+ }
+ if (current() != '}') return nullptr;
+ } else if (current() >= 'A' && current() <= 'Z') {
+ property_name[0] = static_cast<char>(current());
+ } else {
+ return nullptr;
+ }
+ Advance();
+
+ int32_t category =
+ u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
+ if (category == UCHAR_INVALID_CODE) return nullptr;
+
+ USet* set = uset_openEmpty();
+ UErrorCode ec = U_ZERO_ERROR;
+ uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
+ ZoneList<CharacterRange>* ranges = nullptr;
+ if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
+ uset_removeAllStrings(set);
+ int item_count = uset_getItemCount(set);
+ ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
+ int item_result = 0;
+ for (int i = 0; i < item_count; i++) {
+ uc32 start = 0;
+ uc32 end = 0;
+ item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+ ranges->Add(CharacterRange::Range(start, end), zone());
+ }
+ DCHECK_EQ(U_ZERO_ERROR, ec);
+ DCHECK_EQ(0, item_result);
+ }
+ uset_close(set);
+ return ranges;
+#else // V8_I18N_SUPPORT
+ return nullptr;
+#endif // V8_I18N_SUPPORT
+}
bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
uc32 x = 0;
@@ -809,20 +934,35 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
case 'c': {
uc32 controlLetter = Next();
uc32 letter = controlLetter & ~('A' ^ 'a');
- // For compatibility with JSC, inside a character class
- // we also accept digits and underscore as control characters.
- if ((controlLetter >= '0' && controlLetter <= '9') ||
- controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
+ // For compatibility with JSC, inside a character class. We also accept
+ // digits and underscore as control characters, unless with /u.
+ if (letter >= 'A' && letter <= 'Z') {
Advance(2);
// Control letters mapped to ASCII control characters in the range
// 0x00-0x1f.
return controlLetter & 0x1f;
}
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
+ if ((controlLetter >= '0' && controlLetter <= '9') ||
+ controlLetter == '_') {
+ Advance(2);
+ return controlLetter & 0x1f;
+ }
// We match JSC in reading the backslash as a literal
// character instead of as starting an escape.
return '\\';
}
case '0':
+ // With /u, \0 is interpreted as NUL if not followed by another digit.
+ if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+ Advance();
+ return 0;
+ }
+ // Fall through.
case '1':
case '2':
case '3':
@@ -833,43 +973,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
// For compatibility, we interpret a decimal escape that isn't
// a back reference (and therefore either \0 or not valid according
// to the specification) as a 1..3 digit octal character code.
+ if (unicode()) {
+ // With /u, decimal escape is not interpreted as octal character code.
+ ReportError(CStrVector("Invalid class escape"));
+ return 0;
+ }
return ParseOctalLiteral();
case 'x': {
Advance();
uc32 value;
- if (ParseHexEscape(2, &value)) {
- return value;
+ if (ParseHexEscape(2, &value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid escape"));
+ return 0;
}
- if (!unicode_) {
- // If \x is not followed by a two-digit hexadecimal, treat it
- // as an identity escape.
- return 'x';
- }
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
- ReportError(CStrVector("Invalid escape"));
- return 0;
+ // If \x is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'x';
}
case 'u': {
Advance();
uc32 value;
- if (ParseUnicodeEscape(&value)) {
- return value;
- }
- if (!unicode_) {
- return 'u';
+ if (ParseUnicodeEscape(&value)) return value;
+ if (unicode()) {
+ // With /u, invalid escapes are not treated as identity escapes.
+ ReportError(CStrVector("Invalid unicode escape"));
+ return 0;
}
- // If the 'u' flag is present, invalid escapes are not treated as
- // identity escapes.
- ReportError(CStrVector("Invalid unicode escape"));
- return 0;
+ // If \u is not followed by a two-digit hexadecimal, treat it
+ // as an identity escape.
+ return 'u';
}
default: {
uc32 result = current();
- // If the 'u' flag is present, only syntax characters can be escaped, no
- // other identity escapes are allowed. If the 'u' flag is not present, all
- // identity escapes are allowed.
- if (!unicode_ || IsSyntaxCharacter(result)) {
+ // With /u, no identity escapes except for syntax characters and '-' are
+ // allowed. Otherwise, all identity escapes are allowed.
+ if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
Advance();
return result;
}
@@ -899,13 +1039,13 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
case kEndMarker:
return ReportError(CStrVector("\\ at end of pattern"));
default:
- uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
- return CharacterRange::Singleton(c);
+ first = ParseClassCharacterEscape(CHECK_FAILED);
}
} else {
Advance();
- return CharacterRange::Singleton(first);
}
+
+ return CharacterRange::Singleton(first);
}
@@ -927,6 +1067,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
RegExpTree* RegExpParser::ParseCharacterClass() {
static const char* kUnterminated = "Unterminated character class";
+ static const char* kRangeInvalid = "Invalid character class";
static const char* kRangeOutOfOrder = "Range out of order in character class";
DCHECK_EQ(current(), '[');
@@ -956,13 +1097,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
// Either end is an escaped character class. Treat the '-' verbatim.
+ if (unicode()) {
+ // ES2015 21.2.2.15.1 step 1.
+ return ReportError(CStrVector(kRangeInvalid));
+ }
AddRangeOrEscape(ranges, char_class, first, zone());
ranges->Add(CharacterRange::Singleton('-'), zone());
AddRangeOrEscape(ranges, char_class_2, next, zone());
continue;
}
+ // ES2015 21.2.2.15.1 step 6.
if (first.from() > next.to()) {
- return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
+ return ReportError(CStrVector(kRangeOutOfOrder));
}
ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
} else {
@@ -970,7 +1116,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
}
}
if (!has_more()) {
- return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
+ return ReportError(CStrVector(kUnterminated));
}
Advance();
if (ranges->length() == 0) {
@@ -985,10 +1131,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
- FlatStringReader* input, bool multiline,
- bool unicode, RegExpCompileData* result) {
+ FlatStringReader* input, JSRegExp::Flags flags,
+ RegExpCompileData* result) {
DCHECK(result != NULL);
- RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
+ RegExpParser parser(input, &result->error, flags, isolate, zone);
RegExpTree* tree = parser.ParsePattern();
if (parser.failed()) {
DCHECK(tree == NULL);
@@ -1010,11 +1156,13 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
return !parser.failed();
}
-
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
: zone_(zone),
pending_empty_(false),
+ ignore_case_(ignore_case),
+ unicode_(unicode),
characters_(NULL),
+ pending_surrogate_(kNoPendingSurrogate),
terms_(),
alternatives_()
#ifdef DEBUG
@@ -1025,7 +1173,51 @@ RegExpBuilder::RegExpBuilder(Zone* zone)
}
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ FlushPendingSurrogate();
+ // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+ pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+ DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ uc16 lead_surrogate = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ uc32 combined =
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+ if (NeedsDesugaringForIgnoreCase(combined)) {
+ AddCharacterClassForDesugaring(combined);
+ } else {
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom =
+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+ AddAtom(atom);
+ }
+ } else {
+ pending_surrogate_ = trail_surrogate;
+ FlushPendingSurrogate();
+ }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+ if (pending_surrogate_ != kNoPendingSurrogate) {
+ DCHECK(unicode());
+ uc32 c = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddCharacterClassForDesugaring(c);
+ }
+}
+
+
void RegExpBuilder::FlushCharacters() {
+ FlushPendingSurrogate();
pending_empty_ = false;
if (characters_ != NULL) {
RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1053,31 +1245,61 @@ void RegExpBuilder::FlushText() {
void RegExpBuilder::AddCharacter(uc16 c) {
+ FlushPendingSurrogate();
pending_empty_ = false;
- if (characters_ == NULL) {
- characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ if (NeedsDesugaringForIgnoreCase(c)) {
+ AddCharacterClassForDesugaring(c);
+ } else {
+ if (characters_ == NULL) {
+ characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ }
+ characters_->Add(c, zone());
+ LAST(ADD_CHAR);
}
- characters_->Add(c, zone());
- LAST(ADD_CHAR);
}
void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
- ZoneList<uc16> surrogate_pair(2, zone());
- surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
- surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
- AddAtom(atom);
+ DCHECK(unicode());
+ AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+ AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+ } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+ AddLeadSurrogate(c);
+ } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+ AddTrailSurrogate(c);
} else {
AddCharacter(static_cast<uc16>(c));
}
}
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+ // A lead or trail surrogate parsed via escape sequence will not
+ // pair up with any preceding lead or following trail surrogate.
+ FlushPendingSurrogate();
+ AddUnicodeCharacter(character);
+ FlushPendingSurrogate();
+}
void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+ if (NeedsDesugaringForUnicode(cc)) {
+ // With /u, character class needs to be desugared, so it
+ // must be a standalone term instead of being part of a RegExpText.
+ AddTerm(cc);
+ } else {
+ AddAtom(cc);
+ }
+}
+
+void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
+ AddTerm(new (zone()) RegExpCharacterClass(
+ CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
@@ -1094,6 +1316,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) {
}
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+ FlushText();
+ terms_.Add(term, zone());
+ LAST(ADD_ATOM);
+}
+
+
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
FlushText();
terms_.Add(assert, zone());
@@ -1121,6 +1350,47 @@ void RegExpBuilder::FlushTerms() {
}
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+ if (!unicode()) return false;
+ switch (cc->standard_type()) {
+ case 's': // white space
+ case 'w': // ASCII word character
+ case 'd': // ASCII digit
+ return false; // These characters do not need desugaring.
+ default:
+ break;
+ }
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ for (int i = ranges->length() - 1; i >= 0; i--) {
+ uc32 from = ranges->at(i).from();
+ uc32 to = ranges->at(i).to();
+ // Check for non-BMP characters.
+ if (to >= kNonBmpStart) return true;
+ // Check for lone surrogates.
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+ }
+ return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_I18N_SUPPORT
+ if (unicode() && ignore_case()) {
+ USet* set = uset_open(c, c);
+ uset_closeOver(set, USET_CASE_INSENSITIVE);
+ uset_removeAllStrings(set);
+ bool result = uset_size(set) > 1;
+ uset_close(set);
+ return result;
+ }
+ // In the case where ICU is not included, we act as if the unicode flag is
+ // not set, and do not desugar.
+#endif // V8_I18N_SUPPORT
+ return false;
+}
+
+
RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms();
int num_alternatives = alternatives_.length();
@@ -1129,12 +1399,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
}
-
-void RegExpBuilder::AddQuantifierToAtom(
+bool RegExpBuilder::AddQuantifierToAtom(
int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+ FlushPendingSurrogate();
if (pending_empty_) {
pending_empty_ = false;
- return;
+ return true;
}
RegExpTree* atom;
if (characters_ != NULL) {
@@ -1157,23 +1427,26 @@ void RegExpBuilder::AddQuantifierToAtom(
} else if (terms_.length() > 0) {
DCHECK(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
+ // With /u, lookarounds are not quantifiable.
+ if (unicode() && atom->IsLookaround()) return false;
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
- return;
+ return true;
}
terms_.Add(atom, zone());
- return;
+ return true;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
- return;
+ return false;
}
terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
zone());
LAST(ADD_TERM);
+ return true;
}
} // namespace internal