diff options
Diffstat (limited to 'deps/v8/src/regexp/regexp-parser.cc')
-rw-r--r-- | deps/v8/src/regexp/regexp-parser.cc | 523 |
1 files changed, 398 insertions, 125 deletions
diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc index fa8900342c..2fe6fde82a 100644 --- a/deps/v8/src/regexp/regexp-parser.cc +++ b/deps/v8/src/regexp/regexp-parser.cc @@ -8,27 +8,32 @@ #include "src/factory.h" #include "src/isolate.h" #include "src/objects-inl.h" +#include "src/ostreams.h" #include "src/regexp/jsregexp.h" #include "src/utils.h" +#ifdef V8_I18N_SUPPORT +#include "unicode/uset.h" +#endif // V8_I18N_SUPPORT + namespace v8 { namespace internal { RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, - bool multiline, bool unicode, Isolate* isolate, - Zone* zone) + JSRegExp::Flags flags, Isolate* isolate, Zone* zone) : isolate_(isolate), zone_(zone), error_(error), captures_(NULL), in_(in), current_(kEndMarker), + ignore_case_(flags & JSRegExp::kIgnoreCase), + multiline_(flags & JSRegExp::kMultiline), + unicode_(flags & JSRegExp::kUnicode), next_pos_(0), captures_started_(0), capture_count_(0), has_more_(true), - multiline_(multiline), - unicode_(unicode), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false), @@ -36,10 +41,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, Advance(); } +template <bool update_position> +inline uc32 RegExpParser::ReadNext() { + int position = next_pos_; + uc32 c0 = in()->Get(position); + position++; + // Read the whole surrogate pair in case of unicode flag, if possible. + if (unicode() && position < in()->length() && + unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { + uc16 c1 = in()->Get(position); + if (unibrow::Utf16::IsTrailSurrogate(c1)) { + c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); + position++; + } + } + if (update_position) next_pos_ = position; + return c0; +} + uc32 RegExpParser::Next() { if (has_next()) { - return in()->Get(next_pos_); + return ReadNext<false>(); } else { return kEndMarker; } @@ -47,25 +70,14 @@ uc32 RegExpParser::Next() { void RegExpParser::Advance() { - if (next_pos_ < in()->length()) { + if (has_next()) { StackLimitCheck check(isolate()); if (check.HasOverflowed()) { ReportError(CStrVector(Isolate::kStackOverflowMessage)); } else if (zone()->excess_allocation()) { ReportError(CStrVector("Regular expression too large")); } else { - current_ = in()->Get(next_pos_); - next_pos_++; - // Read the whole surrogate pair in case of unicode flag, if possible. - if (unicode_ && next_pos_ < in()->length() && - unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { - uc16 trail = in()->Get(next_pos_); - if (unibrow::Utf16::IsTrailSurrogate(trail)) { - current_ = unibrow::Utf16::CombineSurrogatePair( - static_cast<uc16>(current_), trail); - next_pos_++; - } - } + current_ = ReadNext<true>(); } } else { current_ = kEndMarker; @@ -92,11 +104,28 @@ void RegExpParser::Advance(int dist) { bool RegExpParser::simple() { return simple_; } - -bool RegExpParser::IsSyntaxCharacter(uc32 c) { - return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || - c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || - c == '{' || c == '}' || c == '|'; +bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + return true; + default: + break; + } + return false; } @@ -142,7 +171,7 @@ RegExpTree* RegExpParser::ParsePattern() { RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, - zone()); + ignore_case(), unicode(), zone()); RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -151,14 +180,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { case kEndMarker: if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. - ReportError(CStrVector("Unterminated group") CHECK_FAILED); + return ReportError(CStrVector("Unterminated group")); } DCHECK_EQ(INITIAL, state->group_type()); // Parsing completed successfully. return builder->ToRegExp(); case ')': { if (!state->IsSubexpression()) { - ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); + return ReportError(CStrVector("Unmatched ')'")); } DCHECK_NE(INITIAL, state->group_type()); @@ -206,7 +235,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(CStrVector("Nothing to repeat")); case '^': { Advance(); - if (multiline_) { + if (multiline()) { builder->AddAssertion( new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); } else { @@ -219,8 +248,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = - multiline_ ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; + multiline() ? RegExpAssertion::END_OF_LINE + : RegExpAssertion::END_OF_INPUT; builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); continue; } @@ -230,8 +259,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList<CharacterRange>* ranges = new (zone()) ZoneList<CharacterRange>(2, zone()); CharacterRange::AddClassEscape('.', ranges, zone()); - RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); - builder->AddAtom(atom); + RegExpCharacterClass* cc = + new (zone()) RegExpCharacterClass(ranges, false); + builder->AddCharacterClass(cc); break; } case '(': { @@ -265,25 +295,25 @@ RegExpTree* RegExpParser::ParseDisjunction() { } // Fall through. default: - ReportError(CStrVector("Invalid group") CHECK_FAILED); - break; + return ReportError(CStrVector("Invalid group")); } Advance(2); } else { if (captures_started_ >= kMaxCaptures) { - ReportError(CStrVector("Too many captures") CHECK_FAILED); + return ReportError(CStrVector("Too many captures")); } captures_started_++; } // Store current state and begin new disjunction parsing. state = new (zone()) RegExpParserState( - state, subexpr_type, lookaround_type, captures_started_, zone()); + state, subexpr_type, lookaround_type, captures_started_, + ignore_case(), unicode(), zone()); builder = state->builder(); continue; } case '[': { - RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); - builder->AddAtom(atom); + RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); + builder->AddCharacterClass(cc->AsCharacterClass()); break; } // Atom :: @@ -318,8 +348,26 @@ RegExpTree* RegExpParser::ParseDisjunction() { ZoneList<CharacterRange>* ranges = new (zone()) ZoneList<CharacterRange>(2, zone()); CharacterRange::AddClassEscape(c, ranges, zone()); - RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); - builder->AddAtom(atom); + RegExpCharacterClass* cc = + new (zone()) RegExpCharacterClass(ranges, false); + builder->AddCharacterClass(cc); + break; + } + case 'p': + case 'P': { + uc32 p = Next(); + Advance(2); + if (unicode()) { + ZoneList<CharacterRange>* ranges = ParsePropertyClass(); + if (ranges == nullptr) { + return ReportError(CStrVector("Invalid property name")); + } + RegExpCharacterClass* cc = + new (zone()) RegExpCharacterClass(ranges, p == 'P'); + builder->AddCharacterClass(cc); + } else { + builder->AddCharacter(p); + } break; } case '1': @@ -332,7 +380,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '8': case '9': { int index = 0; - if (ParseBackReferenceIndex(&index)) { + bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); + if (is_backref) { if (state->IsInsideCaptureGroup(index)) { // The back reference is inside the capture group it refers to. // Nothing can possibly have been captured yet, so we use empty @@ -347,24 +396,25 @@ RegExpTree* RegExpParser::ParseDisjunction() { } break; } + // With /u, no identity escapes except for syntax characters + // are allowed. Otherwise, all identity escapes are allowed. + if (unicode()) { + return ReportError(CStrVector("Invalid escape")); + } uc32 first_digit = Next(); if (first_digit == '8' || first_digit == '9') { - // If the 'u' flag is present, only syntax characters can be - // escaped, - // no other identity escapes are allowed. If the 'u' flag is not - // present, all identity escapes are allowed. - if (!unicode_) { - builder->AddCharacter(first_digit); - Advance(2); - } else { - return ReportError(CStrVector("Invalid escape")); - } + builder->AddCharacter(first_digit); + Advance(2); break; } } // FALLTHROUGH case '0': { Advance(); + if (unicode() && Next() >= '0' && Next() <= '9') { + // With /u, decimal escape with leading 0 are not parsed as octal. + return ReportError(CStrVector("Invalid decimal escape")); + } uc32 octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; @@ -402,6 +452,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { // This is outside the specification. We match JSC in // reading the backslash as a literal character instead // of as starting an escape. + if (unicode()) { + // With /u, invalid escapes are not treated as identity escapes. + return ReportError(CStrVector("Invalid unicode escape")); + } builder->AddCharacter('\\'); } else { Advance(2); @@ -414,11 +468,10 @@ RegExpTree* RegExpParser::ParseDisjunction() { uc32 value; if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); - } else if (!unicode_) { + } else if (!unicode()) { builder->AddCharacter('x'); } else { - // If the 'u' flag is present, invalid escapes are not treated as - // identity escapes. + // With /u, invalid escapes are not treated as identity escapes. return ReportError(CStrVector("Invalid escape")); } break; @@ -427,24 +480,20 @@ RegExpTree* RegExpParser::ParseDisjunction() { Advance(2); uc32 value; if (ParseUnicodeEscape(&value)) { - builder->AddUnicodeCharacter(value); - } else if (!unicode_) { + builder->AddEscapedUnicodeCharacter(value); + } else if (!unicode()) { builder->AddCharacter('u'); } else { - // If the 'u' flag is present, invalid escapes are not treated as - // identity escapes. + // With /u, invalid escapes are not treated as identity escapes. return ReportError(CStrVector("Invalid unicode escape")); } break; } default: Advance(); - // If the 'u' flag is present, only syntax characters can be - // escaped, no - // other identity escapes are allowed. If the 'u' flag is not - // present, - // all identity escapes are allowed. - if (!unicode_ || IsSyntaxCharacter(current())) { + // With /u, no identity escapes except for syntax characters + // are allowed. Otherwise, all identity escapes are allowed. + if (!unicode() || IsSyntaxCharacterOrSlash(current())) { builder->AddCharacter(current()); Advance(); } else { @@ -456,10 +505,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': { int dummy; if (ParseIntervalQuantifier(&dummy, &dummy)) { - ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); + return ReportError(CStrVector("Nothing to repeat")); } // fallthrough } + case '}': + case ']': + if (unicode()) { + return ReportError(CStrVector("Lone quantifier brackets")); + } + // fallthrough default: builder->AddUnicodeCharacter(current()); Advance(); @@ -492,13 +547,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '{': if (ParseIntervalQuantifier(&min, &max)) { if (max < min) { - ReportError(CStrVector("numbers out of order in {} quantifier.") - CHECK_FAILED); + return ReportError( + CStrVector("numbers out of order in {} quantifier")); } break; - } else { - continue; + } else if (unicode()) { + // With /u, incomplete quantifiers are not allowed. + return ReportError(CStrVector("Incomplete quantifier")); } + continue; default: continue; } @@ -511,7 +568,9 @@ RegExpTree* RegExpParser::ParseDisjunction() { quantifier_type = RegExpQuantifier::POSSESSIVE; Advance(); } - builder->AddQuantifierToAtom(min, max, quantifier_type); + if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { + return ReportError(CStrVector("Invalid quantifier")); + } } } @@ -740,12 +799,12 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { return true; } - +// This parses RegExpUnicodeEscapeSequence as described in ECMA262. bool RegExpParser::ParseUnicodeEscape(uc32* value) { // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are // allowed). In the latter case, the number of hex digits between { } is // arbitrary. \ and u have already been read. - if (current() == '{' && unicode_) { + if (current() == '{' && unicode()) { int start = position(); Advance(); if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { @@ -758,9 +817,75 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { return false; } // \u but no {, or \u{...} escapes not allowed. - return ParseHexEscape(4, value); + bool result = ParseHexEscape(4, value); + if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && + current() == '\\') { + // Attempt to read trail surrogate. + int start = position(); + if (Next() == 'u') { + Advance(2); + uc32 trail; + if (ParseHexEscape(4, &trail) && + unibrow::Utf16::IsTrailSurrogate(trail)) { + *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), + static_cast<uc16>(trail)); + return true; + } + } + Reset(start); + } + return result; } +ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { +#ifdef V8_I18N_SUPPORT + char property_name[3]; + memset(property_name, 0, sizeof(property_name)); + if (current() == '{') { + Advance(); + if (current() < 'A' || current() > 'Z') return nullptr; + property_name[0] = static_cast<char>(current()); + Advance(); + if (current() >= 'a' && current() <= 'z') { + property_name[1] = static_cast<char>(current()); + Advance(); + } + if (current() != '}') return nullptr; + } else if (current() >= 'A' && current() <= 'Z') { + property_name[0] = static_cast<char>(current()); + } else { + return nullptr; + } + Advance(); + + int32_t category = + u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name); + if (category == UCHAR_INVALID_CODE) return nullptr; + + USet* set = uset_openEmpty(); + UErrorCode ec = U_ZERO_ERROR; + uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec); + ZoneList<CharacterRange>* ranges = nullptr; + if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) { + uset_removeAllStrings(set); + int item_count = uset_getItemCount(set); + ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone()); + int item_result = 0; + for (int i = 0; i < item_count; i++) { + uc32 start = 0; + uc32 end = 0; + item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); + ranges->Add(CharacterRange::Range(start, end), zone()); + } + DCHECK_EQ(U_ZERO_ERROR, ec); + DCHECK_EQ(0, item_result); + } + uset_close(set); + return ranges; +#else // V8_I18N_SUPPORT + return nullptr; +#endif // V8_I18N_SUPPORT +} bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { uc32 x = 0; @@ -809,20 +934,35 @@ uc32 RegExpParser::ParseClassCharacterEscape() { case 'c': { uc32 controlLetter = Next(); uc32 letter = controlLetter & ~('A' ^ 'a'); - // For compatibility with JSC, inside a character class - // we also accept digits and underscore as control characters. - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { + // For compatibility with JSC, inside a character class. We also accept + // digits and underscore as control characters, unless with /u. + if (letter >= 'A' && letter <= 'Z') { Advance(2); // Control letters mapped to ASCII control characters in the range // 0x00-0x1f. return controlLetter & 0x1f; } + if (unicode()) { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(CStrVector("Invalid class escape")); + return 0; + } + if ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_') { + Advance(2); + return controlLetter & 0x1f; + } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. return '\\'; } case '0': + // With /u, \0 is interpreted as NUL if not followed by another digit. + if (unicode() && !(Next() >= '0' && Next() <= '9')) { + Advance(); + return 0; + } + // Fall through. case '1': case '2': case '3': @@ -833,43 +973,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() { // For compatibility, we interpret a decimal escape that isn't // a back reference (and therefore either \0 or not valid according // to the specification) as a 1..3 digit octal character code. + if (unicode()) { + // With /u, decimal escape is not interpreted as octal character code. + ReportError(CStrVector("Invalid class escape")); + return 0; + } return ParseOctalLiteral(); case 'x': { Advance(); uc32 value; - if (ParseHexEscape(2, &value)) { - return value; + if (ParseHexEscape(2, &value)) return value; + if (unicode()) { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(CStrVector("Invalid escape")); + return 0; } - if (!unicode_) { - // If \x is not followed by a two-digit hexadecimal, treat it - // as an identity escape. - return 'x'; - } - // If the 'u' flag is present, invalid escapes are not treated as - // identity escapes. - ReportError(CStrVector("Invalid escape")); - return 0; + // If \x is not followed by a two-digit hexadecimal, treat it + // as an identity escape. + return 'x'; } case 'u': { Advance(); uc32 value; - if (ParseUnicodeEscape(&value)) { - return value; - } - if (!unicode_) { - return 'u'; + if (ParseUnicodeEscape(&value)) return value; + if (unicode()) { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(CStrVector("Invalid unicode escape")); + return 0; } - // If the 'u' flag is present, invalid escapes are not treated as - // identity escapes. - ReportError(CStrVector("Invalid unicode escape")); - return 0; + // If \u is not followed by a two-digit hexadecimal, treat it + // as an identity escape. + return 'u'; } default: { uc32 result = current(); - // If the 'u' flag is present, only syntax characters can be escaped, no - // other identity escapes are allowed. If the 'u' flag is not present, all - // identity escapes are allowed. - if (!unicode_ || IsSyntaxCharacter(result)) { + // With /u, no identity escapes except for syntax characters and '-' are + // allowed. Otherwise, all identity escapes are allowed. + if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { Advance(); return result; } @@ -899,13 +1039,13 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { case kEndMarker: return ReportError(CStrVector("\\ at end of pattern")); default: - uc32 c = ParseClassCharacterEscape(CHECK_FAILED); - return CharacterRange::Singleton(c); + first = ParseClassCharacterEscape(CHECK_FAILED); } } else { Advance(); - return CharacterRange::Singleton(first); } + + return CharacterRange::Singleton(first); } @@ -927,6 +1067,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, RegExpTree* RegExpParser::ParseCharacterClass() { static const char* kUnterminated = "Unterminated character class"; + static const char* kRangeInvalid = "Invalid character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; DCHECK_EQ(current(), '['); @@ -956,13 +1097,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() { CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { // Either end is an escaped character class. Treat the '-' verbatim. + if (unicode()) { + // ES2015 21.2.2.15.1 step 1. + return ReportError(CStrVector(kRangeInvalid)); + } AddRangeOrEscape(ranges, char_class, first, zone()); ranges->Add(CharacterRange::Singleton('-'), zone()); AddRangeOrEscape(ranges, char_class_2, next, zone()); continue; } + // ES2015 21.2.2.15.1 step 6. if (first.from() > next.to()) { - return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); + return ReportError(CStrVector(kRangeOutOfOrder)); } ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); } else { @@ -970,7 +1116,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } } if (!has_more()) { - return ReportError(CStrVector(kUnterminated) CHECK_FAILED); + return ReportError(CStrVector(kUnterminated)); } Advance(); if (ranges->length() == 0) { @@ -985,10 +1131,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() { bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, - FlatStringReader* input, bool multiline, - bool unicode, RegExpCompileData* result) { + FlatStringReader* input, JSRegExp::Flags flags, + RegExpCompileData* result) { DCHECK(result != NULL); - RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); + RegExpParser parser(input, &result->error, flags, isolate, zone); RegExpTree* tree = parser.ParsePattern(); if (parser.failed()) { DCHECK(tree == NULL); @@ -1010,11 +1156,13 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, return !parser.failed(); } - -RegExpBuilder::RegExpBuilder(Zone* zone) +RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode) : zone_(zone), pending_empty_(false), + ignore_case_(ignore_case), + unicode_(unicode), characters_(NULL), + pending_surrogate_(kNoPendingSurrogate), terms_(), alternatives_() #ifdef DEBUG @@ -1025,7 +1173,51 @@ RegExpBuilder::RegExpBuilder(Zone* zone) } +void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { + DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); + FlushPendingSurrogate(); + // Hold onto the lead surrogate, waiting for a trail surrogate to follow. + pending_surrogate_ = lead_surrogate; +} + + +void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { + DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); + if (pending_surrogate_ != kNoPendingSurrogate) { + uc16 lead_surrogate = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); + uc32 combined = + unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); + if (NeedsDesugaringForIgnoreCase(combined)) { + AddCharacterClassForDesugaring(combined); + } else { + ZoneList<uc16> surrogate_pair(2, zone()); + surrogate_pair.Add(lead_surrogate, zone()); + surrogate_pair.Add(trail_surrogate, zone()); + RegExpAtom* atom = + new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); + AddAtom(atom); + } + } else { + pending_surrogate_ = trail_surrogate; + FlushPendingSurrogate(); + } +} + + +void RegExpBuilder::FlushPendingSurrogate() { + if (pending_surrogate_ != kNoPendingSurrogate) { + DCHECK(unicode()); + uc32 c = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + AddCharacterClassForDesugaring(c); + } +} + + void RegExpBuilder::FlushCharacters() { + FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != NULL) { RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); @@ -1053,31 +1245,61 @@ void RegExpBuilder::FlushText() { void RegExpBuilder::AddCharacter(uc16 c) { + FlushPendingSurrogate(); pending_empty_ = false; - if (characters_ == NULL) { - characters_ = new (zone()) ZoneList<uc16>(4, zone()); + if (NeedsDesugaringForIgnoreCase(c)) { + AddCharacterClassForDesugaring(c); + } else { + if (characters_ == NULL) { + characters_ = new (zone()) ZoneList<uc16>(4, zone()); + } + characters_->Add(c, zone()); + LAST(ADD_CHAR); } - characters_->Add(c, zone()); - LAST(ADD_CHAR); } void RegExpBuilder::AddUnicodeCharacter(uc32 c) { if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { - ZoneList<uc16> surrogate_pair(2, zone()); - surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); - surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); - RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); - AddAtom(atom); + DCHECK(unicode()); + AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); + AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); + } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { + AddLeadSurrogate(c); + } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { + AddTrailSurrogate(c); } else { AddCharacter(static_cast<uc16>(c)); } } +void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { + // A lead or trail surrogate parsed via escape sequence will not + // pair up with any preceding lead or following trail surrogate. + FlushPendingSurrogate(); + AddUnicodeCharacter(character); + FlushPendingSurrogate(); +} void RegExpBuilder::AddEmpty() { pending_empty_ = true; } +void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { + if (NeedsDesugaringForUnicode(cc)) { + // With /u, character class needs to be desugared, so it + // must be a standalone term instead of being part of a RegExpText. + AddTerm(cc); + } else { + AddAtom(cc); + } +} + +void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { + AddTerm(new (zone()) RegExpCharacterClass( + CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); +} + + void RegExpBuilder::AddAtom(RegExpTree* term) { if (term->IsEmpty()) { AddEmpty(); @@ -1094,6 +1316,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { } +void RegExpBuilder::AddTerm(RegExpTree* term) { + FlushText(); + terms_.Add(term, zone()); + LAST(ADD_ATOM); +} + + void RegExpBuilder::AddAssertion(RegExpTree* assert) { FlushText(); terms_.Add(assert, zone()); @@ -1121,6 +1350,47 @@ void RegExpBuilder::FlushTerms() { } +bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { + if (!unicode()) return false; + switch (cc->standard_type()) { + case 's': // white space + case 'w': // ASCII word character + case 'd': // ASCII digit + return false; // These characters do not need desugaring. + default: + break; + } + ZoneList<CharacterRange>* ranges = cc->ranges(zone()); + CharacterRange::Canonicalize(ranges); + for (int i = ranges->length() - 1; i >= 0; i--) { + uc32 from = ranges->at(i).from(); + uc32 to = ranges->at(i).to(); + // Check for non-BMP characters. + if (to >= kNonBmpStart) return true; + // Check for lone surrogates. + if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; + } + return false; +} + + +bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { +#ifdef V8_I18N_SUPPORT + if (unicode() && ignore_case()) { + USet* set = uset_open(c, c); + uset_closeOver(set, USET_CASE_INSENSITIVE); + uset_removeAllStrings(set); + bool result = uset_size(set) > 1; + uset_close(set); + return result; + } + // In the case where ICU is not included, we act as if the unicode flag is + // not set, and do not desugar. +#endif // V8_I18N_SUPPORT + return false; +} + + RegExpTree* RegExpBuilder::ToRegExp() { FlushTerms(); int num_alternatives = alternatives_.length(); @@ -1129,12 +1399,12 @@ RegExpTree* RegExpBuilder::ToRegExp() { return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); } - -void RegExpBuilder::AddQuantifierToAtom( +bool RegExpBuilder::AddQuantifierToAtom( int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { + FlushPendingSurrogate(); if (pending_empty_) { pending_empty_ = false; - return; + return true; } RegExpTree* atom; if (characters_ != NULL) { @@ -1157,23 +1427,26 @@ void RegExpBuilder::AddQuantifierToAtom( } else if (terms_.length() > 0) { DCHECK(last_added_ == ADD_ATOM); atom = terms_.RemoveLast(); + // With /u, lookarounds are not quantifiable. + if (unicode() && atom->IsLookaround()) return false; if (atom->max_match() == 0) { // Guaranteed to only match an empty string. LAST(ADD_TERM); if (min == 0) { - return; + return true; } terms_.Add(atom, zone()); - return; + return true; } } else { // Only call immediately after adding an atom or character! UNREACHABLE(); - return; + return false; } terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone()); LAST(ADD_TERM); + return true; } } // namespace internal |