1 files changed, 398 insertions, 125 deletions
diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc
index fa8900342c..2fe6fde82a 100644
--- a/deps/v8/src/regexp/regexp-parser.cc
+++ b/deps/v8/src/regexp/regexp-parser.cc
@@ -8,27 +8,32 @@
 #include "src/factory.h"
 #include "src/isolate.h"
 #include "src/objects-inl.h"
+#include "src/ostreams.h"
 #include "src/regexp/jsregexp.h"
 #include "src/utils.h"
 
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#endif  // V8_I18N_SUPPORT
+
 namespace v8 {
 namespace internal {
 
 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
-                           bool multiline, bool unicode, Isolate* isolate,
-                           Zone* zone)
+                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
     : isolate_(isolate),
       zone_(zone),
       error_(error),
       captures_(NULL),
       in_(in),
       current_(kEndMarker),
+      ignore_case_(flags & JSRegExp::kIgnoreCase),
+      multiline_(flags & JSRegExp::kMultiline),
+      unicode_(flags & JSRegExp::kUnicode),
       next_pos_(0),
       captures_started_(0),
       capture_count_(0),
       has_more_(true),
-      multiline_(multiline),
-      unicode_(unicode),
       simple_(false),
       contains_anchor_(false),
       is_scanned_for_captures_(false),
@@ -36,10 +41,28 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
   Advance();
 }
 
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
+  int position = next_pos_;
+  uc32 c0 = in()->Get(position);
+  position++;
+  // Read the whole surrogate pair in case of unicode flag, if possible.
+  if (unicode() && position < in()->length() &&
+      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
+    uc16 c1 = in()->Get(position);
+    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
+      c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
+      position++;
+    }
+  }
+  if (update_position) next_pos_ = position;
+  return c0;
+}
+
 
 uc32 RegExpParser::Next() {
   if (has_next()) {
-    return in()->Get(next_pos_);
+    return ReadNext<false>();
   } else {
     return kEndMarker;
   }
@@ -47,25 +70,14 @@ uc32 RegExpParser::Next() {
 
 
 void RegExpParser::Advance() {
-  if (next_pos_ < in()->length()) {
+  if (has_next()) {
     StackLimitCheck check(isolate());
     if (check.HasOverflowed()) {
       ReportError(CStrVector(Isolate::kStackOverflowMessage));
     } else if (zone()->excess_allocation()) {
       ReportError(CStrVector("Regular expression too large"));
     } else {
-      current_ = in()->Get(next_pos_);
-      next_pos_++;
-      // Read the whole surrogate pair in case of unicode flag, if possible.
-      if (unicode_ && next_pos_ < in()->length() &&
-          unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
-        uc16 trail = in()->Get(next_pos_);
-        if (unibrow::Utf16::IsTrailSurrogate(trail)) {
-          current_ = unibrow::Utf16::CombineSurrogatePair(
-              static_cast<uc16>(current_), trail);
-          next_pos_++;
-        }
-      }
+      current_ = ReadNext<true>();
     }
   } else {
     current_ = kEndMarker;
@@ -92,11 +104,28 @@ void RegExpParser::Advance(int dist) {
 
 bool RegExpParser::simple() { return simple_; }
 
-
-bool RegExpParser::IsSyntaxCharacter(uc32 c) {
-  return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
-         c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
-         c == '{' || c == '}' || c == '|';
+bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      break;
+  }
+  return false;
 }
 
 
@@ -142,7 +171,7 @@ RegExpTree* RegExpParser::ParsePattern() {
 RegExpTree* RegExpParser::ParseDisjunction() {
   // Used to store current state while parsing subexpressions.
   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
-                                  zone());
+                                  ignore_case(), unicode(), zone());
   RegExpParserState* state = &initial_state;
   // Cache the builder in a local variable for quick access.
   RegExpBuilder* builder = initial_state.builder();
@@ -151,14 +180,14 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case kEndMarker:
         if (state->IsSubexpression()) {
           // Inside a parenthesized group when hitting end of input.
-          ReportError(CStrVector("Unterminated group") CHECK_FAILED);
+          return ReportError(CStrVector("Unterminated group"));
         }
         DCHECK_EQ(INITIAL, state->group_type());
         // Parsing completed successfully.
         return builder->ToRegExp();
       case ')': {
         if (!state->IsSubexpression()) {
-          ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
+          return ReportError(CStrVector("Unmatched ')'"));
         }
         DCHECK_NE(INITIAL, state->group_type());
 
@@ -206,7 +235,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
         return ReportError(CStrVector("Nothing to repeat"));
       case '^': {
         Advance();
-        if (multiline_) {
+        if (multiline()) {
           builder->AddAssertion(
               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
         } else {
@@ -219,8 +248,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '$': {
         Advance();
         RegExpAssertion::AssertionType assertion_type =
-            multiline_ ? RegExpAssertion::END_OF_LINE
-                       : RegExpAssertion::END_OF_INPUT;
+            multiline() ? RegExpAssertion::END_OF_LINE
+                        : RegExpAssertion::END_OF_INPUT;
         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
         continue;
       }
@@ -230,8 +259,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
         ZoneList<CharacterRange>* ranges =
             new (zone()) ZoneList<CharacterRange>(2, zone());
         CharacterRange::AddClassEscape('.', ranges, zone());
-        RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
-        builder->AddAtom(atom);
+        RegExpCharacterClass* cc =
+            new (zone()) RegExpCharacterClass(ranges, false);
+        builder->AddCharacterClass(cc);
         break;
       }
       case '(': {
@@ -265,25 +295,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               }
             // Fall through.
             default:
-              ReportError(CStrVector("Invalid group") CHECK_FAILED);
-              break;
+              return ReportError(CStrVector("Invalid group"));
           }
           Advance(2);
         } else {
           if (captures_started_ >= kMaxCaptures) {
-            ReportError(CStrVector("Too many captures") CHECK_FAILED);
+            return ReportError(CStrVector("Too many captures"));
           }
           captures_started_++;
         }
         // Store current state and begin new disjunction parsing.
         state = new (zone()) RegExpParserState(
-            state, subexpr_type, lookaround_type, captures_started_, zone());
+            state, subexpr_type, lookaround_type, captures_started_,
+            ignore_case(), unicode(), zone());
         builder = state->builder();
         continue;
       }
       case '[': {
-        RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
-        builder->AddAtom(atom);
+        RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
+        builder->AddCharacterClass(cc->AsCharacterClass());
         break;
       }
       // Atom ::
@@ -318,8 +348,26 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             ZoneList<CharacterRange>* ranges =
                 new (zone()) ZoneList<CharacterRange>(2, zone());
             CharacterRange::AddClassEscape(c, ranges, zone());
-            RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
-            builder->AddAtom(atom);
+            RegExpCharacterClass* cc =
+                new (zone()) RegExpCharacterClass(ranges, false);
+            builder->AddCharacterClass(cc);
+            break;
+          }
+          case 'p':
+          case 'P': {
+            uc32 p = Next();
+            Advance(2);
+            if (unicode()) {
+              ZoneList<CharacterRange>* ranges = ParsePropertyClass();
+              if (ranges == nullptr) {
+                return ReportError(CStrVector("Invalid property name"));
+              }
+              RegExpCharacterClass* cc =
+                  new (zone()) RegExpCharacterClass(ranges, p == 'P');
+              builder->AddCharacterClass(cc);
+            } else {
+              builder->AddCharacter(p);
+            }
             break;
           }
           case '1':
@@ -332,7 +380,8 @@ RegExpTree* RegExpParser::ParseDisjunction() {
           case '8':
           case '9': {
             int index = 0;
-            if (ParseBackReferenceIndex(&index)) {
+            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
+            if (is_backref) {
               if (state->IsInsideCaptureGroup(index)) {
                 // The back reference is inside the capture group it refers to.
                 // Nothing can possibly have been captured yet, so we use empty
@@ -347,24 +396,25 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               }
               break;
             }
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (unicode()) {
+              return ReportError(CStrVector("Invalid escape"));
+            }
             uc32 first_digit = Next();
             if (first_digit == '8' || first_digit == '9') {
-              // If the 'u' flag is present, only syntax characters can be
-              // escaped,
-              // no other identity escapes are allowed. If the 'u' flag is not
-              // present, all identity escapes are allowed.
-              if (!unicode_) {
-                builder->AddCharacter(first_digit);
-                Advance(2);
-              } else {
-                return ReportError(CStrVector("Invalid escape"));
-              }
+              builder->AddCharacter(first_digit);
+              Advance(2);
               break;
             }
           }
           // FALLTHROUGH
           case '0': {
             Advance();
+            if (unicode() && Next() >= '0' && Next() <= '9') {
+              // With /u, decimal escape with leading 0 are not parsed as octal.
+              return ReportError(CStrVector("Invalid decimal escape"));
+            }
             uc32 octal = ParseOctalLiteral();
             builder->AddCharacter(octal);
             break;
@@ -402,6 +452,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               // This is outside the specification. We match JSC in
               // reading the backslash as a literal character instead
               // of as starting an escape.
+              if (unicode()) {
+                // With /u, invalid escapes are not treated as identity escapes.
+                return ReportError(CStrVector("Invalid unicode escape"));
+              }
               builder->AddCharacter('\\');
             } else {
               Advance(2);
@@ -414,11 +468,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             uc32 value;
             if (ParseHexEscape(2, &value)) {
               builder->AddCharacter(value);
-            } else if (!unicode_) {
+            } else if (!unicode()) {
               builder->AddCharacter('x');
             } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
               return ReportError(CStrVector("Invalid escape"));
             }
             break;
@@ -427,24 +480,20 @@ RegExpTree* RegExpParser::ParseDisjunction() {
             Advance(2);
             uc32 value;
             if (ParseUnicodeEscape(&value)) {
-              builder->AddUnicodeCharacter(value);
-            } else if (!unicode_) {
+              builder->AddEscapedUnicodeCharacter(value);
+            } else if (!unicode()) {
               builder->AddCharacter('u');
             } else {
-              // If the 'u' flag is present, invalid escapes are not treated as
-              // identity escapes.
+              // With /u, invalid escapes are not treated as identity escapes.
               return ReportError(CStrVector("Invalid unicode escape"));
             }
             break;
           }
           default:
             Advance();
-            // If the 'u' flag is present, only syntax characters can be
-            // escaped, no
-            // other identity escapes are allowed. If the 'u' flag is not
-            // present,
-            // all identity escapes are allowed.
-            if (!unicode_ || IsSyntaxCharacter(current())) {
+            // With /u, no identity escapes except for syntax characters
+            // are allowed. Otherwise, all identity escapes are allowed.
+            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
               builder->AddCharacter(current());
               Advance();
             } else {
@@ -456,10 +505,16 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '{': {
         int dummy;
         if (ParseIntervalQuantifier(&dummy, &dummy)) {
-          ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
+          return ReportError(CStrVector("Nothing to repeat"));
         }
         // fallthrough
       }
+      case '}':
+      case ']':
+        if (unicode()) {
+          return ReportError(CStrVector("Lone quantifier brackets"));
+        }
+      // fallthrough
       default:
         builder->AddUnicodeCharacter(current());
         Advance();
@@ -492,13 +547,15 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       case '{':
         if (ParseIntervalQuantifier(&min, &max)) {
           if (max < min) {
-            ReportError(CStrVector("numbers out of order in {} quantifier.")
-                            CHECK_FAILED);
+            return ReportError(
+                CStrVector("numbers out of order in {} quantifier"));
           }
           break;
-        } else {
-          continue;
+        } else if (unicode()) {
+          // With /u, incomplete quantifiers are not allowed.
+          return ReportError(CStrVector("Incomplete quantifier"));
         }
+        continue;
       default:
         continue;
     }
@@ -511,7 +568,9 @@ RegExpTree* RegExpParser::ParseDisjunction() {
       quantifier_type = RegExpQuantifier::POSSESSIVE;
       Advance();
     }
-    builder->AddQuantifierToAtom(min, max, quantifier_type);
+    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
+      return ReportError(CStrVector("Invalid quantifier"));
+    }
   }
 }
 
@@ -740,12 +799,12 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) {
   return true;
 }
 
-
+// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
   // allowed). In the latter case, the number of hex digits between { } is
   // arbitrary. \ and u have already been read.
-  if (current() == '{' && unicode_) {
+  if (current() == '{' && unicode()) {
     int start = position();
     Advance();
     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
@@ -758,9 +817,75 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) {
     return false;
   }
   // \u but no {, or \u{...} escapes not allowed.
-  return ParseHexEscape(4, value);
+  bool result = ParseHexEscape(4, value);
+  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
+      current() == '\\') {
+    // Attempt to read trail surrogate.
+    int start = position();
+    if (Next() == 'u') {
+      Advance(2);
+      uc32 trail;
+      if (ParseHexEscape(4, &trail) &&
+          unibrow::Utf16::IsTrailSurrogate(trail)) {
+        *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
+                                                      static_cast<uc16>(trail));
+        return true;
+      }
+    }
+    Reset(start);
+  }
+  return result;
 }
 
+ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
+#ifdef V8_I18N_SUPPORT
+  char property_name[3];
+  memset(property_name, 0, sizeof(property_name));
+  if (current() == '{') {
+    Advance();
+    if (current() < 'A' || current() > 'Z') return nullptr;
+    property_name[0] = static_cast<char>(current());
+    Advance();
+    if (current() >= 'a' && current() <= 'z') {
+      property_name[1] = static_cast<char>(current());
+      Advance();
+    }
+    if (current() != '}') return nullptr;
+  } else if (current() >= 'A' && current() <= 'Z') {
+    property_name[0] = static_cast<char>(current());
+  } else {
+    return nullptr;
+  }
+  Advance();
+
+  int32_t category =
+      u_getPropertyValueEnum(UCHAR_GENERAL_CATEGORY_MASK, property_name);
+  if (category == UCHAR_INVALID_CODE) return nullptr;
+
+  USet* set = uset_openEmpty();
+  UErrorCode ec = U_ZERO_ERROR;
+  uset_applyIntPropertyValue(set, UCHAR_GENERAL_CATEGORY_MASK, category, &ec);
+  ZoneList<CharacterRange>* ranges = nullptr;
+  if (ec == U_ZERO_ERROR && !uset_isEmpty(set)) {
+    uset_removeAllStrings(set);
+    int item_count = uset_getItemCount(set);
+    ranges = new (zone()) ZoneList<CharacterRange>(item_count, zone());
+    int item_result = 0;
+    for (int i = 0; i < item_count; i++) {
+      uc32 start = 0;
+      uc32 end = 0;
+      item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
+      ranges->Add(CharacterRange::Range(start, end), zone());
+    }
+    DCHECK_EQ(U_ZERO_ERROR, ec);
+    DCHECK_EQ(0, item_result);
+  }
+  uset_close(set);
+  return ranges;
+#else   // V8_I18N_SUPPORT
+  return nullptr;
+#endif  // V8_I18N_SUPPORT
+}
 
 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
   uc32 x = 0;
@@ -809,20 +934,35 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
     case 'c': {
       uc32 controlLetter = Next();
       uc32 letter = controlLetter & ~('A' ^ 'a');
-      // For compatibility with JSC, inside a character class
-      // we also accept digits and underscore as control characters.
-      if ((controlLetter >= '0' && controlLetter <= '9') ||
-          controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
+      // For compatibility with JSC, inside a character class. We also accept
+      // digits and underscore as control characters, unless with /u.
+      if (letter >= 'A' && letter <= 'Z') {
         Advance(2);
         // Control letters mapped to ASCII control characters in the range
         // 0x00-0x1f.
         return controlLetter & 0x1f;
       }
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
+      if ((controlLetter >= '0' && controlLetter <= '9') ||
+          controlLetter == '_') {
+        Advance(2);
+        return controlLetter & 0x1f;
+      }
       // We match JSC in reading the backslash as a literal
       // character instead of as starting an escape.
       return '\\';
     }
     case '0':
+      // With /u, \0 is interpreted as NUL if not followed by another digit.
+      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
+        Advance();
+        return 0;
+      }
+    // Fall through.
     case '1':
     case '2':
     case '3':
@@ -833,43 +973,43 @@ uc32 RegExpParser::ParseClassCharacterEscape() {
       // For compatibility, we interpret a decimal escape that isn't
       // a back reference (and therefore either \0 or not valid according
       // to the specification) as a 1..3 digit octal character code.
+      if (unicode()) {
+        // With /u, decimal escape is not interpreted as octal character code.
+        ReportError(CStrVector("Invalid class escape"));
+        return 0;
+      }
       return ParseOctalLiteral();
     case 'x': {
       Advance();
       uc32 value;
-      if (ParseHexEscape(2, &value)) {
-        return value;
+      if (ParseHexEscape(2, &value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid escape"));
+        return 0;
       }
-      if (!unicode_) {
-        // If \x is not followed by a two-digit hexadecimal, treat it
-        // as an identity escape.
-        return 'x';
-      }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid escape"));
-      return 0;
+      // If \x is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'x';
     }
     case 'u': {
       Advance();
       uc32 value;
-      if (ParseUnicodeEscape(&value)) {
-        return value;
-      }
-      if (!unicode_) {
-        return 'u';
+      if (ParseUnicodeEscape(&value)) return value;
+      if (unicode()) {
+        // With /u, invalid escapes are not treated as identity escapes.
+        ReportError(CStrVector("Invalid unicode escape"));
+        return 0;
       }
-      // If the 'u' flag is present, invalid escapes are not treated as
-      // identity escapes.
-      ReportError(CStrVector("Invalid unicode escape"));
-      return 0;
+      // If \u is not followed by a two-digit hexadecimal, treat it
+      // as an identity escape.
+      return 'u';
     }
     default: {
       uc32 result = current();
-      // If the 'u' flag is present, only syntax characters can be escaped, no
-      // other identity escapes are allowed. If the 'u' flag is not present, all
-      // identity escapes are allowed.
-      if (!unicode_ || IsSyntaxCharacter(result)) {
+      // With /u, no identity escapes except for syntax characters and '-' are
+      // allowed. Otherwise, all identity escapes are allowed.
+      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
         Advance();
         return result;
       }
@@ -899,13 +1039,13 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
       case kEndMarker:
         return ReportError(CStrVector("\\ at end of pattern"));
       default:
-        uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
-        return CharacterRange::Singleton(c);
+        first = ParseClassCharacterEscape(CHECK_FAILED);
     }
   } else {
     Advance();
-    return CharacterRange::Singleton(first);
   }
+
+  return CharacterRange::Singleton(first);
 }
 
 
@@ -927,6 +1067,7 @@ static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
 
 RegExpTree* RegExpParser::ParseCharacterClass() {
   static const char* kUnterminated = "Unterminated character class";
+  static const char* kRangeInvalid = "Invalid character class";
   static const char* kRangeOutOfOrder = "Range out of order in character class";
 
   DCHECK_EQ(current(), '[');
@@ -956,13 +1097,18 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
         // Either end is an escaped character class. Treat the '-' verbatim.
+        if (unicode()) {
+          // ES2015 21.2.2.15.1 step 1.
+          return ReportError(CStrVector(kRangeInvalid));
+        }
         AddRangeOrEscape(ranges, char_class, first, zone());
         ranges->Add(CharacterRange::Singleton('-'), zone());
         AddRangeOrEscape(ranges, char_class_2, next, zone());
         continue;
       }
+      // ES2015 21.2.2.15.1 step 6.
       if (first.from() > next.to()) {
-        return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
+        return ReportError(CStrVector(kRangeOutOfOrder));
       }
       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
     } else {
@@ -970,7 +1116,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
     }
   }
   if (!has_more()) {
-    return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
+    return ReportError(CStrVector(kUnterminated));
   }
   Advance();
   if (ranges->length() == 0) {
@@ -985,10 +1131,10 @@ RegExpTree* RegExpParser::ParseCharacterClass() {
 
 
 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
-                               FlatStringReader* input, bool multiline,
-                               bool unicode, RegExpCompileData* result) {
+                               FlatStringReader* input, JSRegExp::Flags flags,
+                               RegExpCompileData* result) {
   DCHECK(result != NULL);
-  RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
+  RegExpParser parser(input, &result->error, flags, isolate, zone);
   RegExpTree* tree = parser.ParsePattern();
   if (parser.failed()) {
     DCHECK(tree == NULL);
@@ -1010,11 +1156,13 @@ bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
   return !parser.failed();
 }
 
-
-RegExpBuilder::RegExpBuilder(Zone* zone)
+RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
     : zone_(zone),
       pending_empty_(false),
+      ignore_case_(ignore_case),
+      unicode_(unicode),
       characters_(NULL),
+      pending_surrogate_(kNoPendingSurrogate),
       terms_(),
       alternatives_()
 #ifdef DEBUG
@@ -1025,7 +1173,51 @@ RegExpBuilder::RegExpBuilder(Zone* zone)
 }
 
 
+void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
+  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+  FlushPendingSurrogate();
+  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
+  pending_surrogate_ = lead_surrogate;
+}
+
+
+void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
+  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    uc16 lead_surrogate = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+    uc32 combined =
+        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+    if (NeedsDesugaringForIgnoreCase(combined)) {
+      AddCharacterClassForDesugaring(combined);
+    } else {
+      ZoneList<uc16> surrogate_pair(2, zone());
+      surrogate_pair.Add(lead_surrogate, zone());
+      surrogate_pair.Add(trail_surrogate, zone());
+      RegExpAtom* atom =
+          new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+      AddAtom(atom);
+    }
+  } else {
+    pending_surrogate_ = trail_surrogate;
+    FlushPendingSurrogate();
+  }
+}
+
+
+void RegExpBuilder::FlushPendingSurrogate() {
+  if (pending_surrogate_ != kNoPendingSurrogate) {
+    DCHECK(unicode());
+    uc32 c = pending_surrogate_;
+    pending_surrogate_ = kNoPendingSurrogate;
+    AddCharacterClassForDesugaring(c);
+  }
+}
+
+
 void RegExpBuilder::FlushCharacters() {
+  FlushPendingSurrogate();
   pending_empty_ = false;
   if (characters_ != NULL) {
     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
@@ -1053,31 +1245,61 @@ void RegExpBuilder::FlushText() {
 
 
 void RegExpBuilder::AddCharacter(uc16 c) {
+  FlushPendingSurrogate();
   pending_empty_ = false;
-  if (characters_ == NULL) {
-    characters_ = new (zone()) ZoneList<uc16>(4, zone());
+  if (NeedsDesugaringForIgnoreCase(c)) {
+    AddCharacterClassForDesugaring(c);
+  } else {
+    if (characters_ == NULL) {
+      characters_ = new (zone()) ZoneList<uc16>(4, zone());
+    }
+    characters_->Add(c, zone());
+    LAST(ADD_CHAR);
   }
-  characters_->Add(c, zone());
-  LAST(ADD_CHAR);
 }
 
 
 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
   if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
-    ZoneList<uc16> surrogate_pair(2, zone());
-    surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
-    surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
-    RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
-    AddAtom(atom);
+    DCHECK(unicode());
+    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
+    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
+  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
+    AddLeadSurrogate(c);
+  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
+    AddTrailSurrogate(c);
   } else {
     AddCharacter(static_cast<uc16>(c));
   }
 }
 
+void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
+  // A lead or trail surrogate parsed via escape sequence will not
+  // pair up with any preceding lead or following trail surrogate.
+  FlushPendingSurrogate();
+  AddUnicodeCharacter(character);
+  FlushPendingSurrogate();
+}
 
 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
 
 
+void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
+  if (NeedsDesugaringForUnicode(cc)) {
+    // With /u, character class needs to be desugared, so it
+    // must be a standalone term instead of being part of a RegExpText.
+    AddTerm(cc);
+  } else {
+    AddAtom(cc);
+  }
+}
+
+void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
+  AddTerm(new (zone()) RegExpCharacterClass(
+      CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+}
+
+
 void RegExpBuilder::AddAtom(RegExpTree* term) {
   if (term->IsEmpty()) {
     AddEmpty();
@@ -1094,6 +1316,13 @@ void RegExpBuilder::AddAtom(RegExpTree* term) {
 }
 
 
+void RegExpBuilder::AddTerm(RegExpTree* term) {
+  FlushText();
+  terms_.Add(term, zone());
+  LAST(ADD_ATOM);
+}
+
+
 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
   FlushText();
   terms_.Add(assert, zone());
@@ -1121,6 +1350,47 @@ void RegExpBuilder::FlushTerms() {
 }
 
 
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+  if (!unicode()) return false;
+  switch (cc->standard_type()) {
+    case 's':        // white space
+    case 'w':        // ASCII word character
+    case 'd':        // ASCII digit
+      return false;  // These characters do not need desugaring.
+    default:
+      break;
+  }
+  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+  CharacterRange::Canonicalize(ranges);
+  for (int i = ranges->length() - 1; i >= 0; i--) {
+    uc32 from = ranges->at(i).from();
+    uc32 to = ranges->at(i).to();
+    // Check for non-BMP characters.
+    if (to >= kNonBmpStart) return true;
+    // Check for lone surrogates.
+    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+  }
+  return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_I18N_SUPPORT
+  if (unicode() && ignore_case()) {
+    USet* set = uset_open(c, c);
+    uset_closeOver(set, USET_CASE_INSENSITIVE);
+    uset_removeAllStrings(set);
+    bool result = uset_size(set) > 1;
+    uset_close(set);
+    return result;
+  }
+  // In the case where ICU is not included, we act as if the unicode flag is
+  // not set, and do not desugar.
+#endif  // V8_I18N_SUPPORT
+  return false;
+}
+
+
 RegExpTree* RegExpBuilder::ToRegExp() {
   FlushTerms();
   int num_alternatives = alternatives_.length();
@@ -1129,12 +1399,12 @@ RegExpTree* RegExpBuilder::ToRegExp() {
   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
 }
 
-
-void RegExpBuilder::AddQuantifierToAtom(
+bool RegExpBuilder::AddQuantifierToAtom(
     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
+  FlushPendingSurrogate();
   if (pending_empty_) {
     pending_empty_ = false;
-    return;
+    return true;
   }
   RegExpTree* atom;
   if (characters_ != NULL) {
@@ -1157,23 +1427,26 @@ void RegExpBuilder::AddQuantifierToAtom(
   } else if (terms_.length() > 0) {
     DCHECK(last_added_ == ADD_ATOM);
     atom = terms_.RemoveLast();
+    // With /u, lookarounds are not quantifiable.
+    if (unicode() && atom->IsLookaround()) return false;
     if (atom->max_match() == 0) {
       // Guaranteed to only match an empty string.
       LAST(ADD_TERM);
       if (min == 0) {
-        return;
+        return true;
       }
       terms_.Add(atom, zone());
-      return;
+      return true;
     }
   } else {
     // Only call immediately after adding an atom or character!
     UNREACHABLE();
-    return;
+    return false;
   }
   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
              zone());
   LAST(ADD_TERM);
+  return true;
 }
 
 }  // namespace internal