summaryrefslogtreecommitdiff
path: root/deps/v8/src/regexp/regexp-parser.cc
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/regexp/regexp-parser.cc')
-rw-r--r--deps/v8/src/regexp/regexp-parser.cc149
1 files changed, 121 insertions, 28 deletions
diff --git a/deps/v8/src/regexp/regexp-parser.cc b/deps/v8/src/regexp/regexp-parser.cc
index c1d2c7d5cd..797424baf8 100644
--- a/deps/v8/src/regexp/regexp-parser.cc
+++ b/deps/v8/src/regexp/regexp-parser.cc
@@ -12,6 +12,7 @@
#include "src/objects-inl.h"
#include "src/ostreams.h"
#include "src/regexp/jsregexp.h"
+#include "src/regexp/property-sequences.h"
#include "src/utils.h"
#ifdef V8_INTL_SUPPORT
@@ -344,13 +345,23 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (unicode()) {
ZoneList<CharacterRange>* ranges =
new (zone()) ZoneList<CharacterRange>(2, zone());
- if (!ParsePropertyClass(ranges, p == 'P')) {
- return ReportError(CStrVector("Invalid property name"));
+ std::vector<char> name_1, name_2;
+ if (ParsePropertyClassName(&name_1, &name_2)) {
+ if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
+ RegExpCharacterClass* cc = new (zone())
+ RegExpCharacterClass(zone(), ranges, builder->flags());
+ builder->AddCharacterClass(cc);
+ break;
+ }
+ if (p == 'p' && name_2.empty()) {
+ RegExpTree* sequence = GetPropertySequence(name_1);
+ if (sequence != nullptr) {
+ builder->AddAtom(sequence);
+ break;
+ }
+ }
}
- RegExpCharacterClass* cc = new (zone())
- RegExpCharacterClass(zone(), ranges, builder->flags());
- builder->AddCharacterClass(cc);
-
+ return ReportError(CStrVector("Invalid property name"));
} else {
builder->AddCharacter(p);
}
@@ -1346,8 +1357,10 @@ bool IsUnicodePropertyValueCharacter(char c) {
} // anonymous namespace
-bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
- bool negate) {
+bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
+ std::vector<char>* name_2) {
+ DCHECK(name_1->empty());
+ DCHECK(name_2->empty());
// Parse the property class as follows:
// - In \p{name}, 'name' is interpreted
// - either as a general category property value name.
@@ -1356,55 +1369,58 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
// and 'value' is interpreted as one of the available property value names.
// - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
// - Loose matching is not applied.
- std::vector<char> first_part;
- std::vector<char> second_part;
if (current() == '{') {
// Parse \p{[PropertyName=]PropertyNameValue}
for (Advance(); current() != '}' && current() != '='; Advance()) {
if (!IsUnicodePropertyValueCharacter(current())) return false;
if (!has_next()) return false;
- first_part.push_back(static_cast<char>(current()));
+ name_1->push_back(static_cast<char>(current()));
}
if (current() == '=') {
for (Advance(); current() != '}'; Advance()) {
if (!IsUnicodePropertyValueCharacter(current())) return false;
if (!has_next()) return false;
- second_part.push_back(static_cast<char>(current()));
+ name_2->push_back(static_cast<char>(current()));
}
- second_part.push_back(0); // null-terminate string.
+ name_2->push_back(0); // null-terminate string.
}
} else {
return false;
}
Advance();
- first_part.push_back(0); // null-terminate string.
+ name_1->push_back(0); // null-terminate string.
- DCHECK(first_part.size() - 1 == std::strlen(first_part.data()));
- DCHECK(second_part.empty() ||
- second_part.size() - 1 == std::strlen(second_part.data()));
+ DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
+ DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
+ return true;
+}
- if (second_part.empty()) {
+bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
+ bool negate,
+ const std::vector<char>& name_1,
+ const std::vector<char>& name_2) {
+ if (name_2.empty()) {
// First attempt to interpret as general category property value name.
- const char* name = first_part.data();
+ const char* name = name_1.data();
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
- result, zone())) {
+ add_to, zone())) {
return true;
}
// Interpret "Any", "ASCII", and "Assigned".
- if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
+ if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
return true;
}
// Then attempt to interpret as binary property name with value name 'Y'.
UProperty property = u_getPropertyEnum(name);
if (!IsSupportedBinaryProperty(property)) return false;
if (!IsExactPropertyAlias(name, property)) return false;
- return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
+ return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
zone());
} else {
// Both property name and value name are specified. Attempt to interpret
// the property name as enumerated property.
- const char* property_name = first_part.data();
- const char* value_name = second_part.data();
+ const char* property_name = name_1.data();
+ const char* value_name = name_2.data();
UProperty property = u_getPropertyEnum(property_name);
if (!IsExactPropertyAlias(property_name, property)) return false;
if (property == UCHAR_GENERAL_CATEGORY) {
@@ -1414,18 +1430,93 @@ bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
property != UCHAR_SCRIPT_EXTENSIONS) {
return false;
}
- return LookupPropertyValueName(property, value_name, negate, result,
+ return LookupPropertyValueName(property, value_name, negate, add_to,
zone());
}
}
+RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
+ if (!FLAG_harmony_regexp_sequence) return nullptr;
+ const char* name = name_1.data();
+ const uc32* sequence_list = nullptr;
+ JSRegExp::Flags flags = JSRegExp::kUnicode;
+ if (NameEquals(name, "Emoji_Flag_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
+ } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiTagSequences;
+ } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
+ sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
+ }
+ if (sequence_list != nullptr) {
+ // TODO(yangguo): this creates huge regexp code. Alternative to this is
+ // to create a new operator that checks for these sequences at runtime.
+ RegExpBuilder builder(zone(), flags);
+ while (true) { // Iterate through list of sequences.
+ while (*sequence_list != 0) { // Iterate through sequence.
+ builder.AddUnicodeCharacter(*sequence_list);
+ sequence_list++;
+ }
+ sequence_list++;
+ if (*sequence_list == 0) break;
+ builder.NewAlternative();
+ }
+ return builder.ToRegExp();
+ }
+
+ if (NameEquals(name, "Emoji_Keycap_Sequence")) {
+ // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
+ // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
+ RegExpBuilder builder(zone(), flags);
+ ZoneList<CharacterRange>* prefix_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
+ prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
+ prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
+ builder.AddCharacter(0xFE0F);
+ builder.AddCharacter(0x20E3);
+ return builder.ToRegExp();
+ } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
+ // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
+ // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
+ RegExpBuilder builder(zone(), flags);
+ ZoneList<CharacterRange>* modifier_base_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
+ modifier_base_ranges, zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
+ ZoneList<CharacterRange>* modifier_ranges =
+ new (zone()) ZoneList<CharacterRange>(2, zone());
+ LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
+ zone());
+ builder.AddCharacterClass(
+ new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
+ return builder.ToRegExp();
+ }
+
+ return nullptr;
+}
+
#else // V8_INTL_SUPPORT
-bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
- bool negate) {
+bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
+ std::vector<char>* name_2) {
+ return false;
+}
+
+bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
+ bool negate,
+ const std::vector<char>& name_1,
+ const std::vector<char>& name_2) {
return false;
}
+RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
+ return nullptr;
+}
+
#endif // V8_INTL_SUPPORT
bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
@@ -1591,7 +1682,9 @@ void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
if (unicode()) {
bool negate = Next() == 'P';
Advance(2);
- if (!ParsePropertyClass(ranges, negate)) {
+ std::vector<char> name_1, name_2;
+ if (!ParsePropertyClassName(&name_1, &name_2) ||
+ !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
ReportError(CStrVector("Invalid property name in character class"));
}
*is_class_escape = true;