diff options
author | Bert Belder <bertbelder@gmail.com> | 2012-06-13 15:34:45 +0200 |
---|---|---|
committer | Bert Belder <bertbelder@gmail.com> | 2012-06-14 01:37:13 +0200 |
commit | 50464cd4f49e40f4fe792ff46a81052319a222e9 (patch) | |
tree | 1fe524b2e6c0eb3c459142cd27539f88e1a3f63c /deps/v8/src/jsregexp.h | |
parent | 09be360a0fee2c7619bae8c4248f9ed3d79d1b30 (diff) | |
download | node-new-50464cd4f49e40f4fe792ff46a81052319a222e9.tar.gz |
v8: upgrade to v3.11.10
Diffstat (limited to 'deps/v8/src/jsregexp.h')
-rw-r--r-- | deps/v8/src/jsregexp.h | 590 |
1 files changed, 377 insertions, 213 deletions
diff --git a/deps/v8/src/jsregexp.h b/deps/v8/src/jsregexp.h index 8875de9eb2..782c5b0b20 100644 --- a/deps/v8/src/jsregexp.h +++ b/deps/v8/src/jsregexp.h @@ -40,6 +40,7 @@ class RegExpCompiler; class RegExpMacroAssembler; class RegExpNode; class RegExpTree; +class BoyerMooreLookahead; class RegExpImpl { public: @@ -77,7 +78,8 @@ class RegExpImpl { static Handle<Object> Exec(Handle<JSRegExp> regexp, Handle<String> subject, int index, - Handle<JSArray> lastMatchInfo); + Handle<JSArray> lastMatchInfo, + Zone* zone); // Prepares a JSRegExp object with Irregexp-specific data. static void IrregexpInitialize(Handle<JSRegExp> re, @@ -106,18 +108,26 @@ class RegExpImpl { // as its "registers" argument. If the regexp cannot be compiled, // an exception is set as pending, and this function returns negative. static int IrregexpPrepare(Handle<JSRegExp> regexp, - Handle<String> subject); - - // Execute a regular expression once on the subject, starting from - // character "index". - // If successful, returns RE_SUCCESS and set the capture positions - // in the first registers. + Handle<String> subject, + Zone* zone); + + // Calculate the size of offsets vector for the case of global regexp + // and the number of matches this vector is able to store. + static int GlobalOffsetsVectorSize(Handle<JSRegExp> regexp, + int registers_per_match, + int* max_matches); + + // Execute a regular expression on the subject, starting from index. + // If matching succeeds, return the number of matches. This can be larger + // than one in the case of global regular expressions. + // The captures and subcaptures are stored into the registers vector. // If matching fails, returns RE_FAILURE. // If execution fails, sets a pending exception and returns RE_EXCEPTION. - static IrregexpResult IrregexpExecOnce(Handle<JSRegExp> regexp, - Handle<String> subject, - int index, - Vector<int> registers); + static int IrregexpExecRaw(Handle<JSRegExp> regexp, + Handle<String> subject, + int index, + Vector<int> registers, + Zone* zone); // Execute an Irregexp bytecode pattern. // On a successful match, the result is a JSArray containing @@ -126,7 +136,8 @@ class RegExpImpl { static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp, Handle<String> subject, int index, - Handle<JSArray> lastMatchInfo); + Handle<JSArray> lastMatchInfo, + Zone* zone); // Array index in the lastMatchInfo array. static const int kLastCaptureCount = 0; @@ -190,8 +201,12 @@ class RegExpImpl { static String* last_ascii_string_; static String* two_byte_cached_string_; - static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii); - static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii); + static bool CompileIrregexp( + Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii, + Zone* zone); + static inline bool EnsureCompiledIrregexp( + Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii, + Zone* zone); // Set the subject cache. The previous string buffer is not deleted, so the @@ -222,56 +237,17 @@ enum ElementInSetsRelation { }; -// Represents the relation of two sets. -// Sets can be either disjoint, partially or fully overlapping, or equal. -class SetRelation BASE_EMBEDDED { - public: - // Relation is represented by a bit saying whether there are elements in - // one set that is not in the other, and a bit saying that there are elements - // that are in both sets. - - // Location of an element. Corresponds to the internal areas of - // a Venn diagram. - enum { - kInFirst = 1 << kInsideFirst, - kInSecond = 1 << kInsideSecond, - kInBoth = 1 << kInsideBoth - }; - SetRelation() : bits_(0) {} - ~SetRelation() {} - // Add the existence of objects in a particular - void SetElementsInFirstSet() { bits_ |= kInFirst; } - void SetElementsInSecondSet() { bits_ |= kInSecond; } - void SetElementsInBothSets() { bits_ |= kInBoth; } - // Check the currently known relation of the sets (common functions only, - // for other combinations, use value() to get the bits and check them - // manually). - // Sets are completely disjoint. - bool Disjoint() { return (bits_ & kInBoth) == 0; } - // Sets are equal. - bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; } - // First set contains second. - bool Contains() { return (bits_ & kInSecond) == 0; } - // Second set contains first. - bool ContainedIn() { return (bits_ & kInFirst) == 0; } - bool NonTrivialIntersection() { - return (bits_ == (kInFirst | kInSecond | kInBoth)); - } - int value() { return bits_; } - - private: - int bits_; -}; - - +// Represents code units in the range from from_ to to_, both ends are +// inclusive. class CharacterRange { public: CharacterRange() : from_(0), to_(0) { } // For compatibility with the CHECK_OK macro CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { } - static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges); - static Vector<const uc16> GetWordBounds(); + static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, + Zone* zone); + static Vector<const int> GetWordBounds(); static inline CharacterRange Singleton(uc16 value) { return CharacterRange(value, value); } @@ -290,11 +266,13 @@ class CharacterRange { bool is_valid() { return from_ <= to_; } bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } bool IsSingleton() { return (from_ == to_); } - void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii); + void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii, + Zone* zone); static void Split(ZoneList<CharacterRange>* base, - Vector<const uc16> overlay, + Vector<const int> overlay, ZoneList<CharacterRange>** included, - ZoneList<CharacterRange>** excluded); + ZoneList<CharacterRange>** excluded, + Zone* zone); // Whether a range list is in canonical form: Ranges ordered by from value, // and ranges non-overlapping and non-adjacent. static bool IsCanonical(ZoneList<CharacterRange>* ranges); @@ -303,31 +281,10 @@ class CharacterRange { // adjacent ranges are merged. The resulting list may be shorter than the // original, but cannot be longer. static void Canonicalize(ZoneList<CharacterRange>* ranges); - // Check how the set of characters defined by a CharacterRange list relates - // to the set of word characters. List must be in canonical form. - static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges); - // Takes two character range lists (representing character sets) in canonical - // form and merges them. - // The characters that are only covered by the first set are added to - // first_set_only_out. the characters that are only in the second set are - // added to second_set_only_out, and the characters that are in both are - // added to both_sets_out. - // The pointers to first_set_only_out, second_set_only_out and both_sets_out - // should be to empty lists, but they need not be distinct, and may be NULL. - // If NULL, the characters are dropped, and if two arguments are the same - // pointer, the result is the union of the two sets that would be created - // if the pointers had been distinct. - // This way, the Merge function can compute all the usual set operations: - // union (all three out-sets are equal), intersection (only both_sets_out is - // non-NULL), and set difference (only first_set is non-NULL). - static void Merge(ZoneList<CharacterRange>* first_set, - ZoneList<CharacterRange>* second_set, - ZoneList<CharacterRange>* first_set_only_out, - ZoneList<CharacterRange>* second_set_only_out, - ZoneList<CharacterRange>* both_sets_out); // Negate the contents of a character range in canonical form. static void Negate(ZoneList<CharacterRange>* src, - ZoneList<CharacterRange>* dst); + ZoneList<CharacterRange>* dst, + Zone* zone); static const int kStartMarker = (1 << 24); static const int kPayloadMask = (1 << 24) - 1; @@ -342,7 +299,7 @@ class CharacterRange { class OutSet: public ZoneObject { public: OutSet() : first_(0), remaining_(NULL), successors_(NULL) { } - OutSet* Extend(unsigned value); + OutSet* Extend(unsigned value, Zone* zone); bool Get(unsigned value); static const unsigned kFirstLimit = 32; @@ -350,12 +307,12 @@ class OutSet: public ZoneObject { // Destructively set a value in this set. In most cases you want // to use Extend instead to ensure that only one instance exists // that contains the same values. - void Set(unsigned value); + void Set(unsigned value, Zone* zone); // The successors are a list of sets that contain the same values // as this set and the one more value that is not present in this // set. - ZoneList<OutSet*>* successors() { return successors_; } + ZoneList<OutSet*>* successors(Zone* zone) { return successors_; } OutSet(uint32_t first, ZoneList<unsigned>* remaining) : first_(first), remaining_(remaining), successors_(NULL) { } @@ -370,6 +327,8 @@ class OutSet: public ZoneObject { // Used for mapping character ranges to choices. class DispatchTable : public ZoneObject { public: + explicit DispatchTable(Zone* zone) : tree_(zone) { } + class Entry { public: Entry() : from_(0), to_(0), out_set_(NULL) { } @@ -378,7 +337,9 @@ class DispatchTable : public ZoneObject { uc16 from() { return from_; } uc16 to() { return to_; } void set_to(uc16 value) { to_ = value; } - void AddValue(int value) { out_set_ = out_set_->Extend(value); } + void AddValue(int value, Zone* zone) { + out_set_ = out_set_->Extend(value, zone); + } OutSet* out_set() { return out_set_; } private: uc16 from_; @@ -402,12 +363,14 @@ class DispatchTable : public ZoneObject { } }; - void AddRange(CharacterRange range, int value); + void AddRange(CharacterRange range, int value, Zone* zone); OutSet* Get(uc16 value); void Dump(); template <typename Callback> - void ForEach(Callback* callback) { return tree()->ForEach(callback); } + void ForEach(Callback* callback) { + return tree()->ForEach(callback); + } private: // There can't be a static empty set since it allocates its @@ -475,7 +438,8 @@ struct NodeInfo { follows_newline_interest(false), follows_start_interest(false), at_end(false), - visited(false) { } + visited(false), + replacement_calculated(false) { } // Returns true if the interests and assumptions of this node // matches the given one. @@ -525,25 +489,7 @@ struct NodeInfo { bool at_end: 1; bool visited: 1; -}; - - -class SiblingList { - public: - SiblingList() : list_(NULL) { } - int length() { - return list_ == NULL ? 0 : list_->length(); - } - void Ensure(RegExpNode* parent) { - if (list_ == NULL) { - list_ = new ZoneList<RegExpNode*>(2); - list_->Add(parent); - } - } - void Add(RegExpNode* node) { list_->Add(node); } - RegExpNode* Get(int index) { return list_->at(index); } - private: - ZoneList<RegExpNode*>* list_; + bool replacement_calculated: 1; }; @@ -599,9 +545,15 @@ class QuickCheckDetails { }; +extern int kUninitializedRegExpNodePlaceHolder; + + class RegExpNode: public ZoneObject { public: - RegExpNode() : first_character_set_(NULL), trace_count_(0) { } + explicit RegExpNode(Zone* zone) + : replacement_(NULL), trace_count_(0), zone_(zone) { + bm_info_[0] = bm_info_[1] = NULL; + } virtual ~RegExpNode(); virtual void Accept(NodeVisitor* visitor) = 0; // Generates a goto to this node or actually generates the code at this point. @@ -635,6 +587,50 @@ class RegExpNode: public ZoneObject { bool not_at_start) = 0; static const int kNodeIsTooComplexForGreedyLoops = -1; virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } + // Only returns the successor for a text node of length 1 that matches any + // character and that has no guards on it. + virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( + RegExpCompiler* compiler) { + return NULL; + } + + // Collects information on the possible code units (mod 128) that can match if + // we look forward. This is used for a Boyer-Moore-like string searching + // implementation. TODO(erikcorry): This should share more code with + // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit + // the number of nodes we are willing to look at in order to create this data. + static const int kFillInBMBudget = 200; + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start) { + UNREACHABLE(); + } + + // If we know that the input is ASCII then there are some nodes that can + // never match. This method returns a node that can be substituted for + // itself, or NULL if the node can never match. + virtual RegExpNode* FilterASCII(int depth) { return this; } + // Helper for FilterASCII. + RegExpNode* replacement() { + ASSERT(info()->replacement_calculated); + return replacement_; + } + RegExpNode* set_replacement(RegExpNode* replacement) { + info()->replacement_calculated = true; + replacement_ = replacement; + return replacement; // For convenience. + } + + // We want to avoid recalculating the lookahead info, so we store it on the + // node. Only info that is for this node is stored. We can tell that the + // info is for this node when offset == 0, so the information is calculated + // relative to this node. + void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) { + if (offset == 0) set_bm_info(not_at_start, bm); + } + Label* label() { return &label_; } // If non-generic code is generated for a node (i.e. the node is not at the // start of the trace) then it cannot be reused. This variable sets a limit @@ -645,72 +641,35 @@ class RegExpNode: public ZoneObject { NodeInfo* info() { return &info_; } - void AddSibling(RegExpNode* node) { siblings_.Add(node); } - - // Static version of EnsureSibling that expresses the fact that the - // result has the same type as the input. - template <class C> - static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) { - return static_cast<C*>(node->EnsureSibling(info, cloned)); + BoyerMooreLookahead* bm_info(bool not_at_start) { + return bm_info_[not_at_start ? 1 : 0]; } - SiblingList* siblings() { return &siblings_; } - void set_siblings(SiblingList* other) { siblings_ = *other; } - - // Return the set of possible next characters recognized by the regexp - // (or a safe subset, potentially the set of all characters). - ZoneList<CharacterRange>* FirstCharacterSet(); - - // Compute (if possible within the budget of traversed nodes) the - // possible first characters of the input matched by this node and - // its continuation. Returns the remaining budget after the computation. - // If the budget is spent, the result is negative, and the cached - // first_character_set_ value isn't set. - virtual int ComputeFirstCharacterSet(int budget); - - // Get and set the cached first character set value. - ZoneList<CharacterRange>* first_character_set() { - return first_character_set_; - } - void set_first_character_set(ZoneList<CharacterRange>* character_set) { - first_character_set_ = character_set; - } + Zone* zone() const { return zone_; } protected: enum LimitResult { DONE, CONTINUE }; - static const int kComputeFirstCharacterSetFail = -1; + RegExpNode* replacement_; LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace); - // Returns a sibling of this node whose interests and assumptions - // match the ones in the given node info. If no sibling exists NULL - // is returned. - RegExpNode* TryGetSibling(NodeInfo* info); - - // Returns a sibling of this node whose interests match the ones in - // the given node info. The info must not contain any assertions. - // If no node exists a new one will be created by cloning the current - // node. The result will always be an instance of the same concrete - // class as this node. - RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned); - - // Returns a clone of this node initialized using the copy constructor - // of its concrete class. Note that the node may have to be pre- - // processed before it is on a usable state. - virtual RegExpNode* Clone() = 0; + void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) { + bm_info_[not_at_start ? 1 : 0] = bm; + } private: static const int kFirstCharBudget = 10; Label label_; NodeInfo info_; - SiblingList siblings_; - ZoneList<CharacterRange>* first_character_set_; // This variable keeps track of how many times code has been generated for // this node (in different traces). We don't keep track of where the // generated code is located unless the code is generated at the start of // a trace, in which case it is generic and can be reused by flushing the // deferred operations in the current trace and generating a goto. int trace_count_; + BoyerMooreLookahead* bm_info_[2]; + + Zone* zone_; }; @@ -731,8 +690,8 @@ class Interval { return (from_ <= value) && (value <= to_); } bool is_empty() { return from_ == kNone; } - int from() { return from_; } - int to() { return to_; } + int from() const { return from_; } + int to() const { return to_; } static Interval Empty() { return Interval(); } static const int kNone = -1; private: @@ -744,9 +703,23 @@ class Interval { class SeqRegExpNode: public RegExpNode { public: explicit SeqRegExpNode(RegExpNode* on_success) - : on_success_(on_success) { } + : RegExpNode(on_success->zone()), on_success_(on_success) { } RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } + virtual RegExpNode* FilterASCII(int depth); + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start) { + on_success_->FillInBMInfo( + offset, recursion_depth + 1, budget - 1, bm, not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); + } + + protected: + RegExpNode* FilterSuccessor(int depth); + private: RegExpNode* on_success_; }; @@ -793,11 +766,14 @@ class ActionNode: public SeqRegExpNode { return on_success()->GetQuickCheckDetails( details, compiler, filled_in, not_at_start); } + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); Type type() { return type_; } // TODO(erikcorry): We should allow some action nodes in greedy loops. virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; } - virtual ActionNode* Clone() { return new ActionNode(*this); } - virtual int ComputeFirstCharacterSet(int budget); private: union { @@ -845,8 +821,8 @@ class TextNode: public SeqRegExpNode { TextNode(RegExpCharacterClass* that, RegExpNode* on_success) : SeqRegExpNode(on_success), - elms_(new ZoneList<TextElement>(1)) { - elms_->Add(TextElement::CharClass(that)); + elms_(new(zone()) ZoneList<TextElement>(1, zone())) { + elms_->Add(TextElement::CharClass(that), zone()); } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); @@ -860,13 +836,15 @@ class TextNode: public SeqRegExpNode { ZoneList<TextElement>* elements() { return elms_; } void MakeCaseIndependent(bool is_ascii); virtual int GreedyLoopTextLength(); - virtual TextNode* Clone() { - TextNode* result = new TextNode(*this); - result->CalculateOffsets(); - return result; - } + virtual RegExpNode* GetSuccessorOfOmnivorousTextNode( + RegExpCompiler* compiler); + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); void CalculateOffsets(); - virtual int ComputeFirstCharacterSet(int budget); + virtual RegExpNode* FilterASCII(int depth); private: enum TextEmitPassType { @@ -897,27 +875,22 @@ class AssertionNode: public SeqRegExpNode { AT_START, AT_BOUNDARY, AT_NON_BOUNDARY, - AFTER_NEWLINE, - // Types not directly expressible in regexp syntax. - // Used for modifying a boundary node if its following character is - // known to be word and/or non-word. - AFTER_NONWORD_CHARACTER, - AFTER_WORD_CHARACTER + AFTER_NEWLINE }; static AssertionNode* AtEnd(RegExpNode* on_success) { - return new AssertionNode(AT_END, on_success); + return new(on_success->zone()) AssertionNode(AT_END, on_success); } static AssertionNode* AtStart(RegExpNode* on_success) { - return new AssertionNode(AT_START, on_success); + return new(on_success->zone()) AssertionNode(AT_START, on_success); } static AssertionNode* AtBoundary(RegExpNode* on_success) { - return new AssertionNode(AT_BOUNDARY, on_success); + return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success); } static AssertionNode* AtNonBoundary(RegExpNode* on_success) { - return new AssertionNode(AT_NON_BOUNDARY, on_success); + return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success); } static AssertionNode* AfterNewline(RegExpNode* on_success) { - return new AssertionNode(AFTER_NEWLINE, on_success); + return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success); } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); @@ -928,12 +901,20 @@ class AssertionNode: public SeqRegExpNode { RegExpCompiler* compiler, int filled_in, bool not_at_start); - virtual int ComputeFirstCharacterSet(int budget); - virtual AssertionNode* Clone() { return new AssertionNode(*this); } + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); AssertionNodeType type() { return type_; } void set_type(AssertionNodeType type) { type_ = type; } private: + void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace); + enum IfPrevious { kIsNonWord, kIsWord }; + void BacktrackIfPrevious(RegExpCompiler* compiler, + Trace* trace, + IfPrevious backtrack_if_previous); AssertionNode(AssertionNodeType t, RegExpNode* on_success) : SeqRegExpNode(on_success), type_(t) { } AssertionNodeType type_; @@ -961,8 +942,11 @@ class BackReferenceNode: public SeqRegExpNode { bool not_at_start) { return; } - virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); } - virtual int ComputeFirstCharacterSet(int budget); + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); private: int start_reg_; @@ -973,7 +957,8 @@ class BackReferenceNode: public SeqRegExpNode { class EndNode: public RegExpNode { public: enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS }; - explicit EndNode(Action action) : action_(action) { } + explicit EndNode(Action action, Zone* zone) + : RegExpNode(zone), action_(action) { } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, @@ -986,7 +971,15 @@ class EndNode: public RegExpNode { // Returning 0 from EatsAtLeast should ensure we never get here. UNREACHABLE(); } - virtual EndNode* Clone() { return new EndNode(*this); } + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start) { + // Returning 0 from EatsAtLeast should ensure we never get here. + UNREACHABLE(); + } + private: Action action_; }; @@ -997,8 +990,9 @@ class NegativeSubmatchSuccess: public EndNode { NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg, int clear_capture_count, - int clear_capture_start) - : EndNode(NEGATIVE_SUBMATCH_SUCCESS), + int clear_capture_start, + Zone* zone) + : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone), stack_pointer_register_(stack_pointer_reg), current_position_register_(position_reg), clear_capture_count_(clear_capture_count), @@ -1034,7 +1028,7 @@ class Guard: public ZoneObject { class GuardedAlternative { public: explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { } - void AddGuard(Guard* guard); + void AddGuard(Guard* guard, Zone* zone); RegExpNode* node() { return node_; } void set_node(RegExpNode* node) { node_ = node; } ZoneList<Guard*>* guards() { return guards_; } @@ -1050,13 +1044,17 @@ class AlternativeGeneration; class ChoiceNode: public RegExpNode { public: - explicit ChoiceNode(int expected_size) - : alternatives_(new ZoneList<GuardedAlternative>(expected_size)), + explicit ChoiceNode(int expected_size, Zone* zone) + : RegExpNode(zone), + alternatives_(new(zone) + ZoneList<GuardedAlternative>(expected_size, zone)), table_(NULL), not_at_start_(false), being_calculated_(false) { } virtual void Accept(NodeVisitor* visitor); - void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); } + void AddAlternative(GuardedAlternative node) { + alternatives()->Add(node, zone()); + } ZoneList<GuardedAlternative>* alternatives() { return alternatives_; } DispatchTable* GetTable(bool ignore_case); virtual void Emit(RegExpCompiler* compiler, Trace* trace); @@ -1071,13 +1069,18 @@ class ChoiceNode: public RegExpNode { RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); - virtual ChoiceNode* Clone() { return new ChoiceNode(*this); } + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); bool being_calculated() { return being_calculated_; } bool not_at_start() { return not_at_start_; } void set_not_at_start() { not_at_start_ = true; } void set_being_calculated(bool b) { being_calculated_ = b; } virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; } + virtual RegExpNode* FilterASCII(int depth); protected: int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); @@ -1089,7 +1092,7 @@ class ChoiceNode: public RegExpNode { void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard, Trace* trace); - int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start); + int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least); void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace, GuardedAlternative alternative, @@ -1107,8 +1110,9 @@ class ChoiceNode: public RegExpNode { class NegativeLookaheadChoiceNode: public ChoiceNode { public: explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail, - GuardedAlternative then_do_this) - : ChoiceNode(2) { + GuardedAlternative then_do_this, + Zone* zone) + : ChoiceNode(2, zone) { AddAlternative(this_must_fail); AddAlternative(then_do_this); } @@ -1119,20 +1123,29 @@ class NegativeLookaheadChoiceNode: public ChoiceNode { RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start) { + alternatives_->at(1).node()->FillInBMInfo( + offset, recursion_depth + 1, budget - 1, bm, not_at_start); + if (offset == 0) set_bm_info(not_at_start, bm); + } // For a negative lookahead we don't emit the quick check for the // alternative that is expected to fail. This is because quick check code // starts by loading enough characters for the alternative that takes fewest // characters, but on a negative lookahead the negative branch did not take // part in that calculation (EatsAtLeast) so the assumptions don't hold. virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; } - virtual int ComputeFirstCharacterSet(int budget); + virtual RegExpNode* FilterASCII(int depth); }; class LoopChoiceNode: public ChoiceNode { public: - explicit LoopChoiceNode(bool body_can_be_zero_length) - : ChoiceNode(2), + explicit LoopChoiceNode(bool body_can_be_zero_length, Zone* zone) + : ChoiceNode(2, zone), loop_node_(NULL), continue_node_(NULL), body_can_be_zero_length_(body_can_be_zero_length) { } @@ -1146,12 +1159,16 @@ class LoopChoiceNode: public ChoiceNode { RegExpCompiler* compiler, int characters_filled_in, bool not_at_start); - virtual int ComputeFirstCharacterSet(int budget); - virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); } + virtual void FillInBMInfo(int offset, + int recursion_depth, + int budget, + BoyerMooreLookahead* bm, + bool not_at_start); RegExpNode* loop_node() { return loop_node_; } RegExpNode* continue_node() { return continue_node_; } bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual void Accept(NodeVisitor* visitor); + virtual RegExpNode* FilterASCII(int depth); private: // AddAlternative is made private for loop nodes because alternatives @@ -1167,6 +1184,146 @@ class LoopChoiceNode: public ChoiceNode { }; +// Improve the speed that we scan for an initial point where a non-anchored +// regexp can match by using a Boyer-Moore-like table. This is done by +// identifying non-greedy non-capturing loops in the nodes that eat any +// character one at a time. For example in the middle of the regexp +// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly +// inserted at the start of any non-anchored regexp. +// +// When we have found such a loop we look ahead in the nodes to find the set of +// characters that can come at given distances. For example for the regexp +// /.?foo/ we know that there are at least 3 characters ahead of us, and the +// sets of characters that can occur are [any, [f, o], [o]]. We find a range in +// the lookahead info where the set of characters is reasonably constrained. In +// our example this is from index 1 to 2 (0 is not constrained). We can now +// look 3 characters ahead and if we don't find one of [f, o] (the union of +// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). +// +// For Unicode input strings we do the same, but modulo 128. +// +// We also look at the first string fed to the regexp and use that to get a hint +// of the character frequencies in the inputs. This affects the assessment of +// whether the set of characters is 'reasonably constrained'. +// +// We also have another lookahead mechanism (called quick check in the code), +// which uses a wide load of multiple characters followed by a mask and compare +// to determine whether a match is possible at this point. +enum ContainedInLattice { + kNotYet = 0, + kLatticeIn = 1, + kLatticeOut = 2, + kLatticeUnknown = 3 // Can also mean both in and out. +}; + + +inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { + return static_cast<ContainedInLattice>(a | b); +} + + +ContainedInLattice AddRange(ContainedInLattice a, + const int* ranges, + int ranges_size, + Interval new_range); + + +class BoyerMoorePositionInfo : public ZoneObject { + public: + explicit BoyerMoorePositionInfo(Zone* zone) + : map_(new(zone) ZoneList<bool>(kMapSize, zone)), + map_count_(0), + w_(kNotYet), + s_(kNotYet), + d_(kNotYet), + surrogate_(kNotYet) { + for (int i = 0; i < kMapSize; i++) { + map_->Add(false, zone); + } + } + + bool& at(int i) { return map_->at(i); } + + static const int kMapSize = 128; + static const int kMask = kMapSize - 1; + + int map_count() const { return map_count_; } + + void Set(int character); + void SetInterval(const Interval& interval); + void SetAll(); + bool is_non_word() { return w_ == kLatticeOut; } + bool is_word() { return w_ == kLatticeIn; } + + private: + ZoneList<bool>* map_; + int map_count_; // Number of set bits in the map. + ContainedInLattice w_; // The \w character class. + ContainedInLattice s_; // The \s character class. + ContainedInLattice d_; // The \d character class. + ContainedInLattice surrogate_; // Surrogate UTF-16 code units. +}; + + +class BoyerMooreLookahead : public ZoneObject { + public: + BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); + + int length() { return length_; } + int max_char() { return max_char_; } + RegExpCompiler* compiler() { return compiler_; } + + int Count(int map_number) { + return bitmaps_->at(map_number)->map_count(); + } + + BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } + + void Set(int map_number, int character) { + if (character > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + info->Set(character); + } + + void SetInterval(int map_number, const Interval& interval) { + if (interval.from() > max_char_) return; + BoyerMoorePositionInfo* info = bitmaps_->at(map_number); + if (interval.to() > max_char_) { + info->SetInterval(Interval(interval.from(), max_char_)); + } else { + info->SetInterval(interval); + } + } + + void SetAll(int map_number) { + bitmaps_->at(map_number)->SetAll(); + } + + void SetRest(int from_map) { + for (int i = from_map; i < length_; i++) SetAll(i); + } + bool EmitSkipInstructions(RegExpMacroAssembler* masm); + + private: + // This is the value obtained by EatsAtLeast. If we do not have at least this + // many characters left in the sample string then the match is bound to fail. + // Therefore it is OK to read a character this far ahead of the current match + // point. + int length_; + RegExpCompiler* compiler_; + // 0x7f for ASCII, 0xffff for UTF-16. + int max_char_; + ZoneList<BoyerMoorePositionInfo*>* bitmaps_; + + int GetSkipTable(int min_lookahead, + int max_lookahead, + Handle<ByteArray> boolean_skip_table); + bool FindWorthwhileInterval(int* from, int* to); + int FindBestInterval( + int max_number_of_chars, int old_biggest_points, int* from, int* to); +}; + + // There are many ways to generate code for a node. This class encapsulates // the current way we should be generating. In other words it encapsulates // the current state of the code generator. The effect of this is that we @@ -1312,12 +1469,13 @@ class Trace { void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); private: - int FindAffectedRegisters(OutSet* affected_registers); + int FindAffectedRegisters(OutSet* affected_registers, Zone* zone); void PerformDeferredActions(RegExpMacroAssembler* macro, - int max_register, - OutSet& affected_registers, - OutSet* registers_to_pop, - OutSet* registers_to_clear); + int max_register, + OutSet& affected_registers, + OutSet* registers_to_pop, + OutSet* registers_to_clear, + Zone* zone); void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, OutSet& registers_to_pop, @@ -1350,15 +1508,17 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) // dispatch table of a choice node. class DispatchTableConstructor: public NodeVisitor { public: - DispatchTableConstructor(DispatchTable* table, bool ignore_case) + DispatchTableConstructor(DispatchTable* table, bool ignore_case, + Zone* zone) : table_(table), choice_index_(-1), - ignore_case_(ignore_case) { } + ignore_case_(ignore_case), + zone_(zone) { } void BuildTable(ChoiceNode* node); void AddRange(CharacterRange range) { - table()->AddRange(range, choice_index_); + table()->AddRange(range, choice_index_, zone_); } void AddInverse(ZoneList<CharacterRange>* ranges); @@ -1375,6 +1535,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT) DispatchTable* table_; int choice_index_; bool ignore_case_; + Zone* zone_; }; @@ -1456,9 +1617,11 @@ class RegExpEngine: public AllStatic { static CompilationResult Compile(RegExpCompileData* input, bool ignore_case, + bool global, bool multiline, Handle<String> pattern, - bool is_ascii); + Handle<String> sample_subject, + bool is_ascii, Zone* zone); static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); }; @@ -1483,7 +1646,8 @@ class OffsetsVector { inline int* vector() { return vector_; } inline int length() { return offsets_vector_length_; } - static const int kStaticOffsetsVectorSize = 50; + static const int kStaticOffsetsVectorSize = + Isolate::kJSRegexpStaticOffsetsVectorSize; private: static Address static_offsets_vector_address(Isolate* isolate) { |