summaryrefslogtreecommitdiff
path: root/deps/v8/src/jsregexp.h
diff options
context:
space:
mode:
authorBert Belder <bertbelder@gmail.com>2012-06-13 15:34:45 +0200
committerBert Belder <bertbelder@gmail.com>2012-06-14 01:37:13 +0200
commit50464cd4f49e40f4fe792ff46a81052319a222e9 (patch)
tree1fe524b2e6c0eb3c459142cd27539f88e1a3f63c /deps/v8/src/jsregexp.h
parent09be360a0fee2c7619bae8c4248f9ed3d79d1b30 (diff)
downloadnode-new-50464cd4f49e40f4fe792ff46a81052319a222e9.tar.gz
v8: upgrade to v3.11.10
Diffstat (limited to 'deps/v8/src/jsregexp.h')
-rw-r--r--deps/v8/src/jsregexp.h590
1 files changed, 377 insertions, 213 deletions
diff --git a/deps/v8/src/jsregexp.h b/deps/v8/src/jsregexp.h
index 8875de9eb2..782c5b0b20 100644
--- a/deps/v8/src/jsregexp.h
+++ b/deps/v8/src/jsregexp.h
@@ -40,6 +40,7 @@ class RegExpCompiler;
class RegExpMacroAssembler;
class RegExpNode;
class RegExpTree;
+class BoyerMooreLookahead;
class RegExpImpl {
public:
@@ -77,7 +78,8 @@ class RegExpImpl {
static Handle<Object> Exec(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
- Handle<JSArray> lastMatchInfo);
+ Handle<JSArray> lastMatchInfo,
+ Zone* zone);
// Prepares a JSRegExp object with Irregexp-specific data.
static void IrregexpInitialize(Handle<JSRegExp> re,
@@ -106,18 +108,26 @@ class RegExpImpl {
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Handle<JSRegExp> regexp,
- Handle<String> subject);
-
- // Execute a regular expression once on the subject, starting from
- // character "index".
- // If successful, returns RE_SUCCESS and set the capture positions
- // in the first registers.
+ Handle<String> subject,
+ Zone* zone);
+
+ // Calculate the size of offsets vector for the case of global regexp
+ // and the number of matches this vector is able to store.
+ static int GlobalOffsetsVectorSize(Handle<JSRegExp> regexp,
+ int registers_per_match,
+ int* max_matches);
+
+ // Execute a regular expression on the subject, starting from index.
+ // If matching succeeds, return the number of matches. This can be larger
+ // than one in the case of global regular expressions.
+ // The captures and subcaptures are stored into the registers vector.
// If matching fails, returns RE_FAILURE.
// If execution fails, sets a pending exception and returns RE_EXCEPTION.
- static IrregexpResult IrregexpExecOnce(Handle<JSRegExp> regexp,
- Handle<String> subject,
- int index,
- Vector<int> registers);
+ static int IrregexpExecRaw(Handle<JSRegExp> regexp,
+ Handle<String> subject,
+ int index,
+ Vector<int> registers,
+ Zone* zone);
// Execute an Irregexp bytecode pattern.
// On a successful match, the result is a JSArray containing
@@ -126,7 +136,8 @@ class RegExpImpl {
static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp,
Handle<String> subject,
int index,
- Handle<JSArray> lastMatchInfo);
+ Handle<JSArray> lastMatchInfo,
+ Zone* zone);
// Array index in the lastMatchInfo array.
static const int kLastCaptureCount = 0;
@@ -190,8 +201,12 @@ class RegExpImpl {
static String* last_ascii_string_;
static String* two_byte_cached_string_;
- static bool CompileIrregexp(Handle<JSRegExp> re, bool is_ascii);
- static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii);
+ static bool CompileIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii,
+ Zone* zone);
+ static inline bool EnsureCompiledIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii,
+ Zone* zone);
// Set the subject cache. The previous string buffer is not deleted, so the
@@ -222,56 +237,17 @@ enum ElementInSetsRelation {
};
-// Represents the relation of two sets.
-// Sets can be either disjoint, partially or fully overlapping, or equal.
-class SetRelation BASE_EMBEDDED {
- public:
- // Relation is represented by a bit saying whether there are elements in
- // one set that is not in the other, and a bit saying that there are elements
- // that are in both sets.
-
- // Location of an element. Corresponds to the internal areas of
- // a Venn diagram.
- enum {
- kInFirst = 1 << kInsideFirst,
- kInSecond = 1 << kInsideSecond,
- kInBoth = 1 << kInsideBoth
- };
- SetRelation() : bits_(0) {}
- ~SetRelation() {}
- // Add the existence of objects in a particular
- void SetElementsInFirstSet() { bits_ |= kInFirst; }
- void SetElementsInSecondSet() { bits_ |= kInSecond; }
- void SetElementsInBothSets() { bits_ |= kInBoth; }
- // Check the currently known relation of the sets (common functions only,
- // for other combinations, use value() to get the bits and check them
- // manually).
- // Sets are completely disjoint.
- bool Disjoint() { return (bits_ & kInBoth) == 0; }
- // Sets are equal.
- bool Equals() { return (bits_ & (kInFirst | kInSecond)) == 0; }
- // First set contains second.
- bool Contains() { return (bits_ & kInSecond) == 0; }
- // Second set contains first.
- bool ContainedIn() { return (bits_ & kInFirst) == 0; }
- bool NonTrivialIntersection() {
- return (bits_ == (kInFirst | kInSecond | kInBoth));
- }
- int value() { return bits_; }
-
- private:
- int bits_;
-};
-
-
+// Represents code units in the range from from_ to to_, both ends are
+// inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) { }
// For compatibility with the CHECK_OK macro
CharacterRange(void* null) { ASSERT_EQ(NULL, null); } //NOLINT
CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
- static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges);
- static Vector<const uc16> GetWordBounds();
+ static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
+ Zone* zone);
+ static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc16 value) {
return CharacterRange(value, value);
}
@@ -290,11 +266,13 @@ class CharacterRange {
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
- void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii);
+ void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii,
+ Zone* zone);
static void Split(ZoneList<CharacterRange>* base,
- Vector<const uc16> overlay,
+ Vector<const int> overlay,
ZoneList<CharacterRange>** included,
- ZoneList<CharacterRange>** excluded);
+ ZoneList<CharacterRange>** excluded,
+ Zone* zone);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
static bool IsCanonical(ZoneList<CharacterRange>* ranges);
@@ -303,31 +281,10 @@ class CharacterRange {
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
- // Check how the set of characters defined by a CharacterRange list relates
- // to the set of word characters. List must be in canonical form.
- static SetRelation WordCharacterRelation(ZoneList<CharacterRange>* ranges);
- // Takes two character range lists (representing character sets) in canonical
- // form and merges them.
- // The characters that are only covered by the first set are added to
- // first_set_only_out. the characters that are only in the second set are
- // added to second_set_only_out, and the characters that are in both are
- // added to both_sets_out.
- // The pointers to first_set_only_out, second_set_only_out and both_sets_out
- // should be to empty lists, but they need not be distinct, and may be NULL.
- // If NULL, the characters are dropped, and if two arguments are the same
- // pointer, the result is the union of the two sets that would be created
- // if the pointers had been distinct.
- // This way, the Merge function can compute all the usual set operations:
- // union (all three out-sets are equal), intersection (only both_sets_out is
- // non-NULL), and set difference (only first_set is non-NULL).
- static void Merge(ZoneList<CharacterRange>* first_set,
- ZoneList<CharacterRange>* second_set,
- ZoneList<CharacterRange>* first_set_only_out,
- ZoneList<CharacterRange>* second_set_only_out,
- ZoneList<CharacterRange>* both_sets_out);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
- ZoneList<CharacterRange>* dst);
+ ZoneList<CharacterRange>* dst,
+ Zone* zone);
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
@@ -342,7 +299,7 @@ class CharacterRange {
class OutSet: public ZoneObject {
public:
OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
- OutSet* Extend(unsigned value);
+ OutSet* Extend(unsigned value, Zone* zone);
bool Get(unsigned value);
static const unsigned kFirstLimit = 32;
@@ -350,12 +307,12 @@ class OutSet: public ZoneObject {
// Destructively set a value in this set. In most cases you want
// to use Extend instead to ensure that only one instance exists
// that contains the same values.
- void Set(unsigned value);
+ void Set(unsigned value, Zone* zone);
// The successors are a list of sets that contain the same values
// as this set and the one more value that is not present in this
// set.
- ZoneList<OutSet*>* successors() { return successors_; }
+ ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
OutSet(uint32_t first, ZoneList<unsigned>* remaining)
: first_(first), remaining_(remaining), successors_(NULL) { }
@@ -370,6 +327,8 @@ class OutSet: public ZoneObject {
// Used for mapping character ranges to choices.
class DispatchTable : public ZoneObject {
public:
+ explicit DispatchTable(Zone* zone) : tree_(zone) { }
+
class Entry {
public:
Entry() : from_(0), to_(0), out_set_(NULL) { }
@@ -378,7 +337,9 @@ class DispatchTable : public ZoneObject {
uc16 from() { return from_; }
uc16 to() { return to_; }
void set_to(uc16 value) { to_ = value; }
- void AddValue(int value) { out_set_ = out_set_->Extend(value); }
+ void AddValue(int value, Zone* zone) {
+ out_set_ = out_set_->Extend(value, zone);
+ }
OutSet* out_set() { return out_set_; }
private:
uc16 from_;
@@ -402,12 +363,14 @@ class DispatchTable : public ZoneObject {
}
};
- void AddRange(CharacterRange range, int value);
+ void AddRange(CharacterRange range, int value, Zone* zone);
OutSet* Get(uc16 value);
void Dump();
template <typename Callback>
- void ForEach(Callback* callback) { return tree()->ForEach(callback); }
+ void ForEach(Callback* callback) {
+ return tree()->ForEach(callback);
+ }
private:
// There can't be a static empty set since it allocates its
@@ -475,7 +438,8 @@ struct NodeInfo {
follows_newline_interest(false),
follows_start_interest(false),
at_end(false),
- visited(false) { }
+ visited(false),
+ replacement_calculated(false) { }
// Returns true if the interests and assumptions of this node
// matches the given one.
@@ -525,25 +489,7 @@ struct NodeInfo {
bool at_end: 1;
bool visited: 1;
-};
-
-
-class SiblingList {
- public:
- SiblingList() : list_(NULL) { }
- int length() {
- return list_ == NULL ? 0 : list_->length();
- }
- void Ensure(RegExpNode* parent) {
- if (list_ == NULL) {
- list_ = new ZoneList<RegExpNode*>(2);
- list_->Add(parent);
- }
- }
- void Add(RegExpNode* node) { list_->Add(node); }
- RegExpNode* Get(int index) { return list_->at(index); }
- private:
- ZoneList<RegExpNode*>* list_;
+ bool replacement_calculated: 1;
};
@@ -599,9 +545,15 @@ class QuickCheckDetails {
};
+extern int kUninitializedRegExpNodePlaceHolder;
+
+
class RegExpNode: public ZoneObject {
public:
- RegExpNode() : first_character_set_(NULL), trace_count_(0) { }
+ explicit RegExpNode(Zone* zone)
+ : replacement_(NULL), trace_count_(0), zone_(zone) {
+ bm_info_[0] = bm_info_[1] = NULL;
+ }
virtual ~RegExpNode();
virtual void Accept(NodeVisitor* visitor) = 0;
// Generates a goto to this node or actually generates the code at this point.
@@ -635,6 +587,50 @@ class RegExpNode: public ZoneObject {
bool not_at_start) = 0;
static const int kNodeIsTooComplexForGreedyLoops = -1;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
+ // Only returns the successor for a text node of length 1 that matches any
+ // character and that has no guards on it.
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ return NULL;
+ }
+
+ // Collects information on the possible code units (mod 128) that can match if
+ // we look forward. This is used for a Boyer-Moore-like string searching
+ // implementation. TODO(erikcorry): This should share more code with
+ // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
+ // the number of nodes we are willing to look at in order to create this data.
+ static const int kFillInBMBudget = 200;
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ UNREACHABLE();
+ }
+
+ // If we know that the input is ASCII then there are some nodes that can
+ // never match. This method returns a node that can be substituted for
+ // itself, or NULL if the node can never match.
+ virtual RegExpNode* FilterASCII(int depth) { return this; }
+ // Helper for FilterASCII.
+ RegExpNode* replacement() {
+ ASSERT(info()->replacement_calculated);
+ return replacement_;
+ }
+ RegExpNode* set_replacement(RegExpNode* replacement) {
+ info()->replacement_calculated = true;
+ replacement_ = replacement;
+ return replacement; // For convenience.
+ }
+
+ // We want to avoid recalculating the lookahead info, so we store it on the
+ // node. Only info that is for this node is stored. We can tell that the
+ // info is for this node when offset == 0, so the information is calculated
+ // relative to this node.
+ void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
+
Label* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at the
// start of the trace) then it cannot be reused. This variable sets a limit
@@ -645,72 +641,35 @@ class RegExpNode: public ZoneObject {
NodeInfo* info() { return &info_; }
- void AddSibling(RegExpNode* node) { siblings_.Add(node); }
-
- // Static version of EnsureSibling that expresses the fact that the
- // result has the same type as the input.
- template <class C>
- static C* EnsureSibling(C* node, NodeInfo* info, bool* cloned) {
- return static_cast<C*>(node->EnsureSibling(info, cloned));
+ BoyerMooreLookahead* bm_info(bool not_at_start) {
+ return bm_info_[not_at_start ? 1 : 0];
}
- SiblingList* siblings() { return &siblings_; }
- void set_siblings(SiblingList* other) { siblings_ = *other; }
-
- // Return the set of possible next characters recognized by the regexp
- // (or a safe subset, potentially the set of all characters).
- ZoneList<CharacterRange>* FirstCharacterSet();
-
- // Compute (if possible within the budget of traversed nodes) the
- // possible first characters of the input matched by this node and
- // its continuation. Returns the remaining budget after the computation.
- // If the budget is spent, the result is negative, and the cached
- // first_character_set_ value isn't set.
- virtual int ComputeFirstCharacterSet(int budget);
-
- // Get and set the cached first character set value.
- ZoneList<CharacterRange>* first_character_set() {
- return first_character_set_;
- }
- void set_first_character_set(ZoneList<CharacterRange>* character_set) {
- first_character_set_ = character_set;
- }
+ Zone* zone() const { return zone_; }
protected:
enum LimitResult { DONE, CONTINUE };
- static const int kComputeFirstCharacterSetFail = -1;
+ RegExpNode* replacement_;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
- // Returns a sibling of this node whose interests and assumptions
- // match the ones in the given node info. If no sibling exists NULL
- // is returned.
- RegExpNode* TryGetSibling(NodeInfo* info);
-
- // Returns a sibling of this node whose interests match the ones in
- // the given node info. The info must not contain any assertions.
- // If no node exists a new one will be created by cloning the current
- // node. The result will always be an instance of the same concrete
- // class as this node.
- RegExpNode* EnsureSibling(NodeInfo* info, bool* cloned);
-
- // Returns a clone of this node initialized using the copy constructor
- // of its concrete class. Note that the node may have to be pre-
- // processed before it is on a usable state.
- virtual RegExpNode* Clone() = 0;
+ void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
+ bm_info_[not_at_start ? 1 : 0] = bm;
+ }
private:
static const int kFirstCharBudget = 10;
Label label_;
NodeInfo info_;
- SiblingList siblings_;
- ZoneList<CharacterRange>* first_character_set_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
// generated code is located unless the code is generated at the start of
// a trace, in which case it is generic and can be reused by flushing the
// deferred operations in the current trace and generating a goto.
int trace_count_;
+ BoyerMooreLookahead* bm_info_[2];
+
+ Zone* zone_;
};
@@ -731,8 +690,8 @@ class Interval {
return (from_ <= value) && (value <= to_);
}
bool is_empty() { return from_ == kNone; }
- int from() { return from_; }
- int to() { return to_; }
+ int from() const { return from_; }
+ int to() const { return to_; }
static Interval Empty() { return Interval(); }
static const int kNone = -1;
private:
@@ -744,9 +703,23 @@ class Interval {
class SeqRegExpNode: public RegExpNode {
public:
explicit SeqRegExpNode(RegExpNode* on_success)
- : on_success_(on_success) { }
+ : RegExpNode(on_success->zone()), on_success_(on_success) { }
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
+ virtual RegExpNode* FilterASCII(int depth);
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ on_success_->FillInBMInfo(
+ offset, recursion_depth + 1, budget - 1, bm, not_at_start);
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
+
+ protected:
+ RegExpNode* FilterSuccessor(int depth);
+
private:
RegExpNode* on_success_;
};
@@ -793,11 +766,14 @@ class ActionNode: public SeqRegExpNode {
return on_success()->GetQuickCheckDetails(
details, compiler, filled_in, not_at_start);
}
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
Type type() { return type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
- virtual ActionNode* Clone() { return new ActionNode(*this); }
- virtual int ComputeFirstCharacterSet(int budget);
private:
union {
@@ -845,8 +821,8 @@ class TextNode: public SeqRegExpNode {
TextNode(RegExpCharacterClass* that,
RegExpNode* on_success)
: SeqRegExpNode(on_success),
- elms_(new ZoneList<TextElement>(1)) {
- elms_->Add(TextElement::CharClass(that));
+ elms_(new(zone()) ZoneList<TextElement>(1, zone())) {
+ elms_->Add(TextElement::CharClass(that), zone());
}
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
@@ -860,13 +836,15 @@ class TextNode: public SeqRegExpNode {
ZoneList<TextElement>* elements() { return elms_; }
void MakeCaseIndependent(bool is_ascii);
virtual int GreedyLoopTextLength();
- virtual TextNode* Clone() {
- TextNode* result = new TextNode(*this);
- result->CalculateOffsets();
- return result;
- }
+ virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler);
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
void CalculateOffsets();
- virtual int ComputeFirstCharacterSet(int budget);
+ virtual RegExpNode* FilterASCII(int depth);
private:
enum TextEmitPassType {
@@ -897,27 +875,22 @@ class AssertionNode: public SeqRegExpNode {
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
- AFTER_NEWLINE,
- // Types not directly expressible in regexp syntax.
- // Used for modifying a boundary node if its following character is
- // known to be word and/or non-word.
- AFTER_NONWORD_CHARACTER,
- AFTER_WORD_CHARACTER
+ AFTER_NEWLINE
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
- return new AssertionNode(AT_END, on_success);
+ return new(on_success->zone()) AssertionNode(AT_END, on_success);
}
static AssertionNode* AtStart(RegExpNode* on_success) {
- return new AssertionNode(AT_START, on_success);
+ return new(on_success->zone()) AssertionNode(AT_START, on_success);
}
static AssertionNode* AtBoundary(RegExpNode* on_success) {
- return new AssertionNode(AT_BOUNDARY, on_success);
+ return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
}
static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
- return new AssertionNode(AT_NON_BOUNDARY, on_success);
+ return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
}
static AssertionNode* AfterNewline(RegExpNode* on_success) {
- return new AssertionNode(AFTER_NEWLINE, on_success);
+ return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
}
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
@@ -928,12 +901,20 @@ class AssertionNode: public SeqRegExpNode {
RegExpCompiler* compiler,
int filled_in,
bool not_at_start);
- virtual int ComputeFirstCharacterSet(int budget);
- virtual AssertionNode* Clone() { return new AssertionNode(*this); }
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
AssertionNodeType type() { return type_; }
void set_type(AssertionNodeType type) { type_ = type; }
private:
+ void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
+ enum IfPrevious { kIsNonWord, kIsWord };
+ void BacktrackIfPrevious(RegExpCompiler* compiler,
+ Trace* trace,
+ IfPrevious backtrack_if_previous);
AssertionNode(AssertionNodeType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), type_(t) { }
AssertionNodeType type_;
@@ -961,8 +942,11 @@ class BackReferenceNode: public SeqRegExpNode {
bool not_at_start) {
return;
}
- virtual BackReferenceNode* Clone() { return new BackReferenceNode(*this); }
- virtual int ComputeFirstCharacterSet(int budget);
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
private:
int start_reg_;
@@ -973,7 +957,8 @@ class BackReferenceNode: public SeqRegExpNode {
class EndNode: public RegExpNode {
public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
- explicit EndNode(Action action) : action_(action) { }
+ explicit EndNode(Action action, Zone* zone)
+ : RegExpNode(zone), action_(action) { }
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find,
@@ -986,7 +971,15 @@ class EndNode: public RegExpNode {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
- virtual EndNode* Clone() { return new EndNode(*this); }
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ // Returning 0 from EatsAtLeast should ensure we never get here.
+ UNREACHABLE();
+ }
+
private:
Action action_;
};
@@ -997,8 +990,9 @@ class NegativeSubmatchSuccess: public EndNode {
NegativeSubmatchSuccess(int stack_pointer_reg,
int position_reg,
int clear_capture_count,
- int clear_capture_start)
- : EndNode(NEGATIVE_SUBMATCH_SUCCESS),
+ int clear_capture_start,
+ Zone* zone)
+ : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
stack_pointer_register_(stack_pointer_reg),
current_position_register_(position_reg),
clear_capture_count_(clear_capture_count),
@@ -1034,7 +1028,7 @@ class Guard: public ZoneObject {
class GuardedAlternative {
public:
explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
- void AddGuard(Guard* guard);
+ void AddGuard(Guard* guard, Zone* zone);
RegExpNode* node() { return node_; }
void set_node(RegExpNode* node) { node_ = node; }
ZoneList<Guard*>* guards() { return guards_; }
@@ -1050,13 +1044,17 @@ class AlternativeGeneration;
class ChoiceNode: public RegExpNode {
public:
- explicit ChoiceNode(int expected_size)
- : alternatives_(new ZoneList<GuardedAlternative>(expected_size)),
+ explicit ChoiceNode(int expected_size, Zone* zone)
+ : RegExpNode(zone),
+ alternatives_(new(zone)
+ ZoneList<GuardedAlternative>(expected_size, zone)),
table_(NULL),
not_at_start_(false),
being_calculated_(false) { }
virtual void Accept(NodeVisitor* visitor);
- void AddAlternative(GuardedAlternative node) { alternatives()->Add(node); }
+ void AddAlternative(GuardedAlternative node) {
+ alternatives()->Add(node, zone());
+ }
ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
DispatchTable* GetTable(bool ignore_case);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
@@ -1071,13 +1069,18 @@ class ChoiceNode: public RegExpNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
- virtual ChoiceNode* Clone() { return new ChoiceNode(*this); }
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
+ virtual RegExpNode* FilterASCII(int depth);
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@@ -1089,7 +1092,7 @@ class ChoiceNode: public RegExpNode {
void GenerateGuard(RegExpMacroAssembler* macro_assembler,
Guard* guard,
Trace* trace);
- int CalculatePreloadCharacters(RegExpCompiler* compiler, bool not_at_start);
+ int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
void EmitOutOfLineContinuation(RegExpCompiler* compiler,
Trace* trace,
GuardedAlternative alternative,
@@ -1107,8 +1110,9 @@ class ChoiceNode: public RegExpNode {
class NegativeLookaheadChoiceNode: public ChoiceNode {
public:
explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail,
- GuardedAlternative then_do_this)
- : ChoiceNode(2) {
+ GuardedAlternative then_do_this,
+ Zone* zone)
+ : ChoiceNode(2, zone) {
AddAlternative(this_must_fail);
AddAlternative(then_do_this);
}
@@ -1119,20 +1123,29 @@ class NegativeLookaheadChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start) {
+ alternatives_->at(1).node()->FillInBMInfo(
+ offset, recursion_depth + 1, budget - 1, bm, not_at_start);
+ if (offset == 0) set_bm_info(not_at_start, bm);
+ }
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check code
// starts by loading enough characters for the alternative that takes fewest
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
- virtual int ComputeFirstCharacterSet(int budget);
+ virtual RegExpNode* FilterASCII(int depth);
};
class LoopChoiceNode: public ChoiceNode {
public:
- explicit LoopChoiceNode(bool body_can_be_zero_length)
- : ChoiceNode(2),
+ explicit LoopChoiceNode(bool body_can_be_zero_length, Zone* zone)
+ : ChoiceNode(2, zone),
loop_node_(NULL),
continue_node_(NULL),
body_can_be_zero_length_(body_can_be_zero_length) { }
@@ -1146,12 +1159,16 @@ class LoopChoiceNode: public ChoiceNode {
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
- virtual int ComputeFirstCharacterSet(int budget);
- virtual LoopChoiceNode* Clone() { return new LoopChoiceNode(*this); }
+ virtual void FillInBMInfo(int offset,
+ int recursion_depth,
+ int budget,
+ BoyerMooreLookahead* bm,
+ bool not_at_start);
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual void Accept(NodeVisitor* visitor);
+ virtual RegExpNode* FilterASCII(int depth);
private:
// AddAlternative is made private for loop nodes because alternatives
@@ -1167,6 +1184,146 @@ class LoopChoiceNode: public ChoiceNode {
};
+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time. For example in the middle of the regexp
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
+// inserted at the start of any non-anchored regexp.
+//
+// When we have found such a loop we look ahead in the nodes to find the set of
+// characters that can come at given distances. For example for the regexp
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
+// the lookahead info where the set of characters is reasonably constrained. In
+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
+//
+// For Unicode input strings we do the same, but modulo 128.
+//
+// We also look at the first string fed to the regexp and use that to get a hint
+// of the character frequencies in the inputs. This affects the assessment of
+// whether the set of characters is 'reasonably constrained'.
+//
+// We also have another lookahead mechanism (called quick check in the code),
+// which uses a wide load of multiple characters followed by a mask and compare
+// to determine whether a match is possible at this point.
+enum ContainedInLattice {
+ kNotYet = 0,
+ kLatticeIn = 1,
+ kLatticeOut = 2,
+ kLatticeUnknown = 3 // Can also mean both in and out.
+};
+
+
+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
+ return static_cast<ContainedInLattice>(a | b);
+}
+
+
+ContainedInLattice AddRange(ContainedInLattice a,
+ const int* ranges,
+ int ranges_size,
+ Interval new_range);
+
+
+class BoyerMoorePositionInfo : public ZoneObject {
+ public:
+ explicit BoyerMoorePositionInfo(Zone* zone)
+ : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
+ map_count_(0),
+ w_(kNotYet),
+ s_(kNotYet),
+ d_(kNotYet),
+ surrogate_(kNotYet) {
+ for (int i = 0; i < kMapSize; i++) {
+ map_->Add(false, zone);
+ }
+ }
+
+ bool& at(int i) { return map_->at(i); }
+
+ static const int kMapSize = 128;
+ static const int kMask = kMapSize - 1;
+
+ int map_count() const { return map_count_; }
+
+ void Set(int character);
+ void SetInterval(const Interval& interval);
+ void SetAll();
+ bool is_non_word() { return w_ == kLatticeOut; }
+ bool is_word() { return w_ == kLatticeIn; }
+
+ private:
+ ZoneList<bool>* map_;
+ int map_count_; // Number of set bits in the map.
+ ContainedInLattice w_; // The \w character class.
+ ContainedInLattice s_; // The \s character class.
+ ContainedInLattice d_; // The \d character class.
+ ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
+};
+
+
+class BoyerMooreLookahead : public ZoneObject {
+ public:
+ BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
+
+ int length() { return length_; }
+ int max_char() { return max_char_; }
+ RegExpCompiler* compiler() { return compiler_; }
+
+ int Count(int map_number) {
+ return bitmaps_->at(map_number)->map_count();
+ }
+
+ BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
+
+ void Set(int map_number, int character) {
+ if (character > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ info->Set(character);
+ }
+
+ void SetInterval(int map_number, const Interval& interval) {
+ if (interval.from() > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ if (interval.to() > max_char_) {
+ info->SetInterval(Interval(interval.from(), max_char_));
+ } else {
+ info->SetInterval(interval);
+ }
+ }
+
+ void SetAll(int map_number) {
+ bitmaps_->at(map_number)->SetAll();
+ }
+
+ void SetRest(int from_map) {
+ for (int i = from_map; i < length_; i++) SetAll(i);
+ }
+ bool EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:
+ // This is the value obtained by EatsAtLeast. If we do not have at least this
+ // many characters left in the sample string then the match is bound to fail.
+ // Therefore it is OK to read a character this far ahead of the current match
+ // point.
+ int length_;
+ RegExpCompiler* compiler_;
+ // 0x7f for ASCII, 0xffff for UTF-16.
+ int max_char_;
+ ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
+
+ int GetSkipTable(int min_lookahead,
+ int max_lookahead,
+ Handle<ByteArray> boolean_skip_table);
+ bool FindWorthwhileInterval(int* from, int* to);
+ int FindBestInterval(
+ int max_number_of_chars, int old_biggest_points, int* from, int* to);
+};
+
+
// There are many ways to generate code for a node. This class encapsulates
// the current way we should be generating. In other words it encapsulates
// the current state of the code generator. The effect of this is that we
@@ -1312,12 +1469,13 @@ class Trace {
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
private:
- int FindAffectedRegisters(OutSet* affected_registers);
+ int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
void PerformDeferredActions(RegExpMacroAssembler* macro,
- int max_register,
- OutSet& affected_registers,
- OutSet* registers_to_pop,
- OutSet* registers_to_clear);
+ int max_register,
+ OutSet& affected_registers,
+ OutSet* registers_to_pop,
+ OutSet* registers_to_clear,
+ Zone* zone);
void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
int max_register,
OutSet& registers_to_pop,
@@ -1350,15 +1508,17 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
// dispatch table of a choice node.
class DispatchTableConstructor: public NodeVisitor {
public:
- DispatchTableConstructor(DispatchTable* table, bool ignore_case)
+ DispatchTableConstructor(DispatchTable* table, bool ignore_case,
+ Zone* zone)
: table_(table),
choice_index_(-1),
- ignore_case_(ignore_case) { }
+ ignore_case_(ignore_case),
+ zone_(zone) { }
void BuildTable(ChoiceNode* node);
void AddRange(CharacterRange range) {
- table()->AddRange(range, choice_index_);
+ table()->AddRange(range, choice_index_, zone_);
}
void AddInverse(ZoneList<CharacterRange>* ranges);
@@ -1375,6 +1535,7 @@ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
DispatchTable* table_;
int choice_index_;
bool ignore_case_;
+ Zone* zone_;
};
@@ -1456,9 +1617,11 @@ class RegExpEngine: public AllStatic {
static CompilationResult Compile(RegExpCompileData* input,
bool ignore_case,
+ bool global,
bool multiline,
Handle<String> pattern,
- bool is_ascii);
+ Handle<String> sample_subject,
+ bool is_ascii, Zone* zone);
static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
};
@@ -1483,7 +1646,8 @@ class OffsetsVector {
inline int* vector() { return vector_; }
inline int length() { return offsets_vector_length_; }
- static const int kStaticOffsetsVectorSize = 50;
+ static const int kStaticOffsetsVectorSize =
+ Isolate::kJSRegexpStaticOffsetsVectorSize;
private:
static Address static_offsets_vector_address(Isolate* isolate) {