diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2020-10-12 14:27:29 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2020-10-13 09:35:20 +0000 |
commit | c30a6232df03e1efbd9f3b226777b07e087a1122 (patch) | |
tree | e992f45784689f373bcc38d1b79a239ebe17ee23 /chromium/third_party/re2 | |
parent | 7b5b123ac58f58ffde0f4f6e488bcd09aa4decd3 (diff) | |
download | qtwebengine-chromium-85-based.tar.gz |
BASELINE: Update Chromium to 85.0.4183.14085-based
Change-Id: Iaa42f4680837c57725b1344f108c0196741f6057
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/re2')
25 files changed, 683 insertions, 459 deletions
diff --git a/chromium/third_party/re2/src/BUILD b/chromium/third_party/re2/src/BUILD index 480646fc1a8..8a1a9c0b8f2 100644 --- a/chromium/third_party/re2/src/BUILD +++ b/chromium/third_party/re2/src/BUILD @@ -9,18 +9,18 @@ licenses(["notice"]) exports_files(["LICENSE"]) config_setting( - name = "darwin", + name = "macos", values = {"cpu": "darwin"}, ) config_setting( - name = "windows", - values = {"cpu": "x64_windows"}, + name = "wasm", + values = {"cpu": "wasm"}, ) config_setting( - name = "windows_msvc", - values = {"cpu": "x64_windows_msvc"}, + name = "windows", + values = {"cpu": "x64_windows"}, ) load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") @@ -75,17 +75,17 @@ cc_library( "re2/stringpiece.h", ], copts = select({ + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), linkopts = select({ - # Darwin doesn't need `-pthread' when linking and it appears that + # macOS doesn't need `-pthread' when linking and it appears that # older versions of Clang will warn about the unused command line # argument, so just don't pass it. - ":darwin": [], + ":macos": [], + ":wasm": [], ":windows": [], - ":windows_msvc": [], "//conditions:default": ["-pthread"], }), visibility = ["//visibility:public"], diff --git a/chromium/third_party/re2/src/CMakeLists.txt b/chromium/third_party/re2/src/CMakeLists.txt index 5c980f04622..44a4772088a 100644 --- a/chromium/third_party/re2/src/CMakeLists.txt +++ b/chromium/third_party/re2/src/CMakeLists.txt @@ -13,6 +13,11 @@ project(RE2 CXX) include(CTest) include(GNUInstallDirs) +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) +endif() + option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(USEPCRE "use PCRE in tests and benchmarks" OFF) @@ -36,11 +41,6 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") # Without a byte order mark (BOM), Visual Studio assumes that the source # file is encoded using the current user code page, so we specify UTF-8. add_compile_options(/utf-8) -elseif(CYGWIN OR MINGW) - # See https://stackoverflow.com/questions/38139631 for details. - add_compile_options(-std=gnu++11) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - add_compile_options(-std=c++11) endif() if(WIN32) diff --git a/chromium/third_party/re2/src/Makefile b/chromium/third_party/re2/src/Makefile index a550717017a..20b8d0f3ca2 100644 --- a/chromium/third_party/re2/src/Makefile +++ b/chromium/third_party/re2/src/Makefile @@ -68,6 +68,7 @@ SOEXTVER00=$(SOEXT).$(SONAME).0.0 MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(RE2_LDFLAGS) $(LDFLAGS) endif +.PHONY: all all: obj/libre2.a obj/so/libre2.$(SOEXT) INSTALL_HFILES=\ @@ -176,40 +177,49 @@ DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES)) DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS)) DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS)) +.PRECIOUS: obj/%.o obj/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/dbg/%.o obj/dbg/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) $*.cc +.PRECIOUS: obj/so/%.o obj/so/%.o: %.cc $(HFILES) @mkdir -p $$(dirname $@) $(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(RE2_CXXFLAGS) $(CXXFLAGS) -DNDEBUG $*.cc +.PRECIOUS: obj/libre2.a obj/libre2.a: $(OFILES) @mkdir -p obj $(AR) $(ARFLAGS) obj/libre2.a $(OFILES) +.PRECIOUS: obj/dbg/libre2.a obj/dbg/libre2.a: $(DOFILES) @mkdir -p obj/dbg $(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES) +.PRECIOUS: obj/so/libre2.$(SOEXT) obj/so/libre2.$(SOEXT): $(SOFILES) @mkdir -p obj/so $(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES) ln -sf libre2.$(SOEXTVER) $@ +.PRECIOUS: obj/dbg/test/% obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o @mkdir -p obj/dbg/test $(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) +.PRECIOUS: obj/test/% obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) # Test the shared lib, falling back to the static lib for private symbols +.PRECIOUS: obj/so/test/% obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o @mkdir -p obj/so/test $(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o -Lobj/so -lre2 obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) @@ -229,64 +239,94 @@ obj/test/re2_fuzzer: obj/libre2.a obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o $(CXX) -o $@ obj/re2/fuzzing/re2_fuzzer.o obj/util/fuzz.o obj/libre2.a $(RE2_LDFLAGS) $(LDFLAGS) ifdef REBUILD_TABLES +.PRECIOUS: re2/perl_groups.cc re2/perl_groups.cc: re2/make_perl_groups.pl perl $< > $@ +.PRECIOUS: re2/unicode_%.cc re2/unicode_%.cc: re2/make_unicode_%.py python $< > $@ - -.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc endif +.PHONY: distclean distclean: clean rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc +.PHONY: clean clean: rm -rf obj rm -f re2/*.pyc +.PHONY: testofiles testofiles: $(TESTOFILES) +.PHONY: test test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test +.PHONY: debug-test debug-test: $(DTESTS) @./runtests $(DTESTS) +.PHONY: static-test static-test: $(TESTS) @./runtests $(TESTS) +.PHONY: shared-test shared-test: $(STESTS) @./runtests -shared-library-path obj/so $(STESTS) +.PHONY: debug-bigtest debug-bigtest: $(DTESTS) $(DBIGTESTS) @./runtests $(DTESTS) $(DBIGTESTS) +.PHONY: static-bigtest static-bigtest: $(TESTS) $(BIGTESTS) @./runtests $(TESTS) $(BIGTESTS) +.PHONY: shared-bigtest shared-bigtest: $(STESTS) $(SBIGTESTS) @./runtests -shared-library-path obj/so $(STESTS) $(SBIGTESTS) +.PHONY: benchmark benchmark: obj/test/regexp_benchmark +.PHONY: fuzz fuzz: obj/test/re2_fuzzer -install: obj/libre2.a obj/so/libre2.$(SOEXT) - mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig - $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 +.PHONY: install +install: static-install shared-install + +.PHONY: static +static: obj/libre2.a + +.PHONY: static-install +static-install: obj/libre2.a common-install $(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a + +.PHONY: shared +shared: obj/so/libre2.$(SOEXT) + +.PHONY: shared-install +shared-install: obj/so/libre2.$(SOEXT) common-install $(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER) ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT) + +.PHONY: common-install +common-install: + mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig + $(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2 $(INSTALL_DATA) re2.pc $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@includedir@#$(includedir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc $(SED_INPLACE) -e "s#@libdir@#$(libdir)#" $(DESTDIR)$(libdir)/pkgconfig/re2.pc +.PHONY: testinstall testinstall: static-testinstall shared-testinstall @echo @echo Install tests passed. @echo +.PHONY: static-testinstall static-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) static-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -l:libre2.a $(LDICU) $(LDFLAGS) static-testinstall: @@ -301,6 +341,7 @@ else obj/testinstall endif +.PHONY: shared-testinstall shared-testinstall: CXXFLAGS:=-std=c++11 -pthread -I$(DESTDIR)$(includedir) $(CXXFLAGS) shared-testinstall: LDFLAGS:=-pthread -L$(DESTDIR)$(libdir) -lre2 $(LDICU) $(LDFLAGS) shared-testinstall: @@ -313,19 +354,14 @@ else LD_LIBRARY_PATH="$(DESTDIR)$(libdir):$(LD_LIBRARY_PATH)" obj/testinstall endif +.PHONY: benchlog benchlog: obj/test/regexp_benchmark (echo '==BENCHMARK==' `hostname` `date`; \ (uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \ echo; \ ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//') -# Keep gmake from deleting intermediate files it creates. -# This makes repeated builds faster and preserves debug info on OS X. - -.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \ - obj/dbg/libre2.a obj/so/libre2.a \ - obj/test/% obj/so/test/% obj/dbg/test/% - +.PHONY: log log: $(MAKE) clean $(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \ @@ -341,6 +377,3 @@ log: echo '#' RE2 basic search tests built by make $@ >re2-search.txt echo '#' $$(date) >>re2-search.txt obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt - -x: x.cc obj/libre2.a - g++ -I. -o x x.cc obj/libre2.a diff --git a/chromium/third_party/re2/src/re2/bitstate.cc b/chromium/third_party/re2/src/re2/bitstate.cc index 865b58edc62..320d1eea157 100644 --- a/chromium/third_party/re2/src/re2/bitstate.cc +++ b/chromium/third_party/re2/src/re2/bitstate.cc @@ -332,14 +332,13 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // This looks like it's quadratic in the size of the text, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. - for (const char* p = text.data(); p <= text.data() + text.size(); p++) { - // Try to use memchr to find the first byte quickly. - int fb = prog_->first_byte(); - if (fb >= 0 && p < text.data() + text.size() && (p[0] & 0xFF) != fb) { - p = reinterpret_cast<const char*>( - memchr(p, fb, text.data() + text.size() - p)); + const char* etext = text.data() + text.size(); + for (const char* p = text.data(); p <= etext; p++) { + // Try to use prefix accel (e.g. memchr) to skip ahead. + if (p < etext && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p)); if (p == NULL) - p = text.data() + text.size(); + p = etext; } cap_[0] = p; diff --git a/chromium/third_party/re2/src/re2/compile.cc b/chromium/third_party/re2/src/re2/compile.cc index b3e5bdca0ae..7a9de07281c 100644 --- a/chromium/third_party/re2/src/re2/compile.cc +++ b/chromium/third_party/re2/src/re2/compile.cc @@ -30,91 +30,57 @@ namespace re2 { // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. // // Because the out and out1 fields in Inst are no longer pointers, -// we can't use pointers directly here either. Instead, p refers -// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). -// p == 0 represents the NULL list. This is okay because instruction #0 +// we can't use pointers directly here either. Instead, head refers +// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). +// head == 0 represents the NULL list. This is okay because instruction #0 // is always the fail instruction, which never appears on a list. - struct PatchList { - uint32_t p; - // Returns patch list containing just p. - static PatchList Mk(uint32_t p); + static PatchList Mk(uint32_t p) { + return {p, p}; + } - // Patches all the entries on l to have value v. + // Patches all the entries on l to have value p. // Caller must not ever use patch list again. - static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v); - - // Deref returns the next pointer pointed at by p. - static PatchList Deref(Prog::Inst *inst0, PatchList l); - - // Appends two patch lists and returns result. - static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); -}; - -static PatchList nullPatchList = { 0 }; - -// Returns patch list containing just p. -PatchList PatchList::Mk(uint32_t p) { - PatchList l; - l.p = p; - return l; -} - -// Returns the next pointer pointed at by l. -PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - l.p = ip->out1(); - else - l.p = ip->out(); - return l; -} - -// Patches all the entries on l to have value v. -void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) { - while (l.p != 0) { - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) { - l.p = ip->out1(); - ip->out1_ = val; - } else { - l.p = ip->out(); - ip->set_out(val); + static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { + while (l.head != 0) { + Prog::Inst* ip = &inst0[l.head>>1]; + if (l.head&1) { + l.head = ip->out1(); + ip->out1_ = p; + } else { + l.head = ip->out(); + ip->set_out(p); + } } } -} -// Appends two patch lists and returns result. -PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { - if (l1.p == 0) - return l2; - if (l2.p == 0) - return l1; - - PatchList l = l1; - for (;;) { - PatchList next = PatchList::Deref(inst0, l); - if (next.p == 0) - break; - l = next; + // Appends two patch lists and returns result. + static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { + if (l1.head == 0) + return l2; + if (l2.head == 0) + return l1; + Prog::Inst* ip = &inst0[l1.tail>>1]; + if (l1.tail&1) + ip->out1_ = l2.head; + else + ip->set_out(l2.head); + return {l1.head, l2.tail}; } - Prog::Inst* ip = &inst0[l.p>>1]; - if (l.p&1) - ip->out1_ = l2.p; - else - ip->set_out(l2.p); + uint32_t head; + uint32_t tail; // for constant-time append +}; - return l1; -} +static const PatchList kNullPatchList = {0, 0}; // Compiled program fragment. struct Frag { uint32_t begin; PatchList end; - Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector + Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} }; @@ -212,8 +178,8 @@ class Compiler : public Regexp::Walker<Frag> { int AddSuffixRecursive(int root, int id); // Finds the trie node for the given suffix. Returns a Frag in order to - // distinguish between pointing at the root node directly (end.p == 0) - // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). + // distinguish between pointing at the root node directly (end.head == 0) + // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). Frag FindByteRange(int root, int id); // Compares two ByteRanges and returns true iff they are equal. @@ -225,8 +191,8 @@ class Compiler : public Regexp::Walker<Frag> { // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); - Prog* Finish(); + void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); + Prog* Finish(Regexp* re); // Returns .* where dot = any byte Frag DotStar(); @@ -298,7 +264,7 @@ int Compiler::AllocInst(int n) { // Returns an unmatchable fragment. Frag Compiler::NoMatch() { - return Frag(0, nullPatchList); + return Frag(0, kNullPatchList); } // Is a an unmatchable fragment? @@ -314,7 +280,7 @@ Frag Compiler::Cat(Frag a, Frag b) { // Elide no-op. Prog::Inst* begin = &inst_[a.begin]; if (begin->opcode() == kInstNop && - a.end.p == (a.begin << 1) && + a.end.head == (a.begin << 1) && begin->out() == 0) { // in case refs to a somewhere PatchList::Patch(inst_.data(), a.end, b.begin); @@ -419,7 +385,7 @@ Frag Compiler::Match(int32_t match_id) { if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); - return Frag(id, nullPatchList); + return Frag(id, kNullPatchList); } // Returns a fragment matching a particular empty-width op (like ^ or $) @@ -467,7 +433,7 @@ static int MaxRune(int len) { void Compiler::BeginRange() { rune_cache_.clear(); rune_range_.begin = 0; - rune_range_.end = nullPatchList; + rune_range_.end = kNullPatchList; } int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, @@ -548,9 +514,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { } int br; - if (f.end.p == 0) + if (f.end.head == 0) br = root; - else if (f.end.p&1) + else if (f.end.head&1) br = inst_[f.begin].out1(); else br = inst_[f.begin].out(); @@ -566,9 +532,9 @@ int Compiler::AddSuffixRecursive(int root, int id) { // Ensure that the parent points to the clone, not to the original. // Note that this could leave the head unreachable except via the cache. br = byterange; - if (f.end.p == 0) + if (f.end.head == 0) root = br; - else if (f.end.p&1) + else if (f.end.head&1) inst_[f.begin].out1_ = br; else inst_[f.begin].set_out(br); @@ -601,7 +567,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) { Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) - return Frag(root, nullPatchList); + return Frag(root, kNullPatchList); else return NoMatch(); } @@ -1167,10 +1133,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. - return c.Finish(); + return c.Finish(re); } -Prog* Compiler::Finish() { +Prog* Compiler::Finish(Regexp* re) { if (failed_) return NULL; @@ -1186,7 +1152,17 @@ Prog* Compiler::Finish() { prog_->Optimize(); prog_->Flatten(); prog_->ComputeByteMap(); - prog_->ComputeFirstByte(); + + if (!prog_->reversed()) { + std::string prefix; + bool prefix_foldcase; + if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase) && + !prefix_foldcase) { + prog_->prefix_size_ = prefix.size(); + prog_->prefix_front_ = prefix.front(); + prog_->prefix_back_ = prefix.back(); + } + } // Record remaining memory for DFA. if (max_mem_ <= 0) { @@ -1244,7 +1220,7 @@ Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { c.prog_->set_start(all.begin); c.prog_->set_start_unanchored(all.begin); - Prog* prog = c.Finish(); + Prog* prog = c.Finish(re); if (prog == NULL) return NULL; diff --git a/chromium/third_party/re2/src/re2/dfa.cc b/chromium/third_party/re2/src/re2/dfa.cc index 0934f5a936e..b25cc97ff48 100644 --- a/chromium/third_party/re2/src/re2/dfa.cc +++ b/chromium/third_party/re2/src/re2/dfa.cc @@ -167,12 +167,6 @@ class DFA { typedef std::unordered_set<State*, StateHash, StateEqual> StateSet; private: - // Special "first_byte" values for a state. (Values >= 0 denote actual bytes.) - enum { - kFbUnknown = -1, // No analysis has been performed. - kFbNone = -2, // The first-byte trick cannot be used. - }; - enum { // Indices into start_ for unanchored searches. // Add kStartAnchored for anchored searches. @@ -239,25 +233,26 @@ class DFA { struct SearchParams { SearchParams(const StringPiece& text, const StringPiece& context, RWLocker* cache_lock) - : text(text), context(context), + : text(text), + context(context), anchored(false), + can_prefix_accel(false), want_earliest_match(false), run_forward(false), start(NULL), - first_byte(kFbUnknown), cache_lock(cache_lock), failed(false), ep(NULL), - matches(NULL) { } + matches(NULL) {} StringPiece text; StringPiece context; bool anchored; + bool can_prefix_accel; bool want_earliest_match; bool run_forward; State* start; - int first_byte; - RWLocker *cache_lock; + RWLocker* cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match SparseSet* matches; @@ -268,15 +263,13 @@ class DFA { }; // Before each search, the parameters to Search are analyzed by - // AnalyzeSearch to determine the state in which to start and the - // "first_byte" for that state, if any. + // AnalyzeSearch to determine the state in which to start. struct StartInfo { - StartInfo() : start(NULL), first_byte(kFbUnknown) {} - State* start; - std::atomic<int> first_byte; + StartInfo() : start(NULL) {} + std::atomic<State*> start; }; - // Fills in params->start and params->first_byte using + // Fills in params->start and params->can_prefix_accel using // the other search parameters. Returns true on success, // false on failure. // cache_mutex_.r <= L < mutex_ @@ -288,7 +281,7 @@ class DFA { // cache_mutex_.r <= L < mutex_ // Might unlock and relock cache_mutex_ via params->cache_lock. inline bool InlinedSearchLoop(SearchParams* params, - bool have_first_byte, + bool can_prefix_accel, bool want_earliest_match, bool run_forward); @@ -606,7 +599,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: // those are the only operators with any effect in // RunWorkqOnEmptyString or RunWorkqOnByte. - int* inst = new int[q->size()]; + PODArray<int> inst(q->size()); int n = 0; uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions bool sawmatch = false; // whether queue contains guaranteed kInstMatch @@ -636,7 +629,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { (it == q->begin() && ip->greedy(prog_))) && (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; @@ -683,7 +675,6 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // the execution loop can stop early. This is only okay // if the state is *not* a matching state. if (n == 0 && flag == 0) { - delete[] inst; if (ExtraDebug) fprintf(stderr, " -> DeadState\n"); return DeadState; @@ -693,7 +684,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // unordered state sets separated by Marks. Sort each set // to canonicalize, to reduce the number of distinct sets stored. if (kind_ == Prog::kLongestMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; while (ip < ep) { int* markp = ip; @@ -710,7 +701,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // we have an unordered set of states (i.e. we don't have Marks) // and sorting will reduce the number of distinct sets stored. if (kind_ == Prog::kManyMatch) { - int* ip = inst; + int* ip = inst.data(); int* ep = ip + n; std::sort(ip, ep); } @@ -729,8 +720,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; - State* state = CachedState(inst, n, flag); - delete[] inst; + State* state = CachedState(inst.data(), n, flag); return state; } @@ -1183,10 +1173,8 @@ void DFA::ResetCache(RWLocker* cache_lock) { }); // Clear the cache, reset the memory budget. - for (int i = 0; i < kMaxStart; i++) { - start_[i].start = NULL; - start_[i].first_byte.store(kFbUnknown, std::memory_order_relaxed); - } + for (int i = 0; i < kMaxStart; i++) + start_[i].start.store(NULL, std::memory_order_relaxed); ClearCache(); mem_budget_ = state_budget_; } @@ -1301,8 +1289,7 @@ DFA::State* DFA::StateSaver::Restore() { // situation, the DFA can do better than executing the simple loop. // Instead, it can call memchr to search very quickly for the byte c. // Whether the start state has this property is determined during a -// pre-compilation pass, and if so, the byte b is passed to the search -// loop as the "first_byte" argument, along with a boolean "have_first_byte". +// pre-compilation pass and the "can_prefix_accel" argument is set. // // Fourth, the desired behavior is to search for the leftmost-best match // (approximately, the same one that Perl would find), which is not @@ -1335,7 +1322,7 @@ DFA::State* DFA::StateSaver::Restore() { // making them function arguments lets the inliner specialize // this function to each combination (see two paragraphs above). inline bool DFA::InlinedSearchLoop(SearchParams* params, - bool have_first_byte, + bool can_prefix_accel, bool want_earliest_match, bool run_forward) { State* start = params->start; @@ -1380,12 +1367,12 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, if (ExtraDebug) fprintf(stderr, "@%td: %s\n", p - bp, DumpState(s).c_str()); - if (run_forward && have_first_byte && s == start) { - // In start state, only way out is to find first_byte, - // so use optimized assembly in memchr to skip ahead. - // If first_byte isn't found, we can skip to the end - // of the string. - if ((p = BytePtr(memchr(p, params->first_byte, ep - p))) == NULL) { + if (can_prefix_accel && s == start) { + // In start state, only way out is to find the prefix, + // so we use prefix accel (e.g. memchr) to skip ahead. + // If not found, we can skip to the end of the string. + p = BytePtr(prog_->PrefixAccel(p, ep - p)); + if (p == NULL) { p = ep; break; } @@ -1589,7 +1576,7 @@ bool DFA::SearchTTT(SearchParams* params) { // For debugging, calls the general code directly. bool DFA::SlowSearchLoop(SearchParams* params) { return InlinedSearchLoop(params, - params->first_byte >= 0, + params->can_prefix_accel, params->want_earliest_match, params->run_forward); } @@ -1610,8 +1597,7 @@ bool DFA::FastSearchLoop(SearchParams* params) { &DFA::SearchTTT, }; - bool have_first_byte = params->first_byte >= 0; - int index = 4 * have_first_byte + + int index = 4 * params->can_prefix_accel + 2 * params->want_earliest_match + 1 * params->run_forward; return (this->*Searches[index])(params); @@ -1703,13 +1689,22 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } + params->start = info->start.load(std::memory_order_acquire); + + // Even if we could prefix accel, we cannot do so when anchored and, + // less obviously, we cannot do so when we are going to need flags. + // This trick works only when there is a single byte that leads to a + // different state! + if (prog_->can_prefix_accel() && + !params->anchored && + params->start > SpecialStateMax && + params->start->flag_ >> kFlagNeedShift == 0) + params->can_prefix_accel = true; + if (ExtraDebug) - fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s first_byte=%d\n", + fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n", params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), info->first_byte.load()); - - params->start = info->start; - params->first_byte = info->first_byte.load(std::memory_order_acquire); + DumpState(params->start).c_str(), params->can_prefix_accel); return true; } @@ -1718,47 +1713,25 @@ bool DFA::AnalyzeSearch(SearchParams* params) { bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint32_t flags) { // Quick check. - int fb = info->first_byte.load(std::memory_order_acquire); - if (fb != kFbUnknown) + State* start = info->start.load(std::memory_order_acquire); + if (start != NULL) return true; MutexLock l(&mutex_); - fb = info->first_byte.load(std::memory_order_relaxed); - if (fb != kFbUnknown) + start = info->start.load(std::memory_order_relaxed); + if (start != NULL) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, NULL, flags); - if (info->start == NULL) + start = WorkqToCachedState(q0_, NULL, flags); + if (start == NULL) return false; - if (info->start == DeadState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); - return true; - } - - if (info->start == FullMatchState) { - // Synchronize with "quick check" above. - info->first_byte.store(kFbNone, std::memory_order_release); // will be ignored - return true; - } - - // Even if we have a first_byte, we cannot use it when anchored and, - // less obviously, we cannot use it when we are going to need flags. - // This trick works only when there is a single byte that leads to a - // different state! - int first_byte = prog_->first_byte(); - if (first_byte == -1 || - params->anchored || - info->start->flag_ >> kFlagNeedShift != 0) - first_byte = kFbNone; - // Synchronize with "quick check" above. - info->first_byte.store(first_byte, std::memory_order_release); + info->start.store(start, std::memory_order_release); return true; } @@ -1866,13 +1839,13 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, StringPiece context = const_context; if (context.data() == NULL) context = text; - bool carat = anchor_start(); + bool caret = anchor_start(); bool dollar = anchor_end(); if (reversed_) { using std::swap; - swap(carat, dollar); + swap(caret, dollar); } - if (carat && context.begin() != text.begin()) + if (caret && context.begin() != text.begin()) return false; if (dollar && context.end() != text.end()) return false; diff --git a/chromium/third_party/re2/src/re2/filtered_re2.cc b/chromium/third_party/re2/src/re2/filtered_re2.cc index e5d8de5ce62..5df97456e25 100644 --- a/chromium/third_party/re2/src/re2/filtered_re2.cc +++ b/chromium/third_party/re2/src/re2/filtered_re2.cc @@ -6,6 +6,7 @@ #include <stddef.h> #include <string> +#include <utility> #include "util/util.h" #include "util/logging.h" @@ -27,7 +28,22 @@ FilteredRE2::FilteredRE2(int min_atom_len) FilteredRE2::~FilteredRE2() { for (size_t i = 0; i < re2_vec_.size(); i++) delete re2_vec_[i]; - delete prefilter_tree_; +} + +FilteredRE2::FilteredRE2(FilteredRE2&& other) + : re2_vec_(std::move(other.re2_vec_)), + compiled_(other.compiled_), + prefilter_tree_(std::move(other.prefilter_tree_)) { + other.re2_vec_.clear(); + other.re2_vec_.shrink_to_fit(); + other.compiled_ = false; + other.prefilter_tree_.reset(new PrefilterTree()); +} + +FilteredRE2& FilteredRE2::operator=(FilteredRE2&& other) { + this->~FilteredRE2(); + (void) new (this) FilteredRE2(std::move(other)); + return *this; } RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, @@ -38,7 +54,7 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, if (!re->ok()) { if (options.log_errors()) { LOG(ERROR) << "Couldn't compile regular expression, skipping: " - << re << " due to error " << re->error(); + << pattern << " due to error " << re->error(); } delete re; } else { diff --git a/chromium/third_party/re2/src/re2/filtered_re2.h b/chromium/third_party/re2/src/re2/filtered_re2.h index 86fa58681d2..dd618c70e8b 100644 --- a/chromium/third_party/re2/src/re2/filtered_re2.h +++ b/chromium/third_party/re2/src/re2/filtered_re2.h @@ -21,6 +21,7 @@ // or AllMatches with a vector of indices of strings that were found // in the text to get the actual regexp matches. +#include <memory> #include <string> #include <vector> @@ -36,12 +37,19 @@ class FilteredRE2 { explicit FilteredRE2(int min_atom_len); ~FilteredRE2(); + // Not copyable. + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; + // Movable. + FilteredRE2(FilteredRE2&& other); + FilteredRE2& operator=(FilteredRE2&& other); + // Uses RE2 constructor to create a RE2 object (re). Returns // re->error_code(). If error_code is other than NoError, then re is // deleted and not added to re2_vec_. RE2::ErrorCode Add(const StringPiece& pattern, const RE2::Options& options, - int *id); + int* id); // Prepares the regexps added by Add for filtering. Returns a set // of strings that the caller should check for in candidate texts. @@ -98,10 +106,7 @@ class FilteredRE2 { bool compiled_; // An AND-OR tree of string atoms used for filtering regexps. - PrefilterTree* prefilter_tree_; - - FilteredRE2(const FilteredRE2&) = delete; - FilteredRE2& operator=(const FilteredRE2&) = delete; + std::unique_ptr<PrefilterTree> prefilter_tree_; }; } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/nfa.cc b/chromium/third_party/re2/src/re2/nfa.cc index 75ca30654b6..19ee08de215 100644 --- a/chromium/third_party/re2/src/re2/nfa.cc +++ b/chromium/third_party/re2/src/re2/nfa.cc @@ -27,6 +27,7 @@ #include <stdio.h> #include <string.h> #include <algorithm> +#include <deque> #include <string> #include <utility> #include <vector> @@ -107,18 +108,21 @@ class NFA { // Returns text version of capture information, for debugging. std::string FormatCapture(const char** capture); - inline void CopyCapture(const char** dst, const char** src); + void CopyCapture(const char** dst, const char** src) { + memmove(dst, src, ncapture_*sizeof src[0]); + } Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track bool longest_; // whether searching for longest match bool endmatch_; // whether match must end at text.end() - const char* btext_; // beginning of text being matched (for FormatSubmatch) - const char* etext_; // end of text being matched (for endmatch_) + const char* btext_; // beginning of text (for FormatSubmatch) + const char* etext_; // end of text (for endmatch_) Threadq q0_, q1_; // pre-allocated for Search. PODArray<AddState> stack_; // pre-allocated for AddToThreadq - Thread* free_threads_; // free list + std::deque<Thread> arena_; // thread arena + Thread* freelist_; // thread freelist const char** match_; // best match so far bool matched_; // any match so far? @@ -141,31 +145,30 @@ NFA::NFA(Prog* prog) { prog_->inst_count(kInstEmptyWidth) + prog_->inst_count(kInstNop) + 1; // + 1 for start inst stack_ = PODArray<AddState>(nstack); - free_threads_ = NULL; + freelist_ = NULL; match_ = NULL; matched_ = false; } NFA::~NFA() { delete[] match_; - Thread* next; - for (Thread* t = free_threads_; t; t = next) { - next = t->next; - delete[] t->capture; - delete t; - } + for (const Thread& t : arena_) + delete[] t.capture; } NFA::Thread* NFA::AllocThread() { - Thread* t = free_threads_; - if (t == NULL) { - t = new Thread; + Thread* t = freelist_; + if (t != NULL) { + freelist_ = t->next; t->ref = 1; - t->capture = new const char*[ncapture_]; + // We don't need to touch t->capture because + // the caller will immediately overwrite it. return t; } - free_threads_ = t->next; + arena_.emplace_back(); + t = &arena_.back(); t->ref = 1; + t->capture = new const char*[ncapture_]; return t; } @@ -176,21 +179,13 @@ NFA::Thread* NFA::Incref(Thread* t) { } void NFA::Decref(Thread* t) { - if (t == NULL) - return; + DCHECK(t != NULL); t->ref--; if (t->ref > 0) return; DCHECK_EQ(t->ref, 0); - t->next = free_threads_; - free_threads_ = t; -} - -void NFA::CopyCapture(const char** dst, const char** src) { - for (int i = 0; i < ncapture_; i+=2) { - dst[i] = src[i]; - dst[i+1] = src[i+1]; - } + t->next = freelist_; + freelist_ = t; } // Follows all empty arrows from id0 and enqueues all the states reached. @@ -372,8 +367,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, matched_ = true; Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); if (ip->greedy(prog_)) return ip->out1(); @@ -416,8 +413,10 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, // worse than the one we just found: don't run the // rest of the current Threadq. Decref(t); - for (++i; i != runq->end(); ++i) - Decref(i->value()); + for (++i; i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } runq->clear(); return 0; } @@ -489,6 +488,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } match_ = new const char*[ncapture_]; + memset(match_, 0, ncapture_*sizeof match_[0]); matched_ = false; // For debugging prints. @@ -505,7 +505,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, Threadq* nextq = &q1_; runq->clear(); nextq->clear(); - memset(&match_[0], 0, ncapture_*sizeof match_[0]); // Loop over the text, stepping the machine. for (const char* p = text.data();; p++) { @@ -572,16 +571,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // matches, since it would be to the right of the match // we already found.) if (!matched_ && (!anchored || p == text.data())) { - // If there's a required first byte for an unanchored search - // and we're not in the middle of any possible matches, - // use memchr to search for the byte quickly. - int fb = prog_->first_byte(); + // Try to use prefix accel (e.g. memchr) to skip ahead. + // The search must be unanchored and there must be zero + // possible matches already. if (!anchored && runq->size() == 0 && - fb >= 0 && p < etext_ && (p[0] & 0xFF) != fb) { - p = reinterpret_cast<const char*>(memchr(p, fb, etext_ - p)); - if (p == NULL) { + p < etext_ && prog_->can_prefix_accel()) { + p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); + if (p == NULL) p = etext_; - } } Thread* t = AllocThread(); @@ -612,8 +609,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, } } - for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) - Decref(i->value()); + for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { + if (i->value() != NULL) + Decref(i->value()); + } if (matched_) { for (int i = 0; i < nsubmatch; i++) @@ -629,67 +628,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, return false; } -void Prog::ComputeFirstByte() { - SparseSet q(size()); - q.insert(start()); - for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { - int id = *it; - Prog::Inst* ip = inst(id); - switch (ip->opcode()) { - default: - LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; - break; - - case kInstMatch: - // The empty string matches: no first byte. - first_byte_ = -1; - return; - - case kInstByteRange: - if (!ip->last()) - q.insert(id+1); - - // Must match only a single byte. - if (ip->lo() != ip->hi() || - (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')) { - first_byte_ = -1; - return; - } - // If we haven't seen any bytes yet, record it; - // otherwise must match the one we saw before. - if (first_byte_ == -1) { - first_byte_ = ip->lo(); - } else if (first_byte_ != ip->lo()) { - first_byte_ = -1; - return; - } - break; - - case kInstNop: - case kInstCapture: - case kInstEmptyWidth: - if (!ip->last()) - q.insert(id+1); - - // Continue on. - // Ignore ip->empty() flags for kInstEmptyWidth - // in order to be as conservative as possible - // (assume all possible empty-width flags are true). - if (ip->out()) - q.insert(ip->out()); - break; - - case kInstAltMatch: - DCHECK(!ip->last()); - q.insert(id+1); - break; - - case kInstFail: - break; - } - } -} - bool Prog::SearchNFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, diff --git a/chromium/third_party/re2/src/re2/parse.cc b/chromium/third_party/re2/src/re2/parse.cc index 0a5ff7494f9..3bba6137f4f 100644 --- a/chromium/third_party/re2/src/re2/parse.cc +++ b/chromium/third_party/re2/src/re2/parse.cc @@ -93,7 +93,7 @@ class Regexp::ParseState { bool PushSimpleOp(RegexpOp op); // Pushes a ^ onto the stack. - bool PushCarat(); + bool PushCaret(); // Pushes a \b (word == true) or \B (word == false) onto the stack. bool PushWordBoundary(bool word); @@ -423,7 +423,7 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } // Pushes a ^ onto the stack. -bool Regexp::ParseState::PushCarat() { +bool Regexp::ParseState::PushCaret() { if (flags_ & OneLine) { return PushSimpleOp(kRegexpBeginText); } @@ -685,7 +685,7 @@ bool Regexp::ParseState::DoRightParen() { if ((r1 = stacktop_) == NULL || (r2 = r1->down_) == NULL || r2->op() != kLeftParen) { - status_->set_code(kRegexpMissingParen); + status_->set_code(kRegexpUnexpectedParen); status_->set_error_arg(whole_regexp_); return false; } @@ -1802,14 +1802,13 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Convert the UnicodeSet to a URange32 and UGroup that we can add. int nr = uset.getRangeCount(); - URange32* r = new URange32[nr]; + PODArray<URange32> r(nr); for (int i = 0; i < nr; i++) { r[i].lo = uset.getRangeStart(i); r[i].hi = uset.getRangeEnd(i); } - UGroup g = {"", +1, 0, 0, r, nr}; + UGroup g = {"", +1, 0, 0, r.data(), nr}; AddUGroup(cc, &g, sign, parse_flags); - delete[] r; #endif return kParseOk; @@ -2272,7 +2271,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, break; case '^': // Beginning of line. - if (!ps.PushCarat()) + if (!ps.PushCaret()) return NULL; t.remove_prefix(1); // '^' break; diff --git a/chromium/third_party/re2/src/re2/prog.cc b/chromium/third_party/re2/src/re2/prog.cc index 9244319eccf..ac9c0852406 100644 --- a/chromium/third_party/re2/src/re2/prog.cc +++ b/chromium/third_party/re2/src/re2/prog.cc @@ -7,6 +7,12 @@ #include "re2/prog.h" +#if defined(__AVX2__) +#include <immintrin.h> +#ifdef _MSC_VER +#include <intrin.h> +#endif +#endif #include <stdint.h> #include <string.h> #include <algorithm> @@ -109,7 +115,9 @@ Prog::Prog() start_unanchored_(0), size_(0), bytemap_range_(0), - first_byte_(-1), + prefix_size_(0), + prefix_front_(-1), + prefix_back_(-1), list_count_(0), dfa_mem_(0), dfa_first_(NULL), @@ -908,4 +916,73 @@ void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { } } +#if defined(__AVX2__) +// Finds the least significant non-zero bit in n. +static int FindLSBSet(uint32_t n) { + DCHECK_NE(n, 0); +#if defined(__GNUC__) + return __builtin_ctz(n); +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + unsigned long c; + _BitScanForward(&c, n); + return static_cast<int>(c); +#else + int c = 31; + for (int shift = 1 << 4; shift != 0; shift >>= 1) { + uint32_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif +} +#endif + +const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 2); + if (size < prefix_size_) + return NULL; + // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. + // This also means that probing for prefix_back_ doesn't go out of bounds. + size -= prefix_size_-1; + +#if defined(__AVX2__) + // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. + if (size >= sizeof(__m256i)) { + const __m256i* fp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data)); + const __m256i* bp = reinterpret_cast<const __m256i*>( + reinterpret_cast<const char*>(data) + prefix_size_-1); + const __m256i* endfp = fp + size/sizeof(__m256i); + const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); + const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); + while (fp != endfp) { + const __m256i f_loadu = _mm256_loadu_si256(fp++); + const __m256i b_loadu = _mm256_loadu_si256(bp++); + const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); + const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); + const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); + if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. + const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); + const int fb_movemask = _mm256_movemask_epi8(fb_and); + const int fb_ctz = FindLSBSet(fb_movemask); + return reinterpret_cast<const char*>(fp-1) + fb_ctz; + } + } + data = fp; + size = size%sizeof(__m256i); + } +#endif + + const char* p0 = reinterpret_cast<const char*>(data); + for (const char* p = p0;; p++) { + DCHECK_GE(size, static_cast<size_t>(p-p0)); + p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0))); + if (p == NULL || p[prefix_size_-1] == prefix_back_) + return p; + } +} + } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/prog.h b/chromium/third_party/re2/src/re2/prog.h index 4306672e9ff..e9ce682d992 100644 --- a/chromium/third_party/re2/src/re2/prog.h +++ b/chromium/third_party/re2/src/re2/prog.h @@ -198,8 +198,8 @@ class Prog { Inst *inst(int id) { return &inst_[id]; } int start() { return start_; } - int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } + int start_unanchored() { return start_unanchored_; } void set_start_unanchored(int start) { start_unanchored_ = start; } int size() { return size_; } bool reversed() { return reversed_; } @@ -207,15 +207,27 @@ class Prog { int list_count() { return list_count_; } int inst_count(InstOp op) { return inst_count_[op]; } uint16_t* list_heads() { return list_heads_.data(); } - void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } int64_t dfa_mem() { return dfa_mem_; } + void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } bool anchor_start() { return anchor_start_; } void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } - int first_byte() { return first_byte_; } + bool can_prefix_accel() { return prefix_size_ != 0; } + + // Accelerates to the first likely occurrence of the prefix. + // Returns a pointer to the first byte or NULL if not found. + const void* PrefixAccel(const void* data, size_t size) { + DCHECK_GE(prefix_size_, 1); + return prefix_size_ == 1 ? memchr(data, prefix_front_, size) + : PrefixAccel_FrontAndBack(data, size); + } + + // An implementation of prefix accel that looks for prefix_front_ and + // prefix_back_ to return fewer false positives than memchr(3) alone. + const void* PrefixAccel_FrontAndBack(const void* data, size_t size); // Returns string representation of program for debugging. std::string Dump(); @@ -293,9 +305,6 @@ class Prog { // Compute bytemap. void ComputeByteMap(); - // Computes whether all matches must begin with the same first byte. - void ComputeFirstByte(); - // Run peep-hole optimizer on program. void Optimize(); @@ -397,7 +406,9 @@ class Prog { int start_unanchored_; // unanchored entry point for program int size_; // number of instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ - int first_byte_; // required first byte for match, or -1 if none + size_t prefix_size_; // size of prefix (0 if no prefix) + int prefix_front_; // first byte of prefix (-1 if no prefix) + int prefix_back_; // last byte of prefix (-1 if no prefix) int list_count_; // count of lists (see above) int inst_count_[kNumInst]; // count of instructions by opcode diff --git a/chromium/third_party/re2/src/re2/re2.cc b/chromium/third_party/re2/src/re2/re2.cc index d231a21496d..7ec193c190b 100644 --- a/chromium/third_party/re2/src/re2/re2.cc +++ b/chromium/third_party/re2/src/re2/re2.cc @@ -83,6 +83,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { return RE2::ErrorMissingBracket; case re2::kRegexpMissingParen: return RE2::ErrorMissingParen; + case re2::kRegexpUnexpectedParen: + return RE2::ErrorUnexpectedParen; case re2::kRegexpTrailingBackslash: return RE2::ErrorTrailingBackslash; case re2::kRegexpRepeatArgument: @@ -293,7 +295,7 @@ static int FindMSBSet(uint32_t n) { DCHECK_NE(n, 0); #if defined(__GNUC__) return 31 ^ __builtin_clz(n); -#elif defined(_MSC_VER) +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) unsigned long c; _BitScanReverse(&c, n); return static_cast<int>(c); @@ -405,6 +407,8 @@ bool RE2::Replace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) @@ -425,6 +429,8 @@ int RE2::GlobalReplace(std::string* str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; @@ -497,9 +503,10 @@ bool RE2::Extract(const StringPiece& text, std::string* out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); + if (nvec > 1 + re.NumberOfCapturingGroups()) + return false; if (nvec > static_cast<int>(arraysize(vec))) return false; - if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) return false; @@ -1012,8 +1019,8 @@ bool RE2::Rewrite(std::string* out, int n = (c - '0'); if (n >= veclen) { if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); + LOG(ERROR) << "invalid substitution \\" << n + << " from " << veclen << " groups"; } return false; } diff --git a/chromium/third_party/re2/src/re2/re2.h b/chromium/third_party/re2/src/re2/re2.h index 32b2718f3c0..9d3496c9abd 100644 --- a/chromium/third_party/re2/src/re2/re2.h +++ b/chromium/third_party/re2/src/re2/re2.h @@ -40,6 +40,9 @@ // R"((?i)hello)" -- (?i) turns on case-insensitive matching // R"(/\*(.*?)\*/)" -- .*? matches . minimum no. of times possible // +// When using UTF-8 encoding, case-insensitive matching will perform +// simple case folding, not full case folding. +// // ----------------------------------------------------------------------- // MATCHING INTERFACE: // @@ -244,6 +247,7 @@ class RE2 { ErrorBadCharRange, // bad character class range ErrorMissingBracket, // missing closing ] ErrorMissingParen, // missing closing ) + ErrorUnexpectedParen, // unexpected closing ) ErrorTrailingBackslash, // trailing \ at end of regexp ErrorRepeatArgument, // repeat argument missing, e.g. "*" ErrorRepeatSize, // bad repetition argument diff --git a/chromium/third_party/re2/src/re2/regexp.cc b/chromium/third_party/re2/src/re2/regexp.cc index 5732166eebd..574780fbea9 100644 --- a/chromium/third_party/re2/src/re2/regexp.cc +++ b/chromium/third_party/re2/src/re2/regexp.cc @@ -20,6 +20,7 @@ #include "util/logging.h" #include "util/mutex.h" #include "util/utf.h" +#include "re2/pod_array.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -243,16 +244,15 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, return new Regexp(kRegexpEmptyMatch, flags); } - Regexp** subcopy = NULL; + PODArray<Regexp*> subcopy; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. - subcopy = new Regexp*[nsub]; - memmove(subcopy, sub, nsub * sizeof sub[0]); - sub = subcopy; + subcopy = PODArray<Regexp*>(nsub); + memmove(subcopy.data(), sub, nsub * sizeof sub[0]); + sub = subcopy.data(); nsub = FactorAlternation(sub, nsub, flags); if (nsub == 1) { Regexp* re = sub[0]; - delete[] subcopy; return re; } } @@ -269,7 +269,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, nsub - (nbigsub-1)*kMaxNsub, flags, false); - delete[] subcopy; return re; } @@ -278,8 +277,6 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, Regexp** subs = re->sub(); for (int i = 0; i < nsub; i++) subs[i] = sub[i]; - - delete[] subcopy; return re; } @@ -501,6 +498,7 @@ static const char *kErrorStrings[] = { "invalid character class range", "missing ]", "missing )", + "unexpected )", "trailing \\", "no argument for repetition operator", "invalid repetition size", @@ -658,78 +656,83 @@ std::map<int, std::string>* Regexp::CaptureNames() { return w.TakeMap(); } +void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, + std::string* bytes) { + if (latin1) { + bytes->resize(nrunes); + for (int i = 0; i < nrunes; i++) + (*bytes)[i] = static_cast<char>(runes[i]); + } else { + bytes->resize(nrunes * UTFmax); // worst case + char* p = &(*bytes)[0]; + for (int i = 0; i < nrunes; i++) + p += runetochar(p, &runes[i]); + bytes->resize(p - &(*bytes)[0]); + bytes->shrink_to_fit(); + } +} + // Determines whether regexp matches must be anchored // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix) { + prefix->clear(); + *foldcase = false; + *suffix = NULL; + // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string // 3. the rest - prefix->clear(); - *foldcase = false; - *suffix = NULL; if (op_ != kRegexpConcat) return false; - - // Some number of anchors, then a literal or concatenation. int i = 0; - Regexp** sub = this->sub(); - while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) + while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) i++; if (i == 0 || i >= nsub_) return false; - - Regexp* re = sub[i]; - switch (re->op_) { - default: - return false; - - case kRegexpLiteralString: - // Convert to string in proper encoding. - if (re->parse_flags() & Latin1) { - prefix->resize(re->nrunes_); - for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = static_cast<char>(re->runes_[j]); - } else { - // Convert to UTF-8 in place. - // Assume worst-case space and then trim. - prefix->resize(re->nrunes_ * UTFmax); - char *p = &(*prefix)[0]; - for (int j = 0; j < re->nrunes_; j++) { - Rune r = re->runes_[j]; - if (r < Runeself) - *p++ = static_cast<char>(r); - else - p += runetochar(p, &r); - } - prefix->resize(p - &(*prefix)[0]); - } - break; - - case kRegexpLiteral: - if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, static_cast<char>(re->rune_)); - } else { - char buf[UTFmax]; - prefix->append(buf, runetochar(buf, &re->rune_)); - } - break; - } - *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; + Regexp* re = sub()[i]; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; i++; - - // The rest. if (i < nsub_) { for (int j = i; j < nsub_; j++) - sub[j]->Incref(); - re = Concat(sub + i, nsub_ - i, parse_flags()); + sub()[j]->Incref(); + *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); } else { - re = new Regexp(kRegexpEmptyMatch, parse_flags()); + *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); } - *suffix = re; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; + return true; +} + +// Determines whether regexp matches must be unanchored +// with a fixed string prefix. If so, returns the prefix. +// The prefix might be ASCII case-insensitive. +bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { + prefix->clear(); + *foldcase = false; + + // No need for a walker: the regexp must either begin with or be + // a literal char or string. + Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; + if (re->op_ != kRegexpLiteral && + re->op_ != kRegexpLiteralString) + return false; + + bool latin1 = (re->parse_flags() & Latin1) != 0; + Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; + int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; + ConvertRunesToBytes(latin1, runes, nrunes, prefix); + *foldcase = (re->parse_flags() & FoldCase) != 0; return true; } diff --git a/chromium/third_party/re2/src/re2/regexp.h b/chromium/third_party/re2/src/re2/regexp.h index a5d85c81288..9ea7a07733c 100644 --- a/chromium/third_party/re2/src/re2/regexp.h +++ b/chromium/third_party/re2/src/re2/regexp.h @@ -177,6 +177,7 @@ enum RegexpStatusCode { kRegexpBadCharRange, // bad character class range kRegexpMissingBracket, // missing closing ] kRegexpMissingParen, // missing closing ) + kRegexpUnexpectedParen, // unexpected closing ) kRegexpTrailingBackslash, // at end of regexp kRegexpRepeatArgument, // repeat argument missing, e.g. "*" kRegexpRepeatSize, // bad repetition argument @@ -440,6 +441,13 @@ class Regexp { bool RequiredPrefix(std::string* prefix, bool* foldcase, Regexp** suffix); + // Whether every match of this regexp must be unanchored and + // begin with a non-empty fixed string (perhaps after ASCII + // case-folding). If so, returns the prefix. + // Callers should expect *prefix and *foldcase to be "zeroed" + // regardless of the return value. + bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase); + private: // Constructor allocates vectors as appropriate for operator. explicit Regexp(RegexpOp op, ParseFlags parse_flags); diff --git a/chromium/third_party/re2/src/re2/set.cc b/chromium/third_party/re2/src/re2/set.cc index 87db6b72b11..c3924bfd982 100644 --- a/chromium/third_party/re2/src/re2/set.cc +++ b/chromium/third_party/re2/src/re2/set.cc @@ -7,6 +7,7 @@ #include <stddef.h> #include <algorithm> #include <memory> +#include <utility> #include "util/util.h" #include "util/logging.h" @@ -18,19 +19,37 @@ namespace re2 { -RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { - options_.Copy(options); +RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) + : options_(options), + anchor_(anchor), + compiled_(false), + size_(0) { options_.set_never_capture(true); // might unblock some optimisations - anchor_ = anchor; - prog_ = NULL; - compiled_ = false; - size_ = 0; } RE2::Set::~Set() { for (size_t i = 0; i < elem_.size(); i++) elem_[i].second->Decref(); - delete prog_; +} + +RE2::Set::Set(Set&& other) + : options_(other.options_), + anchor_(other.anchor_), + elem_(std::move(other.elem_)), + compiled_(other.compiled_), + size_(other.size_), + prog_(std::move(other.prog_)) { + other.elem_.clear(); + other.elem_.shrink_to_fit(); + other.compiled_ = false; + other.size_ = 0; + other.prog_.reset(); +} + +RE2::Set& RE2::Set::operator=(Set&& other) { + this->~Set(); + (void) new (this) Set(std::move(other)); + return *this; } int RE2::Set::Add(const StringPiece& pattern, std::string* error) { @@ -97,9 +116,9 @@ bool RE2::Set::Compile() { options_.ParseFlags()); re2::Regexp* re = re2::Regexp::Alternate(sub.data(), size_, pf); - prog_ = Prog::CompileSet(re, anchor_, options_.max_mem()); + prog_.reset(Prog::CompileSet(re, anchor_, options_.max_mem())); re->Decref(); - return prog_ != NULL; + return prog_ != nullptr; } bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const { diff --git a/chromium/third_party/re2/src/re2/set.h b/chromium/third_party/re2/src/re2/set.h index 59733fd94cd..8d64f30ccd9 100644 --- a/chromium/third_party/re2/src/re2/set.h +++ b/chromium/third_party/re2/src/re2/set.h @@ -5,6 +5,7 @@ #ifndef RE2_SET_H_ #define RE2_SET_H_ +#include <memory> #include <string> #include <utility> #include <vector> @@ -36,6 +37,13 @@ class RE2::Set { Set(const RE2::Options& options, RE2::Anchor anchor); ~Set(); + // Not copyable. + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; + // Movable. + Set(Set&& other); + Set& operator=(Set&& other); + // Adds pattern to the set using the options passed to the constructor. // Returns the index that will identify the regexp in the output of Match(), // or -1 if the regexp cannot be parsed. @@ -67,12 +75,9 @@ class RE2::Set { RE2::Options options_; RE2::Anchor anchor_; std::vector<Elem> elem_; - re2::Prog* prog_; bool compiled_; int size_; - - Set(const Set&) = delete; - Set& operator=(const Set&) = delete; + std::unique_ptr<re2::Prog> prog_; }; } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/testing/backtrack.cc b/chromium/third_party/re2/src/re2/testing/backtrack.cc index 3bab1e7776d..216d259802f 100644 --- a/chromium/third_party/re2/src/re2/testing/backtrack.cc +++ b/chromium/third_party/re2/src/re2/testing/backtrack.cc @@ -29,6 +29,7 @@ #include "util/util.h" #include "util/logging.h" +#include "re2/pod_array.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -53,7 +54,6 @@ namespace re2 { class Backtracker { public: explicit Backtracker(Prog* prog); - ~Backtracker(); bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool longest, @@ -79,9 +79,8 @@ class Backtracker { int nsubmatch_; // # of submatches to fill in // Search state - const char* cap_[64]; // capture registers - uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked - size_t nvisited_; // # of words in bitmap + const char* cap_[64]; // capture registers + PODArray<uint32_t> visited_; // bitmap: (Inst*, char*) pairs visited Backtracker(const Backtracker&) = delete; Backtracker& operator=(const Backtracker&) = delete; @@ -93,13 +92,7 @@ Backtracker::Backtracker(Prog* prog) longest_(false), endmatch_(false), submatch_(NULL), - nsubmatch_(0), - visited_(NULL), - nvisited_(0) { -} - -Backtracker::~Backtracker() { - delete[] visited_; + nsubmatch_(0) { } // Runs a backtracking search. @@ -133,10 +126,10 @@ bool Backtracker::Search(const StringPiece& text, const StringPiece& context, // Allocate new visited_ bitmap -- size is proportional // to text, so have to reallocate on each call to Search. - delete[] visited_; - nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; - visited_ = new uint32_t[nvisited_]; - memset(visited_, 0, nvisited_*sizeof visited_[0]); + int nvisited = prog_->size() * static_cast<int>(text.size()+1); + nvisited = (nvisited + 31) / 32; + visited_ = PODArray<uint32_t>(nvisited); + memset(visited_.data(), 0, nvisited*sizeof visited_[0]); // Anchored search must start at text.begin(). if (anchored_) { @@ -166,8 +159,9 @@ bool Backtracker::Visit(int id, const char* p) { // either it didn't match or it did but we're hoping for a better match. // Either way, don't go down that road again. CHECK(p <= text_.data() + text_.size()); - size_t n = id*(text_.size()+1) + (p - text_.data()); - CHECK_LT(n/32, nvisited_); + int n = id * static_cast<int>(text_.size()+1) + + static_cast<int>(p-text_.data()); + CHECK_LT(n/32, visited_.size()); if (visited_[n/32] & (1 << (n&31))) return false; visited_[n/32] |= 1 << (n&31); diff --git a/chromium/third_party/re2/src/re2/testing/compile_test.cc b/chromium/third_party/re2/src/re2/testing/compile_test.cc index 4598aa640fb..2096e2f0797 100644 --- a/chromium/third_party/re2/src/re2/testing/compile_test.cc +++ b/chromium/third_party/re2/src/re2/testing/compile_test.cc @@ -236,7 +236,7 @@ TEST(TestCompile, InsufficientMemory) { "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$", Regexp::LikePerl, NULL); EXPECT_TRUE(re != NULL); - Prog* prog = re->CompileToProg(920); + Prog* prog = re->CompileToProg(850); // If the memory budget has been exhausted, compilation should fail // and return NULL instead of trying to do anything with NoMatch(). EXPECT_TRUE(prog == NULL); diff --git a/chromium/third_party/re2/src/re2/testing/filtered_re2_test.cc b/chromium/third_party/re2/src/re2/testing/filtered_re2_test.cc index deef2f87d62..c788fdadc49 100644 --- a/chromium/third_party/re2/src/re2/testing/filtered_re2_test.cc +++ b/chromium/third_party/re2/src/re2/testing/filtered_re2_test.cc @@ -7,6 +7,7 @@ #include <memory> #include <string> #include <vector> +#include <utility> #include "util/test.h" #include "util/logging.h" @@ -291,4 +292,49 @@ TEST(FilteredRE2Test, EmptyStringInStringSetBug) { "EmptyStringInStringSetBug", &v)); } +TEST(FilteredRE2Test, MoveSemantics) { + FilterTestVars v1; + int id; + v1.f.Add("foo\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("foo", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + + // The moved-to object should do what the moved-from object did. + FilterTestVars v2; + v2.f = std::move(v1.f); + v2.f.AllMatches("abc foo1 xyz", {0}, &v2.matches); + EXPECT_EQ(1, v2.matches.size()); + EXPECT_EQ(0, v2.matches[0]); + v2.f.AllMatches("abc bar2 xyz", {0}, &v2.matches); + EXPECT_EQ(0, v2.matches.size()); + + // The moved-from object should have been reset and be reusable. + v1.f.Add("bar\\d+", v1.opts, &id); + EXPECT_EQ(0, id); + v1.f.Compile(&v1.atoms); + EXPECT_EQ(1, v1.atoms.size()); + EXPECT_EQ("bar", v1.atoms[0]); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + v1.f = std::move(v2.f); + v1.f.AllMatches("abc foo1 xyz", {0}, &v1.matches); + EXPECT_EQ(1, v1.matches.size()); + EXPECT_EQ(0, v1.matches[0]); + v1.f.AllMatches("abc bar2 xyz", {0}, &v1.matches); + EXPECT_EQ(0, v1.matches.size()); +} + } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/testing/re2_test.cc b/chromium/third_party/re2/src/re2/testing/re2_test.cc index 52e1f6f066e..41fccf68eb5 100644 --- a/chromium/third_party/re2/src/re2/testing/re2_test.cc +++ b/chromium/third_party/re2/src/re2/testing/re2_test.cc @@ -224,6 +224,15 @@ TEST(RE2, Extract) { ASSERT_EQ(s, "'foo'"); } +TEST(RE2, MaxSubmatchTooLarge) { + std::string s; + ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s)); + s = "foo"; + ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2")); + s = "foo"; + ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2")); +} + TEST(RE2, Consume) { RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace std::string word; @@ -1268,38 +1277,43 @@ TEST(RE2, CL8622304) { EXPECT_EQ(val, "1,0x2F,030,4,5"); } - // Check that RE2 returns correct regexp pieces on error. // In particular, make sure it returns whole runes // and that it always reports invalid UTF-8. // Also check that Perl error flag piece is big enough. static struct ErrorTest { const char *regexp; - const char *error; + RE2::ErrorCode error_code; + const char *error_arg; } error_tests[] = { - { "ab\\αcd", "\\α" }, - { "ef\\x☺01", "\\x☺0" }, - { "gh\\x1☺01", "\\x1☺" }, - { "ij\\x1", "\\x1" }, - { "kl\\x", "\\x" }, - { "uv\\x{0000☺}", "\\x{0000☺" }, - { "wx\\p{ABC", "\\p{ABC" }, - { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X - { "aa(?sm☺i", "(?sm☺" }, - { "bb[abc", "[abc" }, - - { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8 - { "op\377qr", "" }, - { "st\\x{00000\377", "" }, - { "zz\\p{\377}", "" }, - { "zz\\x{00\377}", "" }, - { "zz(?P<name\377>abc)", "" }, + { "ab\\αcd", RE2::ErrorBadEscape, "\\α" }, + { "ef\\x☺01", RE2::ErrorBadEscape, "\\x☺0" }, + { "gh\\x1☺01", RE2::ErrorBadEscape, "\\x1☺" }, + { "ij\\x1", RE2::ErrorBadEscape, "\\x1" }, + { "kl\\x", RE2::ErrorBadEscape, "\\x" }, + { "uv\\x{0000☺}", RE2::ErrorBadEscape, "\\x{0000☺" }, + { "wx\\p{ABC", RE2::ErrorBadCharRange, "\\p{ABC" }, + // used to return (?s but the error is X + { "yz(?smiUX:abc)", RE2::ErrorBadPerlOp, "(?smiUX" }, + { "aa(?sm☺i", RE2::ErrorBadPerlOp, "(?sm☺" }, + { "bb[abc", RE2::ErrorMissingBracket, "[abc" }, + { "abc(def", RE2::ErrorMissingParen, "abc(def" }, + { "abc)def", RE2::ErrorUnexpectedParen, "abc)def" }, + + // no argument string returned for invalid UTF-8 + { "mn\\x1\377", RE2::ErrorBadUTF8, "" }, + { "op\377qr", RE2::ErrorBadUTF8, "" }, + { "st\\x{00000\377", RE2::ErrorBadUTF8, "" }, + { "zz\\p{\377}", RE2::ErrorBadUTF8, "" }, + { "zz\\x{00\377}", RE2::ErrorBadUTF8, "" }, + { "zz(?P<name\377>abc)", RE2::ErrorBadUTF8, "" }, }; -TEST(RE2, ErrorArgs) { +TEST(RE2, ErrorCodeAndArg) { for (size_t i = 0; i < arraysize(error_tests); i++) { RE2 re(error_tests[i].regexp, RE2::Quiet); EXPECT_FALSE(re.ok()); - EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error(); + EXPECT_EQ(re.error_code(), error_tests[i].error_code) << re.error(); + EXPECT_EQ(re.error_arg(), error_tests[i].error_arg) << re.error(); } } diff --git a/chromium/third_party/re2/src/re2/testing/required_prefix_test.cc b/chromium/third_party/re2/src/re2/testing/required_prefix_test.cc index 54600456a88..c00e81234eb 100644 --- a/chromium/third_party/re2/src/re2/testing/required_prefix_test.cc +++ b/chromium/third_party/re2/src/re2/testing/required_prefix_test.cc @@ -6,6 +6,7 @@ #include "util/test.h" #include "util/logging.h" +#include "re2/prog.h" #include "re2/regexp.h" namespace re2 { @@ -19,10 +20,13 @@ struct PrefixTest { }; static PrefixTest tests[] = { - // If the regexp is missing a ^, there's no required prefix. - { "abc", false }, + // Empty cases. { "", false }, { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has no ^, there's no required prefix. + { "abc", false }, // If the regexp immediately goes into // something not a literal match, there's no required prefix. @@ -53,15 +57,15 @@ TEST(RequiredPrefix, SimpleTests) { bool f; Regexp* s; ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf") + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") << " " << re->Dump(); if (t.return_value) { ASSERT_EQ(p, std::string(t.prefix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(f, t.foldcase) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); ASSERT_EQ(s->ToString(), std::string(t.suffix)) - << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf"); + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); s->Decref(); } re->Decref(); @@ -69,4 +73,73 @@ TEST(RequiredPrefix, SimpleTests) { } } +static PrefixTest for_accel_tests[] = { + // Empty cases. + { "", false }, + { "(?m)^", false }, + { "(?-m)^", false }, + + // If the regexp has a ^, there's no required prefix. + { "^abc", false }, + + // If the regexp immediately goes into + // something not a literal match, there's no required prefix. + { "(abc)", false }, + { "a*", false }, + + // Otherwise, it should work. + { "abc$", true, "abc", false, }, + { "abc", true, "abc", false, }, + { "(?i)abc", true, "abc", true, }, + { "abcd*", true, "abc", false, }, + { "[Aa][Bb]cd*", true, "ab", true, }, + { "ab[Cc]d*", true, "ab", false, }, + { "☺abc", true, "☺abc", false, }, +}; + +TEST(RequiredPrefixForAccel, SimpleTests) { + for (size_t i = 0; i < arraysize(for_accel_tests); i++) { + const PrefixTest& t = for_accel_tests[i]; + for (size_t j = 0; j < 2; j++) { + Regexp::ParseFlags flags = Regexp::LikePerl; + if (j == 0) + flags = flags | Regexp::Latin1; + Regexp* re = Regexp::Parse(t.regexp, flags, NULL); + ASSERT_TRUE(re != NULL) << " " << t.regexp; + + std::string p; + bool f; + ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8") + << " " << re->Dump(); + if (t.return_value) { + ASSERT_EQ(p, std::string(t.prefix)) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + ASSERT_EQ(f, t.foldcase) + << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8"); + } + re->Decref(); + } + } +} + +TEST(PrefixAccel, BasicTest) { + Regexp* re = Regexp::Parse("abc\\d+", Regexp::LikePerl, NULL); + ASSERT_TRUE(re != NULL); + Prog* prog = re->CompileToProg(0); + ASSERT_TRUE(prog != NULL); + for (int i = 0; i < 100; i++) { + std::string text(i, 'a'); + const char* p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_TRUE(p == NULL); + text.append("abc"); + p = reinterpret_cast<const char*>( + prog->PrefixAccel(text.data(), text.size())); + EXPECT_EQ(i, p-text.data()); + } + delete prog; + re->Decref(); +} + } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/testing/set_test.cc b/chromium/third_party/re2/src/re2/testing/set_test.cc index 61d1cf295f3..5a760c4b5e2 100644 --- a/chromium/third_party/re2/src/re2/testing/set_test.cc +++ b/chromium/third_party/re2/src/re2/testing/set_test.cc @@ -5,6 +5,7 @@ #include <stddef.h> #include <string> #include <vector> +#include <utility> #include "util/test.h" #include "util/logging.h" @@ -201,4 +202,29 @@ TEST(Set, Prefix) { ASSERT_EQ(v[0], 0); } +TEST(Set, MoveSemantics) { + RE2::Set s1(RE2::DefaultOptions, RE2::UNANCHORED); + ASSERT_EQ(s1.Add("foo\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); + + // The moved-to object should do what the moved-from object did. + RE2::Set s2 = std::move(s1); + ASSERT_EQ(s2.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s2.Match("abc bar2 xyz", NULL), false); + + // The moved-from object should have been reset and be reusable. + ASSERT_EQ(s1.Add("bar\\d+", NULL), 0); + ASSERT_EQ(s1.Compile(), true); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), false); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), true); + + // Verify that "overwriting" works and also doesn't leak memory. + // (The latter will need a leak detector such as LeakSanitizer.) + s1 = std::move(s2); + ASSERT_EQ(s1.Match("abc foo1 xyz", NULL), true); + ASSERT_EQ(s1.Match("abc bar2 xyz", NULL), false); +} + } // namespace re2 diff --git a/chromium/third_party/re2/src/re2/walker-inl.h b/chromium/third_party/re2/src/re2/walker-inl.h index 310be54bd7e..8cfe79960b1 100644 --- a/chromium/third_party/re2/src/re2/walker-inl.h +++ b/chromium/third_party/re2/src/re2/walker-inl.h @@ -89,7 +89,7 @@ template<typename T> class Regexp::Walker { private: // Walk state for the entire traversal. - std::stack<WalkState<T> >* stack_; + std::stack<WalkState<T>> stack_; bool stopped_early_; int max_visits_; @@ -134,24 +134,22 @@ template<typename T> struct WalkState { }; template<typename T> Regexp::Walker<T>::Walker() { - stack_ = new std::stack<WalkState<T> >; stopped_early_ = false; } template<typename T> Regexp::Walker<T>::~Walker() { Reset(); - delete stack_; } // Clears the stack. Should never be necessary, since // Walk always enters and exits with an empty stack. // Logs DFATAL if stack is not already clear. template<typename T> void Regexp::Walker<T>::Reset() { - if (stack_ && stack_->size() > 0) { + if (!stack_.empty()) { LOG(DFATAL) << "Stack not empty."; - while (stack_->size() > 0) { - delete[] stack_->top().child_args; - stack_->pop(); + while (!stack_.empty()) { + delete[] stack_.top().child_args; + stack_.pop(); } } } @@ -165,12 +163,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, return top_arg; } - stack_->push(WalkState<T>(re, top_arg)); + stack_.push(WalkState<T>(re, top_arg)); WalkState<T>* s; for (;;) { T t; - s = &stack_->top(); + s = &stack_.top(); Regexp* re = s->re; switch (s->n) { case -1: { @@ -201,7 +199,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, s->child_args[s->n] = Copy(s->child_args[s->n - 1]); s->n++; } else { - stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); + stack_.push(WalkState<T>(sub[s->n], s->pre_arg)); } continue; } @@ -214,12 +212,12 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, } } - // We've finished stack_->top(). + // We've finished stack_.top(). // Update next guy down. - stack_->pop(); - if (stack_->size() == 0) + stack_.pop(); + if (stack_.empty()) return t; - s = &stack_->top(); + s = &stack_.top(); if (s->child_args != NULL) s->child_args[s->n] = t; else |