diff options
Diffstat (limited to 'src/third_party/re2/dist/re2/testing/tester.cc')
-rw-r--r-- | src/third_party/re2/dist/re2/testing/tester.cc | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/src/third_party/re2/dist/re2/testing/tester.cc b/src/third_party/re2/dist/re2/testing/tester.cc new file mode 100644 index 00000000000..d2ec4fb9ea1 --- /dev/null +++ b/src/third_party/re2/dist/re2/testing/tester.cc @@ -0,0 +1,685 @@ +// Copyright 2008 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Regular expression engine tester -- test all the implementations against each other. + +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <string> + +#include "util/util.h" +#include "util/flags.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/testing/tester.h" +#include "re2/prog.h" +#include "re2/re2.h" +#include "re2/regexp.h" + +DEFINE_FLAG(bool, dump_prog, false, "dump regexp program"); +DEFINE_FLAG(bool, log_okay, false, "log successful runs"); +DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program"); + +DEFINE_FLAG(int, max_regexp_failures, 100, + "maximum number of regexp test failures (-1 = unlimited)"); + +DEFINE_FLAG(std::string, regexp_engines, "", + "pattern to select regexp engines to test"); + +namespace re2 { + +enum { + kMaxSubmatch = 1+16, // $0...$16 +}; + +const char* engine_names[kEngineMax] = { + "Backtrack", + "NFA", + "DFA", + "DFA1", + "OnePass", + "BitState", + "RE2", + "RE2a", + "RE2b", + "PCRE", +}; + +// Returns the name of the engine. +static const char* EngineName(Engine e) { + CHECK_GE(e, 0); + CHECK_LT(e, arraysize(engine_names)); + CHECK(engine_names[e] != NULL); + return engine_names[e]; +} + +// Returns bit mask of engines to use. +static uint32_t Engines() { + static bool did_parse = false; + static uint32_t cached_engines = 0; + + if (did_parse) + return cached_engines; + + if (GetFlag(FLAGS_regexp_engines).empty()) { + cached_engines = ~0; + } else { + for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) + if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos) + cached_engines |= 1<<i; + } + + if (cached_engines == 0) + LOG(INFO) << "Warning: no engines enabled."; + if (!UsingPCRE) + cached_engines &= ~(1<<kEnginePCRE); + for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) { + if (cached_engines & (1<<i)) + LOG(INFO) << EngineName(i) << " enabled"; + } + + did_parse = true; + return cached_engines; +} + +// The result of running a match. +struct TestInstance::Result { + Result() + : skipped(false), + matched(false), + untrusted(false), + have_submatch(false), + have_submatch0(false) { + ClearSubmatch(); + } + + void ClearSubmatch() { + for (int i = 0; i < kMaxSubmatch; i++) + submatch[i] = StringPiece(); + } + + bool skipped; // test skipped: wasn't applicable + bool matched; // found a match + bool untrusted; // don't really trust the answer + bool have_submatch; // computed all submatch info + bool have_submatch0; // computed just submatch[0] + StringPiece submatch[kMaxSubmatch]; +}; + +typedef TestInstance::Result Result; + +// Formats a single capture range s in text in the form (a,b) +// where a and b are the starting and ending offsets of s in text. +static std::string FormatCapture(const StringPiece& text, + const StringPiece& s) { + if (s.data() == NULL) + return "(?,?)"; + return StringPrintf("(%td,%td)", + s.begin() - text.begin(), + s.end() - text.begin()); +} + +// Returns whether text contains non-ASCII (>= 0x80) bytes. +static bool NonASCII(const StringPiece& text) { + for (size_t i = 0; i < text.size(); i++) + if ((uint8_t)text[i] >= 0x80) + return true; + return false; +} + +// Returns string representation of match kind. +static std::string FormatKind(Prog::MatchKind kind) { + switch (kind) { + case Prog::kFullMatch: + return "full match"; + case Prog::kLongestMatch: + return "longest match"; + case Prog::kFirstMatch: + return "first match"; + case Prog::kManyMatch: + return "many match"; + } + return "???"; +} + +// Returns string representation of anchor kind. +static std::string FormatAnchor(Prog::Anchor anchor) { + switch (anchor) { + case Prog::kAnchored: + return "anchored"; + case Prog::kUnanchored: + return "unanchored"; + } + return "???"; +} + +struct ParseMode { + Regexp::ParseFlags parse_flags; + std::string desc; +}; + +static const Regexp::ParseFlags single_line = + Regexp::LikePerl; +static const Regexp::ParseFlags multi_line = + static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine); + +static ParseMode parse_modes[] = { + { single_line, "single-line" }, + { single_line|Regexp::Latin1, "single-line, latin1" }, + { multi_line, "multiline" }, + { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, + { multi_line|Regexp::Latin1, "multiline, latin1" }, +}; + +static std::string FormatMode(Regexp::ParseFlags flags) { + for (size_t i = 0; i < arraysize(parse_modes); i++) + if (parse_modes[i].parse_flags == flags) + return parse_modes[i].desc; + return StringPrintf("%#x", static_cast<uint32_t>(flags)); +} + +// Constructs and saves all the matching engines that +// will be required for the given tests. +TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, + Regexp::ParseFlags flags) + : regexp_str_(regexp_str), + kind_(kind), + flags_(flags), + error_(false), + regexp_(NULL), + num_captures_(0), + prog_(NULL), + rprog_(NULL), + re_(NULL), + re2_(NULL) { + + VLOG(1) << CEscape(regexp_str); + + // Compile regexp to prog. + // Always required - needed for backtracking (reference implementation). + RegexpStatus status; + regexp_ = Regexp::Parse(regexp_str, flags, &status); + if (regexp_ == NULL) { + LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) + << " mode: " << FormatMode(flags); + error_ = true; + return; + } + num_captures_ = regexp_->NumCaptures(); + prog_ = regexp_->CompileToProg(0); + if (prog_ == NULL) { + LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (GetFlag(FLAGS_dump_prog)) { + LOG(INFO) << "Prog for " + << " regexp " + << CEscape(regexp_str_) + << " (" << FormatKind(kind_) + << ", " << FormatMode(flags_) + << ")\n" + << prog_->Dump(); + } + + // Compile regexp to reversed prog. Only needed for DFA engines. + if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { + rprog_ = regexp_->CompileToReverseProg(0); + if (rprog_ == NULL) { + LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); + error_ = true; + return; + } + if (GetFlag(FLAGS_dump_rprog)) + LOG(INFO) << rprog_->Dump(); + } + + // Create re string that will be used for RE and RE2. + std::string re = std::string(regexp_str); + // Accomodate flags. + // Regexp::Latin1 will be accomodated below. + if (!(flags & Regexp::OneLine)) + re = "(?m)" + re; + if (flags & Regexp::NonGreedy) + re = "(?U)" + re; + if (flags & Regexp::DotNL) + re = "(?s)" + re; + + // Compile regexp to RE2. + if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) { + RE2::Options options; + if (flags & Regexp::Latin1) + options.set_encoding(RE2::Options::EncodingLatin1); + if (kind_ == Prog::kLongestMatch) + options.set_longest_match(true); + re2_ = new RE2(re, options); + if (!re2_->error().empty()) { + LOG(INFO) << "Cannot RE2: " << CEscape(re); + error_ = true; + return; + } + } + + // Compile regexp to RE. + // PCRE as exposed by the RE interface isn't always usable. + // 1. It disagrees about handling of empty-string reptitions + // like matching (a*)* against "b". PCRE treats the (a*) as + // occurring once, while we treat it as occurring not at all. + // 2. It treats $ as this weird thing meaning end of string + // or before the \n at the end of the string. + // 3. It doesn't implement POSIX leftmost-longest matching. + // 4. It lets \s match vertical tab. + // MimicsPCRE() detects 1 and 2. + if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && + kind_ != Prog::kLongestMatch) { + PCRE_Options o; + o.set_option(PCRE::UTF8); + if (flags & Regexp::Latin1) + o.set_option(PCRE::None); + // PCRE has interface bug keeping us from finding $0, so + // add one more layer of parens. + re_ = new PCRE("("+re+")", o); + if (!re_->error().empty()) { + LOG(INFO) << "Cannot PCRE: " << CEscape(re); + error_ = true; + return; + } + } +} + +TestInstance::~TestInstance() { + if (regexp_) + regexp_->Decref(); + delete prog_; + delete rprog_; + delete re_; + delete re2_; +} + +// Runs a single search using the named engine type. +// This interface hides all the irregularities of the various +// engine interfaces from the rest of this file. +void TestInstance::RunSearch(Engine type, + const StringPiece& orig_text, + const StringPiece& orig_context, + Prog::Anchor anchor, + Result* result) { + if (regexp_ == NULL) { + result->skipped = true; + return; + } + int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 + if (nsubmatch > kMaxSubmatch) + nsubmatch = kMaxSubmatch; + + StringPiece text = orig_text; + StringPiece context = orig_context; + + switch (type) { + default: + LOG(FATAL) << "Bad RunSearch type: " << (int)type; + + case kEngineBacktrack: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineNFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchNFA(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineDFA: + if (prog_ == NULL) { + result->skipped = true; + break; + } + result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, + &result->skipped, NULL); + break; + + case kEngineDFA1: + if (prog_ == NULL || rprog_ == NULL) { + result->skipped = true; + break; + } + result->matched = + prog_->SearchDFA(text, context, anchor, kind_, result->submatch, + &result->skipped, NULL); + // If anchored, no need for second run, + // but do it anyway to find more bugs. + if (result->matched) { + if (!rprog_->SearchDFA(result->submatch[0], context, + Prog::kAnchored, Prog::kLongestMatch, + result->submatch, + &result->skipped, NULL)) { + LOG(ERROR) << "Reverse DFA inconsistency: " + << CEscape(regexp_str_) + << " on " << CEscape(text); + result->matched = false; + } + } + result->have_submatch0 = true; + break; + + case kEngineOnePass: + if (prog_ == NULL || + !prog_->IsOnePass() || + anchor == Prog::kUnanchored || + nsubmatch > Prog::kMaxOnePassCapture) { + result->skipped = true; + break; + } + result->matched = prog_->SearchOnePass(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineBitState: + if (prog_ == NULL || + !prog_->CanBitState()) { + result->skipped = true; + break; + } + result->matched = prog_->SearchBitState(text, context, anchor, kind_, + result->submatch, nsubmatch); + result->have_submatch = true; + break; + + case kEngineRE2: + case kEngineRE2a: + case kEngineRE2b: { + if (!re2_ || text.end() != context.end()) { + result->skipped = true; + break; + } + + RE2::Anchor re_anchor; + if (anchor == Prog::kAnchored) + re_anchor = RE2::ANCHOR_START; + else + re_anchor = RE2::UNANCHORED; + if (kind_ == Prog::kFullMatch) + re_anchor = RE2::ANCHOR_BOTH; + + result->matched = re2_->Match( + context, + static_cast<size_t>(text.begin() - context.begin()), + static_cast<size_t>(text.end() - context.begin()), + re_anchor, + result->submatch, + nsubmatch); + result->have_submatch = nsubmatch > 0; + break; + } + + case kEnginePCRE: { + if (!re_ || text.begin() != context.begin() || + text.end() != context.end()) { + result->skipped = true; + break; + } + + // In Perl/PCRE, \v matches any character considered vertical + // whitespace, not just vertical tab. Regexp::MimicsPCRE() is + // unable to handle all cases of this, unfortunately, so just + // catch them here. :( + if (regexp_str_.find("\\v") != StringPiece::npos && + (text.find('\n') != StringPiece::npos || + text.find('\f') != StringPiece::npos || + text.find('\r') != StringPiece::npos)) { + result->skipped = true; + break; + } + + // PCRE 8.34 or so started allowing vertical tab to match \s, + // following a change made in Perl 5.18. RE2 does not. + if ((regexp_str_.find("\\s") != StringPiece::npos || + regexp_str_.find("\\S") != StringPiece::npos) && + text.find('\v') != StringPiece::npos) { + result->skipped = true; + break; + } + + const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; + PCRE::Arg *a = new PCRE::Arg[nsubmatch]; + for (int i = 0; i < nsubmatch; i++) { + a[i] = PCRE::Arg(&result->submatch[i]); + argptr[i] = &a[i]; + } + size_t consumed; + PCRE::Anchor pcre_anchor; + if (anchor == Prog::kAnchored) + pcre_anchor = PCRE::ANCHOR_START; + else + pcre_anchor = PCRE::UNANCHORED; + if (kind_ == Prog::kFullMatch) + pcre_anchor = PCRE::ANCHOR_BOTH; + re_->ClearHitLimit(); + result->matched = + re_->DoMatch(text, + pcre_anchor, + &consumed, + argptr, nsubmatch); + if (re_->HitLimit()) { + result->untrusted = true; + delete[] argptr; + delete[] a; + break; + } + result->have_submatch = true; + delete[] argptr; + delete[] a; + break; + } + } + + if (!result->matched) + result->ClearSubmatch(); +} + +// Checks whether r is okay given that correct is the right answer. +// Specifically, r's answers have to match (but it doesn't have to +// claim to have all the answers). +static bool ResultOkay(const Result& r, const Result& correct) { + if (r.skipped) + return true; + if (r.matched != correct.matched) + return false; + if (r.have_submatch || r.have_submatch0) { + for (int i = 0; i < kMaxSubmatch; i++) { + if (correct.submatch[i].data() != r.submatch[i].data() || + correct.submatch[i].size() != r.submatch[i].size()) + return false; + if (!r.have_submatch) + break; + } + } + return true; +} + +// Runs a single test. +bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + // Backtracking is the gold standard. + Result correct; + RunSearch(kEngineBacktrack, text, context, anchor, &correct); + if (correct.skipped) { + if (regexp_ == NULL) + return true; + LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) + << " " << FormatMode(flags_); + return false; + } + VLOG(1) << "Try: regexp " << CEscape(regexp_str_) + << " text " << CEscape(text) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; + + // Compare the others. + bool all_okay = true; + for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { + if (!(Engines() & (1<<i))) + continue; + + Result r; + RunSearch(i, text, context, anchor, &r); + if (ResultOkay(r, correct)) { + if (GetFlag(FLAGS_log_okay)) + LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); + continue; + } + + // We disagree with PCRE on the meaning of some Unicode matches. + // In particular, we treat non-ASCII UTF-8 as non-word characters. + // We also treat "empty" character sets like [^\w\W] as being + // impossible to match, while PCRE apparently excludes some code + // points (e.g., 0x0080) from both \w and \W. + if (i == kEnginePCRE && NonASCII(text)) + continue; + + if (!r.untrusted) + all_okay = false; + + LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, + context, anchor); + if (r.matched != correct.matched) { + if (r.matched) { + LOG(INFO) << " Should not match (but does)."; + } else { + LOG(INFO) << " Should match (but does not)."; + continue; + } + } + for (int i = 0; i < 1+num_captures_; i++) { + if (r.submatch[i].data() != correct.submatch[i].data() || + r.submatch[i].size() != correct.submatch[i].size()) { + LOG(INFO) << + StringPrintf(" $%d: should be %s is %s", + i, + FormatCapture(text, correct.submatch[i]).c_str(), + FormatCapture(text, r.submatch[i]).c_str()); + } else { + LOG(INFO) << + StringPrintf(" $%d: %s ok", i, + FormatCapture(text, r.submatch[i]).c_str()); + } + } + } + + if (!all_okay) { + // This will be initialised once (after flags have been initialised) + // and that is desirable because we want to enforce a global limit. + static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures); + if (max_regexp_failures > 0 && --max_regexp_failures == 0) + LOG(QFATAL) << "Too many regexp failures."; + } + + return all_okay; +} + +void TestInstance::LogMatch(const char* prefix, Engine e, + const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + LOG(INFO) << prefix + << EngineName(e) + << " regexp " + << CEscape(regexp_str_) + << " " + << CEscape(regexp_->ToString()) + << " text " + << CEscape(text) + << " (" + << text.begin() - context.begin() + << "," + << text.end() - context.begin() + << ") of context " + << CEscape(context) + << " (" << FormatKind(kind_) + << ", " << FormatAnchor(anchor) + << ", " << FormatMode(flags_) + << ")"; +} + +static Prog::MatchKind kinds[] = { + Prog::kFirstMatch, + Prog::kLongestMatch, + Prog::kFullMatch, +}; + +// Test all possible match kinds and parse modes. +Tester::Tester(const StringPiece& regexp) { + error_ = false; + for (size_t i = 0; i < arraysize(kinds); i++) { + for (size_t j = 0; j < arraysize(parse_modes); j++) { + TestInstance* t = new TestInstance(regexp, kinds[i], + parse_modes[j].parse_flags); + error_ |= t->error(); + v_.push_back(t); + } + } +} + +Tester::~Tester() { + for (size_t i = 0; i < v_.size(); i++) + delete v_[i]; +} + +bool Tester::TestCase(const StringPiece& text, const StringPiece& context, + Prog::Anchor anchor) { + bool okay = true; + for (size_t i = 0; i < v_.size(); i++) + okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); + return okay; +} + +static Prog::Anchor anchors[] = { + Prog::kAnchored, + Prog::kUnanchored +}; + +bool Tester::TestInput(const StringPiece& text) { + bool okay = TestInputInContext(text, text); + if (!text.empty()) { + StringPiece sp; + sp = text; + sp.remove_prefix(1); + okay &= TestInputInContext(sp, text); + sp = text; + sp.remove_suffix(1); + okay &= TestInputInContext(sp, text); + } + return okay; +} + +bool Tester::TestInputInContext(const StringPiece& text, + const StringPiece& context) { + bool okay = true; + for (size_t i = 0; i < arraysize(anchors); i++) + okay &= TestCase(text, context, anchors[i]); + return okay; +} + +bool TestRegexpOnText(const StringPiece& regexp, + const StringPiece& text) { + Tester t(regexp); + return t.TestInput(text); +} + +} // namespace re2 |