1 files changed, 685 insertions, 0 deletions
diff --git a/src/third_party/re2/dist/re2/testing/tester.cc b/src/third_party/re2/dist/re2/testing/tester.cc
new file mode 100644
index 00000000000..d2ec4fb9ea1
--- /dev/null
+++ b/src/third_party/re2/dist/re2/testing/tester.cc
@@ -0,0 +1,685 @@
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Regular expression engine tester -- test all the implementations against each other.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <string>
+
+#include "util/util.h"
+#include "util/flags.h"
+#include "util/logging.h"
+#include "util/strutil.h"
+#include "re2/testing/tester.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+
+DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
+DEFINE_FLAG(bool, log_okay, false, "log successful runs");
+DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
+
+DEFINE_FLAG(int, max_regexp_failures, 100,
+            "maximum number of regexp test failures (-1 = unlimited)");
+
+DEFINE_FLAG(std::string, regexp_engines, "",
+            "pattern to select regexp engines to test");
+
+namespace re2 {
+
+enum {
+  kMaxSubmatch = 1+16,  // $0...$16
+};
+
+const char* engine_names[kEngineMax] = {
+  "Backtrack",
+  "NFA",
+  "DFA",
+  "DFA1",
+  "OnePass",
+  "BitState",
+  "RE2",
+  "RE2a",
+  "RE2b",
+  "PCRE",
+};
+
+// Returns the name of the engine.
+static const char* EngineName(Engine e) {
+  CHECK_GE(e, 0);
+  CHECK_LT(e, arraysize(engine_names));
+  CHECK(engine_names[e] != NULL);
+  return engine_names[e];
+}
+
+// Returns bit mask of engines to use.
+static uint32_t Engines() {
+  static bool did_parse = false;
+  static uint32_t cached_engines = 0;
+
+  if (did_parse)
+    return cached_engines;
+
+  if (GetFlag(FLAGS_regexp_engines).empty()) {
+    cached_engines = ~0;
+  } else {
+    for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
+      if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
+        cached_engines |= 1<<i;
+  }
+
+  if (cached_engines == 0)
+    LOG(INFO) << "Warning: no engines enabled.";
+  if (!UsingPCRE)
+    cached_engines &= ~(1<<kEnginePCRE);
+  for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
+    if (cached_engines & (1<<i))
+      LOG(INFO) << EngineName(i) << " enabled";
+  }
+
+  did_parse = true;
+  return cached_engines;
+}
+
+// The result of running a match.
+struct TestInstance::Result {
+  Result()
+      : skipped(false),
+        matched(false),
+        untrusted(false),
+        have_submatch(false),
+        have_submatch0(false) {
+    ClearSubmatch();
+  }
+
+  void ClearSubmatch() {
+    for (int i = 0; i < kMaxSubmatch; i++)
+      submatch[i] = StringPiece();
+  }
+
+  bool skipped;         // test skipped: wasn't applicable
+  bool matched;         // found a match
+  bool untrusted;       // don't really trust the answer
+  bool have_submatch;   // computed all submatch info
+  bool have_submatch0;  // computed just submatch[0]
+  StringPiece submatch[kMaxSubmatch];
+};
+
+typedef TestInstance::Result Result;
+
+// Formats a single capture range s in text in the form (a,b)
+// where a and b are the starting and ending offsets of s in text.
+static std::string FormatCapture(const StringPiece& text,
+                                 const StringPiece& s) {
+  if (s.data() == NULL)
+    return "(?,?)";
+  return StringPrintf("(%td,%td)",
+                      s.begin() - text.begin(),
+                      s.end() - text.begin());
+}
+
+// Returns whether text contains non-ASCII (>= 0x80) bytes.
+static bool NonASCII(const StringPiece& text) {
+  for (size_t i = 0; i < text.size(); i++)
+    if ((uint8_t)text[i] >= 0x80)
+      return true;
+  return false;
+}
+
+// Returns string representation of match kind.
+static std::string FormatKind(Prog::MatchKind kind) {
+  switch (kind) {
+    case Prog::kFullMatch:
+      return "full match";
+    case Prog::kLongestMatch:
+      return "longest match";
+    case Prog::kFirstMatch:
+      return "first match";
+    case Prog::kManyMatch:
+      return "many match";
+  }
+  return "???";
+}
+
+// Returns string representation of anchor kind.
+static std::string FormatAnchor(Prog::Anchor anchor) {
+  switch (anchor) {
+    case Prog::kAnchored:
+      return "anchored";
+    case Prog::kUnanchored:
+      return "unanchored";
+  }
+  return "???";
+}
+
+struct ParseMode {
+  Regexp::ParseFlags parse_flags;
+  std::string desc;
+};
+
+static const Regexp::ParseFlags single_line =
+  Regexp::LikePerl;
+static const Regexp::ParseFlags multi_line =
+  static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
+
+static ParseMode parse_modes[] = {
+  { single_line,                   "single-line"          },
+  { single_line|Regexp::Latin1,    "single-line, latin1"  },
+  { multi_line,                    "multiline"            },
+  { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
+  { multi_line|Regexp::Latin1,     "multiline, latin1"    },
+};
+
+static std::string FormatMode(Regexp::ParseFlags flags) {
+  for (size_t i = 0; i < arraysize(parse_modes); i++)
+    if (parse_modes[i].parse_flags == flags)
+      return parse_modes[i].desc;
+  return StringPrintf("%#x", static_cast<uint32_t>(flags));
+}
+
+// Constructs and saves all the matching engines that
+// will be required for the given tests.
+TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
+                           Regexp::ParseFlags flags)
+  : regexp_str_(regexp_str),
+    kind_(kind),
+    flags_(flags),
+    error_(false),
+    regexp_(NULL),
+    num_captures_(0),
+    prog_(NULL),
+    rprog_(NULL),
+    re_(NULL),
+    re2_(NULL) {
+
+  VLOG(1) << CEscape(regexp_str);
+
+  // Compile regexp to prog.
+  // Always required - needed for backtracking (reference implementation).
+  RegexpStatus status;
+  regexp_ = Regexp::Parse(regexp_str, flags, &status);
+  if (regexp_ == NULL) {
+    LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
+              << " mode: " << FormatMode(flags);
+    error_ = true;
+    return;
+  }
+  num_captures_ = regexp_->NumCaptures();
+  prog_ = regexp_->CompileToProg(0);
+  if (prog_ == NULL) {
+    LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
+    error_ = true;
+    return;
+  }
+  if (GetFlag(FLAGS_dump_prog)) {
+    LOG(INFO) << "Prog for "
+              << " regexp "
+              << CEscape(regexp_str_)
+              << " (" << FormatKind(kind_)
+              << ", " << FormatMode(flags_)
+              << ")\n"
+              << prog_->Dump();
+  }
+
+  // Compile regexp to reversed prog.  Only needed for DFA engines.
+  if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
+    rprog_ = regexp_->CompileToReverseProg(0);
+    if (rprog_ == NULL) {
+      LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
+      error_ = true;
+      return;
+    }
+    if (GetFlag(FLAGS_dump_rprog))
+      LOG(INFO) << rprog_->Dump();
+  }
+
+  // Create re string that will be used for RE and RE2.
+  std::string re = std::string(regexp_str);
+  // Accomodate flags.
+  // Regexp::Latin1 will be accomodated below.
+  if (!(flags & Regexp::OneLine))
+    re = "(?m)" + re;
+  if (flags & Regexp::NonGreedy)
+    re = "(?U)" + re;
+  if (flags & Regexp::DotNL)
+    re = "(?s)" + re;
+
+  // Compile regexp to RE2.
+  if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
+    RE2::Options options;
+    if (flags & Regexp::Latin1)
+      options.set_encoding(RE2::Options::EncodingLatin1);
+    if (kind_ == Prog::kLongestMatch)
+      options.set_longest_match(true);
+    re2_ = new RE2(re, options);
+    if (!re2_->error().empty()) {
+      LOG(INFO) << "Cannot RE2: " << CEscape(re);
+      error_ = true;
+      return;
+    }
+  }
+
+  // Compile regexp to RE.
+  // PCRE as exposed by the RE interface isn't always usable.
+  // 1. It disagrees about handling of empty-string reptitions
+  //    like matching (a*)* against "b".  PCRE treats the (a*) as
+  //    occurring once, while we treat it as occurring not at all.
+  // 2. It treats $ as this weird thing meaning end of string
+  //    or before the \n at the end of the string.
+  // 3. It doesn't implement POSIX leftmost-longest matching.
+  // 4. It lets \s match vertical tab.
+  // MimicsPCRE() detects 1 and 2.
+  if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
+      kind_ != Prog::kLongestMatch) {
+    PCRE_Options o;
+    o.set_option(PCRE::UTF8);
+    if (flags & Regexp::Latin1)
+      o.set_option(PCRE::None);
+    // PCRE has interface bug keeping us from finding $0, so
+    // add one more layer of parens.
+    re_ = new PCRE("("+re+")", o);
+    if (!re_->error().empty()) {
+      LOG(INFO) << "Cannot PCRE: " << CEscape(re);
+      error_ = true;
+      return;
+    }
+  }
+}
+
+TestInstance::~TestInstance() {
+  if (regexp_)
+    regexp_->Decref();
+  delete prog_;
+  delete rprog_;
+  delete re_;
+  delete re2_;
+}
+
+// Runs a single search using the named engine type.
+// This interface hides all the irregularities of the various
+// engine interfaces from the rest of this file.
+void TestInstance::RunSearch(Engine type,
+                             const StringPiece& orig_text,
+                             const StringPiece& orig_context,
+                             Prog::Anchor anchor,
+                             Result* result) {
+  if (regexp_ == NULL) {
+    result->skipped = true;
+    return;
+  }
+  int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
+  if (nsubmatch > kMaxSubmatch)
+    nsubmatch = kMaxSubmatch;
+
+  StringPiece text = orig_text;
+  StringPiece context = orig_context;
+
+  switch (type) {
+    default:
+      LOG(FATAL) << "Bad RunSearch type: " << (int)type;
+
+    case kEngineBacktrack:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
+                                     result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineNFA:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->SearchNFA(text, context, anchor, kind_,
+                        result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineDFA:
+      if (prog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
+                                         &result->skipped, NULL);
+      break;
+
+    case kEngineDFA1:
+      if (prog_ == NULL || rprog_ == NULL) {
+        result->skipped = true;
+        break;
+      }
+      result->matched =
+        prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
+                         &result->skipped, NULL);
+      // If anchored, no need for second run,
+      // but do it anyway to find more bugs.
+      if (result->matched) {
+        if (!rprog_->SearchDFA(result->submatch[0], context,
+                               Prog::kAnchored, Prog::kLongestMatch,
+                               result->submatch,
+                               &result->skipped, NULL)) {
+          LOG(ERROR) << "Reverse DFA inconsistency: "
+                     << CEscape(regexp_str_)
+                     << " on " << CEscape(text);
+          result->matched = false;
+        }
+      }
+      result->have_submatch0 = true;
+      break;
+
+    case kEngineOnePass:
+      if (prog_ == NULL ||
+          !prog_->IsOnePass() ||
+          anchor == Prog::kUnanchored ||
+          nsubmatch > Prog::kMaxOnePassCapture) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
+                                      result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineBitState:
+      if (prog_ == NULL ||
+          !prog_->CanBitState()) {
+        result->skipped = true;
+        break;
+      }
+      result->matched = prog_->SearchBitState(text, context, anchor, kind_,
+                                              result->submatch, nsubmatch);
+      result->have_submatch = true;
+      break;
+
+    case kEngineRE2:
+    case kEngineRE2a:
+    case kEngineRE2b: {
+      if (!re2_ || text.end() != context.end()) {
+        result->skipped = true;
+        break;
+      }
+
+      RE2::Anchor re_anchor;
+      if (anchor == Prog::kAnchored)
+        re_anchor = RE2::ANCHOR_START;
+      else
+        re_anchor = RE2::UNANCHORED;
+      if (kind_ == Prog::kFullMatch)
+        re_anchor = RE2::ANCHOR_BOTH;
+
+      result->matched = re2_->Match(
+          context,
+          static_cast<size_t>(text.begin() - context.begin()),
+          static_cast<size_t>(text.end() - context.begin()),
+          re_anchor,
+          result->submatch,
+          nsubmatch);
+      result->have_submatch = nsubmatch > 0;
+      break;
+    }
+
+    case kEnginePCRE: {
+      if (!re_ || text.begin() != context.begin() ||
+          text.end() != context.end()) {
+        result->skipped = true;
+        break;
+      }
+
+      // In Perl/PCRE, \v matches any character considered vertical
+      // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
+      // unable to handle all cases of this, unfortunately, so just
+      // catch them here. :(
+      if (regexp_str_.find("\\v") != StringPiece::npos &&
+          (text.find('\n') != StringPiece::npos ||
+           text.find('\f') != StringPiece::npos ||
+           text.find('\r') != StringPiece::npos)) {
+        result->skipped = true;
+        break;
+      }
+
+      // PCRE 8.34 or so started allowing vertical tab to match \s,
+      // following a change made in Perl 5.18. RE2 does not.
+      if ((regexp_str_.find("\\s") != StringPiece::npos ||
+           regexp_str_.find("\\S") != StringPiece::npos) &&
+          text.find('\v') != StringPiece::npos) {
+        result->skipped = true;
+        break;
+      }
+
+      const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
+      PCRE::Arg *a = new PCRE::Arg[nsubmatch];
+      for (int i = 0; i < nsubmatch; i++) {
+        a[i] = PCRE::Arg(&result->submatch[i]);
+        argptr[i] = &a[i];
+      }
+      size_t consumed;
+      PCRE::Anchor pcre_anchor;
+      if (anchor == Prog::kAnchored)
+        pcre_anchor = PCRE::ANCHOR_START;
+      else
+        pcre_anchor = PCRE::UNANCHORED;
+      if (kind_ == Prog::kFullMatch)
+        pcre_anchor = PCRE::ANCHOR_BOTH;
+      re_->ClearHitLimit();
+      result->matched =
+        re_->DoMatch(text,
+                     pcre_anchor,
+                     &consumed,
+                     argptr, nsubmatch);
+      if (re_->HitLimit()) {
+        result->untrusted = true;
+        delete[] argptr;
+        delete[] a;
+        break;
+      }
+      result->have_submatch = true;
+      delete[] argptr;
+      delete[] a;
+      break;
+    }
+  }
+
+  if (!result->matched)
+    result->ClearSubmatch();
+}
+
+// Checks whether r is okay given that correct is the right answer.
+// Specifically, r's answers have to match (but it doesn't have to
+// claim to have all the answers).
+static bool ResultOkay(const Result& r, const Result& correct) {
+  if (r.skipped)
+    return true;
+  if (r.matched != correct.matched)
+    return false;
+  if (r.have_submatch || r.have_submatch0) {
+    for (int i = 0; i < kMaxSubmatch; i++) {
+      if (correct.submatch[i].data() != r.submatch[i].data() ||
+          correct.submatch[i].size() != r.submatch[i].size())
+        return false;
+      if (!r.have_submatch)
+        break;
+    }
+  }
+  return true;
+}
+
+// Runs a single test.
+bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
+                           Prog::Anchor anchor) {
+  // Backtracking is the gold standard.
+  Result correct;
+  RunSearch(kEngineBacktrack, text, context, anchor, &correct);
+  if (correct.skipped) {
+    if (regexp_ == NULL)
+      return true;
+    LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
+               << " " << FormatMode(flags_);
+    return false;
+  }
+  VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
+          << " text " << CEscape(text)
+          << " (" << FormatKind(kind_)
+          << ", " << FormatAnchor(anchor)
+          << ", " << FormatMode(flags_)
+          << ")";
+
+  // Compare the others.
+  bool all_okay = true;
+  for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
+    if (!(Engines() & (1<<i)))
+      continue;
+
+    Result r;
+    RunSearch(i, text, context, anchor, &r);
+    if (ResultOkay(r, correct)) {
+      if (GetFlag(FLAGS_log_okay))
+        LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
+      continue;
+    }
+
+    // We disagree with PCRE on the meaning of some Unicode matches.
+    // In particular, we treat non-ASCII UTF-8 as non-word characters.
+    // We also treat "empty" character sets like [^\w\W] as being
+    // impossible to match, while PCRE apparently excludes some code
+    // points (e.g., 0x0080) from both \w and \W.
+    if (i == kEnginePCRE && NonASCII(text))
+      continue;
+
+    if (!r.untrusted)
+      all_okay = false;
+
+    LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
+             context, anchor);
+    if (r.matched != correct.matched) {
+      if (r.matched) {
+        LOG(INFO) << "   Should not match (but does).";
+      } else {
+        LOG(INFO) << "   Should match (but does not).";
+        continue;
+      }
+    }
+    for (int i = 0; i < 1+num_captures_; i++) {
+      if (r.submatch[i].data() != correct.submatch[i].data() ||
+          r.submatch[i].size() != correct.submatch[i].size()) {
+        LOG(INFO) <<
+          StringPrintf("   $%d: should be %s is %s",
+                       i,
+                       FormatCapture(text, correct.submatch[i]).c_str(),
+                       FormatCapture(text, r.submatch[i]).c_str());
+      } else {
+        LOG(INFO) <<
+          StringPrintf("   $%d: %s ok", i,
+                       FormatCapture(text, r.submatch[i]).c_str());
+      }
+    }
+  }
+
+  if (!all_okay) {
+    // This will be initialised once (after flags have been initialised)
+    // and that is desirable because we want to enforce a global limit.
+    static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
+    if (max_regexp_failures > 0 && --max_regexp_failures == 0)
+      LOG(QFATAL) << "Too many regexp failures.";
+  }
+
+  return all_okay;
+}
+
+void TestInstance::LogMatch(const char* prefix, Engine e,
+                            const StringPiece& text, const StringPiece& context,
+                            Prog::Anchor anchor) {
+  LOG(INFO) << prefix
+    << EngineName(e)
+    << " regexp "
+    << CEscape(regexp_str_)
+    << " "
+    << CEscape(regexp_->ToString())
+    << " text "
+    << CEscape(text)
+    << " ("
+    << text.begin() - context.begin()
+    << ","
+    << text.end() - context.begin()
+    << ") of context "
+    << CEscape(context)
+    << " (" << FormatKind(kind_)
+    << ", " << FormatAnchor(anchor)
+    << ", " << FormatMode(flags_)
+    << ")";
+}
+
+static Prog::MatchKind kinds[] = {
+  Prog::kFirstMatch,
+  Prog::kLongestMatch,
+  Prog::kFullMatch,
+};
+
+// Test all possible match kinds and parse modes.
+Tester::Tester(const StringPiece& regexp) {
+  error_ = false;
+  for (size_t i = 0; i < arraysize(kinds); i++) {
+    for (size_t j = 0; j < arraysize(parse_modes); j++) {
+      TestInstance* t = new TestInstance(regexp, kinds[i],
+                                         parse_modes[j].parse_flags);
+      error_ |= t->error();
+      v_.push_back(t);
+    }
+  }
+}
+
+Tester::~Tester() {
+  for (size_t i = 0; i < v_.size(); i++)
+    delete v_[i];
+}
+
+bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
+                         Prog::Anchor anchor) {
+  bool okay = true;
+  for (size_t i = 0; i < v_.size(); i++)
+    okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
+  return okay;
+}
+
+static Prog::Anchor anchors[] = {
+  Prog::kAnchored,
+  Prog::kUnanchored
+};
+
+bool Tester::TestInput(const StringPiece& text) {
+  bool okay = TestInputInContext(text, text);
+  if (!text.empty()) {
+    StringPiece sp;
+    sp = text;
+    sp.remove_prefix(1);
+    okay &= TestInputInContext(sp, text);
+    sp = text;
+    sp.remove_suffix(1);
+    okay &= TestInputInContext(sp, text);
+  }
+  return okay;
+}
+
+bool Tester::TestInputInContext(const StringPiece& text,
+                                const StringPiece& context) {
+  bool okay = true;
+  for (size_t i = 0; i < arraysize(anchors); i++)
+    okay &= TestCase(text, context, anchors[i]);
+  return okay;
+}
+
+bool TestRegexpOnText(const StringPiece& regexp,
+                      const StringPiece& text) {
+  Tester t(regexp);
+  return t.TestInput(text);
+}
+
+}  // namespace re2