1 files changed, 247 insertions, 0 deletions
diff --git a/src/third_party/re2/dist/re2/testing/possible_match_test.cc b/src/third_party/re2/dist/re2/testing/possible_match_test.cc
new file mode 100644
index 00000000000..0ec90ae1e50
--- /dev/null
+++ b/src/third_party/re2/dist/re2/testing/possible_match_test.cc
@@ -0,0 +1,247 @@
+// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "util/test.h"
+#include "util/logging.h"
+#include "util/strutil.h"
+#include "re2/prog.h"
+#include "re2/re2.h"
+#include "re2/regexp.h"
+#include "re2/testing/exhaustive_tester.h"
+#include "re2/testing/regexp_generator.h"
+#include "re2/testing/string_generator.h"
+
+namespace re2 {
+
+// Test that C++ strings are compared as uint8s, not int8s.
+// PossibleMatchRange doesn't depend on this, but callers probably will.
+TEST(CplusplusStrings, EightBit) {
+  std::string s = "\x70";
+  std::string t = "\xA0";
+  EXPECT_LT(s, t);
+}
+
+struct PrefixTest {
+  const char* regexp;
+  int maxlen;
+  const char* min;
+  const char* max;
+};
+
+static PrefixTest tests[] = {
+  { "",                  10,  "",           "",        },
+  { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
+  { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
+  { "a+hello",           10,  "aa",         "ahello"   },
+  { "a*hello",           10,  "a",          "hello"    },
+  { "def|abc",           10,  "abc",        "def"      },
+  { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
+  { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
+  { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
+  { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
+  { "(ab|x)?(c|z)?",     10,  "",           "z"        },
+  { "[^\\s\\S]",         10,  "",           ""         },
+  { "(abc)+",             5,  "abc",        "abcac"    },
+  { "(abc)+",             2,  "ab",         "ac"       },
+  { "(abc)+",             1,  "a",          "b"        },
+  { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
+  { "a*",                10,  "",           "ab"       },
+
+  { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
+  { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
+  { "(?i)a+hello",       10,  "AA",         "ahello"   },
+  { "(?i)a*hello",       10,  "A",          "hello"    },
+  { "(?i)def|abc",       10,  "ABC",        "def"      },
+  { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
+  { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
+  { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
+  { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
+  { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
+  { "(?i)[^\\s\\S]",     10,  "",           ""         },
+  { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
+  { "(?i)(abc)+",         2,  "AB",         "ac"       },
+  { "(?i)(abc)+",         1,  "A",          "b"        },
+  { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
+  { "(?i)a*",            10,  "",           "ab"       },
+  { "(?i)A*",            10,  "",           "ab"       },
+
+  { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
+  { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
+  { "\\Aa+hello",        10,  "aa",         "ahello"   },
+  { "\\Aa*hello",        10,  "a",          "hello"    },
+  { "\\Adef|abc",        10,  "abc",        "def"      },
+  { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
+  { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
+  { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
+  { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
+  { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
+  { "\\A[^\\s\\S]",      10,  "",           ""         },
+  { "\\A(abc)+",          5,  "abc",        "abcac"    },
+  { "\\A(abc)+",          2,  "ab",         "ac"       },
+  { "\\A(abc)+",          1,  "a",          "b"        },
+  { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
+  { "\\Aa*",             10,  "",           "ab"       },
+
+  { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
+  { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
+  { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
+  { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
+  { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
+  { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
+  { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
+  { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
+  { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
+  { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
+  { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
+  { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
+  { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
+  { "(?i)\\A(abc)+",          1,  "A",          "b"        },
+  { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
+  { "(?i)\\Aa*",             10,  "",           "ab"       },
+  { "(?i)\\AA*",             10,  "",           "ab"       },
+};
+
+TEST(PossibleMatchRange, HandWritten) {
+  for (size_t i = 0; i < arraysize(tests); i++) {
+    for (size_t j = 0; j < 2; j++) {
+      const PrefixTest& t = tests[i];
+      std::string min, max;
+      if (j == 0) {
+        LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
+        Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
+        ASSERT_TRUE(re != NULL);
+        Prog* prog = re->CompileToProg(0);
+        ASSERT_TRUE(prog != NULL);
+        ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
+          << " " << t.regexp;
+        delete prog;
+        re->Decref();
+      } else {
+        ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
+      }
+      EXPECT_EQ(t.min, min) << t.regexp;
+      EXPECT_EQ(t.max, max) << t.regexp;
+    }
+  }
+}
+
+// Test cases where PossibleMatchRange should return false.
+TEST(PossibleMatchRange, Failures) {
+  std::string min, max;
+
+  // Fails because no room to write max.
+  EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
+
+  // Fails because there is no max -- any non-empty string matches
+  // or begins a match.  Have to use Latin-1 input, because there
+  // are no valid UTF-8 strings beginning with byte 0xFF.
+  EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2(".*", RE2::Latin1).
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+  EXPECT_FALSE(RE2("\\C*").
+               PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+
+  // Fails because it's a malformed regexp.
+  EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
+      << "min=" << CEscape(min) << ", max=" << CEscape(max);
+}
+
+// Exhaustive test: generate all regexps within parameters,
+// then generate all strings of a given length over a given alphabet,
+// then check that the prefix information agrees with whether
+// the regexp matches each of the strings.
+class PossibleMatchTester : public RegexpGenerator {
+ public:
+  PossibleMatchTester(int maxatoms,
+                      int maxops,
+                      const std::vector<std::string>& alphabet,
+                      const std::vector<std::string>& ops,
+                      int maxstrlen,
+                      const std::vector<std::string>& stralphabet)
+    : RegexpGenerator(maxatoms, maxops, alphabet, ops),
+      strgen_(maxstrlen, stralphabet),
+      regexps_(0), tests_(0) { }
+
+  int regexps()  { return regexps_; }
+  int tests()    { return tests_; }
+
+  // Needed for RegexpGenerator interface.
+  void HandleRegexp(const std::string& regexp);
+
+ private:
+  StringGenerator strgen_;
+
+  int regexps_;   // Number of HandleRegexp calls
+  int tests_;     // Number of regexp tests.
+
+  PossibleMatchTester(const PossibleMatchTester&) = delete;
+  PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
+};
+
+// Processes a single generated regexp.
+// Checks that all accepted strings agree with the prefix range.
+void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
+  regexps_++;
+
+  VLOG(3) << CEscape(regexp);
+
+  RE2 re(regexp, RE2::Latin1);
+  ASSERT_EQ(re.error(), "");
+
+  std::string min, max;
+  if(!re.PossibleMatchRange(&min, &max, 10)) {
+    // There's no good max for "\\C*".  Can't use strcmp
+    // because sometimes it gets embedded in more
+    // complicated expressions.
+    if(strstr(regexp.c_str(), "\\C*"))
+      return;
+    LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
+  }
+
+  strgen_.Reset();
+  while (strgen_.HasNext()) {
+    const StringPiece& s = strgen_.Next();
+    tests_++;
+    if (!RE2::FullMatch(s, re))
+      continue;
+    ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
+    ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
+  }
+}
+
+TEST(PossibleMatchRange, Exhaustive) {
+  int natom = 3;
+  int noperator = 3;
+  int stringlen = 5;
+  if (RE2_DEBUG_MODE) {
+    natom = 2;
+    noperator = 3;
+    stringlen = 3;
+  }
+  PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
+                 RegexpGenerator::EgrepOps(),
+                 stringlen, Explode("ab4"));
+  t.Generate();
+  LOG(INFO) << t.regexps() << " regexps, "
+            << t.tests() << " tests";
+}
+
+}  // namespace re2