summaryrefslogtreecommitdiff
path: root/src/third_party/re2/dist/re2/testing/compile_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/re2/dist/re2/testing/compile_test.cc')
-rw-r--r--src/third_party/re2/dist/re2/testing/compile_test.cc427
1 files changed, 427 insertions, 0 deletions
diff --git a/src/third_party/re2/dist/re2/testing/compile_test.cc b/src/third_party/re2/dist/re2/testing/compile_test.cc
new file mode 100644
index 00000000000..47188307815
--- /dev/null
+++ b/src/third_party/re2/dist/re2/testing/compile_test.cc
@@ -0,0 +1,427 @@
+// Copyright 2007 The RE2 Authors. All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test prog.cc, compile.cc
+
+#include <string>
+
+#include "util/test.h"
+#include "util/logging.h"
+#include "re2/regexp.h"
+#include "re2/prog.h"
+
+namespace re2 {
+
+// Simple input/output tests checking that
+// the regexp compiles to the expected code.
+// These are just to sanity check the basic implementation.
+// The real confidence tests happen by testing the NFA/DFA
+// that run the compiled code.
+
+struct Test {
+ const char* regexp;
+ const char* code;
+};
+
+static Test tests[] = {
+ { "a",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "ab",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. byte [62-62] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a|c",
+ "3+ byte [61-61] 0 -> 5\n"
+ "4. byte [63-63] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a|b",
+ "3. byte [61-62] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "[ab]",
+ "3. byte [61-62] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "a+",
+ "3. byte [61-61] 0 -> 4\n"
+ "4+ nop -> 3\n"
+ "5. match! 0\n" },
+ { "a+?",
+ "3. byte [61-61] 0 -> 4\n"
+ "4+ match! 0\n"
+ "5. nop -> 3\n" },
+ { "a*",
+ "3+ byte [61-61] 1 -> 3\n"
+ "4. match! 0\n" },
+ { "a*?",
+ "3+ match! 0\n"
+ "4. byte [61-61] 0 -> 3\n" },
+ { "a?",
+ "3+ byte [61-61] 1 -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
+ { "a??",
+ "3+ nop -> 5\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "a{4}",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. byte [61-61] 0 -> 6\n"
+ "6. byte [61-61] 0 -> 7\n"
+ "7. match! 0\n" },
+ { "(a)",
+ "3. capture 2 -> 4\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5. capture 3 -> 6\n"
+ "6. match! 0\n" },
+ { "(?:a)",
+ "3. byte [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "",
+ "3. match! 0\n" },
+ { ".",
+ "3+ byte [00-09] 0 -> 5\n"
+ "4. byte [0b-ff] 0 -> 5\n"
+ "5. match! 0\n" },
+ { "[^ab]",
+ "3+ byte [00-09] 0 -> 6\n"
+ "4+ byte [0b-60] 0 -> 6\n"
+ "5. byte [63-ff] 0 -> 6\n"
+ "6. match! 0\n" },
+ { "[Aa]",
+ "3. byte/i [61-61] 0 -> 4\n"
+ "4. match! 0\n" },
+ { "\\C+",
+ "3. byte [00-ff] 0 -> 4\n"
+ "4+ altmatch -> 5 | 6\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+ { "\\C*",
+ "3+ altmatch -> 4 | 5\n"
+ "4+ byte [00-ff] 1 -> 3\n"
+ "5. match! 0\n" },
+ { "\\C?",
+ "3+ byte [00-ff] 1 -> 5\n"
+ "4. nop -> 5\n"
+ "5. match! 0\n" },
+ // Issue 20992936
+ { "[[-`]",
+ "3. byte [5b-60] 0 -> 4\n"
+ "4. match! 0\n" },
+ // Issue 310
+ { "(?:|a)*",
+ "3+ nop -> 7\n"
+ "4. nop -> 9\n"
+ "5+ nop -> 7\n"
+ "6. nop -> 9\n"
+ "7+ nop -> 5\n"
+ "8. byte [61-61] 0 -> 5\n"
+ "9. match! 0\n" },
+ { "(?:|a)+",
+ "3+ nop -> 5\n"
+ "4. byte [61-61] 0 -> 5\n"
+ "5+ nop -> 3\n"
+ "6. match! 0\n" },
+};
+
+TEST(TestRegexpCompileToProg, Simple) {
+ int failed = 0;
+ for (size_t i = 0; i < arraysize(tests); i++) {
+ const re2::Test& t = tests[i];
+ Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
+ if (re == NULL) {
+ LOG(ERROR) << "Cannot parse: " << t.regexp;
+ failed++;
+ continue;
+ }
+ Prog* prog = re->CompileToProg(0);
+ if (prog == NULL) {
+ LOG(ERROR) << "Cannot compile: " << t.regexp;
+ re->Decref();
+ failed++;
+ continue;
+ }
+ ASSERT_TRUE(re->CompileToProg(1) == NULL);
+ std::string s = prog->Dump();
+ if (s != t.code) {
+ LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
+ LOG(ERROR) << "Want:\n" << t.code;
+ LOG(ERROR) << "Got:\n" << s;
+ failed++;
+ }
+ delete prog;
+ re->Decref();
+ }
+ EXPECT_EQ(failed, 0);
+}
+
+static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
+ std::string* bytemap) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
+ EXPECT_TRUE(re != NULL);
+
+ {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *bytemap = prog->DumpByteMap();
+ delete prog;
+ }
+
+ {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ EXPECT_EQ(*bytemap, prog->DumpByteMap());
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+TEST(TestCompile, Latin1Ranges) {
+ // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
+
+ std::string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, OtherByteMapTests) {
+ std::string bytemap;
+
+ // Test that "absent" ranges are mapped to the same byte class.
+ DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-46] -> 1\n"
+ "[47-60] -> 0\n"
+ "[61-66] -> 1\n"
+ "[67-ff] -> 0\n",
+ bytemap);
+
+ // Test the byte classes for \b.
+ DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-2f] -> 0\n"
+ "[30-39] -> 1\n"
+ "[3a-40] -> 0\n"
+ "[41-5a] -> 1\n"
+ "[5b-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-60] -> 0\n"
+ "[61-7a] -> 1\n"
+ "[7b-ff] -> 0\n",
+ bytemap);
+
+ // Bug in the ASCII case-folding optimization created too many byte classes.
+ DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
+ EXPECT_EQ("[00-5e] -> 0\n"
+ "[5f-5f] -> 1\n"
+ "[60-ff] -> 0\n",
+ bytemap);
+}
+
+TEST(TestCompile, UTF8Ranges) {
+ // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
+ // Once, erroneously split between 0x3f and 0x40 because it is
+ // a 6-bit boundary.
+
+ std::string bytemap;
+
+ DumpByteMap(".", Regexp::PerlX, &bytemap);
+ EXPECT_EQ("[00-09] -> 0\n"
+ "[0a-0a] -> 1\n"
+ "[0b-7f] -> 0\n"
+ "[80-bf] -> 2\n"
+ "[c0-c1] -> 1\n"
+ "[c2-df] -> 3\n"
+ "[e0-ef] -> 4\n"
+ "[f0-f4] -> 5\n"
+ "[f5-ff] -> 1\n",
+ bytemap);
+}
+
+TEST(TestCompile, InsufficientMemory) {
+ Regexp* re = Regexp::Parse(
+ "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
+ Regexp::LikePerl, NULL);
+ EXPECT_TRUE(re != NULL);
+ Prog* prog = re->CompileToProg(850);
+ // If the memory budget has been exhausted, compilation should fail
+ // and return NULL instead of trying to do anything with NoMatch().
+ EXPECT_TRUE(prog == NULL);
+ re->Decref();
+}
+
+static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
+ std::string* forward, std::string* reverse) {
+ Regexp* re = Regexp::Parse(pattern, flags, NULL);
+ EXPECT_TRUE(re != NULL);
+
+ if (forward != NULL) {
+ Prog* prog = re->CompileToProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *forward = prog->Dump();
+ delete prog;
+ }
+
+ if (reverse != NULL) {
+ Prog* prog = re->CompileToReverseProg(0);
+ EXPECT_TRUE(prog != NULL);
+ *reverse = prog->Dump();
+ delete prog;
+ }
+
+ re->Decref();
+}
+
+TEST(TestCompile, Bug26705922) {
+ // Bug in the compiler caused inefficient bytecode to be generated for Unicode
+ // groups: common suffixes were cached, but common prefixes were not factored.
+
+ std::string forward, reverse;
+
+ Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
+ "4. byte [90-90] 0 -> 5\n"
+ "5. byte [80-80] 0 -> 6\n"
+ "6+ byte [80-80] 0 -> 8\n"
+ "7. byte [90-90] 0 -> 8\n"
+ "8. match! 0\n",
+ forward);
+ EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
+ "4. byte [90-90] 0 -> 5\n"
+ "5. byte [80-80] 0 -> 6\n"
+ "6. byte [90-90] 0 -> 7\n"
+ "7. byte [f0-f0] 0 -> 8\n"
+ "8. match! 0\n",
+ reverse);
+
+ Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
+ "4. byte [f0-f0] 0 -> 8\n"
+ "5. byte [80-bf] 0 -> 6\n"
+ "6. byte [80-bf] 0 -> 7\n"
+ "7. match! 0\n"
+ "8. byte [90-90] 0 -> 5\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+ "4. byte [80-bf] 0 -> 5\n"
+ "5+ byte [e8-ef] 0 -> 7\n"
+ "6. byte [90-90] 0 -> 8\n"
+ "7. match! 0\n"
+ "8. byte [f0-f0] 0 -> 7\n",
+ reverse);
+
+ Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
+ EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
+ "4+ byte [e0-ef] 0 -> 8\n"
+ "5. byte [f0-f4] 0 -> 9\n"
+ "6. byte [80-bf] 0 -> 7\n"
+ "7. match! 0\n"
+ "8. byte [80-bf] 0 -> 6\n"
+ "9. byte [80-bf] 0 -> 8\n",
+ forward);
+ EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
+ "4+ byte [c2-df] 0 -> 6\n"
+ "5. byte [80-bf] 0 -> 7\n"
+ "6. match! 0\n"
+ "7+ byte [e0-ef] 0 -> 6\n"
+ "8. byte [80-bf] 0 -> 9\n"
+ "9. byte [f0-f4] 0 -> 6\n",
+ reverse);
+}
+
+TEST(TestCompile, Bug35237384) {
+ // Bug in the compiler caused inefficient bytecode to be generated for
+ // nested nullable subexpressions.
+
+ std::string forward;
+
+ Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
+ "4. nop -> 5\n"
+ "5+ byte [61-61] 1 -> 5\n"
+ "6. nop -> 7\n"
+ "7+ byte [61-61] 1 -> 7\n"
+ "8. match! 0\n",
+ forward);
+
+ Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 28\n"
+ "4. nop -> 30\n"
+ "5+ byte [61-61] 1 -> 5\n"
+ "6. nop -> 32\n"
+ "7+ byte [61-61] 1 -> 7\n"
+ "8. nop -> 26\n"
+ "9+ byte [61-61] 1 -> 9\n"
+ "10. nop -> 20\n"
+ "11+ byte [62-62] 1 -> 11\n"
+ "12. nop -> 20\n"
+ "13+ byte [62-62] 1 -> 13\n"
+ "14. nop -> 26\n"
+ "15+ byte [62-62] 1 -> 15\n"
+ "16. nop -> 32\n"
+ "17+ nop -> 9\n"
+ "18. nop -> 11\n"
+ "19. match! 0\n"
+ "20+ nop -> 17\n"
+ "21. nop -> 19\n"
+ "22+ nop -> 7\n"
+ "23. nop -> 13\n"
+ "24+ nop -> 17\n"
+ "25. nop -> 19\n"
+ "26+ nop -> 22\n"
+ "27. nop -> 24\n"
+ "28+ nop -> 5\n"
+ "29. nop -> 15\n"
+ "30+ nop -> 22\n"
+ "31. nop -> 24\n"
+ "32+ nop -> 28\n"
+ "33. nop -> 30\n",
+ forward);
+
+ Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
+ EXPECT_EQ("3+ nop -> 36\n"
+ "4+ nop -> 31\n"
+ "5. nop -> 33\n"
+ "6+ byte [00-09] 0 -> 8\n"
+ "7. byte [0b-ff] 0 -> 8\n"
+ "8+ nop -> 6\n"
+ "9+ nop -> 29\n"
+ "10. nop -> 28\n"
+ "11+ byte [00-09] 0 -> 13\n"
+ "12. byte [0b-ff] 0 -> 13\n"
+ "13+ nop -> 11\n"
+ "14+ nop -> 26\n"
+ "15. nop -> 28\n"
+ "16+ byte [00-09] 0 -> 18\n"
+ "17. byte [0b-ff] 0 -> 18\n"
+ "18+ nop -> 16\n"
+ "19+ nop -> 36\n"
+ "20. nop -> 33\n"
+ "21+ byte [00-09] 0 -> 23\n"
+ "22. byte [0b-ff] 0 -> 23\n"
+ "23+ nop -> 21\n"
+ "24+ nop -> 31\n"
+ "25. nop -> 33\n"
+ "26+ nop -> 28\n"
+ "27. byte [53-53] 0 -> 11\n"
+ "28. match! 0\n"
+ "29+ nop -> 28\n"
+ "30. byte [53-53] 0 -> 6\n"
+ "31+ nop -> 33\n"
+ "32. byte [53-53] 0 -> 21\n"
+ "33+ nop -> 29\n"
+ "34+ nop -> 26\n"
+ "35. nop -> 28\n"
+ "36+ nop -> 33\n"
+ "37. byte [53-53] 0 -> 16\n",
+ forward);
+}
+
+} // namespace re2